From 6a3fdc6e5552e120335c06e660a9fb067f337212 Mon Sep 17 00:00:00 2001 From: German Ivanov Date: Wed, 15 Jun 2022 21:33:02 +0300 Subject: [PATCH 1/9] Improve duplicate searcher --- src/duplicate_searcher/main.py | 53 +++++++++++++++++----------------- 1 file changed, 27 insertions(+), 26 deletions(-) mode change 100644 => 100755 src/duplicate_searcher/main.py diff --git a/src/duplicate_searcher/main.py b/src/duplicate_searcher/main.py old mode 100644 new mode 100755 index 0e5e475..8a3e966 --- a/src/duplicate_searcher/main.py +++ b/src/duplicate_searcher/main.py @@ -1,20 +1,19 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +""" +This script finds duplicate files by comparing their hashes. +For more info, see README.md. +License: MIT +""" -# This program is free software. It comes without any warranty, to -# * the extent permitted by applicable law. You can redistribute it -# * and/or modify it under the terms of the Do What The Fuck You Want -# * To Public License, Version 2, as published by Sam Hocevar. See -# * http://www.wtfpl.net/ for more details. - -# This script is searches file duplicates. -from hashlib import sha256 -from os import listdir -from os.path import isfile, isdir +import sys import threading +from hashlib import sha1 +from os import listdir +from os.path import isdir, isfile - +HASH_FUNCTION = sha1 CHUNK_SIZE = 100 * 1024**2 # 100MiB @@ -24,22 +23,23 @@ def __init__(self): # hash:[filepath1, filepath2, ...] self.thread_processed_files = dict() - # encodes files with sha256 to check for uniqueness - def sha_encoder(self, filepath: str) -> str: + @staticmethod + def sha_encoder(filepath: str) -> str: + """Function to encode files with HASH_FUNCTION.""" try: - encoder = sha256() + encoder = HASH_FUNCTION() with open(file=filepath, mode="rb") as file: chunk = 0 while chunk != b"": chunk = file.read(CHUNK_SIZE) encoder.update(chunk) return encoder.hexdigest() - except Exception as e: - print("Unknown exception: ", e) + except Exception as ex: + print("Unknown exception: ", ex) return "An error occured while trying to encode this file" - # function that calculates and saves hash values for list of files def executor(self, files_path: str, unprocessed_files: list[str]) -> None: + """Function to calculate hashes and save them in dictionary.""" for file in unprocessed_files: file = f"{files_path}/{file}" t_hash_key = self.sha_encoder(file) @@ -54,9 +54,7 @@ def executor(self, files_path: str, unprocessed_files: list[str]) -> None: def duplicate_detector(path: str) -> None: - """ - This function finds all duplicates in specified directory recursively. - """ + """This function finds all duplicates in specified directory recursively.""" directories = [] for element in listdir(path): @@ -80,8 +78,8 @@ def duplicate_detector(path: str) -> None: duplicate_detector(directory) -# function to get dictionaries from all threads def get_processed_files() -> dict[str, list[str]]: + """Function to get dictionaries from all threads.""" processed_files = {} processed_files_keys = set() for encoder_thread in encoders_list: @@ -102,7 +100,10 @@ def get_processed_files() -> dict[str, list[str]]: if __name__ == "__main__": - root_path = input("Enter path to the root directory: ") + if len(sys.argv) > 1: + root_path = sys.argv[1] + else: + root_path = input("Enter path to the root directory: ") try: print("Starting threads...") duplicate_detector(root_path) @@ -111,12 +112,12 @@ def get_processed_files() -> dict[str, list[str]]: thread.join() print("Done. Counting duplicate files...") processed_files = get_processed_files() - for hash_key in processed_files.keys(): - if len(processed_files[hash_key]) > 1: + for hash_key, files in processed_files.items(): + if len(files) > 1: print( "#" * 100, - *processed_files[hash_key], - f"Total: {len(processed_files[hash_key])} duplicates\n", + *files, + f"Total: {len(files)} duplicates\n", sep="\n", ) except RecursionError: From 25fc2b34680345aee44e840c114c75d247deb3aa Mon Sep 17 00:00:00 2001 From: German Ivanov Date: Wed, 15 Jun 2022 21:33:22 +0300 Subject: [PATCH 2/9] Add a newline char --- src/duplicate_searcher/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/duplicate_searcher/main.py b/src/duplicate_searcher/main.py index 8a3e966..fb18bc0 100755 --- a/src/duplicate_searcher/main.py +++ b/src/duplicate_searcher/main.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- + """ This script finds duplicate files by comparing their hashes. For more info, see README.md. From 601be0c0caa451048b63d2ef14e21c5d47d4b9be Mon Sep 17 00:00:00 2001 From: German Ivanov Date: Wed, 15 Jun 2022 22:01:32 +0300 Subject: [PATCH 3/9] Bug fixes, Docs improvements, issue #4 - Add plan.txt to docs - Move duplicate_searcher docs to its folder - Close issue #4 - Change .gitignore - Add .gitattributes (Might be necessary to remove cached files to make changes take place) --- .gitattributes | 1 + .gitignore | 9 ++-- .../duplicate_searcher/README.md | 48 +++++++++++++++---- src/duplicate_searcher/main.py | 10 +++- src/duplicate_searcher/plan_ru.txt | 11 ----- 5 files changed, 55 insertions(+), 24 deletions(-) create mode 100644 .gitattributes rename docs/duplicate_searcher.md => src/duplicate_searcher/README.md (53%) delete mode 100644 src/duplicate_searcher/plan_ru.txt diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a147805 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*text=auto diff --git a/.gitignore b/.gitignore index c52804f..754025c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ -.DS_Store -.idea -venv -__pycache__ \ No newline at end of file +**/.DS_Store +**/.idea +**/venv +**/__pycache__ +**/.vscode \ No newline at end of file diff --git a/docs/duplicate_searcher.md b/src/duplicate_searcher/README.md similarity index 53% rename from docs/duplicate_searcher.md rename to src/duplicate_searcher/README.md index 98a2613..4b046e7 100644 --- a/docs/duplicate_searcher.md +++ b/src/duplicate_searcher/README.md @@ -1,24 +1,56 @@ # Duplicate Searcher + ## About -This program allows you to quickly find duplicate files in specified directory recursively. Currently, there are 2 versions available: normal (with recursive function, limited to max of 999 nested directories), and alternative (might be more stable in some edge cases, though it is slower now). + +This program allows you to quickly find duplicate files in specified directory recursively. Currently, there are 2 versions available: normal (with recursive function, limited to max of 999 nested directories by default, and alternative (might be more stable in some edge cases, though it is slower now). + ## How to use: + ### Input -Use any path to the directory, as your OS supports it. Unfortunately, files relative to the user directory (~) in *nix systems are not supported as of now, but you still can specify them relative to your root directory (/) or current folder (.). + +Use any path to the directory, as your OS supports it. Files relative to the user directory (~) in \*nix systems are now also supported (only in normal version). + ### Output + For each group of duplicates, the program outputs them as following: + ``` #################################################################################################### /path/to/duplicate1 ... /path/to/duplicateN Total: N duplicates +``` + +### Customizations +You can change the hashing algorithm, chunk size (in bytes, should not be more than available RAM), and recursion limit (maximum nested directories depth) in the following lines (`main.py`, not available in the alternative version): + +```python +HASH_FUNCTION = sha1 +CHUNK_SIZE = 100 * 1024**2 +RECURSION_LIMIT = 1000 ``` + +Please note, that `HASH_FUNCTION` is called from code, so when you change it, do not forget to either import it from a library, or add it to the code. + ## Other information -### Speed -No trusted measures yet, but the normal version uses threads to utilize all of your CPU, and reads files in chunks to preserve memory. Please note, that number of threads is limited to a number of files, with no more than 1 thread per file available for stability reasons. However, the program is capable of creating threads for each file, which will be executed in the order your OS believes works best for your computer. We believe reading in chunks provides best average-case time when they are about 100MiB in size, however, if you do not have that much RAM or know the exact number that works best for you, feel free to change their size in the 18 line of _main.py_ (size in bytes, text after "#" is ignored, math operations supported, "**" means raising to some power): -```python3 -CHUNK_SIZE = 100 * 1024**2 # 100MiB -``` + +### Speed + +No trusted measures yet, but the normal version uses threads to utilize all of your CPU, and reads files in chunks to preserve memory. Please note, that number of threads is limited to a number of files, with no more than 1 thread per file available for stability reasons. However, the program is capable of creating threads for each file, which will be executed in the order your OS believes works best for your computer. We believe reading in chunks provides best average-case time when they are about 100MiB in size, however, this value can be changed if necessary. + ### Stability -A lot of exception-catching is done inside, though beware of files without reading permission: those might and will be marked as duplicates if there are more than 2 of them. Hidden files work in any OS regardless of what name they have. If you have strongly limited RAM, see previous paragraph with information on how to change chunk size and decrease/increase memory usage. + +A lot of exception-catching is done inside, though beware of files without reading permission: those might and will be marked as duplicates if there are more than 2 of them. Hidden files work in any OS regardless of what name they have. If you have strongly limited RAM, you can change chunk size to a smaller value. + +## TODO + +- [x] Multithreading +- [x] Exception Handling +- [x] Nesting level limitation +- [ ] Logging +- [ ] Silent mode +- [x] Additional code comments +- [x] Launch with terminal parameter +- [x] Docs diff --git a/src/duplicate_searcher/main.py b/src/duplicate_searcher/main.py index fb18bc0..ff81900 100755 --- a/src/duplicate_searcher/main.py +++ b/src/duplicate_searcher/main.py @@ -8,14 +8,16 @@ License: MIT """ +import os import sys import threading from hashlib import sha1 from os import listdir from os.path import isdir, isfile -HASH_FUNCTION = sha1 +HASH_FUNCTION = sha1 # function to use for hashing files CHUNK_SIZE = 100 * 1024**2 # 100MiB +RECURSION_LIMIT = 1000 # Max recursion level class EncoderThread: @@ -101,10 +103,16 @@ def get_processed_files() -> dict[str, list[str]]: if __name__ == "__main__": + sys.setrecursionlimit(RECURSION_LIMIT) if len(sys.argv) > 1: root_path = sys.argv[1] else: root_path = input("Enter path to the root directory: ") + + # Fix issue #4 (https://github.com/Formak21/PyFile-Scripts/issues/4) + if "~" in root_path: + root_path = os.path.expanduser(root_path) + try: print("Starting threads...") duplicate_detector(root_path) diff --git a/src/duplicate_searcher/plan_ru.txt b/src/duplicate_searcher/plan_ru.txt deleted file mode 100644 index 715ff8a..0000000 --- a/src/duplicate_searcher/plan_ru.txt +++ /dev/null @@ -1,11 +0,0 @@ -На данный момент планируется реализовать: - - Многопоточность - - Авто-определение количества ядер - - Исключения - - Ограничения на глубину - - Cuda ??? - - Логирование в файл - - Silent режим - - Дополнительные комментарии - - Запуск с параметрами из терминала - - Документацию \ No newline at end of file From 98a0fd5ef801af257b442bef6e1bc024a4119823 Mon Sep 17 00:00:00 2001 From: German Ivanov Date: Wed, 15 Jun 2022 22:15:54 +0300 Subject: [PATCH 4/9] Fix a typo --- .gitignore | 2 +- src/duplicate_searcher/main_alt.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 754025c..1cb086b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,4 @@ **/.idea **/venv **/__pycache__ -**/.vscode \ No newline at end of file +**/.vscode diff --git a/src/duplicate_searcher/main_alt.py b/src/duplicate_searcher/main_alt.py index de3c37f..28d06c6 100644 --- a/src/duplicate_searcher/main_alt.py +++ b/src/duplicate_searcher/main_alt.py @@ -8,7 +8,7 @@ import threading # Parameters -CHUNK_SIZE = 100 * 1024 ** 2 # 100MiB +CHUNK_SIZE = 100 * 1024**2 # 100MiB ROOT_PATH = "C:/Users/Form49d/Desktop" EXPORT_FILENAME = "Duplicates.txt" LOG_FILENAME = "Errors.log" From e2109896e4b7103a942927db6d233d42d7c77227 Mon Sep 17 00:00:00 2001 From: German Ivanov Date: Wed, 15 Jun 2022 22:24:37 +0300 Subject: [PATCH 5/9] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d935913..d42ea5e 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ A useful set of tools to work with large number of files, written in Python. Provided under the terms of MIT License (see LICENSE). ## Tools available: -- Duplicate_Searcher (testing) — Advanced multithread file duplates searcher. Documentation is available [here](docs/duplicate_searcher.md). +- Duplicate Searcher (β) — Advanced multithread file duplates searcher. [More…](src/duplicate_searcher/README.md) ## Contributors From a18e3ddffd3d381758499b9756ae6bca492cabaf Mon Sep 17 00:00:00 2001 From: German Ivanov Date: Wed, 15 Jun 2022 23:47:11 +0300 Subject: [PATCH 6/9] Change folder layout, add Content Searcher I find placing different tools in their own directories, and then into another (`src`) directory completely pointless since Python is JIT-compiled anyways (at least CPython), and I removed `src` to avoid confusion. --- README.md | 14 ++- content_searcher/README.md | 65 +++++++++++ content_searcher/main.py | 103 ++++++++++++++++++ .../README.md | 0 .../experiements/read.py | 0 .../experiements/read_chunks.py | 0 .../experiements/read_chunks_mt.py | 0 .../main.py | 0 .../main_alt.py | 0 9 files changed, 177 insertions(+), 5 deletions(-) create mode 100644 content_searcher/README.md create mode 100644 content_searcher/main.py rename {src/duplicate_searcher => duplicate_finder}/README.md (100%) rename {src/duplicate_searcher => duplicate_finder}/experiements/read.py (100%) rename {src/duplicate_searcher => duplicate_finder}/experiements/read_chunks.py (100%) rename {src/duplicate_searcher => duplicate_finder}/experiements/read_chunks_mt.py (100%) rename {src/duplicate_searcher => duplicate_finder}/main.py (100%) rename {src/duplicate_searcher => duplicate_finder}/main_alt.py (100%) diff --git a/README.md b/README.md index d42ea5e..89ae092 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,19 @@ # PyFile-Scripts + ## About -A useful set of tools to work with large number of files, written in Python. Provided under the terms of MIT License (see LICENSE). + +A useful set of tools to work with large number of files, written in Python. Provided under the terms of MIT License (see [LICENSE](LICENSE)). ## Tools available: -- Duplicate Searcher (β) — Advanced multithread file duplates searcher. [More…](src/duplicate_searcher/README.md) +- Duplicate Finder (β) — Advanced multithread file duplates searcher. [More…](src/duplicate_finder/README.md) +- Content Searcher (α) — Search for files using their content. [More…](src/content_searcher/README.md) ## Contributors -Everyone is allowed to open issues, suggest changes, or change code and add pull requests. Any kind of help will be highly appreciated. + +Everyone is allowed to open issues, suggest changes, change code, and add pull requests. Any kind of help will be highly appreciated. + #### Developers: + - [Formak21](https://github.com/Formak21) (Original Creator) - [German Ivanov](https://github.com/germanivanov0719) - - diff --git a/content_searcher/README.md b/content_searcher/README.md new file mode 100644 index 0000000..67a7cf7 --- /dev/null +++ b/content_searcher/README.md @@ -0,0 +1,65 @@ +# Content Searcher + +## About + +This program helps you to quickly search for any string in all files in a given directory. It uses multithreading to speed up the search, and also supports multiple encodings. + +## How to use: + +### Input + +You can provide path and search query in this order as a terminal arguments, or input them during runtime. Use any path to the directory, as supported by your OS. + +### Output + +Apart from progress information, the program outputs results in the following format: + +``` +/full/path/to/file1 +/full/path/to/file2 +... +/full/path/to/fileN + +Total: N +``` + +### Customizations + +You can change some settings in `main.py`. These are the defaults: + +```python +# User-defined settings +RECURSION_LIMIT = 1000 +LOGGING = False +SILENT = False +ENCODING = "utf-8" +``` + +`ENCODING` must be from those supported by Python when opening a file. + +## Other information + +### Speed + +Not measured, but not should be limited in any way but by your memory, disk IO, and CPU speed. + +### Stability + +A lot of error-handling was done, though there are still a few restrictions, such as: + +- If you do not have enough memory to load a file, search cannot be performed because of OS Error. +- In case file uses different encoding, you have to specify it, or it will not work. +- If different files use different encodings, chances are search results will be incomplete. + +Moreover, file has to be loaded to RAM completely before searching, which might lead to temporary performance degradation. + +## TODO + +- [x] Multithreading +- [x] Exception Handling +- [x] Nesting level limitation +- [ ] Logging +- [x] Silent mode +- [x] Launch with terminal parameters +- [ ] Docs +- [ ] Regex diff --git a/content_searcher/main.py b/content_searcher/main.py new file mode 100644 index 0000000..17f291f --- /dev/null +++ b/content_searcher/main.py @@ -0,0 +1,103 @@ +import os +import sys +import threading + +# User-defined settings +RECURSION_LIMIT = 1000 +LOGGING = False +SILENT = False +ENCODING = "utf-8" + +# OS-provided settings +SEPARATOR = os.path.sep + + +def log(message, end="\n"): + if LOGGING: + pass + if not SILENT: + print(message, end=end, flush=True) + + +def get_files(d: str) -> list[str]: + files = [] + try: + for p in os.listdir(d): + if os.path.isfile(d + SEPARATOR + p): + files.append(d + SEPARATOR + p) + elif os.path.isdir(d + SEPARATOR + p): + for file in get_files(d + SEPARATOR + p): + files.append(file) + except Exception as e: + if isinstance(e, OSError) and str(e)[7:9] == "12": + print(f"Not enough memory for {d}") + else: + print(f"Unknown exception while checking directory {d}: {str(e)}") + return files + + +class QuickFinder: + def __init__(self, path: str, query: str): + self.result = False + self.path = path + self.query = query + + def check_query(self) -> bool: + try: + with open(self.path, "rt", encoding=ENCODING) as file: + if self.query in file.read(): + self.result = True + return True + except UnicodeDecodeError: + pass + except Exception as e: + print( + f"Unknown exception while reading file {self.path}: {str(e)} {str(type(e))}" + ) + return False + + +def check_files(files: list[str], query: str) -> list[str]: + threads = [] + result = [] + log("- Creating threads...", end="\t") + for file in files: + qf = QuickFinder(file, query) + t = threading.Thread(target=qf.check_query, daemon=True) + t.start() + threads.append((qf, t)) + log("Done.") + log("- Waiting for threads to finish...", end="\t") + for thread in threads: + thread[1].join() + if thread[0].result: + result.append(thread[0].path) + log("Done.") + return result + + +def search(path: str, query: str) -> list[str]: + log(f'Getting all files recursively from "{path}"...') + files = get_files(path) + log(f"Done. Found {len(files)} files...") + log(f'Looking for "{query}":') + results = check_files(files, query) + log(f"Done. Found {len(results)} results.", end="\n\n") + return results + + +if __name__ == "__main__": + sys.setrecursionlimit(RECURSION_LIMIT) + if len(sys.argv) > 2: + path = sys.argv[1] + query = sys.argv[2] + else: + path = input("Path: ") + query = input("Query: ") + + # Issue #4 workaround(https://github.com/Formak21/PyFile-Scripts/issues/4) + if "~" in path: + path = os.path.expanduser(path) + + r = search(path, query) + print(*r, f"\nTotal: {len(r)}", sep="\n") diff --git a/src/duplicate_searcher/README.md b/duplicate_finder/README.md similarity index 100% rename from src/duplicate_searcher/README.md rename to duplicate_finder/README.md diff --git a/src/duplicate_searcher/experiements/read.py b/duplicate_finder/experiements/read.py similarity index 100% rename from src/duplicate_searcher/experiements/read.py rename to duplicate_finder/experiements/read.py diff --git a/src/duplicate_searcher/experiements/read_chunks.py b/duplicate_finder/experiements/read_chunks.py similarity index 100% rename from src/duplicate_searcher/experiements/read_chunks.py rename to duplicate_finder/experiements/read_chunks.py diff --git a/src/duplicate_searcher/experiements/read_chunks_mt.py b/duplicate_finder/experiements/read_chunks_mt.py similarity index 100% rename from src/duplicate_searcher/experiements/read_chunks_mt.py rename to duplicate_finder/experiements/read_chunks_mt.py diff --git a/src/duplicate_searcher/main.py b/duplicate_finder/main.py similarity index 100% rename from src/duplicate_searcher/main.py rename to duplicate_finder/main.py diff --git a/src/duplicate_searcher/main_alt.py b/duplicate_finder/main_alt.py similarity index 100% rename from src/duplicate_searcher/main_alt.py rename to duplicate_finder/main_alt.py From 577bc0e6f3d34186b93a6475c84eda221cd6e75e Mon Sep 17 00:00:00 2001 From: German Ivanov Date: Wed, 15 Jun 2022 23:50:23 +0300 Subject: [PATCH 7/9] Specify types of contributions for developers --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 89ae092..b770439 100644 --- a/README.md +++ b/README.md @@ -15,5 +15,5 @@ Everyone is allowed to open issues, suggest changes, change code, and add pull r #### Developers: -- [Formak21](https://github.com/Formak21) (Original Creator) -- [German Ivanov](https://github.com/germanivanov0719) +- [Formak21](https://github.com/Formak21) (Original Author, Duplicate Searcher, Duplicate Searcher Alternative) +- [German Ivanov](https://github.com/germanivanov0719) (Content Searcher, other contributions) From 74da657b7c1eaa90636a7cda87a5f20a572925d9 Mon Sep 17 00:00:00 2001 From: German Ivanov Date: Wed, 15 Jun 2022 23:51:06 +0300 Subject: [PATCH 8/9] Add more clarity in contribution examples --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b770439..38a8b27 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ A useful set of tools to work with large number of files, written in Python. Pro ## Contributors -Everyone is allowed to open issues, suggest changes, change code, and add pull requests. Any kind of help will be highly appreciated. +Everyone is allowed to open issues, suggest changes, fork this project, modify code, and add pull requests. Any kind of help will be highly appreciated. #### Developers: From a2677fc48fc5a4cf6f7a6c8701f618e54a20f9a7 Mon Sep 17 00:00:00 2001 From: German Ivanov Date: Wed, 15 Jun 2022 23:51:41 +0300 Subject: [PATCH 9/9] Migrate to non-src paths --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 38a8b27..77c71e4 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,8 @@ A useful set of tools to work with large number of files, written in Python. Pro ## Tools available: -- Duplicate Finder (β) — Advanced multithread file duplates searcher. [More…](src/duplicate_finder/README.md) -- Content Searcher (α) — Search for files using their content. [More…](src/content_searcher/README.md) +- Duplicate Finder (β) — Advanced multithread file duplates searcher. [More…](duplicate_finder/README.md) +- Content Searcher (α) — Search for files using their content. [More…](content_searcher/README.md) ## Contributors