diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a147805 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*text=auto diff --git a/.gitignore b/.gitignore index c52804f..1cb086b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ -.DS_Store -.idea -venv -__pycache__ \ No newline at end of file +**/.DS_Store +**/.idea +**/venv +**/__pycache__ +**/.vscode diff --git a/README.md b/README.md index d935913..77c71e4 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,19 @@ # PyFile-Scripts + ## About -A useful set of tools to work with large number of files, written in Python. Provided under the terms of MIT License (see LICENSE). + +A useful set of tools to work with large number of files, written in Python. Provided under the terms of MIT License (see [LICENSE](LICENSE)). ## Tools available: -- Duplicate_Searcher (testing) — Advanced multithread file duplates searcher. Documentation is available [here](docs/duplicate_searcher.md). +- Duplicate Finder (β) — Advanced multithread file duplates searcher. [More…](duplicate_finder/README.md) +- Content Searcher (α) — Search for files using their content. [More…](content_searcher/README.md) ## Contributors -Everyone is allowed to open issues, suggest changes, or change code and add pull requests. Any kind of help will be highly appreciated. -#### Developers: -- [Formak21](https://github.com/Formak21) (Original Creator) -- [German Ivanov](https://github.com/germanivanov0719) +Everyone is allowed to open issues, suggest changes, fork this project, modify code, and add pull requests. Any kind of help will be highly appreciated. + +#### Developers: +- [Formak21](https://github.com/Formak21) (Original Author, Duplicate Searcher, Duplicate Searcher Alternative) +- [German Ivanov](https://github.com/germanivanov0719) (Content Searcher, other contributions) diff --git a/content_searcher/README.md b/content_searcher/README.md new file mode 100644 index 0000000..67a7cf7 --- /dev/null +++ b/content_searcher/README.md @@ -0,0 +1,65 @@ +# Content Searcher + +## About + +This program helps you to quickly search for any string in all files in a given directory. It uses multithreading to speed up the search, and also supports multiple encodings. + +## How to use: + +### Input + +You can provide path and search query in this order as a terminal arguments, or input them during runtime. Use any path to the directory, as supported by your OS. + +### Output + +Apart from progress information, the program outputs results in the following format: + +``` +/full/path/to/file1 +/full/path/to/file2 +... +/full/path/to/fileN + +Total: N +``` + +### Customizations + +You can change some settings in `main.py`. These are the defaults: + +```python +# User-defined settings +RECURSION_LIMIT = 1000 +LOGGING = False +SILENT = False +ENCODING = "utf-8" +``` + +`ENCODING` must be from those supported by Python when opening a file. + +## Other information + +### Speed + +Not measured, but not should be limited in any way but by your memory, disk IO, and CPU speed. + +### Stability + +A lot of error-handling was done, though there are still a few restrictions, such as: + +- If you do not have enough memory to load a file, search cannot be performed because of OS Error. +- In case file uses different encoding, you have to specify it, or it will not work. +- If different files use different encodings, chances are search results will be incomplete. + +Moreover, file has to be loaded to RAM completely before searching, which might lead to temporary performance degradation. + +## TODO + +- [x] Multithreading +- [x] Exception Handling +- [x] Nesting level limitation +- [ ] Logging +- [x] Silent mode +- [x] Launch with terminal parameters +- [ ] Docs +- [ ] Regex diff --git a/content_searcher/main.py b/content_searcher/main.py new file mode 100644 index 0000000..17f291f --- /dev/null +++ b/content_searcher/main.py @@ -0,0 +1,103 @@ +import os +import sys +import threading + +# User-defined settings +RECURSION_LIMIT = 1000 +LOGGING = False +SILENT = False +ENCODING = "utf-8" + +# OS-provided settings +SEPARATOR = os.path.sep + + +def log(message, end="\n"): + if LOGGING: + pass + if not SILENT: + print(message, end=end, flush=True) + + +def get_files(d: str) -> list[str]: + files = [] + try: + for p in os.listdir(d): + if os.path.isfile(d + SEPARATOR + p): + files.append(d + SEPARATOR + p) + elif os.path.isdir(d + SEPARATOR + p): + for file in get_files(d + SEPARATOR + p): + files.append(file) + except Exception as e: + if isinstance(e, OSError) and str(e)[7:9] == "12": + print(f"Not enough memory for {d}") + else: + print(f"Unknown exception while checking directory {d}: {str(e)}") + return files + + +class QuickFinder: + def __init__(self, path: str, query: str): + self.result = False + self.path = path + self.query = query + + def check_query(self) -> bool: + try: + with open(self.path, "rt", encoding=ENCODING) as file: + if self.query in file.read(): + self.result = True + return True + except UnicodeDecodeError: + pass + except Exception as e: + print( + f"Unknown exception while reading file {self.path}: {str(e)} {str(type(e))}" + ) + return False + + +def check_files(files: list[str], query: str) -> list[str]: + threads = [] + result = [] + log("- Creating threads...", end="\t") + for file in files: + qf = QuickFinder(file, query) + t = threading.Thread(target=qf.check_query, daemon=True) + t.start() + threads.append((qf, t)) + log("Done.") + log("- Waiting for threads to finish...", end="\t") + for thread in threads: + thread[1].join() + if thread[0].result: + result.append(thread[0].path) + log("Done.") + return result + + +def search(path: str, query: str) -> list[str]: + log(f'Getting all files recursively from "{path}"...') + files = get_files(path) + log(f"Done. Found {len(files)} files...") + log(f'Looking for "{query}":') + results = check_files(files, query) + log(f"Done. Found {len(results)} results.", end="\n\n") + return results + + +if __name__ == "__main__": + sys.setrecursionlimit(RECURSION_LIMIT) + if len(sys.argv) > 2: + path = sys.argv[1] + query = sys.argv[2] + else: + path = input("Path: ") + query = input("Query: ") + + # Issue #4 workaround(https://github.com/Formak21/PyFile-Scripts/issues/4) + if "~" in path: + path = os.path.expanduser(path) + + r = search(path, query) + print(*r, f"\nTotal: {len(r)}", sep="\n") diff --git a/docs/duplicate_searcher.md b/duplicate_finder/README.md similarity index 53% rename from docs/duplicate_searcher.md rename to duplicate_finder/README.md index 98a2613..4b046e7 100644 --- a/docs/duplicate_searcher.md +++ b/duplicate_finder/README.md @@ -1,24 +1,56 @@ # Duplicate Searcher + ## About -This program allows you to quickly find duplicate files in specified directory recursively. Currently, there are 2 versions available: normal (with recursive function, limited to max of 999 nested directories), and alternative (might be more stable in some edge cases, though it is slower now). + +This program allows you to quickly find duplicate files in specified directory recursively. Currently, there are 2 versions available: normal (with recursive function, limited to max of 999 nested directories by default, and alternative (might be more stable in some edge cases, though it is slower now). + ## How to use: + ### Input -Use any path to the directory, as your OS supports it. Unfortunately, files relative to the user directory (~) in *nix systems are not supported as of now, but you still can specify them relative to your root directory (/) or current folder (.). + +Use any path to the directory, as your OS supports it. Files relative to the user directory (~) in \*nix systems are now also supported (only in normal version). + ### Output + For each group of duplicates, the program outputs them as following: + ``` #################################################################################################### /path/to/duplicate1 ... /path/to/duplicateN Total: N duplicates +``` + +### Customizations +You can change the hashing algorithm, chunk size (in bytes, should not be more than available RAM), and recursion limit (maximum nested directories depth) in the following lines (`main.py`, not available in the alternative version): + +```python +HASH_FUNCTION = sha1 +CHUNK_SIZE = 100 * 1024**2 +RECURSION_LIMIT = 1000 ``` + +Please note, that `HASH_FUNCTION` is called from code, so when you change it, do not forget to either import it from a library, or add it to the code. + ## Other information -### Speed -No trusted measures yet, but the normal version uses threads to utilize all of your CPU, and reads files in chunks to preserve memory. Please note, that number of threads is limited to a number of files, with no more than 1 thread per file available for stability reasons. However, the program is capable of creating threads for each file, which will be executed in the order your OS believes works best for your computer. We believe reading in chunks provides best average-case time when they are about 100MiB in size, however, if you do not have that much RAM or know the exact number that works best for you, feel free to change their size in the 18 line of _main.py_ (size in bytes, text after "#" is ignored, math operations supported, "**" means raising to some power): -```python3 -CHUNK_SIZE = 100 * 1024**2 # 100MiB -``` + +### Speed + +No trusted measures yet, but the normal version uses threads to utilize all of your CPU, and reads files in chunks to preserve memory. Please note, that number of threads is limited to a number of files, with no more than 1 thread per file available for stability reasons. However, the program is capable of creating threads for each file, which will be executed in the order your OS believes works best for your computer. We believe reading in chunks provides best average-case time when they are about 100MiB in size, however, this value can be changed if necessary. + ### Stability -A lot of exception-catching is done inside, though beware of files without reading permission: those might and will be marked as duplicates if there are more than 2 of them. Hidden files work in any OS regardless of what name they have. If you have strongly limited RAM, see previous paragraph with information on how to change chunk size and decrease/increase memory usage. + +A lot of exception-catching is done inside, though beware of files without reading permission: those might and will be marked as duplicates if there are more than 2 of them. Hidden files work in any OS regardless of what name they have. If you have strongly limited RAM, you can change chunk size to a smaller value. + +## TODO + +- [x] Multithreading +- [x] Exception Handling +- [x] Nesting level limitation +- [ ] Logging +- [ ] Silent mode +- [x] Additional code comments +- [x] Launch with terminal parameter +- [x] Docs diff --git a/src/duplicate_searcher/experiements/read.py b/duplicate_finder/experiements/read.py similarity index 100% rename from src/duplicate_searcher/experiements/read.py rename to duplicate_finder/experiements/read.py diff --git a/src/duplicate_searcher/experiements/read_chunks.py b/duplicate_finder/experiements/read_chunks.py similarity index 100% rename from src/duplicate_searcher/experiements/read_chunks.py rename to duplicate_finder/experiements/read_chunks.py diff --git a/src/duplicate_searcher/experiements/read_chunks_mt.py b/duplicate_finder/experiements/read_chunks_mt.py similarity index 100% rename from src/duplicate_searcher/experiements/read_chunks_mt.py rename to duplicate_finder/experiements/read_chunks_mt.py diff --git a/src/duplicate_searcher/main.py b/duplicate_finder/main.py old mode 100644 new mode 100755 similarity index 71% rename from src/duplicate_searcher/main.py rename to duplicate_finder/main.py index a3c8dfc..c46e869 --- a/src/duplicate_searcher/main.py +++ b/duplicate_finder/main.py @@ -1,14 +1,23 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +""" +This script finds duplicate files by comparing their hashes. +For more info, see README.md. -# This script is searches file duplicates. -from hashlib import sha512 -from os import listdir -from os.path import isfile, isdir +License: MIT +""" + +import os +import sys import threading +from hashlib import sha1 +from os import listdir +from os.path import isdir, isfile -CHUNK_SIZE = 100 * 1024 ** 2 # 100MiB +HASH_FUNCTION = sha1 # function to use for hashing files +RECURSION_LIMIT = 1000 # Max recursion level +CHUNK_SIZE = 100 * 1024**2 # 100MiB class EncoderThread: @@ -17,11 +26,12 @@ def __init__(self): # hash:[filepath1, filepath2, ...] self.thread_processed_files = dict() - # encodes files with sha512 to check for uniqueness - def sha_encoder(self, file_path: str) -> str: + @staticmethod + def sha_encoder(filepath: str) -> str: + """Function to encode files with HASH_FUNCTION.""" try: - encoder = sha512() - with open(file=file_path, mode="rb") as file: + encoder = HASH_FUNCTION() + with open(file=filepath, mode="rb") as file: chunk = file.read(CHUNK_SIZE) while chunk != b"": encoder.update(chunk) @@ -31,8 +41,8 @@ def sha_encoder(self, file_path: str) -> str: print(f"Unknown exception: {ex}") return "-1" - # function that calculates and saves hash values for list of files def executor(self, files_path: str, unprocessed_files: list[str]) -> None: + """Function to calculate hashes and save them in dictionary.""" for file in unprocessed_files: file = f"{files_path}/{file}" t_hash_key = self.sha_encoder(file) @@ -47,9 +57,7 @@ def executor(self, files_path: str, unprocessed_files: list[str]) -> None: def duplicate_detector(path: str) -> None: - """ - This function finds all duplicates in specified directory recursively. - """ + """This function finds all duplicates in specified directory recursively.""" directories = [] for element in listdir(path): @@ -73,8 +81,8 @@ def duplicate_detector(path: str) -> None: duplicate_detector(directory) -# function to get dictionaries from all threads def get_processed_files() -> dict[str, list[str]]: + """Function to get dictionaries from all threads.""" processed_files = {} processed_files_keys = set() for encoder_thread in encoders_list: @@ -95,7 +103,16 @@ def get_processed_files() -> dict[str, list[str]]: if __name__ == "__main__": - root_path = input("Enter path to the root directory: ") + sys.setrecursionlimit(RECURSION_LIMIT) + if len(sys.argv) > 1: + root_path = sys.argv[1] + else: + root_path = input("Enter path to the root directory: ") + + # Fix issue #4 (https://github.com/Formak21/PyFile-Scripts/issues/4) + if "~" in root_path: + root_path = os.path.expanduser(root_path) + try: print("Starting threads...") duplicate_detector(root_path) @@ -104,12 +121,12 @@ def get_processed_files() -> dict[str, list[str]]: thread.join() print("Done. Counting duplicate files...") processed_files = get_processed_files() - for hash_key in processed_files.keys(): - if len(processed_files[hash_key]) > 1: + for hash_key, files in processed_files.items(): + if len(files) > 1: print( "#" * 100, - *processed_files[hash_key], - f"Total: {len(processed_files[hash_key])} duplicates\n", + *files, + f"Total: {len(files)} duplicates\n", sep="\n", ) except RecursionError: diff --git a/src/duplicate_searcher/main_alt.py b/duplicate_finder/main_alt.py similarity index 99% rename from src/duplicate_searcher/main_alt.py rename to duplicate_finder/main_alt.py index de3c37f..28d06c6 100644 --- a/src/duplicate_searcher/main_alt.py +++ b/duplicate_finder/main_alt.py @@ -8,7 +8,7 @@ import threading # Parameters -CHUNK_SIZE = 100 * 1024 ** 2 # 100MiB +CHUNK_SIZE = 100 * 1024**2 # 100MiB ROOT_PATH = "C:/Users/Form49d/Desktop" EXPORT_FILENAME = "Duplicates.txt" LOG_FILENAME = "Errors.log" diff --git a/src/duplicate_searcher/plan_ru.txt b/src/duplicate_searcher/plan_ru.txt deleted file mode 100644 index 715ff8a..0000000 --- a/src/duplicate_searcher/plan_ru.txt +++ /dev/null @@ -1,11 +0,0 @@ -На данный момент планируется реализовать: - - Многопоточность - - Авто-определение количества ядер - - Исключения - - Ограничения на глубину - - Cuda ??? - - Логирование в файл - - Silent режим - - Дополнительные комментарии - - Запуск с параметрами из терминала - - Документацию \ No newline at end of file