From 8ff868ddbb1d54951338b367d91fca7afd8c6d5e Mon Sep 17 00:00:00 2001 From: Ricardo Date: Sat, 29 Nov 2025 22:12:57 -0500 Subject: [PATCH 1/6] Implemented scan_directory and added necessary testing files --- python/src/magika/magika.py | 27 +++++++++++++++++ python/tests/test_magika_python_module.py | 6 ++++ python/tests/utils.py | 6 ++++ test_magika.py | 24 +++++++++++++++ tests_data/scan_directory/code2.py | 32 ++++++++++++++++++++ tests_data/scan_directory/test.json | 31 +++++++++++++++++++ tests_data/scan_directory/test.rs | 36 +++++++++++++++++++++++ tests_data/scan_directory/test.ts | 36 +++++++++++++++++++++++ 8 files changed, 198 insertions(+) create mode 100644 test_magika.py create mode 100644 tests_data/scan_directory/code2.py create mode 100644 tests_data/scan_directory/test.json create mode 100644 tests_data/scan_directory/test.rs create mode 100644 tests_data/scan_directory/test.ts diff --git a/python/src/magika/magika.py b/python/src/magika/magika.py index fa2152ea..ece293c1 100644 --- a/python/src/magika/magika.py +++ b/python/src/magika/magika.py @@ -167,6 +167,33 @@ def identify_paths( return self._get_results_from_paths(paths_) + def scan_directory(self, directory: Union[str, os.PathLike]) -> List[MagikaResult]: + """Identify the content type of a all files in a directory given its path.""" + path_obj = Path(directory) + + # Guard clause: check if directory exists + if not path_obj.exists() or not path_obj.is_dir(): + raise FileNotFoundError(f"The directory '{directory}' does not exist or is not a directory.") + + collected_paths: List[Union[str, os.PathLike]] = [] + + # rglob('*') recursively finds all files and directories + for item in path_obj.rglob('*'): + # We only want files, not sub-directories themselves + if item.is_file(): + collected_paths.append(item) + + paths_ = [] + for path in collected_paths: + if isinstance(path, str) or isinstance(path, os.PathLike): + paths_.append(Path(path)) + else: + raise TypeError( + f"Input '{path}' is invalid: input path should be of type `Union[str, os.PathLike]`" + ) + + return self._get_results_from_paths(paths_) + def identify_bytes(self, content: bytes) -> MagikaResult: """Identify the content type of raw bytes.""" if not isinstance(content, bytes): diff --git a/python/tests/test_magika_python_module.py b/python/tests/test_magika_python_module.py index e25bd40b..c7595e64 100644 --- a/python/tests/test_magika_python_module.py +++ b/python/tests/test_magika_python_module.py @@ -72,6 +72,12 @@ def test_magika_module_with_explicit_model_dir() -> None: with open(test_path, "rb") as f: _ = m.identify_stream(f) +def test_magika_module_with_basic_tests_by_directory() -> None: + tests_paths = utils.get_directory_test_dir() + + m = Magika() + results = m.scan_directory(tests_paths) + def test_magika_module_with_basic_tests_by_paths() -> None: tests_paths = utils.get_basic_test_files_paths() diff --git a/python/tests/utils.py b/python/tests/utils.py index 375dcd8c..8a38eac1 100644 --- a/python/tests/utils.py +++ b/python/tests/utils.py @@ -42,6 +42,12 @@ def get_basic_tests_files_dir() -> Path: return tests_files_dir +def get_directory_test_dir() -> Path: + tests_files_dir = get_tests_data_dir() / "scan_directory" + assert tests_files_dir.is_dir() + return tests_files_dir + + def get_mitra_tests_files_dir() -> Path: tests_files_dir = get_tests_data_dir() / "mitra" assert tests_files_dir.is_dir() diff --git a/test_magika.py b/test_magika.py new file mode 100644 index 00000000..a5272fd3 --- /dev/null +++ b/test_magika.py @@ -0,0 +1,24 @@ +#from magika import Magika +import sys +import os + +# Get the absolute path to the folder containing magika.py +# We go: current dir -> python -> src -> magika +module_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'python', 'src')) + +# --- THE FIX --- +# Use insert(0, path) instead of append(path) +# This puts your folder at the TOP of the search list +if module_path not in sys.path: + sys.path.insert(0, module_path) + +import magika +#m = Magika() +m = magika.Magika() +#res = m.identify_path('./tests_data/basic/python') +res = m.scan_directory('./tests_data/scan_directory') +print("------------------ PREDICTIONS ------------------") +for result in res: + print(result) + print() +#print(res) \ No newline at end of file diff --git a/tests_data/scan_directory/code2.py b/tests_data/scan_directory/code2.py new file mode 100644 index 00000000..a2931d6d --- /dev/null +++ b/tests_data/scan_directory/code2.py @@ -0,0 +1,32 @@ +import random +import time + +class NumberGuesser: + def __init__(self, limit=100): + self.limit = limit + self.target = random.randint(1, limit) + self.attempts = 0 + + def guess(self, user_input): + self.attempts += 1 + try: + val = int(user_input) + except ValueError: + return "Please enter a valid integer." + + if val < self.target: + return "Too low!" + elif val > self.target: + return "Too high!" + else: + return f"Correct! It took you {self.attempts} tries." + +def main(): + game = NumberGuesser() + print("I'm thinking of a number between 1 and 100.") + # Simulation of a game loop + for i in range(5): + print(f"Simulation guess {i}: {game.guess(i * 20)}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests_data/scan_directory/test.json b/tests_data/scan_directory/test.json new file mode 100644 index 00000000..df29c36a --- /dev/null +++ b/tests_data/scan_directory/test.json @@ -0,0 +1,31 @@ +{ + "project": { + "id": 1024, + "name": "MagikaTest", + "isActive": true, + "tags": ["python", "cpp", "automation"], + "metadata": { + "created_at": "2023-10-27T10:00:00Z", + "author": "dev_user", + "version": 1.2 + } + }, + "employees": [ + { + "id": 1, + "name": "Alice Smith", + "role": "Lead Developer", + "skills": ["Rust", "C++"] + }, + { + "id": 2, + "name": "Bob Jones", + "role": "QA Engineer", + "skills": ["Python", "Selenium"] + } + ], + "settings": { + "retry_attempts": 5, + "timeout_ms": 3000 + } +} \ No newline at end of file diff --git a/tests_data/scan_directory/test.rs b/tests_data/scan_directory/test.rs new file mode 100644 index 00000000..3ea8704d --- /dev/null +++ b/tests_data/scan_directory/test.rs @@ -0,0 +1,36 @@ +use std::collections::HashMap; + +struct User { + username: String, + email: String, + sign_in_count: u64, + active: bool, +} + +impl User { + fn new(username: String, email: String) -> User { + User { + username, + email, + sign_in_count: 1, + active: true, + } + } + + fn deactivate(&mut self) { + self.active = false; + println!("User {} has been deactivated.", self.username); + } +} + +fn main() { + let mut users = HashMap::new(); + let user1 = User::new(String::from("rust_fan"), String::from("rust@example.com")); + + users.insert("u1", user1); + + match users.get_mut("u1") { + Some(u) => u.deactivate(), + None => println!("User not found"), + } +} \ No newline at end of file diff --git a/tests_data/scan_directory/test.ts b/tests_data/scan_directory/test.ts new file mode 100644 index 00000000..5773bc3a --- /dev/null +++ b/tests_data/scan_directory/test.ts @@ -0,0 +1,36 @@ +interface Task { + id: number; + title: string; + completed: boolean; + completedAt?: Date; // Optional property +} + +class TaskManager { + private tasks: Task[] = []; + + addTask(title: string): void { + const newTask: Task = { + id: this.tasks.length + 1, + title: title, + completed: false + }; + this.tasks.push(newTask); + } + + completeTask(id: number): void { + const task = this.tasks.find(t => t.id === id); + if (task) { + task.completed = true; + task.completedAt = new Date(); + console.log(`Task '${task.title}' marked as done.`); + } + } + + listTasks(): Task[] { + return this.tasks; + } +} + +const manager = new TaskManager(); +manager.addTask("Learn TypeScript"); +manager.completeTask(1); \ No newline at end of file From 57843b130a5671bdf6469a653d94e0626757f99e Mon Sep 17 00:00:00 2001 From: Ricardo Date: Sat, 29 Nov 2025 22:23:40 -0500 Subject: [PATCH 2/6] cleanup --- test_magika.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test_magika.py b/test_magika.py index a5272fd3..301779a0 100644 --- a/test_magika.py +++ b/test_magika.py @@ -2,13 +2,8 @@ import sys import os -# Get the absolute path to the folder containing magika.py -# We go: current dir -> python -> src -> magika module_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'python', 'src')) -# --- THE FIX --- -# Use insert(0, path) instead of append(path) -# This puts your folder at the TOP of the search list if module_path not in sys.path: sys.path.insert(0, module_path) From 23602718124dc4fcd56403c7278f34e32efc6822 Mon Sep 17 00:00:00 2001 From: Christian Lopez Date: Sun, 30 Nov 2025 17:23:01 -0400 Subject: [PATCH 3/6] added recursive scanning to scan_directory --- python/src/magika/magika.py | 12 +++++++----- test_magika.py | 19 ------------------- 2 files changed, 7 insertions(+), 24 deletions(-) delete mode 100644 test_magika.py diff --git a/python/src/magika/magika.py b/python/src/magika/magika.py index ece293c1..fb1ba574 100644 --- a/python/src/magika/magika.py +++ b/python/src/magika/magika.py @@ -166,9 +166,9 @@ def identify_paths( ) return self._get_results_from_paths(paths_) - - def scan_directory(self, directory: Union[str, os.PathLike]) -> List[MagikaResult]: - """Identify the content type of a all files in a directory given its path.""" + + def scan_directory(self, directory: Union[str, os.PathLike], recursive_scan=False) -> List[MagikaResult]: + """Identify the content type of all files in a directory given its path.""" path_obj = Path(directory) # Guard clause: check if directory exists @@ -177,8 +177,10 @@ def scan_directory(self, directory: Union[str, os.PathLike]) -> List[MagikaResul collected_paths: List[Union[str, os.PathLike]] = [] - # rglob('*') recursively finds all files and directories - for item in path_obj.rglob('*'): + # Use rglob('*') for recursive scan, glob('*') for single directory + glob_pattern = path_obj.rglob('*') if recursive_scan else path_obj.glob('*') + + for item in glob_pattern: # We only want files, not sub-directories themselves if item.is_file(): collected_paths.append(item) diff --git a/test_magika.py b/test_magika.py deleted file mode 100644 index 301779a0..00000000 --- a/test_magika.py +++ /dev/null @@ -1,19 +0,0 @@ -#from magika import Magika -import sys -import os - -module_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'python', 'src')) - -if module_path not in sys.path: - sys.path.insert(0, module_path) - -import magika -#m = Magika() -m = magika.Magika() -#res = m.identify_path('./tests_data/basic/python') -res = m.scan_directory('./tests_data/scan_directory') -print("------------------ PREDICTIONS ------------------") -for result in res: - print(result) - print() -#print(res) \ No newline at end of file From 6b451b0872d11d5f4710076a549e1c20ef852ff4 Mon Sep 17 00:00:00 2001 From: Anthony Vargas <90121982+Speedrunyourknowledge@users.noreply.github.com> Date: Mon, 1 Dec 2025 16:53:31 -0500 Subject: [PATCH 4/6] Fix syntax of scan_directory test --- python/tests/test_magika_python_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/test_magika_python_module.py b/python/tests/test_magika_python_module.py index c7595e64..a1d6fd57 100644 --- a/python/tests/test_magika_python_module.py +++ b/python/tests/test_magika_python_module.py @@ -76,7 +76,7 @@ def test_magika_module_with_basic_tests_by_directory() -> None: tests_paths = utils.get_directory_test_dir() m = Magika() - results = m.scan_directory(tests_paths) + _ = m.scan_directory(tests_paths) def test_magika_module_with_basic_tests_by_paths() -> None: From b7978e205f862626c5212762af63f0e4d69daabc Mon Sep 17 00:00:00 2001 From: Anthony Vargas <90121982+Speedrunyourknowledge@users.noreply.github.com> Date: Mon, 1 Dec 2025 17:15:00 -0500 Subject: [PATCH 5/6] Fix format with ruff --- python/src/magika/magika.py | 16 ++++++++++------ python/tests/test_magika_python_module.py | 1 + 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/python/src/magika/magika.py b/python/src/magika/magika.py index fb1ba574..081ab4ce 100644 --- a/python/src/magika/magika.py +++ b/python/src/magika/magika.py @@ -166,20 +166,24 @@ def identify_paths( ) return self._get_results_from_paths(paths_) - - def scan_directory(self, directory: Union[str, os.PathLike], recursive_scan=False) -> List[MagikaResult]: + + def scan_directory( + self, directory: Union[str, os.PathLike], recursive_scan=False + ) -> List[MagikaResult]: """Identify the content type of all files in a directory given its path.""" path_obj = Path(directory) - + # Guard clause: check if directory exists if not path_obj.exists() or not path_obj.is_dir(): - raise FileNotFoundError(f"The directory '{directory}' does not exist or is not a directory.") + raise FileNotFoundError( + f"The directory '{directory}' does not exist or is not a directory." + ) collected_paths: List[Union[str, os.PathLike]] = [] # Use rglob('*') for recursive scan, glob('*') for single directory - glob_pattern = path_obj.rglob('*') if recursive_scan else path_obj.glob('*') - + glob_pattern = path_obj.rglob("*") if recursive_scan else path_obj.glob("*") + for item in glob_pattern: # We only want files, not sub-directories themselves if item.is_file(): diff --git a/python/tests/test_magika_python_module.py b/python/tests/test_magika_python_module.py index a1d6fd57..d31b7dfa 100644 --- a/python/tests/test_magika_python_module.py +++ b/python/tests/test_magika_python_module.py @@ -72,6 +72,7 @@ def test_magika_module_with_explicit_model_dir() -> None: with open(test_path, "rb") as f: _ = m.identify_stream(f) + def test_magika_module_with_basic_tests_by_directory() -> None: tests_paths = utils.get_directory_test_dir() From fc502c7be5293a7bc088c3ed2d87e1c511edc881 Mon Sep 17 00:00:00 2001 From: Anthony Vargas <90121982+Speedrunyourknowledge@users.noreply.github.com> Date: Mon, 1 Dec 2025 17:19:59 -0500 Subject: [PATCH 6/6] Fix missing type annotation --- python/src/magika/magika.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/src/magika/magika.py b/python/src/magika/magika.py index 081ab4ce..120efad4 100644 --- a/python/src/magika/magika.py +++ b/python/src/magika/magika.py @@ -168,7 +168,7 @@ def identify_paths( return self._get_results_from_paths(paths_) def scan_directory( - self, directory: Union[str, os.PathLike], recursive_scan=False + self, directory: Union[str, os.PathLike], recursive_scan: bool = False ) -> List[MagikaResult]: """Identify the content type of all files in a directory given its path.""" path_obj = Path(directory)