diff --git a/python/src/magika/magika.py b/python/src/magika/magika.py index fa2152ea..120efad4 100644 --- a/python/src/magika/magika.py +++ b/python/src/magika/magika.py @@ -167,6 +167,39 @@ def identify_paths( return self._get_results_from_paths(paths_) + def scan_directory( + self, directory: Union[str, os.PathLike], recursive_scan: bool = False + ) -> List[MagikaResult]: + """Identify the content type of all files in a directory given its path.""" + path_obj = Path(directory) + + # Guard clause: check if directory exists + if not path_obj.exists() or not path_obj.is_dir(): + raise FileNotFoundError( + f"The directory '{directory}' does not exist or is not a directory." + ) + + collected_paths: List[Union[str, os.PathLike]] = [] + + # Use rglob('*') for recursive scan, glob('*') for single directory + glob_pattern = path_obj.rglob("*") if recursive_scan else path_obj.glob("*") + + for item in glob_pattern: + # We only want files, not sub-directories themselves + if item.is_file(): + collected_paths.append(item) + + paths_ = [] + for path in collected_paths: + if isinstance(path, str) or isinstance(path, os.PathLike): + paths_.append(Path(path)) + else: + raise TypeError( + f"Input '{path}' is invalid: input path should be of type `Union[str, os.PathLike]`" + ) + + return self._get_results_from_paths(paths_) + def identify_bytes(self, content: bytes) -> MagikaResult: """Identify the content type of raw bytes.""" if not isinstance(content, bytes): diff --git a/python/tests/test_magika_python_module.py b/python/tests/test_magika_python_module.py index e25bd40b..d31b7dfa 100644 --- a/python/tests/test_magika_python_module.py +++ b/python/tests/test_magika_python_module.py @@ -73,6 +73,13 @@ def test_magika_module_with_explicit_model_dir() -> None: _ = m.identify_stream(f) +def test_magika_module_with_basic_tests_by_directory() -> None: + tests_paths = utils.get_directory_test_dir() + + m = Magika() + _ = m.scan_directory(tests_paths) + + def test_magika_module_with_basic_tests_by_paths() -> None: tests_paths = utils.get_basic_test_files_paths() diff --git a/python/tests/utils.py b/python/tests/utils.py index 375dcd8c..8a38eac1 100644 --- a/python/tests/utils.py +++ b/python/tests/utils.py @@ -42,6 +42,12 @@ def get_basic_tests_files_dir() -> Path: return tests_files_dir +def get_directory_test_dir() -> Path: + tests_files_dir = get_tests_data_dir() / "scan_directory" + assert tests_files_dir.is_dir() + return tests_files_dir + + def get_mitra_tests_files_dir() -> Path: tests_files_dir = get_tests_data_dir() / "mitra" assert tests_files_dir.is_dir() diff --git a/tests_data/scan_directory/code2.py b/tests_data/scan_directory/code2.py new file mode 100644 index 00000000..a2931d6d --- /dev/null +++ b/tests_data/scan_directory/code2.py @@ -0,0 +1,32 @@ +import random +import time + +class NumberGuesser: + def __init__(self, limit=100): + self.limit = limit + self.target = random.randint(1, limit) + self.attempts = 0 + + def guess(self, user_input): + self.attempts += 1 + try: + val = int(user_input) + except ValueError: + return "Please enter a valid integer." + + if val < self.target: + return "Too low!" + elif val > self.target: + return "Too high!" + else: + return f"Correct! It took you {self.attempts} tries." + +def main(): + game = NumberGuesser() + print("I'm thinking of a number between 1 and 100.") + # Simulation of a game loop + for i in range(5): + print(f"Simulation guess {i}: {game.guess(i * 20)}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests_data/scan_directory/test.json b/tests_data/scan_directory/test.json new file mode 100644 index 00000000..df29c36a --- /dev/null +++ b/tests_data/scan_directory/test.json @@ -0,0 +1,31 @@ +{ + "project": { + "id": 1024, + "name": "MagikaTest", + "isActive": true, + "tags": ["python", "cpp", "automation"], + "metadata": { + "created_at": "2023-10-27T10:00:00Z", + "author": "dev_user", + "version": 1.2 + } + }, + "employees": [ + { + "id": 1, + "name": "Alice Smith", + "role": "Lead Developer", + "skills": ["Rust", "C++"] + }, + { + "id": 2, + "name": "Bob Jones", + "role": "QA Engineer", + "skills": ["Python", "Selenium"] + } + ], + "settings": { + "retry_attempts": 5, + "timeout_ms": 3000 + } +} \ No newline at end of file diff --git a/tests_data/scan_directory/test.rs b/tests_data/scan_directory/test.rs new file mode 100644 index 00000000..3ea8704d --- /dev/null +++ b/tests_data/scan_directory/test.rs @@ -0,0 +1,36 @@ +use std::collections::HashMap; + +struct User { + username: String, + email: String, + sign_in_count: u64, + active: bool, +} + +impl User { + fn new(username: String, email: String) -> User { + User { + username, + email, + sign_in_count: 1, + active: true, + } + } + + fn deactivate(&mut self) { + self.active = false; + println!("User {} has been deactivated.", self.username); + } +} + +fn main() { + let mut users = HashMap::new(); + let user1 = User::new(String::from("rust_fan"), String::from("rust@example.com")); + + users.insert("u1", user1); + + match users.get_mut("u1") { + Some(u) => u.deactivate(), + None => println!("User not found"), + } +} \ No newline at end of file diff --git a/tests_data/scan_directory/test.ts b/tests_data/scan_directory/test.ts new file mode 100644 index 00000000..5773bc3a --- /dev/null +++ b/tests_data/scan_directory/test.ts @@ -0,0 +1,36 @@ +interface Task { + id: number; + title: string; + completed: boolean; + completedAt?: Date; // Optional property +} + +class TaskManager { + private tasks: Task[] = []; + + addTask(title: string): void { + const newTask: Task = { + id: this.tasks.length + 1, + title: title, + completed: false + }; + this.tasks.push(newTask); + } + + completeTask(id: number): void { + const task = this.tasks.find(t => t.id === id); + if (task) { + task.completed = true; + task.completedAt = new Date(); + console.log(`Task '${task.title}' marked as done.`); + } + } + + listTasks(): Task[] { + return this.tasks; + } +} + +const manager = new TaskManager(); +manager.addTask("Learn TypeScript"); +manager.completeTask(1); \ No newline at end of file