From 3dedbae1b64b21b0988f70908c169a60e781c44e Mon Sep 17 00:00:00 2001 From: fcdl94 Date: Fri, 3 Oct 2025 12:36:37 +0000 Subject: [PATCH 1/2] feat(system): enhance archive extraction by removing __MACOSX directory and validating train/validation splits - Added functionality to remove the __MACOSX directory after extraction. - Implemented checks to ensure the presence of 'train' and 'validation' directories in the extracted content, raising errors if not found. --- focoos/utils/system.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/focoos/utils/system.py b/focoos/utils/system.py index 8217e6f8..fdfd1564 100644 --- a/focoos/utils/system.py +++ b/focoos/utils/system.py @@ -1,6 +1,7 @@ import importlib.metadata as metadata import os import platform +import shutil import subprocess import sys import tarfile @@ -340,9 +341,25 @@ def extract_archive( logger.info(f"[elapsed {t1 - t0:.3f} ] Extracted archive to: {extracted_dir}") comm.synchronize() + # Remove __MACOSX directory + if "__MACOSX" in os.listdir(extracted_dir): + shutil.rmtree(os.path.join(extracted_dir, "__MACOSX")) + if len(list_dir(extracted_dir)) == 1: extracted_dir = list_dir(extracted_dir)[0] + POSSIBLE_TRAIN_DIRS = ["train", "training"] + POSSIBLE_VAL_DIRS = ["valid", "val", "validation"] + if len(list_dir(extracted_dir)) > 1: + if not any(dir in POSSIBLE_TRAIN_DIRS for dir in os.listdir(extracted_dir)): + raise FileNotFoundError( + f"Train split not found in {extracted_dir}: {[str(x) for x in list_dir(extracted_dir)]}. You should provide a zip dataset with only a root folder or train and val subfolders." + ) + if not any(dir in POSSIBLE_VAL_DIRS for dir in os.listdir(extracted_dir)): + raise FileNotFoundError( + f"Validation split not found in {extracted_dir}: {[str(x) for x in list_dir(extracted_dir)]}. You should provide a zip dataset with only a root folder or train and val subfolders." + ) + # Optionally delete the original archive if delete_original: os.remove(archive_path) From 058b6b2c47e6a34a0adc902336d414d1ceba4991 Mon Sep 17 00:00:00 2001 From: fcdl94 Date: Fri, 3 Oct 2025 14:27:14 +0000 Subject: [PATCH 2/2] refactor(system): rename list_dir to list_directories and streamline directory validation - Renamed the function `list_dir` to `list_directories` for clarity. - Updated the extraction logic to use the new function name. - Improved validation checks for 'train' and 'validation' directories by using `Path` objects for better consistency and readability. --- focoos/utils/system.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/focoos/utils/system.py b/focoos/utils/system.py index fdfd1564..70ab5af8 100644 --- a/focoos/utils/system.py +++ b/focoos/utils/system.py @@ -267,7 +267,7 @@ def is_inside_sagemaker(): return res -def list_dir(base_directory: Union[str, Path]) -> List[Path]: +def list_directories(base_directory: Union[str, Path]) -> List[Path]: """ A function that lists directories within a base directory. @@ -311,10 +311,9 @@ def extract_archive( # Determine the extraction path t0 = time.time() base_dir = os.path.dirname(archive_path) + extracted_dir = base_dir if destination is not None: extracted_dir = os.path.join(base_dir, destination) - else: - extracted_dir = base_dir if comm.is_main_process(): logger.info(f"Extracting archive: {archive_path} to {extracted_dir}") @@ -345,20 +344,20 @@ def extract_archive( if "__MACOSX" in os.listdir(extracted_dir): shutil.rmtree(os.path.join(extracted_dir, "__MACOSX")) - if len(list_dir(extracted_dir)) == 1: - extracted_dir = list_dir(extracted_dir)[0] + if len(list_directories(extracted_dir)) == 1: + extracted_dir = list_directories(extracted_dir)[0] POSSIBLE_TRAIN_DIRS = ["train", "training"] POSSIBLE_VAL_DIRS = ["valid", "val", "validation"] - if len(list_dir(extracted_dir)) > 1: - if not any(dir in POSSIBLE_TRAIN_DIRS for dir in os.listdir(extracted_dir)): - raise FileNotFoundError( - f"Train split not found in {extracted_dir}: {[str(x) for x in list_dir(extracted_dir)]}. You should provide a zip dataset with only a root folder or train and val subfolders." - ) - if not any(dir in POSSIBLE_VAL_DIRS for dir in os.listdir(extracted_dir)): - raise FileNotFoundError( - f"Validation split not found in {extracted_dir}: {[str(x) for x in list_dir(extracted_dir)]}. You should provide a zip dataset with only a root folder or train and val subfolders." - ) + inner_dirs = list_directories(extracted_dir) + if not any(dir.name in POSSIBLE_TRAIN_DIRS for dir in inner_dirs): + raise FileNotFoundError( + f"Train split not found in {extracted_dir}: {[str(x) for x in inner_dirs]}. You should provide a zip dataset with only a root folder or train and val subfolders." + ) + if not any(dir.name in POSSIBLE_VAL_DIRS for dir in inner_dirs): + raise FileNotFoundError( + f"Validation split not found in {extracted_dir}: {[str(x) for x in inner_dirs]}. You should provide a zip dataset with only a root folder or train and val subfolders." + ) # Optionally delete the original archive if delete_original: