diff --git a/Dockerfile b/Dockerfile index 1c23e327..14b95998 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,8 +5,8 @@ LABEL authors="FeTS_Admin " RUN apt-get update && apt-get update --fix-missing && apt-get install -y libnss3 libnspr4 libxcursor1 libxcursor-dev libasound2 libdbus-1-dev libglfw3-dev libgles2-mesa-dev ffmpeg libsm6 libxext6 python3.8 python3.8-venv python3.8-dev python3-setuptools ENV PATH=/CaPTk/bin/qt/5.12.1/bin:/CaPTk/bin/qt/5.12.1/libexec:$PATH -ENV CMAKE_PREFIX_PATH=/CaPTk/bin/ITK-build:/CaPTk/bin/DCMTK-build:/CaPTk/bin/qt/5.12.1/lib/cmake/Qt5:$CMAKE_PREFIX_PATH - +ENV CMAKE_PREFIX_PATH=/CaPTk/bin/ITK-build:/CaPTk/bin/DCMTK-build:/CaPTk/bin/qt/5.12.1/lib/cmake/Qt5 +ENV DCMTK_DIR=/CaPTk/bin/DCMTK-build RUN pwd && ls -l WORKDIR /Front-End @@ -88,13 +88,10 @@ RUN pip install git+https://github.com/mlcommons/GaNDLF@616b37bafad8f89d5c816a88 # setup a separate env for nnunet RUN python -m venv /nnunet_env && /nnunet_env/bin/pip install --upgrade pip -RUN /nnunet_env/bin/pip install torch==1.12.1+cu102 torchvision==0.13.1+cu102 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu102 +RUN /nnunet_env/bin/pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 RUN /nnunet_env/bin/pip install git+https://github.com/MIC-DKFZ/nnUNet.git@nnunetv1 -ENV nnUNet_raw_data_base="/tmp/nnUNet_raw_data_base" -ENV nnUNet_preprocessed="/tmp/nnUNet_preprocessed" -# see https://docs.docker.com/config/containers/resource_constraints/#gpu for detailed explanation ENV CUDA_VISIBLE_DEVICES="0" COPY ./mlcubes/data_preparation/project /project diff --git a/docs/assets/img/rano_docker.png b/docs/assets/img/rano_docker.png new file mode 100644 index 00000000..f80ad17a Binary files /dev/null and b/docs/assets/img/rano_docker.png differ diff --git a/mlcubes/.gitignore b/mlcubes/.gitignore index be8d1082..e883eb1d 100644 --- a/mlcubes/.gitignore +++ b/mlcubes/.gitignore @@ -7,4 +7,5 @@ */mlcube/workspace/* !requirements.txt !*/mlcube/workspace/parameters.yaml -models \ No newline at end of file +models +tmpmodel \ No newline at end of file diff --git a/mlcubes/data_preparation/mlcube/clean.sh b/mlcubes/data_preparation/mlcube/clean.sh new file mode 100644 index 00000000..d0bbc3cc --- /dev/null +++ b/mlcubes/data_preparation/mlcube/clean.sh @@ -0,0 +1,5 @@ +rm -rf workspace/data +rm -rf workspace/labels +rm -rf workspace/metadata +rm -rf workspace/report +rm -rf workspace/statistics diff --git a/mlcubes/data_preparation/mlcube/mlcube.yaml b/mlcubes/data_preparation/mlcube/mlcube.yaml index cd837282..bed72694 100644 --- a/mlcubes/data_preparation/mlcube/mlcube.yaml +++ b/mlcubes/data_preparation/mlcube/mlcube.yaml @@ -8,7 +8,7 @@ platform: docker: # Image name - image: mlcommons/rano-data-prep-mlcube:latest + image: mlcommons/rano-data-prep-mlcube:1.0.10 # Docker build context relative to $MLCUBE_ROOT. Default is `build`. build_context: "../project" # Docker file name within docker build context, default is `Dockerfile`. diff --git a/mlcubes/data_preparation/mlcube/tests.sh b/mlcubes/data_preparation/mlcube/tests.sh new file mode 100644 index 00000000..8c788080 --- /dev/null +++ b/mlcubes/data_preparation/mlcube/tests.sh @@ -0,0 +1,63 @@ + +DATA=./workspace/data + +run() { +mlcube run --mlcube ./mlcube.yaml --task prepare --network=none --mount=ro \ + report_file=report/report.yaml \ + labels_path=input_data \ + -Pdocker.cpu_args="-u $(id -u):$(id -g)" \ + -Pdocker.gpu_args="-u $(id -u):$(id -g)" +} + +run_other() { +mlcube run --mlcube ./mlcube.yaml --task sanity_check --network=none --mount=ro \ + -Pdocker.cpu_args="-u $(id -u):$(id -g)" \ + -Pdocker.gpu_args="-u $(id -u):$(id -g)" + +mlcube run --mlcube ./mlcube.yaml --task statistics --network=none --mount=ro \ + output_path=statistics/statistics.yaml \ + -Pdocker.cpu_args="-u $(id -u):$(id -g)" \ + -Pdocker.gpu_args="-u $(id -u):$(id -g)" +} + +STARTTIME=$(date +%s.%N) + + +run + +# manual review +cp $DATA/tumor_extracted/DataForQC/AAAC_1/2008.03.31/TumorMasksForQC/AAAC_1_2008.03.31_tumorMask_model_0.nii.gz \ + $DATA/tumor_extracted/DataForQC/AAAC_1/2008.03.31/TumorMasksForQC/finalized/AAAC_1_2008.03.31_tumorMask_model_0.nii.gz + +cp $DATA/tumor_extracted/DataForQC/AAAC_1/2012.01.02/TumorMasksForQC/AAAC_1_2012.01.02_tumorMask_model_0.nii.gz \ + $DATA/tumor_extracted/DataForQC/AAAC_1/2012.01.02/TumorMasksForQC/finalized/AAAC_1_2012.01.02_tumorMask_model_0.nii.gz + +cp $DATA/tumor_extracted/DataForQC/AAAC_2/2001.01.01/TumorMasksForQC/AAAC_2_2001.01.01_tumorMask_model_0.nii.gz \ + $DATA/tumor_extracted/DataForQC/AAAC_2/2001.01.01/TumorMasksForQC/finalized/AAAC_2_2001.01.01_tumorMask_model_0.nii.gz +# end manual review + +run & +PID=$! + +# prompt response +BREAK=0 +while [ $BREAK -eq "0" ] +do +if [ -f $DATA/".prompt.txt" ]; + then BREAK=1; +else + sleep 0.1s; +fi + +done + +echo -n "y" >> $DATA/.response.txt +# end prompt response + +wait ${PID} + +ENDTIME=$(date +%s.%N) +DIFF=$(echo "$ENDTIME - $STARTTIME" | bc) +echo $DIFF + +run_other \ No newline at end of file diff --git a/mlcubes/data_preparation/mlcube/tests_sing.sh b/mlcubes/data_preparation/mlcube/tests_sing.sh new file mode 100644 index 00000000..df9762c3 --- /dev/null +++ b/mlcubes/data_preparation/mlcube/tests_sing.sh @@ -0,0 +1,62 @@ + +DATA=./workspace/data + +run() { +mlcube run --mlcube ./mlcube.yaml --task prepare --network=none --mount=ro --platform=singularity \ + report_file=report/report.yaml \ + labels_path=input_data \ + -Psingularity.run_args="-nce" +} + +run_other() { +mlcube run --mlcube ./mlcube.yaml --task sanity_check --network=none --mount=ro --platform=singularity \ + -Psingularity.run_args="-nce" + + +mlcube run --mlcube ./mlcube.yaml --task statistics --network=none --mount=ro --platform=singularity \ + output_path=statistics/statistics.yaml \ + -Psingularity.run_args="-nce" + +} + +STARTTIME=$(date +%s.%N) + + +run + +# manual review +cp $DATA/tumor_extracted/DataForQC/AAAC_1/2008.03.31/TumorMasksForQC/AAAC_1_2008.03.31_tumorMask_model_0.nii.gz \ + $DATA/tumor_extracted/DataForQC/AAAC_1/2008.03.31/TumorMasksForQC/finalized/AAAC_1_2008.03.31_tumorMask_model_0.nii.gz + +cp $DATA/tumor_extracted/DataForQC/AAAC_1/2012.01.02/TumorMasksForQC/AAAC_1_2012.01.02_tumorMask_model_0.nii.gz \ + $DATA/tumor_extracted/DataForQC/AAAC_1/2012.01.02/TumorMasksForQC/finalized/AAAC_1_2012.01.02_tumorMask_model_0.nii.gz + +cp $DATA/tumor_extracted/DataForQC/AAAC_2/2001.01.01/TumorMasksForQC/AAAC_2_2001.01.01_tumorMask_model_0.nii.gz \ + $DATA/tumor_extracted/DataForQC/AAAC_2/2001.01.01/TumorMasksForQC/finalized/AAAC_2_2001.01.01_tumorMask_model_0.nii.gz +# end manual review + +run & +PID=$! + +# prompt response +BREAK=0 +while [ $BREAK -eq "0" ] +do +if [ -f $DATA/".prompt.txt" ]; + then BREAK=1; +else + sleep 0.1s; +fi + +done + +echo -n "y" >> $DATA/.response.txt +# end prompt response + +wait ${PID} + +ENDTIME=$(date +%s.%N) +DIFF=$(echo "$ENDTIME - $STARTTIME" | bc) +echo $DIFF + +run_other \ No newline at end of file diff --git a/mlcubes/data_preparation/project/Dockerfile.dev b/mlcubes/data_preparation/project/Dockerfile.dev new file mode 100644 index 00000000..4eb71cf2 --- /dev/null +++ b/mlcubes/data_preparation/project/Dockerfile.dev @@ -0,0 +1,17 @@ +FROM hasan7/baselocal:0.0.0 + +COPY ./atlasImage_0.125.nii.gz /project +COPY ./tmpmodel /project + +# use a downsampled reference image for DICOM to NIFTI conversion +RUN mv /project/atlasImage_0.125.nii.gz /Front-End/bin/install/appdir/usr/data/sri24/atlasImage.nii.gz + +# remove heavy brain extraction models +RUN rm -rf /project/stages/data_prep_models/brain_extraction/model_0/ +RUN rm -rf /project/stages/data_prep_models/brain_extraction/model_1/ + +# use dummy brain extraction models +RUN cp -r /project/tmpmodel /project/stages/data_prep_models/brain_extraction/model_0 +RUN mv /project/tmpmodel /project/stages/data_prep_models/brain_extraction/model_1 + +ENTRYPOINT ["python", "/project/mlcube.py"] \ No newline at end of file diff --git a/mlcubes/data_preparation/project/README.md b/mlcubes/data_preparation/project/README.md new file mode 100644 index 00000000..46e8cd47 --- /dev/null +++ b/mlcubes/data_preparation/project/README.md @@ -0,0 +1,12 @@ +# How to run tests + +1. Download and extract (sha256: 701fbba8b253fc5b2f54660837c493a38dec986df9bdbf3d97f07c8bc276a965): + + +2. Move `additional_files` and `input_data` to the mlcube workspace +3. Move `tmpmodel` and `atlasImage_0.125.nii.gz` to the mlcube project folder + +4. Build the base docker image from the repo's root folder Dockerfile +5. Build the dev docker image using `Dockerfile.dev` in the mlcube project folder. +6. Then change the docker image name in `mlcube.yaml` according to step 5. +7. Then go to `mlcube` folder and run the tests scripts diff --git a/mlcubes/data_preparation/project/mlcube.py b/mlcubes/data_preparation/project/mlcube.py index f798c49d..7d0c96e6 100644 --- a/mlcubes/data_preparation/project/mlcube.py +++ b/mlcubes/data_preparation/project/mlcube.py @@ -1,12 +1,13 @@ """MLCube handler file""" +import os import typer import subprocess - +import shutil app = typer.Typer() -def exec_python(cmd: str) -> None: +def exec_python(cmd: str, check_for_failure=True) -> None: """Execute a python script as a subprocess Args: @@ -15,7 +16,8 @@ def exec_python(cmd: str) -> None: splitted_cmd = cmd.split() process = subprocess.Popen(splitted_cmd, cwd=".") process.wait() - assert process.returncode == 0, f"command failed: {cmd}" + if check_for_failure: + assert process.returncode == 0, f"command failed: {cmd}" @app.command("prepare") @@ -29,10 +31,9 @@ def prepare( report_file: str = typer.Option(..., "--report_file"), metadata_path: str = typer.Option(..., "--metadata_path"), ): - cmd = f"python3 project/prepare.py --data_path={data_path} --labels_path={labels_path} --models_path={models_path} --data_out={output_path} --labels_out={output_labels_path} --report={report_file} --parameters={parameters_file} --metadata_path={metadata_path}" + cmd = f"python3 /project/prepare.py --data_path={data_path} --labels_path={labels_path} --models_path={models_path} --data_out={output_path} --labels_out={output_labels_path} --report={report_file} --parameters={parameters_file} --metadata_path={metadata_path}" exec_python(cmd) - @app.command("sanity_check") def sanity_check( data_path: str = typer.Option(..., "--data_path"), @@ -41,7 +42,7 @@ def sanity_check( metadata_path: str = typer.Option(..., "--metadata_path"), ): # Modify the sanity_check command as needed - cmd = f"python3 project/sanity_check.py --data_path={data_path} --labels_path={labels_path} --metadata={metadata_path}" + cmd = f"python3 /project/sanity_check.py --data_path={data_path} --labels_path={labels_path} --metadata={metadata_path}" exec_python(cmd) @@ -54,8 +55,8 @@ def sanity_check( out_path: str = typer.Option(..., "--output_path"), ): # Modify the statistics command as needed - cmd = f"python3 project/statistics.py --data_path={data_path} --labels_path={labels_path} --out_file={out_path} --metadata={metadata_path}" - exec_python(cmd) + cmd = f"python3 /project/statistics.py --data_path={data_path} --labels_path={labels_path} --out_file={out_path} --metadata={metadata_path}" + exec_python(cmd, check_for_failure=False) # Don't throw an error if it fails, to avoid traceback and confusion from users if __name__ == "__main__": diff --git a/mlcubes/data_preparation/project/prepare.py b/mlcubes/data_preparation/project/prepare.py index 754d1446..e0eb3bf9 100644 --- a/mlcubes/data_preparation/project/prepare.py +++ b/mlcubes/data_preparation/project/prepare.py @@ -1,4 +1,5 @@ import os +import shutil import argparse import pandas as pd import yaml @@ -157,10 +158,25 @@ def init_report(args) -> pd.DataFrame: def main(): args = setup_argparser() - os.environ["RESULTS_FOLDER"] = os.path.join(args.models, "nnUNet_trained_models") + output_path = args.data_out + models_path = args.models + + tmpfolder = os.path.join(output_path, ".tmp") + cbica_tmpfolder = os.path.join(tmpfolder, ".cbicaTemp") + os.environ["TMPDIR"] = tmpfolder + os.environ["CBICA_TEMP_DIR"] = cbica_tmpfolder + os.makedirs(tmpfolder, exist_ok=True) + os.makedirs(cbica_tmpfolder, exist_ok=True) + os.environ["RESULTS_FOLDER"] = os.path.join(models_path, "nnUNet_trained_models") + os.environ["nnUNet_raw_data_base"] = os.path.join(tmpfolder, "nnUNet_raw_data_base") + os.environ["nnUNet_preprocessed"] = os.path.join(tmpfolder, "nnUNet_preprocessed") + report = init_report(args) pipeline = init_pipeline(args) pipeline.run(report, args.report) + # cleanup tmp folder + shutil.rmtree(tmpfolder, ignore_errors=True) + if __name__ == "__main__": main() \ No newline at end of file diff --git a/mlcubes/data_preparation/project/requirements.txt b/mlcubes/data_preparation/project/requirements.txt index f07ba674..baa55254 100644 --- a/mlcubes/data_preparation/project/requirements.txt +++ b/mlcubes/data_preparation/project/requirements.txt @@ -1,10 +1,10 @@ -typer -pandas -PyYAML +typer==0.9.0 +pandas==1.5.3 +PyYAML==6.0.1 # Include all your requirements here -SimpleITK -tqdm -scikit-image +SimpleITK==2.3.1 +tqdm==4.66.2 +scikit-image==0.21.0 FigureGenerator==0.0.4 gandlf==0.0.16 labelfusion==1.0.14 diff --git a/mlcubes/data_preparation/project/sanity_check.py b/mlcubes/data_preparation/project/sanity_check.py index e4da55bb..d2137e37 100644 --- a/mlcubes/data_preparation/project/sanity_check.py +++ b/mlcubes/data_preparation/project/sanity_check.py @@ -14,9 +14,9 @@ def sanity_check(data_path: str, labels_path: str): """ # Here you must add all the checks you consider important regarding the # state of the data - assert has_prepared_folder_structure( - data_path, labels_path - ), "The contents of the labels and data don't ressemble a prepared dataset" + if not has_prepared_folder_structure(data_path, labels_path): + print("The contents of the labels and data don't resemble a prepared dataset", flush=True) + exit(1) if __name__ == "__main__": diff --git a/mlcubes/data_preparation/project/stages/comparison.py b/mlcubes/data_preparation/project/stages/comparison.py index 9d17f2ad..32a21747 100644 --- a/mlcubes/data_preparation/project/stages/comparison.py +++ b/mlcubes/data_preparation/project/stages/comparison.py @@ -106,6 +106,7 @@ def __report_success( return report def could_run(self, index: Union[str, int], report: DataFrame) -> bool: + print(f"Checking if {self.name} can run") # Ensure a single reviewed segmentation file exists path = self.__get_input_path(index) gt_path = self.__get_backup_path(index) @@ -125,6 +126,7 @@ def could_run(self, index: Union[str, int], report: DataFrame) -> bool: prev_hash = report.loc[index]["segmentation_hash"] hash_changed = prev_hash != reviewed_hash + print(f"{path_exists=} and {contains_case=} and {gt_path_exists=} and {hash_changed=}") is_valid = path_exists and contains_case and gt_path_exists and hash_changed return is_valid diff --git a/mlcubes/data_preparation/project/stages/confirm.py b/mlcubes/data_preparation/project/stages/confirm.py index 63f706e9..ae830748 100644 --- a/mlcubes/data_preparation/project/stages/confirm.py +++ b/mlcubes/data_preparation/project/stages/confirm.py @@ -127,6 +127,7 @@ def __process_row(self, row: pd.Series) -> pd.Series: return row def could_run(self, report: DataFrame) -> bool: + print(f"Checking if {self.name} can run") # could run once all cases have been compared to the ground truth missing_voxels = report["num_changed_voxels"].isnull().values.any() prev_path_exists = os.path.exists(self.prev_stage_path) @@ -134,6 +135,7 @@ def could_run(self, report: DataFrame) -> bool: if prev_path_exists: empty_prev_path = len(os.listdir(self.prev_stage_path)) == 0 + print(f"{prev_path_exists=} and not {empty_prev_path=} and not {missing_voxels=}") return prev_path_exists and not empty_prev_path and not missing_voxels def execute(self, report: DataFrame) -> Tuple[DataFrame, bool]: diff --git a/mlcubes/data_preparation/project/stages/extract.py b/mlcubes/data_preparation/project/stages/extract.py index c8987391..8c76d96d 100644 --- a/mlcubes/data_preparation/project/stages/extract.py +++ b/mlcubes/data_preparation/project/stages/extract.py @@ -57,8 +57,11 @@ def could_run(self, index: Union[str, int], report: pd.DataFrame) -> bool: Returns: bool: Wether this stage could be executed for the given case """ + print(f"Checking if {self.name} can run") prev_paths = self.__get_paths(index, self.prev_path, self.prev_subpath) - return all([os.path.exists(path) for path in prev_paths]) + is_valid = all([os.path.exists(path) for path in prev_paths]) + print(f"{is_valid=}") + return is_valid def execute( self, index: Union[str, int], report: pd.DataFrame @@ -97,6 +100,7 @@ def __copy_case(self, index: Union[str, int]): prev_paths = self.__get_paths(index, self.prev_path, self.prev_subpath) copy_paths = self.__get_paths(index, self.out_path, self.prev_subpath) for prev, copy in zip(prev_paths, copy_paths): + shutil.rmtree(copy, ignore_errors=True) shutil.copytree(prev, copy, dirs_exist_ok=True) def _process_case(self, index: Union[str, int]): diff --git a/mlcubes/data_preparation/project/stages/generate_report.py b/mlcubes/data_preparation/project/stages/generate_report.py index a7661137..44317935 100644 --- a/mlcubes/data_preparation/project/stages/generate_report.py +++ b/mlcubes/data_preparation/project/stages/generate_report.py @@ -296,7 +296,9 @@ def execute(self, report: pd.DataFrame) -> Tuple[pd.DataFrame, bool]: "labels_path", "input_hash", ] + print("Initializing report") if report is None: + print("No previous report was identified. Creating a new one") report = pd.DataFrame(columns=cols) input_is_prepared = has_prepared_folder_structure( @@ -304,6 +306,7 @@ def execute(self, report: pd.DataFrame) -> Tuple[pd.DataFrame, bool]: ) if input_is_prepared: # If prepared, store data directly in the data folder + print("Input data looks prepared already. Skipping preprocessing") self.output_path = self.done_data_out_path observed_cases = set() @@ -357,15 +360,23 @@ def execute(self, report: pd.DataFrame) -> Tuple[pd.DataFrame, bool]: if index in report.index: # Case has already been identified, see if input hash is different + # or if status is corrupted # if so, override the contents and restart the state for that case - if report.loc[index]["input_hash"] == input_hash: + case = report.loc[index] + has_not_changed = case["input_hash"] == input_hash + has_a_valid_status = not np.isnan(case["status"]) + if has_not_changed and has_a_valid_status: continue + print(f"Case {index} has either changed ({not has_not_changed}) or has a corrupted status ({not has_a_valid_status}). Starting from scratch") + shutil.rmtree(out_tp_path, ignore_errors=True) shutil.copytree(in_tp_path, out_tp_path) report = report.drop(index) else: # New case not identified by the report. Add it + print(f"New case identified: {index}. Adding to report") + shutil.rmtree(out_tp_path, ignore_errors=True) shutil.copytree(in_tp_path, out_tp_path) data = { diff --git a/mlcubes/data_preparation/project/stages/get_csv.py b/mlcubes/data_preparation/project/stages/get_csv.py index 417a7153..ac75692e 100644 --- a/mlcubes/data_preparation/project/stages/get_csv.py +++ b/mlcubes/data_preparation/project/stages/get_csv.py @@ -47,10 +47,12 @@ def could_run(self, index: Union[str, int], report: pd.DataFrame) -> bool: Returns: bool: wether this stage could be executed """ + print(f"Checking if {self.name} can run") id, tp = get_id_tp(index) prev_case_path = os.path.join(self.prev_stage_path, id, tp) - - return os.path.exists(prev_case_path) + is_valid = os.path.exists(prev_case_path) + print(f"{is_valid=}") + return is_valid def execute( self, index: Union[str, int], report: pd.DataFrame @@ -73,6 +75,7 @@ def execute( # We will first copy the timepoint to the out folder # This is so, if successful, the csv will point to the data # in the next stage, instead of the previous + shutil.rmtree(tp_out_path, ignore_errors=True) shutil.copytree(tp_path, tp_out_path) try: @@ -92,23 +95,35 @@ def execute( update_row_with_dict(report, report_data, index) return report, False - if f"{id}_{tp}" in self.csv_processor.subject_timepoint_missing_modalities: - shutil.rmtree(tp_out_path, ignore_errors=True) - # Differentiate errors by floating point value - status_code = -self.status_code - 0.1 # -1.1 - report_data["status"] = status_code - report_data["data_path"] = tp_path - success = False - elif f"{id}_{tp}" in self.csv_processor.subject_timepoint_extra_modalities: - shutil.rmtree(tp_out_path, ignore_errors=True) - # Differentiate errors by floating point value - status_code = -self.status_code - 0.2 # -1.2 - report_data["status"] = status_code - report_data["data_path"] = tp_path - success = False - else: + missing = self.csv_processor.subject_timepoint_missing_modalities + extra = self.csv_processor.subject_timepoint_extra_modalities + + success = True + report_data["comment"] = "" + for missing_subject, msg in missing: + if f"{id}_{tp}" in missing_subject: + # Differentiate errors by floating point value + status_code = -self.status_code - 0.1 # -1.1 + report_data["status"] = status_code + report_data["data_path"] = tp_path + report_data["comment"] += "\n\n" + msg + success = False + + for extra_subject, msg in extra: + if f"{id}_{tp}" in extra_subject: + # Differentiate errors by floating point value + status_code = -self.status_code - 0.2 # -1.2 + report_data["status"] = status_code + report_data["data_path"] = tp_path + report_data["comment"] += "\n\n" + msg + success = False + + if success: shutil.rmtree(tp_path) - success = True + else: + shutil.rmtree(tp_out_path, ignore_errors=True) + + report_data["comment"] = report_data["comment"].strip() update_row_with_dict(report, report_data, index) diff --git a/mlcubes/data_preparation/project/stages/manual.py b/mlcubes/data_preparation/project/stages/manual.py index 89ce48a7..a3bc5509 100644 --- a/mlcubes/data_preparation/project/stages/manual.py +++ b/mlcubes/data_preparation/project/stages/manual.py @@ -97,7 +97,7 @@ def __report_step_missing( return report def __report_multiple_cases_error( - self, index: Union[str, int], report: pd.DataFrame + self, index: Union[str, int], report: pd.DataFrame, cases: list ) -> pd.DataFrame: path = self.__get_output_path(index) data_path = report.loc[index, "data_path"] @@ -106,6 +106,8 @@ def __report_multiple_cases_error( "status": -self.status_code - 0.1, # -5.1 "data_path": data_path, "labels_path": path, + "comment": f"Multiple files were identified in the labels path: {cases}. " \ + + "Please ensure that there is only the manually corrected segmentation file." } update_row_with_dict(report, report_data, index) return report @@ -155,6 +157,7 @@ def __report_rollback( return report def could_run(self, index: Union[str, int], report: pd.DataFrame) -> bool: + print(f"Checking if {self.name} can run") out_path = self.__get_output_path(index) cases = [] if os.path.exists(out_path): @@ -169,6 +172,7 @@ def could_run(self, index: Union[str, int], report: pd.DataFrame) -> bool: segmentation_exists = os.path.exists(in_path) annotation_exists = len(cases) == 1 brain_mask_changed = brain_mask_hash != expected_brain_mask_hash + print(f"{segmentation_exists=} and (not {annotation_exists=} or {brain_mask_changed=})") return segmentation_exists and (not annotation_exists or brain_mask_changed) def execute( @@ -213,7 +217,7 @@ def execute( if len(cases) > 1: # Found more than one reviewed case - return self.__report_multiple_cases_error(index, report), False + return self.__report_multiple_cases_error(index, report, cases), False elif not len(cases): # Found no cases yet reviewed return self.__report_step_missing(index, report), False diff --git a/mlcubes/data_preparation/project/stages/nifti_transform.py b/mlcubes/data_preparation/project/stages/nifti_transform.py index 5ff1fc61..9267c393 100644 --- a/mlcubes/data_preparation/project/stages/nifti_transform.py +++ b/mlcubes/data_preparation/project/stages/nifti_transform.py @@ -42,10 +42,13 @@ def could_run(self, index: Union[str, int], report: pd.DataFrame) -> bool: Returns: bool: Wether this stage could be executed for the given case """ + print(f"Checking if {self.name} can run") id, tp = get_id_tp(index) prev_case_path = os.path.join(self.prev_stage_path, id, tp) if os.path.exists(prev_case_path): - return len(os.listdir(prev_case_path)) > 0 + is_valid = len(os.listdir(prev_case_path)) > 0 + print(f"{is_valid}") + return is_valid return False def execute(self, index: Union[str, int], report: pd.DataFrame) -> pd.DataFrame: @@ -60,12 +63,23 @@ def execute(self, index: Union[str, int], report: pd.DataFrame) -> pd.DataFrame: """ self.__prepare_exec() self.__process_case(index) + self.__cleanup_artifacts(index) report, success = self.__update_report(index, report) self.prep.write() self.__update_metadata() return report, success + def __cleanup_artifacts(self, index): + unused_artifacts_substrs = ["raw", "to_SRI", ".mat"] + _, out_path = self.__get_output_paths(index) + root_artifacts = os.listdir(out_path) + for artifact in root_artifacts: + if not any([substr in artifact for substr in unused_artifacts_substrs]): + continue + artifact_path = os.path.join(out_path, artifact) + os.remove(artifact_path) + def __get_output_paths(self, index: Union[str, int]): id, tp = get_id_tp(index) fets_path = os.path.join(self.prep.final_output_dir, id, tp) @@ -88,7 +102,7 @@ def __process_case(self, index: Union[str, int]): def __update_prev_stage_state(self, index: Union[str, int], report: pd.DataFrame): prev_data_path = report.loc[index]["data_path"] prev_data_path = unnormalize_path(prev_data_path, self.data_out) - shutil.rmtree(prev_data_path) + shutil.rmtree(prev_data_path, ignore_errors=True) def __undo_current_stage_changes(self, index: Union[str, int]): fets_path, qc_path = self.__get_output_paths(index) diff --git a/mlcubes/data_preparation/project/stages/pipeline.py b/mlcubes/data_preparation/project/stages/pipeline.py index 99fa2935..b876c851 100644 --- a/mlcubes/data_preparation/project/stages/pipeline.py +++ b/mlcubes/data_preparation/project/stages/pipeline.py @@ -2,6 +2,7 @@ from typing import Union, List, Tuple from tqdm import tqdm import traceback +from pathlib import Path import yaml import os @@ -30,8 +31,12 @@ def normalize_report_paths(report: DataFrame) -> DataFrame: def write_report(report: DataFrame, filepath: str): report = normalize_report_paths(report) report_dict = report.to_dict() - with open(filepath, "w") as f: + + # Use a temporary file to avoid quick writes collisions and corruption + temp_path = Path(filepath).parent / ".report.yaml" + with open(temp_path, 'w') as f: yaml.dump(report_dict, f) + os.rename(temp_path, filepath) class Pipeline: @@ -140,6 +145,8 @@ def determine_next_stage( runnable_stage = self.stages[i] could_run_stages.append(runnable_stage) + print(f"Possible next stages: {[stage.name for stage in could_run_stages]}") + # TODO: split into a function if len(could_run_stages) == 1: stage = could_run_stages[0] @@ -154,6 +161,8 @@ def determine_next_stage( # Either no stage can be executed (len(could_run_stages == 0)) # or multiple stages can be executed (len(could_run_stages > 1)) report_stage = self.__get_report_stage_to_run(subject, report) + if report_stage is not None: + print(f"Reported next stage: {report_stage.name}") # TODO: split into a function if len(could_run_stages) == 0: @@ -232,9 +241,13 @@ def process_subject( # Filter out invalid subjects working_report = report[~report.index.isin(invalid_subjects)].copy() + print(f"Determining next stage for {subject}", flush=True) stage, done = self.determine_next_stage(subject, working_report) + if stage is not None: + print(f"Next stage for {subject}: {stage.name}", flush=True) if done: + print(f"Subject {subject} is Done", flush=True) break try: diff --git a/src/applications/BraTSPipeline.cxx b/src/applications/BraTSPipeline.cxx index 5a49fce3..901a5ea4 100644 --- a/src/applications/BraTSPipeline.cxx +++ b/src/applications/BraTSPipeline.cxx @@ -155,13 +155,11 @@ int main(int argc, char** argv) return EXIT_FAILURE; } - auto tempOutput = cbica::getEnvironmentVariableValue("OUTPUT_DIR_FROM_MEDPERF"); - if (tempOutput == "") - { - tempOutput = cbica::createTemporaryDirectory(); - } + auto tempOutput = cbica::createTemporaryDirectory(); //construct command std::string fullCommandToRun = cbica::normPath(m_exe) + " -o " + cbica::normPath(tempOutput) + " -z y \"" + cbica::normPath(dicomFolderPath) + "\""; + std::cerr << "Executing dicom conversion with the following command" << std::endl; + std::cerr << fullCommandToRun << std::endl; //run command via system call if (std::system((fullCommandToRun).c_str()) != 0) diff --git a/src/applications/CreateCSVForDICOMs.py b/src/applications/CreateCSVForDICOMs.py index aaf48a42..f7c03c3c 100644 --- a/src/applications/CreateCSVForDICOMs.py +++ b/src/applications/CreateCSVForDICOMs.py @@ -18,15 +18,26 @@ def verify_dicom_folder(dicom_folder: str) -> (bool, str): Returns: bool: True if the folder is a valid DICOM folder or a 3D NIfTI file, False otherwise. str: The path to the first DICOM file in the folder if the folder is a valid DICOM folder, the NIfTI file itself otherwise. + str: If verification failed, a message explaining the cause of failure is returned. Otherwise an empty string is returned. """ if os.path.isdir(dicom_folder): series_IDs = sitk.ImageSeriesReader.GetGDCMSeriesIDs(dicom_folder) if not series_IDs: - return False, None + msg = ( + f"No valid or readable DICOM files were found in `{dicom_folder}`. " + "Please convert your images using an external tool (such as dcm2niix) " + "and try again by passing the NIfTI images in the CSV files instead of the DICOM images." + ) + return False, None, msg if len(series_IDs) > 1: - return False, None + msg = ( + f"More than 1 DICOM series was found in `{dicom_folder}`, " + "which usually means that multiple modalities or timepoints " + "are present in that folder. Please ensure that only 1 modality is present per folder and try again." + ) + return False, None, msg series_file_names = sitk.ImageSeriesReader.GetGDCMSeriesFileNames( dicom_folder, series_IDs[0] @@ -41,9 +52,14 @@ def verify_dicom_folder(dicom_folder: str) -> (bool, str): series_file_names = [dicom_folder] if image.GetDimension() != 3: - return False, None + msg = ( + f"Only 3D volumes are supported, but the image in `{dicom_folder}` " + f"appears to have {int(image.GetDimension())} dimensions. If your are scans " + "have been encoded differently, please contact your vendor to convert them to 3D volumes." + ) + return False, None, msg - return True, series_file_names[0] + return True, series_file_names[0], "" def setup_argparser(): @@ -105,13 +121,23 @@ def process_timepoint(self, timepoint, subject, subject_dir): return modality_folders = os.listdir(timepoint_dir) + modality_folders = [folder for folder in modality_folders if not folder.startswith(".")] # check if there are missing modalities + subject_tp = subject + "_" + timepoint if len(modality_folders) < 4: - self.subject_timepoint_missing_modalities.append(subject + "_" + timepoint) + msg = ( + f"Less than 4 modalities where identified: {modality_folders}. " + "Please ensure all modalities are present." + ) + self.subject_timepoint_missing_modalities.append((subject_tp, msg)) return # check if there are extra modalities if len(modality_folders) > 4: - self.subject_timepoint_extra_modalities.append(subject + "_" + timepoint) + msg = ( + f"More than 4 modalities where identified: {modality_folders}. " + "Please review your data and remove any unwanted files/folders." + ) + self.subject_timepoint_extra_modalities.append((subject_tp, msg)) return # goldilocks zone @@ -133,23 +159,25 @@ def process_timepoint(self, timepoint, subject, subject_dir): if modality_id != modality_norm: continue - valid_dicom, first_dicom_file = verify_dicom_folder(modality_path) + valid_dicom, first_dicom_file, msg = verify_dicom_folder(modality_path) if valid_dicom: detected_modalities[modality_to_check] = first_dicom_file break else: - self.subject_timepoint_missing_modalities.append( - subject + "_" + timepoint + "_" + modality - ) + subject_tp_mod = subject + "_" + timepoint + "_" + modality + self.subject_timepoint_missing_modalities.append((subject_tp_mod, msg)) # check if any modalities are missing modalities_missing = False for modality in detected_modalities: if detected_modalities[modality] is None: modalities_missing = True - self.subject_timepoint_missing_modalities.append( - subject + "_" + timepoint + "_" + modality + subject_tp_mod = subject + "_" + timepoint + "_" + modality + msg = ( + f"A valid file/directory corresponding to modality {modality} could not be found. " + "Please ensure all modalities are included and are valid." ) + self.subject_timepoint_missing_modalities.append((subject_tp_mod, msg)) if modalities_missing: return @@ -199,12 +227,12 @@ def main(inputDir: str, outputCSV: str): if len(missing) > 0: print( "WARNING: The following subject timepoints are missing modalities: ", - missing, + [subject for subject, _ in missing], ) if len(extra) > 0: print( "WARNING: The following subject timepoints have extra modalities: ", - extra, + [subject for subject, _ in extra], ) print("Done!") diff --git a/src/applications/PrepareDataset.py b/src/applications/PrepareDataset.py index 2edef449..de39132a 100644 --- a/src/applications/PrepareDataset.py +++ b/src/applications/PrepareDataset.py @@ -8,9 +8,8 @@ import numpy as np from skimage.measure import label from copy import deepcopy +import subprocess -from FigureGenerator.screenshot_maker import figure_generator -from GANDLF.cli import main_run from LabelFusion.wrapper import fuse_images from .constants import * @@ -111,7 +110,7 @@ def _get_relevant_dicom_tags(filename: str) -> dict: } for key in keys_to_extract: output_dict[keys_to_extract[key]] = series_reader.GetMetaData(0, key) - except RuntimeError as e: + except (RuntimeError, IndexError) as e: # print( # f"WARNING: Could not read DICOM tags from {input_dicom_dir}.", # ) @@ -142,14 +141,11 @@ def save_screenshot( ) ylabels = (",").join(MODALITIES_LIST) - figure_generator( - input_images=images, - ylabels=ylabels, - output=output_filename, - input_mask=input_mask, - flip_sagittal=True, - flip_coronal=True, - ) + cmd_path = os.path.join(os.path.dirname(__file__), "figure_generator_wrapper.py") + cmd = f"python3 {cmd_path} -i={images} -ylabels={ylabels} -output={output_filename}" + if input_mask is not None: + cmd += f" -masks={input_mask}" + subprocess.run(cmd.split(), check=True) def _read_image_with_min_check(filename): @@ -188,7 +184,7 @@ def _read_image_with_min_check(filename): sitk.WriteImage(output_image, filename) return 0 else: - return counts.astype(int) + return counts.sum().astype(int) return 0 @@ -330,16 +326,9 @@ def _run_brain_extraction_using_gandlf( config_file = posixpath.join(model_dir, file) break - main_run( - data_csv=data_path, - config_file=config_file, - model_dir=model_dir, - train_mode=False, - device="cpu", - resume=False, - reset=False, - output_dir=model_output_dir, - ) + cmd = f"gandlf_run -c={config_file} -i={data_path} -t=False -m={model_dir} -d=cpu -o={model_output_dir}" + subprocess.run(cmd.split(), check=True) + model_output_dir_testing = posixpath.join(model_output_dir, TESTING_FOLDER) modality_outputs = os.listdir(model_output_dir_testing) @@ -423,16 +412,8 @@ def _run_tumor_segmentation_using_gandlf( # parameters["model"]["type"] = "openvino" # yaml.safe_dump(parameters, open(config_file, "w")) - main_run( - data_csv=data_path, - config_file=config_file, - model_dir=model_dir, - train_mode=False, - device="cpu", - resume=False, - reset=False, - output_dir=model_output_dir, - ) + cmd = f"gandlf_run -c={config_file} -i={data_path} -t=False -m={model_dir} -d=cpu -o={model_output_dir}" + subprocess.run(cmd.split(), check=True) model_output_dir_testing = posixpath.join(model_output_dir, TESTING_FOLDER) # We expect one subject (one output modality, one file). @@ -620,6 +601,7 @@ def convert_to_dicom(self, idx: int, row: pd.Series, pbar: tqdm): command = ( bratsPipeline_exe + + " -d " + " -t1 " + row[parsed_headers["T1"]] + " -t1c " diff --git a/src/applications/figure_generator_wrapper.py b/src/applications/figure_generator_wrapper.py new file mode 100644 index 00000000..83a3a085 --- /dev/null +++ b/src/applications/figure_generator_wrapper.py @@ -0,0 +1,29 @@ +import argparse +from FigureGenerator.screenshot_maker import figure_generator + +def setup_argparser(): + parser = argparse.ArgumentParser("Figure Generator Wrapper") + parser.add_argument("-i", type=str) + parser.add_argument("-ylabels", type=str) + parser.add_argument("-output", type=str) + parser.add_argument("-masks", type=str, nargs='?', const=None, default=None) + return parser + +def main(): + args = setup_argparser().parse_args() + images = args.i + ylabels = args.ylabels + output_filename = args.output + input_mask = args.masks + + figure_generator( + input_images=images, + ylabels=ylabels, + output=output_filename, + input_mask=input_mask, + flip_sagittal=True, + flip_coronal=True, + ) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/cbica_toolkit/src/cbicaUtilities.cpp b/src/cbica_toolkit/src/cbicaUtilities.cpp index 7cfb7610..66b7e083 100644 --- a/src/cbica_toolkit/src/cbicaUtilities.cpp +++ b/src/cbica_toolkit/src/cbicaUtilities.cpp @@ -162,7 +162,11 @@ namespace cbica homeEnv = "HOME"; #endif - auto exeTempDir = cbica::getEnvironmentVariableValue(homeEnv) + "/.cbicaTemp/"/* + cbica::getExecutableName()*/; + auto exeTempDir = cbica::getEnvironmentVariableValue("CBICA_TEMP_DIR"); + if (exeTempDir == "") + { + exeTempDir = cbica::getEnvironmentVariableValue(homeEnv) + "/.cbicaTemp/"/* + cbica::getExecutableName()*/; + } if (!directoryExists(exeTempDir)) { createDir(exeTempDir);