From 97aa3734d9a477f734761b09df88f527411a1718 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 16 Jun 2025 15:32:41 -0700 Subject: [PATCH 1/4] feat: init capDSD --- docker-wrappers/capDSD/Dockerfile | 6 ++ docker-wrappers/capDSD/README.md | 3 + spras/capDSD.py | 91 +++++++++++++++++++ spras/runner.py | 1 + .../expected/capdsd-matrix-expected.txt | 5 + test/capDSD/input/capdsd-ppi.txt | 2 + test/capDSD/input/capdsd-ppip.txt | 1 + test/capDSD/test_capDSD.py | 61 +++++++++++++ .../expected/capdsd-ppi-expected.txt | 2 + test/generate-inputs/test_generate_inputs.py | 3 +- 10 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 docker-wrappers/capDSD/Dockerfile create mode 100644 docker-wrappers/capDSD/README.md create mode 100644 spras/capDSD.py create mode 100644 test/capDSD/expected/capdsd-matrix-expected.txt create mode 100644 test/capDSD/input/capdsd-ppi.txt create mode 100644 test/capDSD/input/capdsd-ppip.txt create mode 100644 test/capDSD/test_capDSD.py create mode 100644 test/generate-inputs/expected/capdsd-ppi-expected.txt diff --git a/docker-wrappers/capDSD/Dockerfile b/docker-wrappers/capDSD/Dockerfile new file mode 100644 index 000000000..cf4f2fafb --- /dev/null +++ b/docker-wrappers/capDSD/Dockerfile @@ -0,0 +1,6 @@ +FROM python:2.7.18 + +RUN pip install numpy==1.16.6 +RUN wget https://web.archive.org/web/20250616194746/http://dsd.cs.tufts.edu/capdsd/files//capDSD-src.zip + +RUN unzip capDSD-src.zip -d capDSD/ diff --git a/docker-wrappers/capDSD/README.md b/docker-wrappers/capDSD/README.md new file mode 100644 index 000000000..caa8b7400 --- /dev/null +++ b/docker-wrappers/capDSD/README.md @@ -0,0 +1,3 @@ +# capDSD Docker Image + +A Docker image for [capDSD](https://doi.org/10.1093/bioinformatics/btu263) that is available on [DockerHub](https://hub.docker.com/repository/docker/reedcompbio/capdsd). \ No newline at end of file diff --git a/spras/capDSD.py b/spras/capDSD.py new file mode 100644 index 000000000..0acdf4a78 --- /dev/null +++ b/spras/capDSD.py @@ -0,0 +1,91 @@ +from pathlib import Path + +from spras.containers import prepare_volume, run_container_and_log +from spras.dataset import Dataset +from spras.interactome import convert_directed_to_undirected +from spras.prm import PRM + + +__all__ = ['CapDSD'] + +class CapDSD(PRM): + required_inputs = ['ppi', 'ppip'] + + @staticmethod + def generate_inputs(data: Dataset, filename_map: dict[str, str]): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + """ + for input_type in CapDSD.required_inputs: + if input_type not in filename_map: + raise ValueError(f"{input_type} filename is missing") + + # create the ppi + ppi = data.get_interactome() + ppi = convert_directed_to_undirected(ppi) + ppi.to_csv(filename_map['ppi'], sep='\t', index=False, columns=["Interactor1", "Interactor2", "Weight"], + header=False) + + # then, we want to 'guide' the ppi with a .ppip file, which is a secondary, + # trusted interactome: we use the directed edges from the interactome as our + # trusted edges. + ppip = data.get_interactome() + ppip = ppip[ppip["Direction"] == "D"] + ppip.to_csv(filename_map['ppip'], sep='\t', index=False, columns=["Interactor1", "Interactor2"], header=False) + + @staticmethod + def run(ppi=None, ppip=None, output_file=None, container_framework="docker"): + if not ppi or not ppip or not output_file: + raise ValueError("Required capDSD arguments are missing") + + work_dir = '/capDSD' + + volumes = list() + + bind_path, ppi_file = prepare_volume(ppi, work_dir) + volumes.append(bind_path) + + bind_path, ppip_file = prepare_volume(ppip, work_dir) + volumes.append(bind_path) + + # Create a prefix for the output filename and ensure the directory exists + out_dir = Path(output_file).parent + out_dir.mkdir(parents=True, exist_ok=True) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + volumes.append(bind_path) + mapped_out_prefix = mapped_out_dir + '/output' + + container_suffix = "capdsd" + + + # First, we move ppip_file to a subdirectory + run_container_and_log('capDSD', + container_framework, + container_suffix, + ['sh', '-c', f'mkdir ppip && mv {ppip_file} ppip/{ppip_file}'], + volumes, + work_dir) + + command = ['python', + '/capDSD/DSD.py', + '-pathmode', '1', + '-p', str(Path(ppip_file).parent), + ppi_file, mapped_out_prefix] + + + run_container_and_log('capDSD', + container_framework, + container_suffix, + command, + volumes, + work_dir) + + output_matrix = Path(out_dir) / 'output.dsd' + output_matrix.rename(output_file) + + @staticmethod + def parse_output(raw_pathway_file: str, standardized_pathway_file: str): + pass + \ No newline at end of file diff --git a/spras/runner.py b/spras/runner.py index 8490644c1..52810af01 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -1,6 +1,7 @@ # supported algorithm imports from spras.allpairs import AllPairs as allpairs from spras.dataset import Dataset +from spras.capDSD import CapDSD as capdsd from spras.domino import DOMINO as domino from spras.meo import MEO as meo from spras.mincostflow import MinCostFlow as mincostflow diff --git a/test/capDSD/expected/capdsd-matrix-expected.txt b/test/capDSD/expected/capdsd-matrix-expected.txt new file mode 100644 index 000000000..76753d4eb --- /dev/null +++ b/test/capDSD/expected/capdsd-matrix-expected.txt @@ -0,0 +1,5 @@ + A B C D +A 0.0 0.0 16.0 16.0 +B 0.0 0.0 16.0 16.0 +C 16.0 16.0 0.0 0.0 +D 16.0 16.0 0.0 0.0 diff --git a/test/capDSD/input/capdsd-ppi.txt b/test/capDSD/input/capdsd-ppi.txt new file mode 100644 index 000000000..4e05c5b63 --- /dev/null +++ b/test/capDSD/input/capdsd-ppi.txt @@ -0,0 +1,2 @@ +A B 0.5 +C D 0.75 \ No newline at end of file diff --git a/test/capDSD/input/capdsd-ppip.txt b/test/capDSD/input/capdsd-ppip.txt new file mode 100644 index 000000000..5aa7ba857 --- /dev/null +++ b/test/capDSD/input/capdsd-ppip.txt @@ -0,0 +1 @@ +B C \ No newline at end of file diff --git a/test/capDSD/test_capDSD.py b/test/capDSD/test_capDSD.py new file mode 100644 index 000000000..bed97c196 --- /dev/null +++ b/test/capDSD/test_capDSD.py @@ -0,0 +1,61 @@ +import filecmp +import shutil +from pathlib import Path + +import pytest + +import spras.config as config +from spras.capDSD import CapDSD + +config.init_from_file("config/config.yaml") + +TEST_DIR = Path('test', 'capDSD') +IN_DIR = TEST_DIR / 'input' +OUT_DIR = TEST_DIR / 'output' +EXPECTED_DIR = TEST_DIR / 'expected' + +INPUT_PPI = IN_DIR / 'capdsd-ppi.txt' +INPUT_PPIP = IN_DIR / 'capdsd-ppip.txt' + +OUT_FILE = OUT_DIR / 'output.txt' +EXPECTED_FILE = EXPECTED_DIR / 'capdsd-matrix-expected.txt' + +class TestCapDSD: + """ + Run capDSD tests in the Docker image + """ + def test_capdsd_required(self): + OUT_FILE.unlink(missing_ok=True) + # Only include required arguments + CapDSD.run( + ppi=INPUT_PPI, + ppip=INPUT_PPIP, + output_file=OUT_FILE + ) + assert OUT_FILE.exists() + + assert filecmp.cmp(OUT_FILE, EXPECTED_FILE) + + def test_capdsd_missing(self): + # Test the expected error is raised when required arguments are missing + with pytest.raises(ValueError): + # No PPI + CapDSD.run( + ppip=INPUT_PPIP, + output_file=OUT_FILE + ) + + # Only run Singularity test if the binary is available on the system + # spython is only available on Unix, but do not explicitly skip non-Unix platforms + @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') + def test_capdsd_singularity(self): + OUT_FILE.unlink(missing_ok=True) + # Only include required arguments and run with Singularity + CapDSD.run( + ppi=INPUT_PPI, + ppip=INPUT_PPIP, + output_file=OUT_FILE, + container_framework="singularity") + assert OUT_FILE.exists() + + assert filecmp.cmp(OUT_FILE, EXPECTED_FILE) diff --git a/test/generate-inputs/expected/capdsd-ppi-expected.txt b/test/generate-inputs/expected/capdsd-ppi-expected.txt new file mode 100644 index 000000000..8334ffd53 --- /dev/null +++ b/test/generate-inputs/expected/capdsd-ppi-expected.txt @@ -0,0 +1,2 @@ +test_A B 0.98 +B C 0.77 diff --git a/test/generate-inputs/test_generate_inputs.py b/test/generate-inputs/test_generate_inputs.py index 6d732d315..0726e919c 100644 --- a/test/generate-inputs/test_generate_inputs.py +++ b/test/generate-inputs/test_generate_inputs.py @@ -16,7 +16,8 @@ 'omicsintegrator2': 'edges', 'domino': 'network', 'pathlinker': 'network', - 'allpairs': 'network' + 'allpairs': 'network', + 'capdsd': 'ppi' } From 6b05cde77679ff11e6b07e27d507fe6d6f242d1a Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 16 Jun 2025 15:37:02 -0700 Subject: [PATCH 2/4] fix: add missing cmts and such --- .github/workflows/build-containers.yml | 5 +++++ config/config.yaml | 4 ++++ docker-wrappers/capDSD/Dockerfile | 2 ++ spras/capDSD.py | 7 +++++++ test/parse-outputs/test_parse_outputs.py | 2 +- 5 files changed, 19 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-containers.yml b/.github/workflows/build-containers.yml index 8ebc59435..f371e30fd 100644 --- a/.github/workflows/build-containers.yml +++ b/.github/workflows/build-containers.yml @@ -48,6 +48,11 @@ jobs: with: path: docker-wrappers/Cytoscape container: reedcompbio/py4cytoscape + build-and-remove-capdsd: + uses: "./.github/workflows/build-and-remove-template.yml" + with: + path: docker-wrappers/capDSD + container: reedcompbio/capdsd build-and-remove-spras: uses: "./.github/workflows/build-and-remove-template.yml" with: diff --git a/config/config.yaml b/config/config.yaml index 4f16beded..fa6c17d59 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -96,6 +96,10 @@ algorithms: slice_threshold: [0.3] module_threshold: [0.05] + - name: "capdsd" + params: + include: true + # Here we specify which pathways to run and other file location information. # DataLoader.py can currently only load a single dataset # Assume that if a dataset label does not change, the lists of associated input files do not change diff --git a/docker-wrappers/capDSD/Dockerfile b/docker-wrappers/capDSD/Dockerfile index cf4f2fafb..4fb7b38ae 100644 --- a/docker-wrappers/capDSD/Dockerfile +++ b/docker-wrappers/capDSD/Dockerfile @@ -1,6 +1,8 @@ FROM python:2.7.18 RUN pip install numpy==1.16.6 +# Since this is an arbitrary internet ZIP file, we use the web archive link instead. +# TODO: checksum? RUN wget https://web.archive.org/web/20250616194746/http://dsd.cs.tufts.edu/capdsd/files//capDSD-src.zip RUN unzip capDSD-src.zip -d capDSD/ diff --git a/spras/capDSD.py b/spras/capDSD.py index 0acdf4a78..85695e85c 100644 --- a/spras/capDSD.py +++ b/spras/capDSD.py @@ -37,6 +37,13 @@ def generate_inputs(data: Dataset, filename_map: dict[str, str]): @staticmethod def run(ppi=None, ppip=None, output_file=None, container_framework="docker"): + """ + Run BTB with Docker + @param ppi: input interactome file containing only undirected edges (required) + @param ppip: input interactome file containing only directed edges (required) + @param output_file: path to the output matrix (required) + @param container_framework: specify a container framework + """ if not ppi or not ppip or not output_file: raise ValueError("Required capDSD arguments are missing") diff --git a/test/parse-outputs/test_parse_outputs.py b/test/parse-outputs/test_parse_outputs.py index 49baf10f8..309d2920c 100644 --- a/test/parse-outputs/test_parse_outputs.py +++ b/test/parse-outputs/test_parse_outputs.py @@ -12,7 +12,7 @@ # the DOMINO output of the network dip.sif and the nodes tnfa_active_genes_file.txt # from https://github.com/Shamir-Lab/DOMINO/tree/master/examples -algorithms = ['mincostflow', 'meo', 'omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'allpairs', 'domino'] +algorithms = ['mincostflow', 'meo', 'omicsintegrator1', 'omicsintegrator2', 'pathlinker', 'allpairs', 'domino', 'capdsd'] class TestParseOutputs: From e3896f07a7ab49a0354883530aed8b70c3e37867 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 16 Jun 2025 15:37:20 -0700 Subject: [PATCH 3/4] style: fmt --- spras/capDSD.py | 14 ++++++-------- spras/runner.py | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/spras/capDSD.py b/spras/capDSD.py index 85695e85c..1bf2b0b6b 100644 --- a/spras/capDSD.py +++ b/spras/capDSD.py @@ -5,7 +5,6 @@ from spras.interactome import convert_directed_to_undirected from spras.prm import PRM - __all__ = ['CapDSD'] class CapDSD(PRM): @@ -21,13 +20,13 @@ def generate_inputs(data: Dataset, filename_map: dict[str, str]): for input_type in CapDSD.required_inputs: if input_type not in filename_map: raise ValueError(f"{input_type} filename is missing") - + # create the ppi ppi = data.get_interactome() ppi = convert_directed_to_undirected(ppi) ppi.to_csv(filename_map['ppi'], sep='\t', index=False, columns=["Interactor1", "Interactor2", "Weight"], header=False) - + # then, we want to 'guide' the ppi with a .ppip file, which is a secondary, # trusted interactome: we use the directed edges from the interactome as our # trusted edges. @@ -46,7 +45,7 @@ def run(ppi=None, ppip=None, output_file=None, container_framework="docker"): """ if not ppi or not ppip or not output_file: raise ValueError("Required capDSD arguments are missing") - + work_dir = '/capDSD' volumes = list() @@ -65,7 +64,7 @@ def run(ppi=None, ppip=None, output_file=None, container_framework="docker"): mapped_out_prefix = mapped_out_dir + '/output' container_suffix = "capdsd" - + # First, we move ppip_file to a subdirectory run_container_and_log('capDSD', @@ -80,7 +79,7 @@ def run(ppi=None, ppip=None, output_file=None, container_framework="docker"): '-pathmode', '1', '-p', str(Path(ppip_file).parent), ppi_file, mapped_out_prefix] - + run_container_and_log('capDSD', container_framework, @@ -88,11 +87,10 @@ def run(ppi=None, ppip=None, output_file=None, container_framework="docker"): command, volumes, work_dir) - + output_matrix = Path(out_dir) / 'output.dsd' output_matrix.rename(output_file) @staticmethod def parse_output(raw_pathway_file: str, standardized_pathway_file: str): pass - \ No newline at end of file diff --git a/spras/runner.py b/spras/runner.py index 52810af01..8eb42d49f 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -1,7 +1,7 @@ # supported algorithm imports from spras.allpairs import AllPairs as allpairs -from spras.dataset import Dataset from spras.capDSD import CapDSD as capdsd +from spras.dataset import Dataset from spras.domino import DOMINO as domino from spras.meo import MEO as meo from spras.mincostflow import MinCostFlow as mincostflow From 9f7720c5872f9f9d8f4e4eec0f3779c4fe75bf00 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 16 Jun 2025 15:50:39 -0700 Subject: [PATCH 4/4] fix: actually integrate ppip using .ppip instead of .txt from the patch provided in #235. --- Snakefile | 12 +++++++++--- config/config.yaml | 2 +- spras/capDSD.py | 15 ++++----------- spras/util.py | 10 ++++++++++ test/capDSD/expected/capdsd-matrix-expected.txt | 8 ++++---- .../input/{capdsd-ppip.txt => capdsd-ppip.ppip} | 0 6 files changed, 28 insertions(+), 19 deletions(-) rename test/capDSD/input/{capdsd-ppip.txt => capdsd-ppip.ppip} (100%) diff --git a/Snakefile b/Snakefile index df90f8e4a..d9ff74e56 100644 --- a/Snakefile +++ b/Snakefile @@ -6,6 +6,7 @@ from spras.dataset import Dataset from spras.evaluation import Evaluation from spras.analysis import ml, summary, graphspace, cytoscape import spras.config as _config +from spras.util import extend_filename # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them @@ -189,7 +190,9 @@ checkpoint prepare_input: # Use the algorithm's generate_inputs function to load the merged dataset, extract the relevant columns, # and write the output files specified by required_inputs # The filename_map provides the output file path for each required input file type - filename_map = {input_type: SEP.join([out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs', f'{input_type}.txt']) for input_type in runner.get_required_inputs(wildcards.algorithm)} + filename_map = {input_type: SEP.join( + [out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs', extend_filename(input_type)] + ) for input_type in runner.get_required_inputs(wildcards.algorithm)} runner.prepare_inputs(wildcards.algorithm, input.dataset_file, filename_map) # Collect the prepared input files from the specified directory @@ -207,7 +210,7 @@ def collect_prepared_input(wildcards): prepared_dir = SEP.join([out_dir, 'prepared', f'{wildcards.dataset}-{wildcards.algorithm}-inputs']) # Construct the list of expected prepared input files for the reconstruction algorithm - prepared_inputs = expand(f'{prepared_dir}{SEP}{{type}}.txt',type=runner.get_required_inputs(algorithm=wildcards.algorithm)) + prepared_inputs = expand(f'{prepared_dir}{SEP}{{type}}',type=map(extend_filename, runner.get_required_inputs(algorithm=wildcards.algorithm))) # If the directory is missing, do nothing because the missing output triggers running prepare_input if os.path.isdir(prepared_dir): # If the directory exists, confirm all prepared input files exist as well (as opposed to some or none) @@ -238,7 +241,10 @@ rule reconstruct: # Create a copy so that the updates are not written to the parameters logfile params = reconstruction_params(wildcards.algorithm, wildcards.params).copy() # Add the input files - params.update(dict(zip(runner.get_required_inputs(wildcards.algorithm), *{input}, strict=True))) + params.update(dict(zip( + [inp.replace(".", "_") for inp in runner.get_required_inputs(wildcards.algorithm)], + *{input}, strict=True + ))) # Add the output file # All run functions can accept a relative path to the output file that should be written that is called 'output_file' params['output_file'] = output.pathway_file diff --git a/config/config.yaml b/config/config.yaml index fa6c17d59..7e52876f6 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -24,7 +24,7 @@ container_registry: base_url: docker.io # The owner or project of the registry # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs - owner: reedcompbio + owner: pubtristanf # This list of algorithms should be generated by a script which checks the filesystem for installs. # It shouldn't be changed by mere mortals. (alternatively, we could add a path to executable for each algorithm diff --git a/spras/capDSD.py b/spras/capDSD.py index 1bf2b0b6b..f2d41c577 100644 --- a/spras/capDSD.py +++ b/spras/capDSD.py @@ -8,7 +8,7 @@ __all__ = ['CapDSD'] class CapDSD(PRM): - required_inputs = ['ppi', 'ppip'] + required_inputs = ['ppi', 'ppip.ppip'] @staticmethod def generate_inputs(data: Dataset, filename_map: dict[str, str]): @@ -32,7 +32,7 @@ def generate_inputs(data: Dataset, filename_map: dict[str, str]): # trusted edges. ppip = data.get_interactome() ppip = ppip[ppip["Direction"] == "D"] - ppip.to_csv(filename_map['ppip'], sep='\t', index=False, columns=["Interactor1", "Interactor2"], header=False) + ppip.to_csv(filename_map['ppip.ppip'], sep='\t', index=False, columns=["Interactor1", "Interactor2"], header=False) @staticmethod def run(ppi=None, ppip=None, output_file=None, container_framework="docker"): @@ -65,15 +65,8 @@ def run(ppi=None, ppip=None, output_file=None, container_framework="docker"): container_suffix = "capdsd" - - # First, we move ppip_file to a subdirectory - run_container_and_log('capDSD', - container_framework, - container_suffix, - ['sh', '-c', f'mkdir ppip && mv {ppip_file} ppip/{ppip_file}'], - volumes, - work_dir) - + # Since the volumes are binded under different folders, we can safely + # use the ppip_file's parent. command = ['python', '/capDSD/DSD.py', '-pathmode', '1', diff --git a/spras/util.py b/spras/util.py index 83cca945d..8e9b6865f 100644 --- a/spras/util.py +++ b/spras/util.py @@ -105,3 +105,13 @@ def duplicate_edges(df: pd.DataFrame) -> (pd.DataFrame, bool): unique_edges_df = df_sorted.drop_duplicates(subset=["Node1", "Node2", "Direction"], keep="first", ignore_index=True) return unique_edges_df, not unique_edges_df.equals(df) + +# https://stackoverflow.com/a/49689414/7589775 +def extend_filename(file_name: str, extension=".txt") -> str: + """ + Adds a default file extension if none is provided. + """ + root, ext = os.path.splitext(file_name) + if not ext: + ext = extension + return f'{root}{ext}' diff --git a/test/capDSD/expected/capdsd-matrix-expected.txt b/test/capDSD/expected/capdsd-matrix-expected.txt index 76753d4eb..98ec7ffc5 100644 --- a/test/capDSD/expected/capdsd-matrix-expected.txt +++ b/test/capDSD/expected/capdsd-matrix-expected.txt @@ -1,5 +1,5 @@ A B C D -A 0.0 0.0 16.0 16.0 -B 0.0 0.0 16.0 16.0 -C 16.0 16.0 0.0 0.0 -D 16.0 16.0 0.0 0.0 +A 0.0 1.9999962366471538 4.153838337651781 4.153838337651781 +B 1.9999962366471538 0.0 2.153842101004627 2.153842101004627 +C 4.153838337651781 2.153842101004627 0.0 0.0 +D 4.153838337651781 2.153842101004627 0.0 0.0 diff --git a/test/capDSD/input/capdsd-ppip.txt b/test/capDSD/input/capdsd-ppip.ppip similarity index 100% rename from test/capDSD/input/capdsd-ppip.txt rename to test/capDSD/input/capdsd-ppip.ppip