Reed-CompBio · tristan-f-r · Jul 9, 2025 · Jul 9, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/Snakefile b/Snakefile
@@ -34,7 +34,6 @@ def get_dataset(_datasets, label):
 algorithms = list(algorithm_params)
 algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
 dataset_labels = list(_config.config.datasets.keys())
-
 dataset_gold_standard_node_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['node_files'] for dataset in gs['dataset_labels']]
 dataset_gold_standard_edge_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['edge_files'] for dataset in gs['dataset_labels']]
 
@@ -282,7 +281,7 @@ rule reconstruct:
 # Original pathway reconstruction output to universal output
 # Use PRRunner as a wrapper to call the algorithm-specific parse_output
 rule parse_output:
-    input: 
+    input:
         raw_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']),
         dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle'])
     output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt'])

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
@@ -7,7 +7,7 @@
 
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
-                       algo_with_params: list) -> pd.DataFrame:
+                       algo_with_params: list[str]) -> pd.DataFrame:
     """
     Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
     in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the

diff --git a/spras/config/config.py b/spras/config/config.py
@@ -13,7 +13,11 @@
 """
 
 import copy as copy
+import functools
+import hashlib
+import importlib.metadata
 import itertools as it
+import sysconfig
 import warnings
 from pathlib import Path
 from typing import Any
@@ -27,6 +31,31 @@
 
 config = None
 
+@functools.cache
+def spras_revision() -> str:
+    """
+    Gets the current revision of SPRAS.
+
+    A few notes:
+    - This is not dependent on the SPRAS version nor the git commit, but rather solely on the PyPA RECORD file,
+    (https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file), which contains
+    hashes of all files associated with the package distribution [other than itself], and is also included in the package distribution.
+    - This means that, when developing SPRAS, spras_revision will be updated when spras is initially installed, but not during the middle
+    of development.
+    """
+    try:
+        record_path = Path(
+            # The directory for site-packages, where .dist-info is located.
+            sysconfig.get_path("purelib"),
+            str(importlib.metadata.distribution('spras').locate_file(f"spras-{importlib.metadata.version('spras')}.dist-info/RECORD")))
+        with open(record_path, 'rb', buffering=0) as f:
+            # Truncated to the magic value 8, the length of the short git revision.
+            return hashlib.file_digest(f, 'sha256').hexdigest()[:8]
+    except importlib.metadata.PackageNotFoundError as err:
+        raise RuntimeError('spras is not an installed pip-module: did you forget to install SPRAS as a module?') from err
+def attach_spras_revision(label: str) -> str:
+    return f"{label}_{spras_revision()}"
+
 # This will get called in the Snakefile, instantiating the singleton with the raw config
 def init_global(config_dict):
     global config
@@ -117,6 +146,12 @@ def process_datasets(self, raw_config: RawConfig):
         # Currently assumes all datasets have a label and the labels are unique
         # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts
         # Convert to dicts to simplify the yaml logging
+
+        for dataset in raw_config.datasets:
+            dataset.label = attach_spras_revision(dataset.label)
+        for gold_standard in raw_config.gold_standards:
+            gold_standard.label = attach_spras_revision(gold_standard.label)
+
         for dataset in raw_config.datasets:
             label = dataset.label
             if label.lower() in [key.lower() for key in self.datasets.keys()]:
@@ -130,8 +165,11 @@ def process_datasets(self, raw_config: RawConfig):
         dataset_labels = set(self.datasets.keys())
         gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']}
         for label in gold_standard_dataset_labels:
-            if label not in dataset_labels:
+            if attach_spras_revision(label) not in dataset_labels:
                 raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.")
+        # We attach the SPRAS revision to the individual dataset labels afterwards for a cleaner error message above.
+        for key, gold_standard in self.gold_standards.items():
+            self.gold_standards[key]["dataset_labels"] = map(attach_spras_revision, gold_standard["dataset_labels"])
 
         # Code snipped from Snakefile that may be useful for assigning default labels
         # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
@@ -187,7 +225,10 @@ def process_algorithms(self, raw_config: RawConfig):
                             run_dict[param] = float(value)
                         if isinstance(value, np.ndarray):
                             run_dict[param] = value.tolist()
-                    params_hash = hash_params_sha1_base32(run_dict, self.hash_length, cls=NpHashEncoder)
+                    # Incorporates the `spras_revision` into the hash
+                    hash_run_dict = copy.deepcopy(run_dict)
+                    hash_run_dict["_spras_rev"] = spras_revision()
+                    params_hash = hash_params_sha1_base32(hash_run_dict, self.hash_length, cls=NpHashEncoder)
                     if params_hash in prior_params_hashes:
                         raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file '
                                         f'(current length {self.hash_length}).')

diff --git a/test/analysis/expected_output/expected_egfr_summary.txt b/test/analysis/expected_output/expected_egfr_summary.txt
@@ -1,10 +1,4 @@
-Name	Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in sources	Nodes in targets	Nodes in active	Nodes in dummy	Parameter combination
-test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt	48	45	3	0.0398936170212766	5	2.0	16	3.882808476926124	27	0	27	27	0	{'module_threshold': 0.05, 'slice_threshold': 0.3}
-test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt	1877	12845	1	0.007295700506524384	469	6.0	6	2.7973618474338107	621	1	620	621	1	{'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
-test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt	28	20	8	0.05291005291005291	4	1.0	5	1.306439393939394	28	1	27	28	1	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 10.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01}
-test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt	39	31	8	0.04183535762483131	6	1.0	5	1.5084498834498834	39	1	38	39	1	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 2.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01}
-test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt	14	9	5	0.0989010989010989	4	1.0	2	1.1866666666666668	14	0	14	14	0	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 0.55, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01}
-test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt	593	591	2	0.0033669841848593955	32	1.0	30	6.72248989073389	531	1	530	531	1	{'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
-test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt	704	702	2	0.002836867968446916	35	1.0	24	6.038766691954387	616	1	615	616	1	{'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
-test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt	14	17	1	0.18681318681318682	6	2.0	7	2.857142857142857	6	1	5	6	1	{'k': 10}
-test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt	25	32	1	0.10666666666666667	8	2.0	7	3.486666666666667	11	1	10	11	1	{'k': 20}
+Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in sources	Nodes in targets	Nodes in active	Nodes in dummy	Parameter combination
+14	17	1	0.18681318681318682	6	2.0	7	2.857142857142857	6	1	5	6	1	{'k': 10}
+25	32	1	0.10666666666666667	8	2.0	7	3.486666666666667	11	1	10	11	1	{'k': 20}
+1874	12845	1	0.007319084148670001	469	6.0	6	2.7952001166950904	621	1	620	621	1	{'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
diff --git a/test/analysis/expected_output/expected_example_summary.txt b/test/analysis/expected_output/expected_example_summary.txt
@@ -1,13 +1,6 @@
-Name	Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in active	Nodes in dummy	Nodes in sources	Nodes in targets	Parameter combination
-test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{}
-test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'module_threshold': 0.05, 'slice_threshold': 0.3}
-test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
-test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'flow': 1, 'capacity': 1}
-test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
-test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
-test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
-test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
-test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt	0	0	0	0.0	0	0.0	0	0.0	0	0	0	0	0	{'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
-test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
-test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'k': 200}
-test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt	3	2	1	0.6666666666666666	2	1.0	2	1.3333333333333333	2	2	1	1	1	{'k': 100}
+Number of nodes	Number of edges	Number of connected components	Density	Max degree	Median degree	Max diameter	Average path length	Nodes in prize	Nodes in active	Nodes in dummy	Nodes in sources	Nodes in targets	Parameter combination
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'flow': 1, 'capacity': 1}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'k': 100}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'k': 200}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
+3	2	1	0.6666666666666666	2	1	2	1.3333333333333333	2	2	0	1	1	{}
diff --git a/test/analysis/input/.gitignore b/test/analysis/input/.gitignore
@@ -0,0 +1 @@
+run
diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml
diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml
@@ -1,92 +1,27 @@
-# The length of the hash used to identify a parameter combination
 hash_length: 7
 
 containers:
-  # Specify the container framework used by each PRM wrapper. Valid options include:
-  # - docker (default if not specified)
-  # - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed
-  # - dsub -- experimental with limited support, used for running on Google Cloud with the All of Us cloud environment.
-  #   - There is no support for other environments at the moment.
   framework: docker
-
-  # Only used if container_framework is set to singularity, this will unpack the singularity containers
-  # to the local filesystem. This is useful when PRM containers need to run inside another container,
-  # such as would be the case in an HTCondor/OSPool environment.
-  # NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way
-  # that persists after the workflow is complete. To clean up the unpacked containers, the user must
-  # manually delete them. For convenience, these unpacked files will exist in the current working directory
-  # under `unpacked`.
   unpack_singularity: false
-
-  # Allow the user to configure which container registry containers should be pulled from
-  # Note that this assumes container names are consistent across registries, and that the
-  # registry being passed doesn't require authentication for pull actions
   registry:
     base_url: docker.io
-    # The owner or project of the registry
-    # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs
     owner: reedcompbio
 
 algorithms:
   - name: pathlinker
     include: true
     runs:
       run1:
-        k:
-          - 10
-          - 20
-  - name: omicsintegrator1
-    include: true
-    runs:
-      run1:
-        b:
-          - 0.55
-          - 2
-          - 10
-        d:
-          - 10
-        g:
-          - 1e-3
-        r:
-          - 0.01
-        w:
-          - 0.1
-        mu:
-          - 0.008
-        dummy_mode: ["file"]
-  - name: omicsintegrator2
-    include: true
-    runs:
-      run1:
-        b:
-          - 4
-        g:
-          - 0
-      run2:
-        b:
-          - 2
-        g:
-          - 3
+        k: [10, 20]
   - name: meo
     include: true
     runs:
       run1:
-        local_search:
-          - true
-        max_path_length:
-          - 3
-        rand_restarts:
-          - 10
-  - name: domino
-    include: true
-    runs:
-      run1:
-        slice_threshold:
-          - 0.3
-        module_threshold:
-          - 0.05
+        local_search: true
+        max_path_length: 3
+        rand_restarts: 10
 datasets:
-  - data_dir: input
+  - data_dir: "input"
     edge_files:
       - phosphosite-irefindex13.0-uniprot.txt
     label: tps_egfr
@@ -95,12 +30,12 @@ datasets:
     other_files: []
 reconstruction_settings:
   locations:
-    reconstruction_dir: output/egfr
+    reconstruction_dir: "test/analysis/input/run/egfr"
 analysis:
   cytoscape:
-    include: true
+    include: false
   summary:
-    include: true
+    include: false
   ml:
     include: false
   evaluation: