Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
b0327a2
feat: spras_revision
tristan-f-r Jul 9, 2025
8cec738
style: fmt
tristan-f-r Jul 9, 2025
5683392
test: summary
tristan-f-r Jul 10, 2025
af90ce0
docs(test_summary): mention preprocessing motivation
tristan-f-r Jul 10, 2025
6141874
test(analysis/summary): use input from /input instead
tristan-f-r Jul 10, 2025
440a2d4
docs(test/analysis): mention dual integration testing
tristan-f-r Jul 10, 2025
d9e852b
test(analysis/summary): use test/analysis provided gold standard
tristan-f-r Jul 10, 2025
abb0eb9
style: fmt
tristan-f-r Jul 10, 2025
60185fc
chore: don't repeat docs inside analysis configs
tristan-f-r Jul 10, 2025
e6bd6a0
feat: get working with cytoscape
tristan-f-r Jul 11, 2025
f9a3081
style: fmt
tristan-f-r Jul 11, 2025
77fc3b4
test: remove nondet from analysis
tristan-f-r Jul 11, 2025
0592850
fix: get input pathways at runtime
tristan-f-r Jul 11, 2025
0b6413d
Merge branch 'umain' into hash
tristan-f-r Aug 4, 2025
1817157
fix: rm run
tristan-f-r Aug 4, 2025
c077d91
Merge branch 'main' into hash
tristan-f-r Aug 14, 2025
50f2195
fix: correct for pydantic
tristan-f-r Aug 14, 2025
d3a088b
fix: attach spras revision inside gs_values
tristan-f-r Aug 14, 2025
8e3b898
chore: drop re import
tristan-f-r Aug 14, 2025
1ada504
Merge branch 'main' into hash
tristan-f-r Aug 27, 2025
34a40ad
fix: correct tests
tristan-f-r Aug 27, 2025
5d2c6d0
Merge branch 'main' into hash
tristan-f-r Sep 9, 2025
ef15781
Merge branch 'main' into hash
tristan-f-r Sep 24, 2025
8d5019b
fix: correct Snakefile
tristan-f-r Sep 24, 2025
9949572
fix: use correct gs variable
tristan-f-r Sep 25, 2025
3cd25e8
Merge branch 'main' into hash
tristan-f-r Oct 24, 2025
0965a68
test: correct config
tristan-f-r Oct 25, 2025
a169505
fix: correct name again
tristan-f-r Oct 25, 2025
eec09f2
Merge branch 'main' into hash
tristan-f-r Jan 10, 2026
a8d71bd
test: fix files
tristan-f-r Jan 10, 2026
e12fc75
apply suggestions
tristan-f-r Jan 17, 2026
977bf5a
clean, fix: strip project_directory
tristan-f-r Jan 17, 2026
8500bcb
fix: correct equality on not SPRAS pyproject.toml
tristan-f-r Jan 17, 2026
112db39
chore: grammar
tristan-f-r Jan 17, 2026
c7262ed
chore: move attach_spras_revision out of Snakefile
tristan-f-r Jan 18, 2026
f69a0f3
Merge branch 'main' into hash
tristan-f-r Jan 31, 2026
72e30bf
fix: properly resolve merge conflict
tristan-f-r Jan 31, 2026
c71b652
fix: undo mistaken merge conflict
tristan-f-r Jan 31, 2026
6b941e0
chore: drop unnecessary self.datasets initialization
tristan-f-r Jan 31, 2026
fbf0ceb
feat: dynamic spras versioning
tristan-f-r Jan 31, 2026
edc0369
chore: error handling on setup.pu
tristan-f-r Jan 31, 2026
3a1251d
docs: note on git commit hashes
tristan-f-r Jan 31, 2026
d330d6a
chore: drop git magic
tristan-f-r Jan 31, 2026
5e31d06
feat: correctly parse RECORD
tristan-f-r Jan 31, 2026
dba2b45
style: fmt
tristan-f-r Jan 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def get_dataset(_datasets, label):
algorithms = list(algorithm_params)
algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
dataset_labels = list(_config.config.datasets.keys())

dataset_gold_standard_node_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['node_files'] for dataset in gs['dataset_labels']]
dataset_gold_standard_edge_pairs = [f"{dataset}-{gs['label']}" for gs in _config.config.gold_standards.values() if gs['edge_files'] for dataset in gs['dataset_labels']]

Expand Down Expand Up @@ -282,7 +281,7 @@ rule reconstruct:
# Original pathway reconstruction output to universal output
# Use PRRunner as a wrapper to call the algorithm-specific parse_output
rule parse_output:
input:
input:
raw_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']),
dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle'])
output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt'])
Expand Down
2 changes: 1 addition & 1 deletion spras/analysis/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
algo_with_params: list) -> pd.DataFrame:
algo_with_params: list[str]) -> pd.DataFrame:
"""
Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the
Expand Down
45 changes: 43 additions & 2 deletions spras/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
"""

import copy as copy
import functools
import hashlib
import importlib.metadata
import itertools as it
import sysconfig
import warnings
from pathlib import Path
from typing import Any
Expand All @@ -27,6 +31,31 @@

config = None

@functools.cache
def spras_revision() -> str:
"""
Gets the current revision of SPRAS.

A few notes:
- This is not dependent on the SPRAS version nor the git commit, but rather solely on the PyPA RECORD file,
(https://packaging.python.org/en/latest/specifications/recording-installed-packages/#the-record-file), which contains
hashes of all files associated with the package distribution [other than itself], and is also included in the package distribution.
- This means that, when developing SPRAS, spras_revision will be updated when spras is initially installed, but not during the middle
of development.
"""
try:
record_path = Path(
# The directory for site-packages, where .dist-info is located.
sysconfig.get_path("purelib"),
str(importlib.metadata.distribution('spras').locate_file(f"spras-{importlib.metadata.version('spras')}.dist-info/RECORD")))
with open(record_path, 'rb', buffering=0) as f:
# Truncated to the magic value 8, the length of the short git revision.
return hashlib.file_digest(f, 'sha256').hexdigest()[:8]
except importlib.metadata.PackageNotFoundError as err:
raise RuntimeError('spras is not an installed pip-module: did you forget to install SPRAS as a module?') from err
def attach_spras_revision(label: str) -> str:
return f"{label}_{spras_revision()}"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking through whether there are other ways to get this same behavior without making filenames longer. The subdirectory names that already follow the --params- pattern are long already, and now we're extending them. The only other idea is to use subdirectories instead, which isn't necessarily an improvement.

Copy link
Collaborator Author

@tristan-f-r tristan-f-r Jan 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a little concerning, though thanks to #434, I'm not too worried about files being the primary interface for organizing SPRAS output. We should still document this file directory naming once we have actual SPRAS workflow documentation.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unresolving this for now so we can get broader feedback from @annaritz and @ntalluri. I can be a meeting agenda item if needed.


# This will get called in the Snakefile, instantiating the singleton with the raw config
def init_global(config_dict):
global config
Expand Down Expand Up @@ -117,6 +146,12 @@ def process_datasets(self, raw_config: RawConfig):
# Currently assumes all datasets have a label and the labels are unique
# When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts
# Convert to dicts to simplify the yaml logging

for dataset in raw_config.datasets:
dataset.label = attach_spras_revision(dataset.label)
for gold_standard in raw_config.gold_standards:
gold_standard.label = attach_spras_revision(gold_standard.label)

for dataset in raw_config.datasets:
label = dataset.label
if label.lower() in [key.lower() for key in self.datasets.keys()]:
Expand All @@ -130,8 +165,11 @@ def process_datasets(self, raw_config: RawConfig):
dataset_labels = set(self.datasets.keys())
gold_standard_dataset_labels = {dataset_label for value in self.gold_standards.values() for dataset_label in value['dataset_labels']}
for label in gold_standard_dataset_labels:
if label not in dataset_labels:
if attach_spras_revision(label) not in dataset_labels:
raise ValueError(f"Dataset label '{label}' provided in gold standards does not exist in the existing dataset labels.")
# We attach the SPRAS revision to the individual dataset labels afterwards for a cleaner error message above.
for key, gold_standard in self.gold_standards.items():
self.gold_standards[key]["dataset_labels"] = map(attach_spras_revision, gold_standard["dataset_labels"])

# Code snipped from Snakefile that may be useful for assigning default labels
# dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
Expand Down Expand Up @@ -187,7 +225,10 @@ def process_algorithms(self, raw_config: RawConfig):
run_dict[param] = float(value)
if isinstance(value, np.ndarray):
run_dict[param] = value.tolist()
params_hash = hash_params_sha1_base32(run_dict, self.hash_length, cls=NpHashEncoder)
# Incorporates the `spras_revision` into the hash
hash_run_dict = copy.deepcopy(run_dict)
hash_run_dict["_spras_rev"] = spras_revision()
params_hash = hash_params_sha1_base32(hash_run_dict, self.hash_length, cls=NpHashEncoder)
if params_hash in prior_params_hashes:
raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file '
f'(current length {self.hash_length}).')
Expand Down
14 changes: 4 additions & 10 deletions test/analysis/expected_output/expected_egfr_summary.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
Name Number of nodes Number of edges Number of connected components Density Max degree Median degree Max diameter Average path length Nodes in prize Nodes in sources Nodes in targets Nodes in active Nodes in dummy Parameter combination
test/analysis/input/egfr/tps-egfr-domino-params-V3X4RW7_pathway.txt 48 45 3 0.0398936170212766 5 2.0 16 3.882808476926124 27 0 27 27 0 {'module_threshold': 0.05, 'slice_threshold': 0.3}
test/analysis/input/egfr/tps-egfr-meo-params-GKEDDFZ_pathway.txt 1877 12845 1 0.007295700506524384 469 6.0 6 2.7973618474338107 621 1 620 621 1 {'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt 28 20 8 0.05291005291005291 4 1.0 5 1.306439393939394 28 1 27 28 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 10.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01}
test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt 39 31 8 0.04183535762483131 6 1.0 5 1.5084498834498834 39 1 38 39 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 2.0, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01}
test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt 14 9 5 0.0989010989010989 4 1.0 2 1.1866666666666668 14 0 14 14 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.1, 'b': 0.55, 'd': 10, 'mu': 0.008, 'noise': None, 'g': 0.001, 'r': 0.01}
test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt 593 591 2 0.0033669841848593955 32 1.0 30 6.72248989073389 531 1 530 531 1 {'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
test/analysis/input/egfr/tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt 704 702 2 0.002836867968446916 35 1.0 24 6.038766691954387 616 1 615 616 1 {'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt 14 17 1 0.18681318681318682 6 2.0 7 2.857142857142857 6 1 5 6 1 {'k': 10}
test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt 25 32 1 0.10666666666666667 8 2.0 7 3.486666666666667 11 1 10 11 1 {'k': 20}
Number of nodes Number of edges Number of connected components Density Max degree Median degree Max diameter Average path length Nodes in prize Nodes in sources Nodes in targets Nodes in active Nodes in dummy Parameter combination
14 17 1 0.18681318681318682 6 2.0 7 2.857142857142857 6 1 5 6 1 {'k': 10}
25 32 1 0.10666666666666667 8 2.0 7 3.486666666666667 11 1 10 11 1 {'k': 20}
1874 12845 1 0.007319084148670001 469 6.0 6 2.7952001166950904 621 1 620 621 1 {'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
19 changes: 6 additions & 13 deletions test/analysis/expected_output/expected_example_summary.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
Name Number of nodes Number of edges Number of connected components Density Max degree Median degree Max diameter Average path length Nodes in prize Nodes in active Nodes in dummy Nodes in sources Nodes in targets Parameter combination
test/analysis/input/example/data0-allpairs-params-BEH6YB2_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {}
test/analysis/input/example/data0-domino-params-V3X4RW7_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'module_threshold': 0.05, 'slice_threshold': 0.3}
test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
test/analysis/input/example/data0-mincostflow-params-SZPZVU6_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'flow': 1, 'capacity': 1}
test/analysis/input/example/data0-omicsintegrator1-params-E3LSEZQ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
test/analysis/input/example/data0-omicsintegrator1-params-NFIPHUX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 0.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
test/analysis/input/example/data0-omicsintegrator1-params-SU2S63Y_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 5.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
test/analysis/input/example/data0-omicsintegrator1-params-V26JBGX_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'dummy_mode': 'file', 'mu_squared': False, 'exclude_terms': False, 'noisy_edges': 0, 'shuffled_prizes': 0, 'random_terminals': 0, 'seed': None, 'w': 5.0, 'b': 6.0, 'd': 10, 'mu': 0.0, 'noise': None, 'g': 0.001, 'r': 0.0}
test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0.0 0 0.0 0 0.0 0 0 0 0 0 {'w': 5.0, 'b': 4.0, 'g': 0.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'w': 5.0, 'b': 2.0, 'g': 3.0, 'noise': None, 'noisy_edges': None, 'random_terminals': None, 'dummy_mode': None, 'seed': None}
test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 200}
test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt 3 2 1 0.6666666666666666 2 1.0 2 1.3333333333333333 2 2 1 1 1 {'k': 100}
Number of nodes Number of edges Number of connected components Density Max degree Median degree Max diameter Average path length Nodes in prize Nodes in active Nodes in dummy Nodes in sources Nodes in targets Parameter combination
3 2 1 0.6666666666666666 2 1 2 1.3333333333333333 2 2 0 1 1 {'flow': 1, 'capacity': 1}
3 2 1 0.6666666666666666 2 1 2 1.3333333333333333 2 2 0 1 1 {'k': 100}
3 2 1 0.6666666666666666 2 1 2 1.3333333333333333 2 2 0 1 1 {'k': 200}
3 2 1 0.6666666666666666 2 1 2 1.3333333333333333 2 2 0 1 1 {'max_path_length': 3, 'local_search': True, 'rand_restarts': 10}
3 2 1 0.6666666666666666 2 1 2 1.3333333333333333 2 2 0 1 1 {}
1 change: 1 addition & 0 deletions test/analysis/input/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
run
114 changes: 0 additions & 114 deletions test/analysis/input/config.yaml

This file was deleted.

81 changes: 8 additions & 73 deletions test/analysis/input/egfr.yaml
Original file line number Diff line number Diff line change
@@ -1,92 +1,27 @@
# The length of the hash used to identify a parameter combination
hash_length: 7

containers:
# Specify the container framework used by each PRM wrapper. Valid options include:
# - docker (default if not specified)
# - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed
# - dsub -- experimental with limited support, used for running on Google Cloud with the All of Us cloud environment.
# - There is no support for other environments at the moment.
framework: docker

# Only used if container_framework is set to singularity, this will unpack the singularity containers
# to the local filesystem. This is useful when PRM containers need to run inside another container,
# such as would be the case in an HTCondor/OSPool environment.
# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way
# that persists after the workflow is complete. To clean up the unpacked containers, the user must
# manually delete them. For convenience, these unpacked files will exist in the current working directory
# under `unpacked`.
unpack_singularity: false

# Allow the user to configure which container registry containers should be pulled from
# Note that this assumes container names are consistent across registries, and that the
# registry being passed doesn't require authentication for pull actions
registry:
base_url: docker.io
# The owner or project of the registry
# For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs
owner: reedcompbio

algorithms:
- name: pathlinker
include: true
runs:
run1:
k:
- 10
- 20
- name: omicsintegrator1
include: true
runs:
run1:
b:
- 0.55
- 2
- 10
d:
- 10
g:
- 1e-3
r:
- 0.01
w:
- 0.1
mu:
- 0.008
dummy_mode: ["file"]
- name: omicsintegrator2
include: true
runs:
run1:
b:
- 4
g:
- 0
run2:
b:
- 2
g:
- 3
k: [10, 20]
- name: meo
include: true
runs:
run1:
local_search:
- true
max_path_length:
- 3
rand_restarts:
- 10
- name: domino
include: true
runs:
run1:
slice_threshold:
- 0.3
module_threshold:
- 0.05
local_search: true
max_path_length: 3
rand_restarts: 10
datasets:
- data_dir: input
- data_dir: "input"
edge_files:
- phosphosite-irefindex13.0-uniprot.txt
label: tps_egfr
Expand All @@ -95,12 +30,12 @@ datasets:
other_files: []
reconstruction_settings:
locations:
reconstruction_dir: output/egfr
reconstruction_dir: "test/analysis/input/run/egfr"
analysis:
cytoscape:
include: true
include: false
summary:
include: true
include: false
ml:
include: false
evaluation:
Expand Down
Loading
Loading