From c2e64f7eb0c6fbe0ab15b1e8f0707a2c63bbec47 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 12:47:04 -0700 Subject: [PATCH 01/34] feat: begin config refactor --- config/config.yaml | 6 +- config/egfr-param-tuning.yaml | 1 - config/egfr.yaml | 1 - docker-wrappers/SPRAS/example_config.yaml | 2 - environment.yml | 1 + pyproject.toml | 1 + spras/{ => config}/config.py | 169 +++++++++++----------- spras/config/raw_config.py | 64 ++++++++ spras/containers.py | 2 +- test/AllPairs/test_ap.py | 2 +- test/DOMINO/test_domino.py | 2 +- test/LocalNeighborhood/test_ln.py | 2 +- test/MEO/test_meo.py | 2 +- test/MinCostFlow/test_mcf.py | 2 +- test/OmicsIntegrator1/test_oi1.py | 2 +- test/OmicsIntegrator2/test_oi2.py | 2 +- test/PathLinker/test_pathlinker.py | 2 +- test/analysis/input/config.yaml | 1 - test/analysis/input/egfr.yaml | 1 - test/analysis/test_cytoscape.py | 2 +- test/analysis/test_summary.py | 2 +- test/test_config.py | 2 +- test/test_util.py | 2 +- 23 files changed, 169 insertions(+), 104 deletions(-) rename spras/{ => config}/config.py (80%) create mode 100644 spras/config/raw_config.py diff --git a/config/config.yaml b/config/config.yaml index 1f246dd15..3179dfedc 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -63,6 +63,11 @@ algorithms: - name: "omicsintegrator2" params: include: true + runs: + - b: 4 + g: 0 + - b: 2 + g: 3 run1: b: 4 g: 0 @@ -144,7 +149,6 @@ reconstruction_settings: # TODO move to global reconstruction_dir: "output" - run: true analysis: # Create one summary per pathway file and a single summary table for all pathways for each dataset diff --git a/config/egfr-param-tuning.yaml b/config/egfr-param-tuning.yaml index a0a965b70..50a4788a2 100644 --- a/config/egfr-param-tuning.yaml +++ b/config/egfr-param-tuning.yaml @@ -3440,7 +3440,6 @@ gold_standards: reconstruction_settings: locations: reconstruction_dir: output/tps_egfr - run: true analysis: summary: include: true diff --git a/config/egfr.yaml b/config/egfr.yaml index b8c5138b8..cea3ad54b 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -74,7 +74,6 @@ datasets: reconstruction_settings: locations: reconstruction_dir: output/egfr - run: true analysis: graphspace: include: false diff --git a/docker-wrappers/SPRAS/example_config.yaml b/docker-wrappers/SPRAS/example_config.yaml index f7fd74e98..9791c2f23 100644 --- a/docker-wrappers/SPRAS/example_config.yaml +++ b/docker-wrappers/SPRAS/example_config.yaml @@ -123,8 +123,6 @@ reconstruction_settings: # TODO move to global reconstruction_dir: "output" - run: true - analysis: # Create one summary per pathway file and a single summary table for all pathways for each dataset summary: diff --git a/environment.yml b/environment.yml index 6694b9812..7d14e3ea4 100644 --- a/environment.yml +++ b/environment.yml @@ -8,6 +8,7 @@ dependencies: - matplotlib=3.6 - networkx=2.8 - pandas=1.5 + - pydantic=2.11.7 - numpy=1.26.4 - pre-commit=2.20 # Only required for development - pytest=8.0 # Only required for development diff --git a/pyproject.toml b/pyproject.toml index 3e90f7b1e..27dee2693 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "matplotlib==3.6", "networkx==2.8", "pandas==1.5", + "pydantic==2.11.7", "numpy==1.26.4", "pip==22.1", "requests==2.28", diff --git a/spras/config.py b/spras/config/config.py similarity index 80% rename from spras/config.py rename to spras/config/config.py index 7bbf9cd1b..f6c5ada8b 100644 --- a/spras/config.py +++ b/spras/config/config.py @@ -16,19 +16,19 @@ import itertools as it import os import re +import warnings from collections.abc import Iterable import numpy as np import yaml from spras.util import NpHashEncoder, hash_params_sha1_base32 - -# The default length of the truncated hash used to identify parameter combinations -DEFAULT_HASH_LENGTH = 7 -DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio" +from spras.config.raw_config import ContainerFramework, RawConfig, DEFAULT_HASH_LENGTH config = None +DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio" + # This will get called in the Snakefile, instantiating the singleton with the raw config def init_global(config_dict): global config @@ -43,11 +43,9 @@ def init_from_file(filepath): with open(filepath, 'r') as yaml_file: config_dict = yaml.safe_load(yaml_file) except FileNotFoundError: - print(f"Error: The specified config '{filepath}' could not be found.") - return False + raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") except yaml.YAMLError as e: - print(f"Error: Failed to parse config '{filepath}': {e}") - return False + raise RuntimeError(f"Error: Failed to parse config '{filepath}': {e}") # And finally, initialize config = Config(config_dict) @@ -55,18 +53,15 @@ def init_from_file(filepath): class Config: def __init__(self, raw_config): - # Since process_config winds up modifying the raw_config passed to it as a side effect, - # we'll make a deep copy here to guarantee we don't break anything. This preserves the - # config as it's given to the Snakefile by Snakemake + # Member vars populated by process_config. Any values that don't have sensible initial values are set to None + # before they are populated for __init__ to show exactly what is being configured. - # Member vars populated by process_config. Set to None before they are populated so that our - # __init__ makes clear exactly what is being configured. # Directory used for storing output self.out_dir = None # Container framework used by PRMs. Valid options are "docker", "dsub", and "singularity" self.container_framework = None # The container prefix (host and organization) to use for images. Default is "docker.io/reedcompbio" - self.container_prefix = DEFAULT_CONTAINER_PREFIX + self.container_prefix: str = DEFAULT_CONTAINER_PREFIX # A Boolean specifying whether to unpack singularity containers. Default is False self.unpack_singularity = False # A dictionary to store configured datasets against which SPRAS will be run @@ -74,7 +69,7 @@ def __init__(self, raw_config): # A dictionary to store configured gold standard data against output of SPRAS runs self.gold_standards = None # The hash length SPRAS will use to identify parameter combinations. Default is 7 - self.hash_length = DEFAULT_HASH_LENGTH + self.hash_length: int = DEFAULT_HASH_LENGTH # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key. self.algorithms = None # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. @@ -107,44 +102,24 @@ def __init__(self, raw_config): # A Boolean specifying whether to run the evaluation per algorithm analysis self.analysis_include_evaluation_aggregate_algo = None - _raw_config = copy.deepcopy(raw_config) - self.process_config(_raw_config) - - def process_config(self, raw_config): + # Since snakemake provides an empty config, we provide this + # wrapper error first before passing validation to pydantic. if raw_config == {}: raise ValueError("Config file cannot be empty. Use --configfile to set a config file.") - # Set up a few top-level config variables - self.out_dir = raw_config["reconstruction_settings"]["locations"]["reconstruction_dir"] - - # We allow the container framework not to be defined in the config. In the case it isn't, default to docker. - # However, if we get a bad value, we raise an exception. - if "container_framework" in raw_config: - container_framework = raw_config["container_framework"].lower() - if container_framework not in ("docker", "singularity", "dsub"): - msg = "SPRAS was configured to run with an unknown container framework: '" + raw_config["container_framework"] + "'. Accepted values are 'docker', 'singularity' or 'dsub'." - raise ValueError(msg) - if container_framework == "dsub": - print("Warning: 'dsub' framework integration is experimental and may not be fully supported.") - self.container_framework = container_framework - else: - self.container_framework = "docker" - - # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. - if "unpack_singularity" in raw_config: - # The value in the config is a string, and we need to convert it to a bool. - unpack_singularity = raw_config["unpack_singularity"] - if unpack_singularity and self.container_framework != "singularity": - print("Warning: unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.") - self.unpack_singularity = unpack_singularity - - # Grab registry from the config, and if none is provided default to docker - if "container_registry" in raw_config and raw_config["container_registry"]["base_url"] != "" and raw_config["container_registry"]["owner"] != "": - self.container_prefix = raw_config["container_registry"]["base_url"] + "/" + raw_config["container_registry"]["owner"] - - # Parse dataset information - # Datasets is initially a list, where each list entry has a dataset label and lists of input files - # Convert the dataset list into a dict where the label is the key and update the config data structure + # Since process_config winds up modifying the raw_config passed to it as a side effect, + # we'll make a deep copy here to guarantee we don't break anything. This preserves the + # config as it's given to the Snakefile by Snakemake + _raw_config = copy.deepcopy(raw_config) + parsed_raw_config = RawConfig.model_validate_json(_raw_config) + self.process_config(parsed_raw_config) + + def process_datasets(self, raw_config: RawConfig): + """ + Parse dataset information + Datasets is initially a list, where each list entry has a dataset label and lists of input files + Convert the dataset list into a dict where the label is the key and update the config data structure + """ # TODO allow labels to be optional and assign default labels # TODO check for collisions in dataset labels, warn, and make the labels unique # Need to work more on input file naming to make less strict assumptions @@ -152,24 +127,20 @@ def process_config(self, raw_config): # Currently assumes all datasets have a label and the labels are unique # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts # Convert to dicts to simplify the yaml logging - self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]} - - for key in self.datasets: - pattern = r'^\w+$' - if not bool(re.match(pattern, key)): - raise ValueError(f"Dataset label \'{key}\' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.") + self.datasets = {} + for dataset in raw_config.datasets: + label = dataset.label + if label in self.datasets: + raise ValueError(f"Datasets must have unique labels, but the label {label} appears at least twice.") + self.datasets[label] = dict(dataset) + + # Validate dataset labels + label_pattern = r'^\w+$' + if not bool(re.match(label_pattern, label)): + raise ValueError(f"Dataset label '{label}' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.") # parse gold standard information - try: - self.gold_standards = {gold_standard["label"]: dict(gold_standard) for gold_standard in raw_config["gold_standards"]} - except: - self.gold_standards = {} - - # check that gold_standard labels are formatted correctly - for key in self.gold_standards: - pattern = r'^\w+$' - if not bool(re.match(pattern, key)): - raise ValueError(f"Gold standard label \'{key}\' contains invalid values. Gold standard labels can only contain letters, numbers, or underscores.") + self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards} # check that all the dataset labels in the gold standards are existing datasets labels dataset_labels = set(self.datasets.keys()) @@ -182,33 +153,30 @@ def process_config(self, raw_config): # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)] # Maps from the dataset label to the dataset list index # dataset_dict = {dataset.get('label', f'dataset{index}'): index for index, dataset in enumerate(datasets)} - - # Override the default parameter hash length if specified in the config file - if "hash_length" in raw_config and raw_config["hash_length"] != "": - self.hash_length = int(raw_config["hash_length"]) - + + def process_algorithms(self, raw_config: RawConfig): + """ + Parse algorithm information + Each algorithm's parameters are provided as a list of dictionaries + Defaults are handled in the Python function or class that wraps + running that algorithm + Keys in the parameter dictionary are strings + """ prior_params_hashes = set() - - # Parse algorithm information - # Each algorithm's parameters are provided as a list of dictionaries - # Defaults are handled in the Python function or class that wraps - # running that algorithm - # Keys in the parameter dictionary are strings self.algorithm_params = dict() self.algorithm_directed = dict() - self.algorithms = raw_config["algorithms"] + self.algorithms = raw_config.algorithms for alg in self.algorithms: - cur_params = alg["params"] - if "include" in cur_params and cur_params.pop("include"): + cur_params = alg.params + if cur_params.include: # This dict maps from parameter combinations hashes to parameter combination dictionaries - self.algorithm_params[alg["name"]] = dict() + self.algorithm_params[alg.name] = dict() else: # Do not parse the rest of the parameters for this algorithm if it is not included continue - if "directed" in cur_params: - print("UPDATE: we no longer use the directed key in the config file") - cur_params.pop("directed") + if cur_params.directed != None: + warnings.warn("UPDATE: we no longer use the directed key in the config file") # The algorithm has no named arguments so create a default placeholder if len(cur_params) == 0: @@ -265,6 +233,39 @@ def process_config(self, raw_config): f'(current length {self.hash_length}).') self.algorithm_params[alg["name"]][params_hash] = run_dict + def process_config(self, raw_config: RawConfig): + # Set up a few top-level config variables + self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir + + # We allow the container framework not to be defined in the config. In the case it isn't, default to docker. + # However, if we get a bad value, we raise an exception. + if raw_config.container_framework != None: + container_framework = raw_config.container_framework + if container_framework == ContainerFramework.dsub: + warnings.warn("'dsub' framework integration is experimental and may not be fully supported.") + self.container_framework = container_framework + else: + self.container_framework = "docker" + + # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. + if raw_config.unpack_singularity: + # The value in the config is a string, and we need to convert it to a bool. + unpack_singularity = raw_config["unpack_singularity"] + if unpack_singularity and self.container_framework != "singularity": + warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.") + self.unpack_singularity = unpack_singularity + + # Grab registry from the config, and if none is provided default to docker + if raw_config.container_registry and raw_config["container_registry"]["base_url"] != "" and raw_config["container_registry"]["owner"] != "": + self.container_prefix = raw_config["container_registry"]["base_url"] + "/" + raw_config["container_registry"]["owner"] + + # Override the default parameter hash length if specified in the config file + if "hash_length" in raw_config and raw_config["hash_length"] != "": + self.hash_length = int(raw_config["hash_length"]) + + self.process_datasets(raw_config) + self.process_algorithms(raw_config) + self.analysis_params = raw_config["analysis"] if "analysis" in raw_config else {} self.ml_params = self.analysis_params["ml"] if "ml" in self.analysis_params else {} self.evaluation_params = self.analysis_params["evaluation"] if "evaluation" in self.analysis_params else {} diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py new file mode 100644 index 000000000..1810bf0e5 --- /dev/null +++ b/spras/config/raw_config.py @@ -0,0 +1,64 @@ +""" +Contains the raw pydantic schema for the configuration file. +""" + +from enum import Enum +from pydantic import BaseModel, ConfigDict, Field +from typing import Optional + +# The default length of the truncated hash used to identify parameter combinations +DEFAULT_HASH_LENGTH = 7 + +class ContainerFramework(str, Enum): + docker = 'docker' + # TODO: add apptainer variant once #260 gets merged + singularity = 'singularity' + dsub = 'dsub' + +class ContainerRegistry(BaseModel): + base_url: str + owner: str = Field(description="The owner or project of the registry") + +class AlgorithmParams(BaseModel): + include: bool = Field(default=False) + directed: Optional[bool] + # TODO + +class Algorithm(BaseModel): + name: str + params: AlgorithmParams + +class Dataset(BaseModel): + label: str + node_files: list[str] + edge_files: list[str] + other_files: list[str] + data_dir: str + +class GoldStandard(BaseModel): + label: str + node_files: list[str] + data_dir: str + dataset_labels: list[str] + +class Locations(BaseModel): + reconstruction_dir: str + +class ReconstructionSettings(BaseModel): + locations: Locations + +class RawConfig(BaseModel): + # TODO: move this to nested container key + container_framework: Optional[ContainerFramework] + unpack_singularity: bool = Field(default=False) + container_registry: ContainerRegistry + + hash_length: Optional[int] = Field( + description="The length of the hash used to identify a parameter combination", + default=DEFAULT_HASH_LENGTH) + + algorithms: list[Algorithm] + datasets: list[Dataset] + gold_standards: list[GoldStandard] = Field(default=[]) + + reconstruction_settings: ReconstructionSettings diff --git a/spras/containers.py b/spras/containers.py index a1fda05f2..3e6c7c3fc 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -7,7 +7,7 @@ import docker -import spras.config as config +import spras.config.config as config from spras.logging import indent from spras.util import hash_filename diff --git a/test/AllPairs/test_ap.py b/test/AllPairs/test_ap.py index 442b26a73..31dd612d9 100644 --- a/test/AllPairs/test_ap.py +++ b/test/AllPairs/test_ap.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.allpairs import AllPairs # Note that we don't directly use the config in the test, but we need the config diff --git a/test/DOMINO/test_domino.py b/test/DOMINO/test_domino.py index 7f09fa975..4323ea4c9 100644 --- a/test/DOMINO/test_domino.py +++ b/test/DOMINO/test_domino.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.domino import DOMINO, post_domino_id_transform, pre_domino_id_transform config.init_from_file("config/config.yaml") diff --git a/test/LocalNeighborhood/test_ln.py b/test/LocalNeighborhood/test_ln.py index fbee54902..9093efc68 100644 --- a/test/LocalNeighborhood/test_ln.py +++ b/test/LocalNeighborhood/test_ln.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config config.init_from_file("config/config.yaml") diff --git a/test/MEO/test_meo.py b/test/MEO/test_meo.py index e2abdb72d..32958be20 100644 --- a/test/MEO/test_meo.py +++ b/test/MEO/test_meo.py @@ -3,7 +3,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.meo import MEO, write_properties config.init_from_file("config/config.yaml") diff --git a/test/MinCostFlow/test_mcf.py b/test/MinCostFlow/test_mcf.py index 89bd61d0b..c777a665d 100644 --- a/test/MinCostFlow/test_mcf.py +++ b/test/MinCostFlow/test_mcf.py @@ -3,7 +3,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.mincostflow import MinCostFlow config.init_from_file("config/config.yaml") diff --git a/test/OmicsIntegrator1/test_oi1.py b/test/OmicsIntegrator1/test_oi1.py index 35b41d428..a484c0af3 100644 --- a/test/OmicsIntegrator1/test_oi1.py +++ b/test/OmicsIntegrator1/test_oi1.py @@ -3,7 +3,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.omicsintegrator1 import OmicsIntegrator1, write_conf config.init_from_file("config/config.yaml") diff --git a/test/OmicsIntegrator2/test_oi2.py b/test/OmicsIntegrator2/test_oi2.py index 2a0a3e3c1..13f7f30b6 100644 --- a/test/OmicsIntegrator2/test_oi2.py +++ b/test/OmicsIntegrator2/test_oi2.py @@ -3,7 +3,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.omicsintegrator2 import OmicsIntegrator2 config.init_from_file("config/config.yaml") diff --git a/test/PathLinker/test_pathlinker.py b/test/PathLinker/test_pathlinker.py index 3fd6a96bd..ed9f10670 100644 --- a/test/PathLinker/test_pathlinker.py +++ b/test/PathLinker/test_pathlinker.py @@ -3,7 +3,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.pathlinker import PathLinker config.init_from_file("config/config.yaml") diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml index 833e6c4bb..49879e461 100644 --- a/test/analysis/input/config.yaml +++ b/test/analysis/input/config.yaml @@ -102,7 +102,6 @@ reconstruction_settings: locations: #place the save path here reconstruction_dir: "output" - run: true analysis: # Create one summary per pathway file and a single summary table for all pathways for each dataset diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml index 1ddac1cae..281ecb495 100644 --- a/test/analysis/input/egfr.yaml +++ b/test/analysis/input/egfr.yaml @@ -91,7 +91,6 @@ datasets: reconstruction_settings: locations: reconstruction_dir: output/egfr - run: true analysis: graphspace: include: true diff --git a/test/analysis/test_cytoscape.py b/test/analysis/test_cytoscape.py index 7451b9876..68a77cd07 100644 --- a/test/analysis/test_cytoscape.py +++ b/test/analysis/test_cytoscape.py @@ -2,7 +2,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.analysis.cytoscape import run_cytoscape config.init_from_file("test/analysis/input/config.yaml") diff --git a/test/analysis/test_summary.py b/test/analysis/test_summary.py index 4ff5396da..0400d1f1b 100644 --- a/test/analysis/test_summary.py +++ b/test/analysis/test_summary.py @@ -3,7 +3,7 @@ import pandas as pd -import spras.config as config +import spras.config.config as config from spras.analysis.summary import summarize_networks from spras.dataset import Dataset diff --git a/test/test_config.py b/test/test_config.py index 6c773ddc0..26b18a4e9 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -3,7 +3,7 @@ import numpy as np import pytest -import spras.config as config +import spras.config.config as config # Set up a dummy config for testing. For now, only include things that MUST exist in the dict diff --git a/test/test_util.py b/test/test_util.py index baf9db0ed..2a25fc0d1 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -2,7 +2,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.containers import convert_docker_path, prepare_path_docker, prepare_volume from spras.util import hash_params_sha1_base32 From 4d1a19c54e22052d8e82f9c10bfdc791e4df5143 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 20:49:36 +0000 Subject: [PATCH 02/34] feat: mostly structured config --- spras/config/__init__.py | 0 spras/config/config.py | 113 +++++++++++++--------------- spras/config/raw_config.py | 27 ++++++- spras/config/raw_config_analysis.py | 33 ++++++++ 4 files changed, 109 insertions(+), 64 deletions(-) create mode 100644 spras/config/__init__.py create mode 100644 spras/config/raw_config_analysis.py diff --git a/spras/config/__init__.py b/spras/config/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spras/config/config.py b/spras/config/config.py index f6c5ada8b..751c774a5 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -18,12 +18,13 @@ import re import warnings from collections.abc import Iterable +from typing import Any import numpy as np import yaml from spras.util import NpHashEncoder, hash_params_sha1_base32 -from spras.config.raw_config import ContainerFramework, RawConfig, DEFAULT_HASH_LENGTH +from spras.config.raw_config import ContainerFramework, RawConfig config = None @@ -52,12 +53,15 @@ def init_from_file(filepath): class Config: - def __init__(self, raw_config): - # Member vars populated by process_config. Any values that don't have sensible initial values are set to None + def __init__(self, raw_config: dict[str, Any]): + parsed_raw_config = RawConfig.model_validate(raw_config) + self.process_config(parsed_raw_config) + + # Member vars populated by process_config. Any values that don't have quick initial values are set to None # before they are populated for __init__ to show exactly what is being configured. # Directory used for storing output - self.out_dir = None + self.out_dir = parsed_raw_config.reconstruction_settings.locations.reconstruction_dir # Container framework used by PRMs. Valid options are "docker", "dsub", and "singularity" self.container_framework = None # The container prefix (host and organization) to use for images. Default is "docker.io/reedcompbio" @@ -68,8 +72,8 @@ def __init__(self, raw_config): self.datasets = None # A dictionary to store configured gold standard data against output of SPRAS runs self.gold_standards = None - # The hash length SPRAS will use to identify parameter combinations. Default is 7 - self.hash_length: int = DEFAULT_HASH_LENGTH + # The hash length SPRAS will use to identify parameter combinations. + self.hash_length = parsed_raw_config.hash_length # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key. self.algorithms = None # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. @@ -106,13 +110,6 @@ def __init__(self, raw_config): # wrapper error first before passing validation to pydantic. if raw_config == {}: raise ValueError("Config file cannot be empty. Use --configfile to set a config file.") - - # Since process_config winds up modifying the raw_config passed to it as a side effect, - # we'll make a deep copy here to guarantee we don't break anything. This preserves the - # config as it's given to the Snakefile by Snakemake - _raw_config = copy.deepcopy(raw_config) - parsed_raw_config = RawConfig.model_validate_json(_raw_config) - self.process_config(parsed_raw_config) def process_datasets(self, raw_config: RawConfig): """ @@ -178,8 +175,12 @@ def process_algorithms(self, raw_config: RawConfig): if cur_params.directed != None: warnings.warn("UPDATE: we no longer use the directed key in the config file") + cur_params = cur_params.__pydantic_extra__ + if not cur_params: + raise RuntimeError("An internal error occured: ConfigDict extra should be set on AlgorithmParams.") + # The algorithm has no named arguments so create a default placeholder - if len(cur_params) == 0: + if len(cur_params.keys()) == 0: cur_params["run1"] = {"spras_placeholder": ["no parameters"]} # Each set of runs should be 1 level down in the config file @@ -210,7 +211,7 @@ def process_algorithms(self, raw_config: RawConfig): # Catch-all for strings obj = [obj] if not isinstance(obj, Iterable): - raise ValueError(f"The object `{obj}` in algorithm {alg['name']} at key '{p}' in run '{run_params}' is not iterable!") from None + raise ValueError(f"The object `{obj}` in algorithm {alg.name} at key '{p}' in run '{run_params}' is not iterable!") from None all_runs.append(obj) run_list_tuples = list(it.product(*all_runs)) param_name_tuple = tuple(param_name_list) @@ -231,47 +232,18 @@ def process_algorithms(self, raw_config: RawConfig): if params_hash in prior_params_hashes: raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file ' f'(current length {self.hash_length}).') - self.algorithm_params[alg["name"]][params_hash] = run_dict - - def process_config(self, raw_config: RawConfig): - # Set up a few top-level config variables - self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir - - # We allow the container framework not to be defined in the config. In the case it isn't, default to docker. - # However, if we get a bad value, we raise an exception. - if raw_config.container_framework != None: - container_framework = raw_config.container_framework - if container_framework == ContainerFramework.dsub: - warnings.warn("'dsub' framework integration is experimental and may not be fully supported.") - self.container_framework = container_framework - else: - self.container_framework = "docker" - - # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. - if raw_config.unpack_singularity: - # The value in the config is a string, and we need to convert it to a bool. - unpack_singularity = raw_config["unpack_singularity"] - if unpack_singularity and self.container_framework != "singularity": - warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.") - self.unpack_singularity = unpack_singularity - - # Grab registry from the config, and if none is provided default to docker - if raw_config.container_registry and raw_config["container_registry"]["base_url"] != "" and raw_config["container_registry"]["owner"] != "": - self.container_prefix = raw_config["container_registry"]["base_url"] + "/" + raw_config["container_registry"]["owner"] + self.algorithm_params[alg.name][params_hash] = run_dict - # Override the default parameter hash length if specified in the config file - if "hash_length" in raw_config and raw_config["hash_length"] != "": - self.hash_length = int(raw_config["hash_length"]) + def process_analysis(self, raw_config: RawConfig): + if not raw_config.analysis: + return - self.process_datasets(raw_config) - self.process_algorithms(raw_config) - - self.analysis_params = raw_config["analysis"] if "analysis" in raw_config else {} - self.ml_params = self.analysis_params["ml"] if "ml" in self.analysis_params else {} - self.evaluation_params = self.analysis_params["evaluation"] if "evaluation" in self.analysis_params else {} + self.analysis_params = raw_config.analysis + self.ml_params = self.analysis_params.ml if self.analysis_params.ml else {} + self.evaluation_params = self.analysis_params.evaluation if self.analysis_params.evaluation else {} self.pca_params = {} - if "components" in self.ml_params: + if self.ml_params.components: self.pca_params["components"] = self.ml_params["components"] if "labels" in self.ml_params: self.pca_params["labels"] = self.ml_params["labels"] @@ -282,14 +254,14 @@ def process_config(self, raw_config: RawConfig): if "metric" in self.ml_params: self.hac_params["metric"] = self.ml_params ["metric"] - self.analysis_include_summary = raw_config["analysis"]["summary"]["include"] - self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"] - self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"] - self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] - self.analysis_include_evaluation = raw_config["analysis"]["evaluation"]["include"] + self.analysis_include_summary = raw_config.analysis.summary.include + self.analysis_include_graphspace = raw_config.analysis.graphspace.include + self.analysis_include_cytoscape = raw_config.analysis.cytoscape.include + self.analysis_include_ml = raw_config.analysis.ml.include + self.analysis_include_evaluation = raw_config.analysis.evaluation.include # Only run ML aggregate per algorithm if analysis include ML is set to True - if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml: + if self.ml_params.aggregate_per_algorithm and self.analysis_include_ml: self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] else: self.analysis_include_ml_aggregate_algo = False @@ -304,11 +276,32 @@ def process_config(self, raw_config: RawConfig): self.analysis_include_evaluation = False # Only run Evaluation aggregate per algorithm if analysis include ML is set to True - if 'aggregate_per_algorithm' in self.evaluation_params and self.analysis_include_evaluation: - self.analysis_include_evaluation_aggregate_algo = raw_config["analysis"]["evaluation"]["aggregate_per_algorithm"] + if self.evaluation_params.aggregate_per_algorithm and self.analysis_include_evaluation: + self.analysis_include_evaluation_aggregate_algo = raw_config.analysis.evaluation.aggregate_per_algorithm else: self.analysis_include_evaluation_aggregate_algo = False # Only run Evaluation per algorithm if ML per algorithm is set to True if not self.analysis_include_ml_aggregate_algo: self.analysis_include_evaluation_aggregate_algo = False + + def process_config(self, raw_config: RawConfig): + # Set up a few top-level config variables + self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir + + if raw_config.container_framework == ContainerFramework.dsub: + warnings.warn("'dsub' framework integration is experimental and may not be fully supported.") + self.container_framework = raw_config.container_framework + + # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. + if raw_config.unpack_singularity and self.container_framework != "singularity": + warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.") + self.unpack_singularity = raw_config.unpack_singularity + + # Grab registry from the config, and if none is provided default to docker + if raw_config.container_registry and raw_config.container_registry.base_url != "" and raw_config.container_registry.owner != "": + self.container_prefix = raw_config.container_registry.base_url + "/" + raw_config.container_registry.owner + + self.process_datasets(raw_config) + self.process_algorithms(raw_config) + self.process_analysis(raw_config) diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index 1810bf0e5..76992c8f1 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -6,6 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field from typing import Optional +from spras.config.raw_config_analysis import Analysis + # The default length of the truncated hash used to identify parameter combinations DEFAULT_HASH_LENGTH = 7 @@ -19,15 +21,21 @@ class ContainerRegistry(BaseModel): base_url: str owner: str = Field(description="The owner or project of the registry") + model_config = ConfigDict(extra='forbid') + class AlgorithmParams(BaseModel): include: bool = Field(default=False) directed: Optional[bool] - # TODO + + # TODO: use array of runs instead + model_config = ConfigDict(extra='allow') class Algorithm(BaseModel): name: str params: AlgorithmParams + model_config = ConfigDict(extra='forbid') + class Dataset(BaseModel): label: str node_files: list[str] @@ -35,30 +43,41 @@ class Dataset(BaseModel): other_files: list[str] data_dir: str + model_config = ConfigDict(extra='forbid') + class GoldStandard(BaseModel): label: str node_files: list[str] data_dir: str dataset_labels: list[str] + model_config = ConfigDict(extra='forbid') + class Locations(BaseModel): reconstruction_dir: str + model_config = ConfigDict(extra='forbid') + class ReconstructionSettings(BaseModel): locations: Locations + model_config = ConfigDict(extra='forbid') + class RawConfig(BaseModel): - # TODO: move this to nested container key - container_framework: Optional[ContainerFramework] + # TODO: move these container values to a nested container key + container_framework: ContainerFramework = Field(default=ContainerFramework.docker) unpack_singularity: bool = Field(default=False) container_registry: ContainerRegistry - hash_length: Optional[int] = Field( + hash_length: int = Field( description="The length of the hash used to identify a parameter combination", default=DEFAULT_HASH_LENGTH) algorithms: list[Algorithm] datasets: list[Dataset] gold_standards: list[GoldStandard] = Field(default=[]) + analysis: Optional[Analysis] reconstruction_settings: ReconstructionSettings + + model_config = ConfigDict(extra='forbid') diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py new file mode 100644 index 000000000..8743f5969 --- /dev/null +++ b/spras/config/raw_config_analysis.py @@ -0,0 +1,33 @@ +from pydantic import BaseModel +from typing import Optional + +class SummaryAnalysis(BaseModel): + include: bool + +class GraphspaceAnalysis(BaseModel): + include: bool + +class CytoscapeAnalysis(BaseModel): + include: bool + +class MlAnalysis(BaseModel): + include: bool + aggregate_per_algorithm: bool + components: int + labels: bool + # TODO: enumify + linkage: str + # TODO: enumify + metric: str + +class EvaluationAnalysis(BaseModel): + include: bool + aggregate_per_algorithm: bool + + +class Analysis(BaseModel): + summary: Optional[SummaryAnalysis] + graphspace: Optional[GraphspaceAnalysis] + cytoscape: Optional[CytoscapeAnalysis] + ml: Optional[MlAnalysis] + evaluation: Optional[EvaluationAnalysis] From b56ecde361c9138cea993472d1f52c0adbf0dff2 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 20:53:36 +0000 Subject: [PATCH 03/34] feat: add enum variants on ml --- config/config.yaml | 6 ------ spras/config/config.py | 8 ++++---- spras/config/raw_config.py | 3 ++- spras/config/raw_config_analysis.py | 22 +++++++++++++++++----- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 3179dfedc..68c580683 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -63,11 +63,6 @@ algorithms: - name: "omicsintegrator2" params: include: true - runs: - - b: 4 - g: 0 - - b: 2 - g: 3 run1: b: 4 g: 0 @@ -149,7 +144,6 @@ reconstruction_settings: # TODO move to global reconstruction_dir: "output" - analysis: # Create one summary per pathway file and a single summary table for all pathways for each dataset summary: diff --git a/spras/config/config.py b/spras/config/config.py index 751c774a5..b6be80ef1 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -23,8 +23,8 @@ import numpy as np import yaml -from spras.util import NpHashEncoder, hash_params_sha1_base32 from spras.config.raw_config import ContainerFramework, RawConfig +from spras.util import NpHashEncoder, hash_params_sha1_base32 config = None @@ -110,7 +110,7 @@ def __init__(self, raw_config: dict[str, Any]): # wrapper error first before passing validation to pydantic. if raw_config == {}: raise ValueError("Config file cannot be empty. Use --configfile to set a config file.") - + def process_datasets(self, raw_config: RawConfig): """ Parse dataset information @@ -150,7 +150,7 @@ def process_datasets(self, raw_config: RawConfig): # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)] # Maps from the dataset label to the dataset list index # dataset_dict = {dataset.get('label', f'dataset{index}'): index for index, dataset in enumerate(datasets)} - + def process_algorithms(self, raw_config: RawConfig): """ Parse algorithm information @@ -172,7 +172,7 @@ def process_algorithms(self, raw_config: RawConfig): # Do not parse the rest of the parameters for this algorithm if it is not included continue - if cur_params.directed != None: + if cur_params.directed is not None: warnings.warn("UPDATE: we no longer use the directed key in the config file") cur_params = cur_params.__pydantic_extra__ diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index 76992c8f1..f60cb47c5 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -3,9 +3,10 @@ """ from enum import Enum -from pydantic import BaseModel, ConfigDict, Field from typing import Optional +from pydantic import BaseModel, ConfigDict, Field + from spras.config.raw_config_analysis import Analysis # The default length of the truncated hash used to identify parameter combinations diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py index 8743f5969..7eae2a127 100644 --- a/spras/config/raw_config_analysis.py +++ b/spras/config/raw_config_analysis.py @@ -1,6 +1,9 @@ -from pydantic import BaseModel +from enum import Enum from typing import Optional +from pydantic import BaseModel + + class SummaryAnalysis(BaseModel): include: bool @@ -10,15 +13,24 @@ class GraphspaceAnalysis(BaseModel): class CytoscapeAnalysis(BaseModel): include: bool +class MlLinkage(str, Enum): + ward = 'ward' + complete = 'complete' + average = 'average' + single = 'single' + +class MlMetric(str, Enum): + euclidean = 'euclidean' + manhattan = 'manhattan' + cosine = 'cosine' + class MlAnalysis(BaseModel): include: bool aggregate_per_algorithm: bool components: int labels: bool - # TODO: enumify - linkage: str - # TODO: enumify - metric: str + linkage: MlLinkage + metric: MlMetric class EvaluationAnalysis(BaseModel): include: bool From bf95888ccbe00d90f124a998ab128f03b324c33f Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 21:06:53 +0000 Subject: [PATCH 04/34] fix: some defaults --- spras/config/config.py | 6 +++--- spras/config/raw_config.py | 12 ++++++------ spras/config/raw_config_analysis.py | 18 +++++++++--------- test/test_config.py | 1 - 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index b6be80ef1..75c9124c6 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -43,10 +43,10 @@ def init_from_file(filepath): try: with open(filepath, 'r') as yaml_file: config_dict = yaml.safe_load(yaml_file) - except FileNotFoundError: - raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") + except FileNotFoundError as e: + raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") from e except yaml.YAMLError as e: - raise RuntimeError(f"Error: Failed to parse config '{filepath}': {e}") + raise RuntimeError(f"Error: Failed to parse config '{filepath}'") from e # And finally, initialize config = Config(config_dict) diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index f60cb47c5..1580706e2 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -25,8 +25,8 @@ class ContainerRegistry(BaseModel): model_config = ConfigDict(extra='forbid') class AlgorithmParams(BaseModel): - include: bool = Field(default=False) - directed: Optional[bool] + include: bool = False + directed: Optional[bool] = None # TODO: use array of runs instead model_config = ConfigDict(extra='allow') @@ -66,8 +66,8 @@ class ReconstructionSettings(BaseModel): class RawConfig(BaseModel): # TODO: move these container values to a nested container key - container_framework: ContainerFramework = Field(default=ContainerFramework.docker) - unpack_singularity: bool = Field(default=False) + container_framework: ContainerFramework = ContainerFramework.docker + unpack_singularity: bool = False container_registry: ContainerRegistry hash_length: int = Field( @@ -76,8 +76,8 @@ class RawConfig(BaseModel): algorithms: list[Algorithm] datasets: list[Dataset] - gold_standards: list[GoldStandard] = Field(default=[]) - analysis: Optional[Analysis] + gold_standards: list[GoldStandard] = [] + analysis: Optional[Analysis] = None reconstruction_settings: ReconstructionSettings diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py index 7eae2a127..5682fb6e6 100644 --- a/spras/config/raw_config_analysis.py +++ b/spras/config/raw_config_analysis.py @@ -27,10 +27,10 @@ class MlMetric(str, Enum): class MlAnalysis(BaseModel): include: bool aggregate_per_algorithm: bool - components: int - labels: bool - linkage: MlLinkage - metric: MlMetric + components: int = 2 + labels: bool = True + linkage: MlLinkage = MlLinkage.ward + metric: MlMetric = MlMetric.euclidean class EvaluationAnalysis(BaseModel): include: bool @@ -38,8 +38,8 @@ class EvaluationAnalysis(BaseModel): class Analysis(BaseModel): - summary: Optional[SummaryAnalysis] - graphspace: Optional[GraphspaceAnalysis] - cytoscape: Optional[CytoscapeAnalysis] - ml: Optional[MlAnalysis] - evaluation: Optional[EvaluationAnalysis] + summary: Optional[SummaryAnalysis] = None + graphspace: Optional[GraphspaceAnalysis] = None + cytoscape: Optional[CytoscapeAnalysis] = None + ml: Optional[MlAnalysis] = None + evaluation: Optional[EvaluationAnalysis] = None diff --git a/test/test_config.py b/test/test_config.py index 26b18a4e9..3039e2dc3 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -25,7 +25,6 @@ def get_test_config(): "datasets": [{"label": "alg1"}, {"label": "alg2"}], "gold_standards": [{"label": "gs1", "dataset_labels": []}], "algorithms": [ - {"params": ["param2", "param2"]}, { "name": "strings", "params": { From 51d6a7b1efeee0b97f5ad6e1ca377f91fbd9ce19 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 21:47:26 +0000 Subject: [PATCH 05/34] feat: fully finish config parsing --- spras/config/config.py | 43 ++++++++++------------------- spras/config/raw_config.py | 23 ++++++++++----- spras/config/raw_config_analysis.py | 21 +++++++------- spras/config/util_enum.py | 14 ++++++++++ test/test_config.py | 33 +++++++++++++++++++--- 5 files changed, 83 insertions(+), 51 deletions(-) create mode 100644 spras/config/util_enum.py diff --git a/spras/config/config.py b/spras/config/config.py index 75c9124c6..02ab61ca0 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -54,8 +54,12 @@ def init_from_file(filepath): class Config: def __init__(self, raw_config: dict[str, Any]): + # Since snakemake provides an empty config, we provide this + # wrapper error first before passing validation to pydantic. + if raw_config == {}: + raise ValueError("Config file cannot be empty. Use --configfile to set a config file.") + parsed_raw_config = RawConfig.model_validate(raw_config) - self.process_config(parsed_raw_config) # Member vars populated by process_config. Any values that don't have quick initial values are set to None # before they are populated for __init__ to show exactly what is being configured. @@ -106,10 +110,7 @@ def __init__(self, raw_config: dict[str, Any]): # A Boolean specifying whether to run the evaluation per algorithm analysis self.analysis_include_evaluation_aggregate_algo = None - # Since snakemake provides an empty config, we provide this - # wrapper error first before passing validation to pydantic. - if raw_config == {}: - raise ValueError("Config file cannot be empty. Use --configfile to set a config file.") + self.process_config(parsed_raw_config) def process_datasets(self, raw_config: RawConfig): """ @@ -118,7 +119,6 @@ def process_datasets(self, raw_config: RawConfig): Convert the dataset list into a dict where the label is the key and update the config data structure """ # TODO allow labels to be optional and assign default labels - # TODO check for collisions in dataset labels, warn, and make the labels unique # Need to work more on input file naming to make less strict assumptions # about the filename structure # Currently assumes all datasets have a label and the labels are unique @@ -130,12 +130,7 @@ def process_datasets(self, raw_config: RawConfig): if label in self.datasets: raise ValueError(f"Datasets must have unique labels, but the label {label} appears at least twice.") self.datasets[label] = dict(dataset) - - # Validate dataset labels - label_pattern = r'^\w+$' - if not bool(re.match(label_pattern, label)): - raise ValueError(f"Dataset label '{label}' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.") - + # parse gold standard information self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards} @@ -173,7 +168,7 @@ def process_algorithms(self, raw_config: RawConfig): continue if cur_params.directed is not None: - warnings.warn("UPDATE: we no longer use the directed key in the config file") + warnings.warn("UPDATE: we no longer use the directed key in the config file", stacklevel=2) cur_params = cur_params.__pydantic_extra__ if not cur_params: @@ -239,20 +234,10 @@ def process_analysis(self, raw_config: RawConfig): return self.analysis_params = raw_config.analysis - self.ml_params = self.analysis_params.ml if self.analysis_params.ml else {} - self.evaluation_params = self.analysis_params.evaluation if self.analysis_params.evaluation else {} - - self.pca_params = {} - if self.ml_params.components: - self.pca_params["components"] = self.ml_params["components"] - if "labels" in self.ml_params: - self.pca_params["labels"] = self.ml_params["labels"] + self.ml_params = self.analysis_params.ml + self.evaluation_params = self.analysis_params.evaluation - self.hac_params = {} - if "linkage" in self.ml_params: - self.hac_params["linkage"] = self.ml_params["linkage"] - if "metric" in self.ml_params: - self.hac_params["metric"] = self.ml_params ["metric"] + self.pca_params = self.ml_params self.analysis_include_summary = raw_config.analysis.summary.include self.analysis_include_graphspace = raw_config.analysis.graphspace.include @@ -262,7 +247,7 @@ def process_analysis(self, raw_config: RawConfig): # Only run ML aggregate per algorithm if analysis include ML is set to True if self.ml_params.aggregate_per_algorithm and self.analysis_include_ml: - self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] + self.analysis_include_ml_aggregate_algo = raw_config.analysis.ml.aggregate_per_algorithm else: self.analysis_include_ml_aggregate_algo = False @@ -290,12 +275,12 @@ def process_config(self, raw_config: RawConfig): self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir if raw_config.container_framework == ContainerFramework.dsub: - warnings.warn("'dsub' framework integration is experimental and may not be fully supported.") + warnings.warn("'dsub' framework integration is experimental and may not be fully supported.", stacklevel=2) self.container_framework = raw_config.container_framework # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container. if raw_config.unpack_singularity and self.container_framework != "singularity": - warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.") + warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.", stacklevel=2) self.unpack_singularity = raw_config.unpack_singularity # Grab registry from the config, and if none is provided default to docker diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index 1580706e2..5ab6fed1a 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -2,17 +2,26 @@ Contains the raw pydantic schema for the configuration file. """ -from enum import Enum -from typing import Optional +import re +from spras.config.util_enum import CaseInsensitiveEnum +from typing import Annotated, Optional -from pydantic import BaseModel, ConfigDict, Field +from pydantic import AfterValidator, BaseModel, ConfigDict, Field from spras.config.raw_config_analysis import Analysis # The default length of the truncated hash used to identify parameter combinations DEFAULT_HASH_LENGTH = 7 -class ContainerFramework(str, Enum): +def label_validator(name: str): + label_pattern = r'^\w+$' + def validate(label: str): + if not bool(re.match(label_pattern, label)): + raise ValueError(f"{name} label '{label}' contains invalid values. {name} labels can only contain letters, numbers, or underscores.") + return label + return validate + +class ContainerFramework(CaseInsensitiveEnum): docker = 'docker' # TODO: add apptainer variant once #260 gets merged singularity = 'singularity' @@ -26,7 +35,7 @@ class ContainerRegistry(BaseModel): class AlgorithmParams(BaseModel): include: bool = False - directed: Optional[bool] = None + directed: Annotated[Optional[bool], Field(deprecated=True)] = None # TODO: use array of runs instead model_config = ConfigDict(extra='allow') @@ -38,7 +47,7 @@ class Algorithm(BaseModel): model_config = ConfigDict(extra='forbid') class Dataset(BaseModel): - label: str + label: Annotated[str, AfterValidator(label_validator("Dataset"))] node_files: list[str] edge_files: list[str] other_files: list[str] @@ -47,7 +56,7 @@ class Dataset(BaseModel): model_config = ConfigDict(extra='forbid') class GoldStandard(BaseModel): - label: str + label: Annotated[str, AfterValidator(label_validator("Gold Standard"))] node_files: list[str] data_dir: str dataset_labels: list[str] diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py index 5682fb6e6..6b5be9bab 100644 --- a/spras/config/raw_config_analysis.py +++ b/spras/config/raw_config_analysis.py @@ -1,5 +1,4 @@ -from enum import Enum -from typing import Optional +from spras.config.util_enum import CaseInsensitiveEnum from pydantic import BaseModel @@ -13,20 +12,20 @@ class GraphspaceAnalysis(BaseModel): class CytoscapeAnalysis(BaseModel): include: bool -class MlLinkage(str, Enum): +class MlLinkage(CaseInsensitiveEnum): ward = 'ward' complete = 'complete' average = 'average' single = 'single' -class MlMetric(str, Enum): +class MlMetric(CaseInsensitiveEnum): euclidean = 'euclidean' manhattan = 'manhattan' cosine = 'cosine' class MlAnalysis(BaseModel): include: bool - aggregate_per_algorithm: bool + aggregate_per_algorithm: bool = False components: int = 2 labels: bool = True linkage: MlLinkage = MlLinkage.ward @@ -34,12 +33,12 @@ class MlAnalysis(BaseModel): class EvaluationAnalysis(BaseModel): include: bool - aggregate_per_algorithm: bool + aggregate_per_algorithm: bool = False class Analysis(BaseModel): - summary: Optional[SummaryAnalysis] = None - graphspace: Optional[GraphspaceAnalysis] = None - cytoscape: Optional[CytoscapeAnalysis] = None - ml: Optional[MlAnalysis] = None - evaluation: Optional[EvaluationAnalysis] = None + summary: SummaryAnalysis = SummaryAnalysis(include=False) + graphspace: GraphspaceAnalysis = GraphspaceAnalysis(include=False) + cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False) + ml: MlAnalysis = MlAnalysis(include=False) + evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False) diff --git a/spras/config/util_enum.py b/spras/config/util_enum.py new file mode 100644 index 000000000..ec5700fd3 --- /dev/null +++ b/spras/config/util_enum.py @@ -0,0 +1,14 @@ +from enum import Enum +from typing import Any + +# https://stackoverflow.com/a/76883868/7589775 +class CaseInsensitiveEnum(str, Enum): + @classmethod + def _missing_(cls, value: Any): + if isinstance(value, str): + value = value.lower() + + for member in cls: + if member.lower() == value: + return member + return None diff --git a/test/test_config.py b/test/test_config.py index 3039e2dc3..9a3bf6549 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -4,7 +4,14 @@ import pytest import spras.config.config as config +from spras.config.raw_config import DEFAULT_HASH_LENGTH +filler_dataset_data: dict[str, str | list[str]] = { + "data_dir": "fake", + "edge_files": [], + "other_files": [], + "node_files": [] +} # Set up a dummy config for testing. For now, only include things that MUST exist in the dict # in order for the config init to complete. To test particular parts of the config initialization, @@ -22,8 +29,25 @@ def get_test_config(): "reconstruction_dir": "my_dir" } }, - "datasets": [{"label": "alg1"}, {"label": "alg2"}], - "gold_standards": [{"label": "gs1", "dataset_labels": []}], + "datasets": [{ + "label": "alg1", + "data_dir": "fake", + "edge_files": [], + "other_files": [], + "node_files": [] + }, { + "label": "alg2", + "data_dir": "faux", + "edge_files": [], + "other_files": [], + "node_files": [] + }], + "gold_standards": [{ + "label": "gs1", + "dataset_labels": [], + "node_files": [], + "data_dir": "gs-fake" + }], "algorithms": [ { "name": "strings", @@ -125,9 +149,9 @@ def test_config_hash_length(self): config.init_global(test_config) assert (config.config.hash_length == 7) - test_config["hash_length"] = "" + test_config.pop("hash_length", None) config.init_global(test_config) - assert (config.config.hash_length == config.DEFAULT_HASH_LENGTH) + assert (config.config.hash_length == DEFAULT_HASH_LENGTH) # Initialize the configuration test_config["hash_length"] = "12" @@ -193,6 +217,7 @@ def test_correct_dataset_label(self): test_config = get_test_config() correct_test_dicts = [{"label": "test"}, {"label": "123"}, {"label": "test123"}, {"label": "123test"}, {"label": "_"}, {"label": "test_test"}, {"label": "_test"}, {"label": "test_"}] + correct_test_dicts = [dict(list(d.items()) + list(filler_dataset_data.items())) for d in correct_test_dicts] for test_dict in correct_test_dicts: test_config["datasets"] = [test_dict] From a27d38decf17c3579753e3b8e7e9b603abc1e538 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 21:49:28 +0000 Subject: [PATCH 06/34] style: fmt --- spras/config/config.py | 2 +- spras/config/raw_config.py | 2 +- spras/config/raw_config_analysis.py | 4 ++-- spras/config/util_enum.py | 1 + 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index 02ab61ca0..b627fcc73 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -130,7 +130,7 @@ def process_datasets(self, raw_config: RawConfig): if label in self.datasets: raise ValueError(f"Datasets must have unique labels, but the label {label} appears at least twice.") self.datasets[label] = dict(dataset) - + # parse gold standard information self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards} diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index 5ab6fed1a..4c1cc3581 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -3,12 +3,12 @@ """ import re -from spras.config.util_enum import CaseInsensitiveEnum from typing import Annotated, Optional from pydantic import AfterValidator, BaseModel, ConfigDict, Field from spras.config.raw_config_analysis import Analysis +from spras.config.util_enum import CaseInsensitiveEnum # The default length of the truncated hash used to identify parameter combinations DEFAULT_HASH_LENGTH = 7 diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py index 6b5be9bab..194fc8f7c 100644 --- a/spras/config/raw_config_analysis.py +++ b/spras/config/raw_config_analysis.py @@ -1,7 +1,7 @@ -from spras.config.util_enum import CaseInsensitiveEnum - from pydantic import BaseModel +from spras.config.util_enum import CaseInsensitiveEnum + class SummaryAnalysis(BaseModel): include: bool diff --git a/spras/config/util_enum.py b/spras/config/util_enum.py index ec5700fd3..3e73eda98 100644 --- a/spras/config/util_enum.py +++ b/spras/config/util_enum.py @@ -1,6 +1,7 @@ from enum import Enum from typing import Any + # https://stackoverflow.com/a/76883868/7589775 class CaseInsensitiveEnum(str, Enum): @classmethod From dd4674a22d9af81c210093a8fd9da393074e38db Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 22:03:04 +0000 Subject: [PATCH 07/34] fix: remove dep mark, use strict is None --- spras/config/config.py | 2 +- spras/config/raw_config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index b627fcc73..55d061607 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -171,7 +171,7 @@ def process_algorithms(self, raw_config: RawConfig): warnings.warn("UPDATE: we no longer use the directed key in the config file", stacklevel=2) cur_params = cur_params.__pydantic_extra__ - if not cur_params: + if cur_params is None: raise RuntimeError("An internal error occured: ConfigDict extra should be set on AlgorithmParams.") # The algorithm has no named arguments so create a default placeholder diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index 4c1cc3581..caff7c690 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -35,7 +35,7 @@ class ContainerRegistry(BaseModel): class AlgorithmParams(BaseModel): include: bool = False - directed: Annotated[Optional[bool], Field(deprecated=True)] = None + directed: Optional[bool] = None # TODO: use array of runs instead model_config = ConfigDict(extra='allow') From 5a8826d24434613a611ccdfef69c81260e4d3129 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 23:01:30 +0000 Subject: [PATCH 08/34] chore: correct config loc --- Snakefile | 2 +- spras/config/config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index df90f8e4a..f5817650d 100644 --- a/Snakefile +++ b/Snakefile @@ -5,7 +5,7 @@ import yaml from spras.dataset import Dataset from spras.evaluation import Evaluation from spras.analysis import ml, summary, graphspace, cytoscape -import spras.config as _config +import spras.config.config as _config # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them diff --git a/spras/config/config.py b/spras/config/config.py index 55d061607..5894dd304 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -6,7 +6,7 @@ module that imports this module can access a config option by checking the object's value. For example -import spras.config as config +import spras.config.config as config container_framework = config.config.container_framework will grab the top level registry configuration option as it appears in the config file From a47b0dfbe7128221f2ee378f3188402a83f60ae6 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 25 Jun 2025 23:45:53 +0000 Subject: [PATCH 09/34] fix: specify hac params --- spras/config/config.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spras/config/config.py b/spras/config/config.py index 5894dd304..0db7bedf1 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -239,6 +239,11 @@ def process_analysis(self, raw_config: RawConfig): self.pca_params = self.ml_params + self.hac_params = { + "linkage": self.ml_params.linkage, + "metric": self.ml_params.metric + } + self.analysis_include_summary = raw_config.analysis.summary.include self.analysis_include_graphspace = raw_config.analysis.graphspace.include self.analysis_include_cytoscape = raw_config.analysis.cytoscape.include From 8d756045da12d48e603c5d6f7a7846d41d24fd53 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 26 Jun 2025 16:04:27 +0000 Subject: [PATCH 10/34] fix: expand class params --- spras/config/config.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index 0db7bedf1..b479c2a53 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -233,9 +233,10 @@ def process_analysis(self, raw_config: RawConfig): if not raw_config.analysis: return - self.analysis_params = raw_config.analysis - self.ml_params = self.analysis_params.ml - self.evaluation_params = self.analysis_params.evaluation + # these params are classes - we need to turn them into var dicts + self.analysis_params = vars(raw_config.analysis) + self.ml_params = vars(self.analysis_params.ml) + self.evaluation_params = vars(self.analysis_params.evaluation) self.pca_params = self.ml_params From afa1de5605fee43de0457aa05c8ee956eb1982df Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 26 Jun 2025 16:06:36 +0000 Subject: [PATCH 11/34] fix: expand on pca_params --- spras/config/config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index b479c2a53..515a677c5 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -233,12 +233,12 @@ def process_analysis(self, raw_config: RawConfig): if not raw_config.analysis: return - # these params are classes - we need to turn them into var dicts - self.analysis_params = vars(raw_config.analysis) - self.ml_params = vars(self.analysis_params.ml) - self.evaluation_params = vars(self.analysis_params.evaluation) + self.analysis_params = raw_config.analysis + self.ml_params = self.analysis_params.ml + self.evaluation_params = self.analysis_params.evaluation - self.pca_params = self.ml_params + # self.ml_params is a class, pca_params needs to be a dict. + self.pca_params = vars(self.ml_params) self.hac_params = { "linkage": self.ml_params.linkage, From 5eefc51f79ad38c8c086f0c0e233fa6f91966d36 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 26 Jun 2025 09:35:20 -0700 Subject: [PATCH 12/34] fix: drop include dict --- spras/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/config.py b/spras/config/config.py index 515a677c5..8d52ebd97 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -238,7 +238,7 @@ def process_analysis(self, raw_config: RawConfig): self.evaluation_params = self.analysis_params.evaluation # self.ml_params is a class, pca_params needs to be a dict. - self.pca_params = vars(self.ml_params) + self.pca_params = {k: v for k, v in vars(self.ml_params).items if k != 'include'} self.hac_params = { "linkage": self.ml_params.linkage, From 52431866627ce081795487c37064403a8045c829 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 26 Jun 2025 10:28:05 -0700 Subject: [PATCH 13/34] fix: call items --- spras/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/config.py b/spras/config/config.py index 8d52ebd97..23d8c44fd 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -238,7 +238,7 @@ def process_analysis(self, raw_config: RawConfig): self.evaluation_params = self.analysis_params.evaluation # self.ml_params is a class, pca_params needs to be a dict. - self.pca_params = {k: v for k, v in vars(self.ml_params).items if k != 'include'} + self.pca_params = {k: v for k, v in vars(self.ml_params).items() if k != 'include'} self.hac_params = { "linkage": self.ml_params.linkage, From 3b20c48bae7ef46a4d15a9b34dc65c399732f95c Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 26 Jun 2025 17:51:54 +0000 Subject: [PATCH 14/34] fix: better typing and deafults --- spras/config/config.py | 17 +++++++++-------- spras/config/raw_config.py | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index 23d8c44fd..d944cc976 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -23,7 +23,7 @@ import numpy as np import yaml -from spras.config.raw_config import ContainerFramework, RawConfig +from spras.config.raw_config import ContainerFramework, RawConfig, Analysis from spras.util import NpHashEncoder, hash_params_sha1_base32 config = None @@ -86,9 +86,11 @@ def __init__(self, raw_config: dict[str, Any]): # Deprecated. Previously a dict mapping algorithm names to a Boolean tracking whether they used directed graphs. self.algorithm_directed = None # A dict with the analysis settings - self.analysis_params = None + self.analysis_params = parsed_raw_config.analysis + # A dict with the evaluation settings + self.evaluation_params = self.analysis_params.evaluation # A dict with the ML settings - self.ml_params = None + self.ml_params = self.analysis_params.ml # A Boolean specifying whether to run ML analysis for individual algorithms self.analysis_include_ml_aggregate_algo = None # A dict with the PCA settings @@ -233,12 +235,11 @@ def process_analysis(self, raw_config: RawConfig): if not raw_config.analysis: return - self.analysis_params = raw_config.analysis - self.ml_params = self.analysis_params.ml - self.evaluation_params = self.analysis_params.evaluation - # self.ml_params is a class, pca_params needs to be a dict. - self.pca_params = {k: v for k, v in vars(self.ml_params).items() if k != 'include'} + self.pca_params = { + "components": self.ml_params.components, + "labels": self.ml_params.labels + } self.hac_params = { "linkage": self.ml_params.linkage, diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py index caff7c690..409b48427 100644 --- a/spras/config/raw_config.py +++ b/spras/config/raw_config.py @@ -86,7 +86,7 @@ class RawConfig(BaseModel): algorithms: list[Algorithm] datasets: list[Dataset] gold_standards: list[GoldStandard] = [] - analysis: Optional[Analysis] = None + analysis: Analysis = Analysis() reconstruction_settings: ReconstructionSettings From 2d4a90f669f77a9d209c658ce5596833a2d43881 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 26 Jun 2025 11:09:54 -0700 Subject: [PATCH 15/34] style: fmt --- spras/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/config.py b/spras/config/config.py index d944cc976..0c176cbab 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -23,7 +23,7 @@ import numpy as np import yaml -from spras.config.raw_config import ContainerFramework, RawConfig, Analysis +from spras.config.raw_config import Analysis, ContainerFramework, RawConfig from spras.util import NpHashEncoder, hash_params_sha1_base32 config = None From 2a4fb2ec5c923954c9748dc63009002264fcebf9 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Wed, 9 Jul 2025 10:40:44 -0700 Subject: [PATCH 16/34] refactor: add config forbid --- spras/config/raw_config_analysis.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py index 194fc8f7c..dbec5f1b9 100644 --- a/spras/config/raw_config_analysis.py +++ b/spras/config/raw_config_analysis.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from spras.config.util_enum import CaseInsensitiveEnum @@ -6,12 +6,13 @@ class SummaryAnalysis(BaseModel): include: bool -class GraphspaceAnalysis(BaseModel): - include: bool + model_config = ConfigDict(extra='forbid') class CytoscapeAnalysis(BaseModel): include: bool + model_config = ConfigDict(extra='forbid') + class MlLinkage(CaseInsensitiveEnum): ward = 'ward' complete = 'complete' @@ -31,14 +32,18 @@ class MlAnalysis(BaseModel): linkage: MlLinkage = MlLinkage.ward metric: MlMetric = MlMetric.euclidean + model_config = ConfigDict(extra='forbid') + class EvaluationAnalysis(BaseModel): include: bool aggregate_per_algorithm: bool = False + model_config = ConfigDict(extra='forbid') class Analysis(BaseModel): summary: SummaryAnalysis = SummaryAnalysis(include=False) - graphspace: GraphspaceAnalysis = GraphspaceAnalysis(include=False) cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False) ml: MlAnalysis = MlAnalysis(include=False) evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False) + + model_config = ConfigDict(extra='forbid') From 4ded57eeb490b21a51feec81ed119e400fc363ef Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Wed, 9 Jul 2025 17:45:47 +0000 Subject: [PATCH 17/34] refactor: update config imports --- test/BowTieBuilder/test_btb.py | 2 +- test/RWR/test_RWR.py | 2 +- test/ST_RWR/test_STRWR.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/BowTieBuilder/test_btb.py b/test/BowTieBuilder/test_btb.py index 88b12d0dd..d4a458b3c 100644 --- a/test/BowTieBuilder/test_btb.py +++ b/test/BowTieBuilder/test_btb.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config config.init_from_file("config/config.yaml") diff --git a/test/RWR/test_RWR.py b/test/RWR/test_RWR.py index 4d6ce7864..b0316ded0 100644 --- a/test/RWR/test_RWR.py +++ b/test/RWR/test_RWR.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.rwr import RWR config.init_from_file("config/config.yaml") diff --git a/test/ST_RWR/test_STRWR.py b/test/ST_RWR/test_STRWR.py index a0a5b4ea9..898b24055 100644 --- a/test/ST_RWR/test_STRWR.py +++ b/test/ST_RWR/test_STRWR.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.strwr import ST_RWR config.init_from_file("config/config.yaml") From 22b568645ef19cde8d430c1bf77a1aff5057ee84 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Wed, 9 Jul 2025 19:23:09 +0000 Subject: [PATCH 18/34] refactor: better names to schema files --- spras/config/{raw_config_analysis.py => analysis_schema.py} | 0 spras/config/config.py | 2 +- spras/config/{raw_config.py => schema.py} | 2 +- test/test_config.py | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename spras/config/{raw_config_analysis.py => analysis_schema.py} (100%) rename spras/config/{raw_config.py => schema.py} (98%) diff --git a/spras/config/raw_config_analysis.py b/spras/config/analysis_schema.py similarity index 100% rename from spras/config/raw_config_analysis.py rename to spras/config/analysis_schema.py diff --git a/spras/config/config.py b/spras/config/config.py index 2f4b44efa..293a08ec7 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -23,7 +23,7 @@ import numpy as np import yaml -from spras.config.raw_config import Analysis, ContainerFramework, RawConfig +from spras.config.schema import Analysis, ContainerFramework, RawConfig from spras.util import NpHashEncoder, hash_params_sha1_base32 config = None diff --git a/spras/config/raw_config.py b/spras/config/schema.py similarity index 98% rename from spras/config/raw_config.py rename to spras/config/schema.py index 409b48427..f882e5382 100644 --- a/spras/config/raw_config.py +++ b/spras/config/schema.py @@ -7,7 +7,7 @@ from pydantic import AfterValidator, BaseModel, ConfigDict, Field -from spras.config.raw_config_analysis import Analysis +from spras.config.analysis_schema import Analysis from spras.config.util_enum import CaseInsensitiveEnum # The default length of the truncated hash used to identify parameter combinations diff --git a/test/test_config.py b/test/test_config.py index 84d7d1d54..6095ad145 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -4,7 +4,7 @@ import pytest import spras.config.config as config -from spras.config.raw_config import DEFAULT_HASH_LENGTH +from spras.config.schema import DEFAULT_HASH_LENGTH filler_dataset_data: dict[str, str | list[str]] = { "data_dir": "fake", From ea59e4cec7d5197fd394f3704a5406facaa6f1a9 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Fri, 11 Jul 2025 21:56:42 +0000 Subject: [PATCH 19/34] fix: no default include, mention model_config allow reason --- spras/config/schema.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spras/config/schema.py b/spras/config/schema.py index f882e5382..d1b86e06d 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -34,10 +34,12 @@ class ContainerRegistry(BaseModel): model_config = ConfigDict(extra='forbid') class AlgorithmParams(BaseModel): - include: bool = False + include: bool directed: Optional[bool] = None - # TODO: use array of runs instead + # TODO: use array of runs instead. We currently rely on the + # extra parameters here to extract the algorithm parameter information, + # which is why this deviates from the usual ConfigDict(extra='forbid'). model_config = ConfigDict(extra='allow') class Algorithm(BaseModel): From fa7d7c984b1e91454926fb0ffa1b51ecdc2377b5 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Fri, 11 Jul 2025 15:10:01 -0700 Subject: [PATCH 20/34] fix(config): case-insensitive check on labels --- spras/config/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index 293a08ec7..c6ac8f8e0 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -127,8 +127,8 @@ def process_datasets(self, raw_config: RawConfig): self.datasets = {} for dataset in raw_config.datasets: label = dataset.label - if label in self.datasets: - raise ValueError(f"Datasets must have unique labels, but the label {label} appears at least twice.") + if label.lower() in [key.lower() for key in self.datasets.keys()]: + raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.") self.datasets[label] = dict(dataset) # parse gold standard information From 52eab214a9946a58f33ab34847da2119e2be6807 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 16:23:15 +0000 Subject: [PATCH 21/34] refactor: merge config --- spras/config/analysis_schema.py | 49 ----------------------------- spras/config/schema.py | 56 ++++++++++++++++++++++++++++++++- spras/config/util_enum.py | 4 +++ 3 files changed, 59 insertions(+), 50 deletions(-) delete mode 100644 spras/config/analysis_schema.py diff --git a/spras/config/analysis_schema.py b/spras/config/analysis_schema.py deleted file mode 100644 index dbec5f1b9..000000000 --- a/spras/config/analysis_schema.py +++ /dev/null @@ -1,49 +0,0 @@ -from pydantic import BaseModel, ConfigDict - -from spras.config.util_enum import CaseInsensitiveEnum - - -class SummaryAnalysis(BaseModel): - include: bool - - model_config = ConfigDict(extra='forbid') - -class CytoscapeAnalysis(BaseModel): - include: bool - - model_config = ConfigDict(extra='forbid') - -class MlLinkage(CaseInsensitiveEnum): - ward = 'ward' - complete = 'complete' - average = 'average' - single = 'single' - -class MlMetric(CaseInsensitiveEnum): - euclidean = 'euclidean' - manhattan = 'manhattan' - cosine = 'cosine' - -class MlAnalysis(BaseModel): - include: bool - aggregate_per_algorithm: bool = False - components: int = 2 - labels: bool = True - linkage: MlLinkage = MlLinkage.ward - metric: MlMetric = MlMetric.euclidean - - model_config = ConfigDict(extra='forbid') - -class EvaluationAnalysis(BaseModel): - include: bool - aggregate_per_algorithm: bool = False - - model_config = ConfigDict(extra='forbid') - -class Analysis(BaseModel): - summary: SummaryAnalysis = SummaryAnalysis(include=False) - cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False) - ml: MlAnalysis = MlAnalysis(include=False) - evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False) - - model_config = ConfigDict(extra='forbid') diff --git a/spras/config/schema.py b/spras/config/schema.py index d1b86e06d..f4cad554c 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -1,5 +1,13 @@ """ Contains the raw pydantic schema for the configuration file. + +Using Pydantic as our backing config parser allows us to declaratively +type our config, giving us more robust user errors with guarantees +that parts of the config exist after parsing it through Pydantic. + +We declare models using two classes here: +- `BaseModel` (docs: https://docs.pydantic.dev/latest/api/base_model/) +- `CaseInsensitiveEnum` (see ./util_enum.py) """ import re @@ -7,9 +15,55 @@ from pydantic import AfterValidator, BaseModel, ConfigDict, Field -from spras.config.analysis_schema import Analysis from spras.config.util_enum import CaseInsensitiveEnum + +class SummaryAnalysis(BaseModel): + include: bool + + model_config = ConfigDict(extra='forbid') + +class CytoscapeAnalysis(BaseModel): + include: bool + + model_config = ConfigDict(extra='forbid') + +class MlLinkage(CaseInsensitiveEnum): + ward = 'ward' + complete = 'complete' + average = 'average' + single = 'single' + +class MlMetric(CaseInsensitiveEnum): + euclidean = 'euclidean' + manhattan = 'manhattan' + cosine = 'cosine' + +class MlAnalysis(BaseModel): + include: bool + aggregate_per_algorithm: bool = False + components: int = 2 + labels: bool = True + linkage: MlLinkage = MlLinkage.ward + metric: MlMetric = MlMetric.euclidean + + model_config = ConfigDict(extra='forbid') + +class EvaluationAnalysis(BaseModel): + include: bool + aggregate_per_algorithm: bool = False + + model_config = ConfigDict(extra='forbid') + +class Analysis(BaseModel): + summary: SummaryAnalysis = SummaryAnalysis(include=False) + cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False) + ml: MlAnalysis = MlAnalysis(include=False) + evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False) + + model_config = ConfigDict(extra='forbid') + + # The default length of the truncated hash used to identify parameter combinations DEFAULT_HASH_LENGTH = 7 diff --git a/spras/config/util_enum.py b/spras/config/util_enum.py index 3e73eda98..b7680222b 100644 --- a/spras/config/util_enum.py +++ b/spras/config/util_enum.py @@ -4,6 +4,10 @@ # https://stackoverflow.com/a/76883868/7589775 class CaseInsensitiveEnum(str, Enum): + """ + We prefer this over Enum to make sure the config parsing + is more relaxed when it comes to string enum values. + """ @classmethod def _missing_(cls, value: Any): if isinstance(value, str): From 3c305f4b5b4c69e2eec44eee2bc8d2e6a1a92cfe Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 16:44:39 +0000 Subject: [PATCH 22/34] docs: use concepts link --- spras/config/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/schema.py b/spras/config/schema.py index f4cad554c..63fe1b613 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -6,7 +6,7 @@ that parts of the config exist after parsing it through Pydantic. We declare models using two classes here: -- `BaseModel` (docs: https://docs.pydantic.dev/latest/api/base_model/) +- `BaseModel` (docs: https://docs.pydantic.dev/latest/concepts/models/) - `CaseInsensitiveEnum` (see ./util_enum.py) """ From 49e50a03e85a82949e7913bb66a34c70195cb70a Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 18:13:59 +0000 Subject: [PATCH 23/34] refactor: mv util_enum -> util --- spras/config/schema.py | 2 +- spras/config/{util_enum.py => util.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename spras/config/{util_enum.py => util.py} (100%) diff --git a/spras/config/schema.py b/spras/config/schema.py index 63fe1b613..10991ad2f 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -15,7 +15,7 @@ from pydantic import AfterValidator, BaseModel, ConfigDict, Field -from spras.config.util_enum import CaseInsensitiveEnum +from spras.config.util import CaseInsensitiveEnum class SummaryAnalysis(BaseModel): diff --git a/spras/config/util_enum.py b/spras/config/util.py similarity index 100% rename from spras/config/util_enum.py rename to spras/config/util.py From cb28f61396c90415611ae970f4d7fdd7b924519b Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 11:14:19 -0700 Subject: [PATCH 24/34] docs: correct util_enum path --- spras/config/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/schema.py b/spras/config/schema.py index 10991ad2f..623c9dd9b 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -7,7 +7,7 @@ We declare models using two classes here: - `BaseModel` (docs: https://docs.pydantic.dev/latest/concepts/models/) -- `CaseInsensitiveEnum` (see ./util_enum.py) +- `CaseInsensitiveEnum` (see ./util.py) """ import re From 2c938ed09708359ebaaf82733646868998df759f Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 14 Jul 2025 16:05:36 -0700 Subject: [PATCH 25/34] fix: add spras.config to pyproject --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bedbe1628..b18ef12c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,4 +74,4 @@ select = [ # py-modules tells setuptools which directory is our actual module py-modules = ["spras"] # packages tells setuptools what the exported package is called (ie allows import spras) -packages = ["spras", "spras.analysis"] +packages = ["spras", "spras.analysis", "spras.config"] From ebcf6b0e6bc1cf04ad318ac5a1a7305aeffb3686 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 16 Jul 2025 14:43:27 -0700 Subject: [PATCH 26/34] docs: mention `args` in contributing --- CONTRIBUTING.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ffb138c14..8b9fee1f5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -110,6 +110,9 @@ Use `pathlinker.py` as an example. Call the new class within `local_neighborhood.py` `LocalNeighborhood` and set `__all__` so the class can be [imported](https://docs.python.org/3/tutorial/modules.html#importing-from-a-package). Specify the list of `required_input` files to be `network` and `nodes`. These entries are used to tell Snakemake what input files should be present before running the Local Neighborhood algorithm. +You also need to implement an `args` class with pydantic. Since `LocalNeighborhood` +takes no additional arguments, use the `Empty` arguments class (see `AllPairs` for an example), and make sure to specify it as a generic when writing a PRM class. This means your +class declaration should look something like `class LocalNeighborhood(PRM[Empty]):` Before implementing the `generate_inputs` function, explore the structure of the `Dataset` class interactively. In an interactive Python session, run the following commands to load the `data0` dataset and explore the nodes and interactome. From 4733b954305e52640053f6e8c94298263a55cded Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 16 Jul 2025 14:46:33 -0700 Subject: [PATCH 27/34] Revert "docs: mention `args` in contributing" This reverts commit ebcf6b0e6bc1cf04ad318ac5a1a7305aeffb3686. --- CONTRIBUTING.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8b9fee1f5..ffb138c14 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -110,9 +110,6 @@ Use `pathlinker.py` as an example. Call the new class within `local_neighborhood.py` `LocalNeighborhood` and set `__all__` so the class can be [imported](https://docs.python.org/3/tutorial/modules.html#importing-from-a-package). Specify the list of `required_input` files to be `network` and `nodes`. These entries are used to tell Snakemake what input files should be present before running the Local Neighborhood algorithm. -You also need to implement an `args` class with pydantic. Since `LocalNeighborhood` -takes no additional arguments, use the `Empty` arguments class (see `AllPairs` for an example), and make sure to specify it as a generic when writing a PRM class. This means your -class declaration should look something like `class LocalNeighborhood(PRM[Empty]):` Before implementing the `generate_inputs` function, explore the structure of the `Dataset` class interactively. In an interactive Python session, run the following commands to load the `data0` dataset and explore the nodes and interactome. From fa51d79c683ea8cdc09bfc40ab2c1d5df849919c Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 17 Jul 2025 19:45:00 +0000 Subject: [PATCH 28/34] docs: document some pydantic choices --- spras/config/schema.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/spras/config/schema.py b/spras/config/schema.py index 623c9dd9b..07cf1047d 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -17,10 +17,15 @@ from spras.config.util import CaseInsensitiveEnum +# Most options here have an `include` property, +# which is meant to make disabling parts of the configuration easier. +# When an option does not have a default, it means that it must be set by the user. class SummaryAnalysis(BaseModel): include: bool + # We prefer to never allow extra keys, to prevent + # any user mistypes. model_config = ConfigDict(extra='forbid') class CytoscapeAnalysis(BaseModel): @@ -28,6 +33,9 @@ class CytoscapeAnalysis(BaseModel): model_config = ConfigDict(extra='forbid') +# Note that CaseInsensitiveEnum is not pydantic: pydantic +# has special support for enums, but we avoid the +# pydantic-specific "model_config" key here for this reason. class MlLinkage(CaseInsensitiveEnum): ward = 'ward' complete = 'complete' @@ -68,6 +76,10 @@ class Analysis(BaseModel): DEFAULT_HASH_LENGTH = 7 def label_validator(name: str): + """ + A validator takes in a label + and ensure that it's alphanumeric (with underscores). + """ label_pattern = r'^\w+$' def validate(label: str): if not bool(re.match(label_pattern, label)): @@ -103,6 +115,9 @@ class Algorithm(BaseModel): model_config = ConfigDict(extra='forbid') class Dataset(BaseModel): + # We prefer AfterValidator here to allow pydantic to run its own + # validation & coercion logic before we check it against our own + # requirements label: Annotated[str, AfterValidator(label_validator("Dataset"))] node_files: list[str] edge_files: list[str] @@ -124,6 +139,7 @@ class Locations(BaseModel): model_config = ConfigDict(extra='forbid') +# NOTE: This setting doesn't have any uses past setting the output_dir as of now. class ReconstructionSettings(BaseModel): locations: Locations @@ -135,9 +151,8 @@ class RawConfig(BaseModel): unpack_singularity: bool = False container_registry: ContainerRegistry - hash_length: int = Field( - description="The length of the hash used to identify a parameter combination", - default=DEFAULT_HASH_LENGTH) + hash_length: int = DEFAULT_HASH_LENGTH + "The length of the hash used to identify a parameter combination" algorithms: list[Algorithm] datasets: list[Dataset] @@ -146,4 +161,6 @@ class RawConfig(BaseModel): reconstruction_settings: ReconstructionSettings - model_config = ConfigDict(extra='forbid') + # We include use_attribute_docstrings here to preserve the docstrings + # after attributes at runtime (for future JSON schema generation) + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) From 1bef8c763c8acf8695e0b4a1cd83297b1550f25b Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 24 Jul 2025 13:09:29 -0700 Subject: [PATCH 29/34] fix: add defaults for kde and remove_empty_pathways --- spras/config/schema.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spras/config/schema.py b/spras/config/schema.py index 07cf1047d..0a03a05f6 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -52,6 +52,8 @@ class MlAnalysis(BaseModel): aggregate_per_algorithm: bool = False components: int = 2 labels: bool = True + kde: bool = False + remove_empty_pathways: bool = False linkage: MlLinkage = MlLinkage.ward metric: MlMetric = MlMetric.euclidean From 593206cbe731f612f453f2e4570667a85e6b25b8 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 24 Jul 2025 13:09:36 -0700 Subject: [PATCH 30/34] style: fmt --- spras/config/config.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spras/config/config.py b/spras/config/config.py index 354b9f676..11208187c 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -15,7 +15,6 @@ import copy as copy import itertools as it import os -import re import warnings from collections.abc import Iterable from typing import Any @@ -23,7 +22,7 @@ import numpy as np import yaml -from spras.config.schema import Analysis, ContainerFramework, RawConfig +from spras.config.schema import ContainerFramework, RawConfig from spras.util import NpHashEncoder, hash_params_sha1_base32 config = None @@ -275,7 +274,7 @@ def process_analysis(self, raw_config: RawConfig): # Only run Evaluation per algorithm if ML per algorithm is set to True if not self.analysis_include_ml_aggregate_algo: self.analysis_include_evaluation_aggregate_algo = False - + # Set kde to True if Evaluation is set to True # When Evaluation is True, PCA is used to pick a single parameter combination for all algorithms with multiple # parameter combinations and KDE is used to choose the parameter combination in the PC space From c465c9ce6595691c53520328215172778fe20ed0 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 24 Jul 2025 13:38:19 -0700 Subject: [PATCH 31/34] test: update rn import --- test/ResponseNet/test_rn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/ResponseNet/test_rn.py b/test/ResponseNet/test_rn.py index 6fa09904b..6b9fe05cf 100644 --- a/test/ResponseNet/test_rn.py +++ b/test/ResponseNet/test_rn.py @@ -4,7 +4,7 @@ import pytest -import spras.config as config +import spras.config.config as config from spras.responsenet import ResponseNet config.init_from_file("config/config.yaml") From 2443735c6e549042476c3225d47b8302fc109514 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 25 Jul 2025 16:14:26 -0700 Subject: [PATCH 32/34] style: typos --- spras/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/config.py b/spras/config/config.py index 11208187c..605faecf4 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -171,7 +171,7 @@ def process_algorithms(self, raw_config: RawConfig): cur_params = cur_params.__pydantic_extra__ if cur_params is None: - raise RuntimeError("An internal error occured: ConfigDict extra should be set on AlgorithmParams.") + raise RuntimeError("An internal error occurred: ConfigDict extra should be set on AlgorithmParams.") # The algorithm has no named arguments so create a default placeholder if len(cur_params.keys()) == 0: From 18a173f9442ef9acc0b325e8417727e73d1c69b6 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 25 Jul 2025 22:26:06 -0700 Subject: [PATCH 33/34] docs: grammar --- spras/config/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/schema.py b/spras/config/schema.py index 0a03a05f6..f3a552694 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -80,7 +80,7 @@ class Analysis(BaseModel): def label_validator(name: str): """ A validator takes in a label - and ensure that it's alphanumeric (with underscores). + and ensures that it's alphanumeric (with underscores). """ label_pattern = r'^\w+$' def validate(label: str): From 5225616df80599ff0965632bef51914c9bf7471f Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 28 Jul 2025 08:36:28 -0700 Subject: [PATCH 34/34] docs: use nicer alphanumeric explanation Co-authored-by: Neha Talluri <78840540+ntalluri@users.noreply.github.com> --- spras/config/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/schema.py b/spras/config/schema.py index f3a552694..c84ea4384 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -80,7 +80,7 @@ class Analysis(BaseModel): def label_validator(name: str): """ A validator takes in a label - and ensures that it's alphanumeric (with underscores). + and ensures that it contains only letters, numbers, or underscores. """ label_pattern = r'^\w+$' def validate(label: str):