From c2e64f7eb0c6fbe0ab15b1e8f0707a2c63bbec47 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Wed, 25 Jun 2025 12:47:04 -0700
Subject: [PATCH 01/34] feat: begin config refactor

---
 config/config.yaml                        |   6 +-
 config/egfr-param-tuning.yaml             |   1 -
 config/egfr.yaml                          |   1 -
 docker-wrappers/SPRAS/example_config.yaml |   2 -
 environment.yml                           |   1 +
 pyproject.toml                            |   1 +
 spras/{ => config}/config.py              | 169 +++++++++++-----------
 spras/config/raw_config.py                |  64 ++++++++
 spras/containers.py                       |   2 +-
 test/AllPairs/test_ap.py                  |   2 +-
 test/DOMINO/test_domino.py                |   2 +-
 test/LocalNeighborhood/test_ln.py         |   2 +-
 test/MEO/test_meo.py                      |   2 +-
 test/MinCostFlow/test_mcf.py              |   2 +-
 test/OmicsIntegrator1/test_oi1.py         |   2 +-
 test/OmicsIntegrator2/test_oi2.py         |   2 +-
 test/PathLinker/test_pathlinker.py        |   2 +-
 test/analysis/input/config.yaml           |   1 -
 test/analysis/input/egfr.yaml             |   1 -
 test/analysis/test_cytoscape.py           |   2 +-
 test/analysis/test_summary.py             |   2 +-
 test/test_config.py                       |   2 +-
 test/test_util.py                         |   2 +-
 23 files changed, 169 insertions(+), 104 deletions(-)
 rename spras/{ => config}/config.py (80%)
 create mode 100644 spras/config/raw_config.py

diff --git a/config/config.yaml b/config/config.yaml
index 1f246dd15..3179dfedc 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -63,6 +63,11 @@ algorithms:
   - name: "omicsintegrator2"
     params:
       include: true
+      runs:
+        - b: 4
+          g: 0
+        - b: 2
+          g: 3
       run1:
         b: 4
         g: 0
@@ -144,7 +149,6 @@ reconstruction_settings:
     # TODO move to global
     reconstruction_dir: "output"
 
-  run: true
 
 analysis:
   # Create one summary per pathway file and a single summary table for all pathways for each dataset
diff --git a/config/egfr-param-tuning.yaml b/config/egfr-param-tuning.yaml
index a0a965b70..50a4788a2 100644
--- a/config/egfr-param-tuning.yaml
+++ b/config/egfr-param-tuning.yaml
@@ -3440,7 +3440,6 @@ gold_standards:
 reconstruction_settings:
   locations:
     reconstruction_dir: output/tps_egfr
-  run: true
 analysis:
   summary:
     include: true
diff --git a/config/egfr.yaml b/config/egfr.yaml
index b8c5138b8..cea3ad54b 100644
--- a/config/egfr.yaml
+++ b/config/egfr.yaml
@@ -74,7 +74,6 @@ datasets:
 reconstruction_settings:
   locations:
     reconstruction_dir: output/egfr
-  run: true
 analysis:
   graphspace:
     include: false
diff --git a/docker-wrappers/SPRAS/example_config.yaml b/docker-wrappers/SPRAS/example_config.yaml
index f7fd74e98..9791c2f23 100644
--- a/docker-wrappers/SPRAS/example_config.yaml
+++ b/docker-wrappers/SPRAS/example_config.yaml
@@ -123,8 +123,6 @@ reconstruction_settings:
     # TODO move to global
     reconstruction_dir: "output"
 
-  run: true
-
 analysis:
   # Create one summary per pathway file and a single summary table for all pathways for each dataset
   summary:
diff --git a/environment.yml b/environment.yml
index 6694b9812..7d14e3ea4 100644
--- a/environment.yml
+++ b/environment.yml
@@ -8,6 +8,7 @@ dependencies:
   - matplotlib=3.6
   - networkx=2.8
   - pandas=1.5
+  - pydantic=2.11.7
   - numpy=1.26.4
   - pre-commit=2.20 # Only required for development
   - pytest=8.0 # Only required for development
diff --git a/pyproject.toml b/pyproject.toml
index 3e90f7b1e..27dee2693 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "matplotlib==3.6",
     "networkx==2.8",
     "pandas==1.5",
+    "pydantic==2.11.7",
     "numpy==1.26.4",
     "pip==22.1",
     "requests==2.28",
diff --git a/spras/config.py b/spras/config/config.py
similarity index 80%
rename from spras/config.py
rename to spras/config/config.py
index 7bbf9cd1b..f6c5ada8b 100644
--- a/spras/config.py
+++ b/spras/config/config.py
@@ -16,19 +16,19 @@
 import itertools as it
 import os
 import re
+import warnings
 from collections.abc import Iterable
 
 import numpy as np
 import yaml
 
 from spras.util import NpHashEncoder, hash_params_sha1_base32
-
-# The default length of the truncated hash used to identify parameter combinations
-DEFAULT_HASH_LENGTH = 7
-DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio"
+from spras.config.raw_config import ContainerFramework, RawConfig, DEFAULT_HASH_LENGTH
 
 config = None
 
+DEFAULT_CONTAINER_PREFIX = "docker.io/reedcompbio"
+
 # This will get called in the Snakefile, instantiating the singleton with the raw config
 def init_global(config_dict):
     global config
@@ -43,11 +43,9 @@ def init_from_file(filepath):
         with open(filepath, 'r') as yaml_file:
             config_dict = yaml.safe_load(yaml_file)
     except FileNotFoundError:
-        print(f"Error: The specified config '{filepath}' could not be found.")
-        return False
+        raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.")
     except yaml.YAMLError as e:
-        print(f"Error: Failed to parse config '{filepath}': {e}")
-        return False
+        raise RuntimeError(f"Error: Failed to parse config '{filepath}': {e}")
 
     # And finally, initialize
     config = Config(config_dict)
@@ -55,18 +53,15 @@ def init_from_file(filepath):
 
 class Config:
     def __init__(self, raw_config):
-        # Since process_config winds up modifying the raw_config passed to it as a side effect,
-        # we'll make a deep copy here to guarantee we don't break anything. This preserves the
-        # config as it's given to the Snakefile by Snakemake
+        # Member vars populated by process_config. Any values that don't have sensible initial values are set to None
+        # before they are populated for __init__ to show exactly what is being configured.
 
-        # Member vars populated by process_config. Set to None before they are populated so that our
-        # __init__ makes clear exactly what is being configured.
         # Directory used for storing output
         self.out_dir = None
         # Container framework used by PRMs. Valid options are "docker", "dsub", and "singularity"
         self.container_framework = None
         # The container prefix (host and organization) to use for images. Default is "docker.io/reedcompbio"
-        self.container_prefix = DEFAULT_CONTAINER_PREFIX
+        self.container_prefix: str = DEFAULT_CONTAINER_PREFIX
         # A Boolean specifying whether to unpack singularity containers. Default is False
         self.unpack_singularity = False
         # A dictionary to store configured datasets against which SPRAS will be run
@@ -74,7 +69,7 @@ def __init__(self, raw_config):
         # A dictionary to store configured gold standard data against output of SPRAS runs
         self.gold_standards = None
         # The hash length SPRAS will use to identify parameter combinations. Default is 7
-        self.hash_length = DEFAULT_HASH_LENGTH
+        self.hash_length: int = DEFAULT_HASH_LENGTH
         # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key.
         self.algorithms = None
         # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations.
@@ -107,44 +102,24 @@ def __init__(self, raw_config):
         # A Boolean specifying whether to run the evaluation per algorithm analysis
         self.analysis_include_evaluation_aggregate_algo = None
 
-        _raw_config = copy.deepcopy(raw_config)
-        self.process_config(_raw_config)
-
-    def process_config(self, raw_config):
+        # Since snakemake provides an empty config, we provide this
+        # wrapper error first before passing validation to pydantic.
         if raw_config == {}:
             raise ValueError("Config file cannot be empty. Use --configfile <filename> to set a config file.")
 
-        # Set up a few top-level config variables
-        self.out_dir = raw_config["reconstruction_settings"]["locations"]["reconstruction_dir"]
-
-        # We allow the container framework not to be defined in the config. In the case it isn't, default to docker.
-        # However, if we get a bad value, we raise an exception.
-        if "container_framework" in raw_config:
-            container_framework = raw_config["container_framework"].lower()
-            if container_framework not in ("docker", "singularity", "dsub"):
-                msg = "SPRAS was configured to run with an unknown container framework: '" + raw_config["container_framework"] + "'. Accepted values are 'docker', 'singularity' or 'dsub'."
-                raise ValueError(msg)
-            if container_framework == "dsub":
-                print("Warning: 'dsub' framework integration is experimental and may not be fully supported.")
-            self.container_framework = container_framework
-        else:
-            self.container_framework = "docker"
-
-        # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container.
-        if "unpack_singularity" in raw_config:
-            # The value in the config is a string, and we need to convert it to a bool.
-            unpack_singularity = raw_config["unpack_singularity"]
-            if unpack_singularity and self.container_framework != "singularity":
-                print("Warning: unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.")
-            self.unpack_singularity = unpack_singularity
-
-        # Grab registry from the config, and if none is provided default to docker
-        if "container_registry" in raw_config and raw_config["container_registry"]["base_url"] != "" and raw_config["container_registry"]["owner"] != "":
-            self.container_prefix = raw_config["container_registry"]["base_url"] + "/" + raw_config["container_registry"]["owner"]
-
-        # Parse dataset information
-        # Datasets is initially a list, where each list entry has a dataset label and lists of input files
-        # Convert the dataset list into a dict where the label is the key and update the config data structure
+        # Since process_config winds up modifying the raw_config passed to it as a side effect,
+        # we'll make a deep copy here to guarantee we don't break anything. This preserves the
+        # config as it's given to the Snakefile by Snakemake
+        _raw_config = copy.deepcopy(raw_config)
+        parsed_raw_config = RawConfig.model_validate_json(_raw_config)
+        self.process_config(parsed_raw_config)
+    
+    def process_datasets(self, raw_config: RawConfig):
+        """
+        Parse dataset information
+        Datasets is initially a list, where each list entry has a dataset label and lists of input files
+        Convert the dataset list into a dict where the label is the key and update the config data structure
+        """
         # TODO allow labels to be optional and assign default labels
         # TODO check for collisions in dataset labels, warn, and make the labels unique
         # Need to work more on input file naming to make less strict assumptions
@@ -152,24 +127,20 @@ def process_config(self, raw_config):
         # Currently assumes all datasets have a label and the labels are unique
         # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts
         # Convert to dicts to simplify the yaml logging
-        self.datasets = {dataset["label"]: dict(dataset) for dataset in raw_config["datasets"]}
-
-        for key in self.datasets:
-            pattern = r'^\w+$'
-            if not bool(re.match(pattern, key)):
-                raise ValueError(f"Dataset label \'{key}\' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.")
+        self.datasets = {}
+        for dataset in raw_config.datasets:
+            label = dataset.label
+            if label in self.datasets:
+                raise ValueError(f"Datasets must have unique labels, but the label {label} appears at least twice.")
+            self.datasets[label] = dict(dataset)
+
+            # Validate dataset labels
+            label_pattern = r'^\w+$'
+            if not bool(re.match(label_pattern, label)):
+                raise ValueError(f"Dataset label '{label}' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.")
 
         # parse gold standard information
-        try:
-            self.gold_standards = {gold_standard["label"]: dict(gold_standard) for gold_standard in raw_config["gold_standards"]}
-        except:
-            self.gold_standards = {}
-
-        # check that gold_standard labels are formatted correctly
-        for key in self.gold_standards:
-            pattern = r'^\w+$'
-            if not bool(re.match(pattern, key)):
-                raise ValueError(f"Gold standard label \'{key}\' contains invalid values. Gold standard labels can only contain letters, numbers, or underscores.")
+        self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards}
 
         # check that all the dataset labels in the gold standards are existing datasets labels
         dataset_labels = set(self.datasets.keys())
@@ -182,33 +153,30 @@ def process_config(self, raw_config):
         # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
         # Maps from the dataset label to the dataset list index
         # dataset_dict = {dataset.get('label', f'dataset{index}'): index for index, dataset in enumerate(datasets)}
-
-        # Override the default parameter hash length if specified in the config file
-        if "hash_length" in raw_config and raw_config["hash_length"] != "":
-            self.hash_length = int(raw_config["hash_length"])
-
+    
+    def process_algorithms(self, raw_config: RawConfig):
+        """
+        Parse algorithm information
+        Each algorithm's parameters are provided as a list of dictionaries
+        Defaults are handled in the Python function or class that wraps
+        running that algorithm
+        Keys in the parameter dictionary are strings
+        """
         prior_params_hashes = set()
-
-        # Parse algorithm information
-        # Each algorithm's parameters are provided as a list of dictionaries
-        # Defaults are handled in the Python function or class that wraps
-        # running that algorithm
-        # Keys in the parameter dictionary are strings
         self.algorithm_params = dict()
         self.algorithm_directed = dict()
-        self.algorithms = raw_config["algorithms"]
+        self.algorithms = raw_config.algorithms
         for alg in self.algorithms:
-            cur_params = alg["params"]
-            if "include" in cur_params and cur_params.pop("include"):
+            cur_params = alg.params
+            if cur_params.include:
                 # This dict maps from parameter combinations hashes to parameter combination dictionaries
-                self.algorithm_params[alg["name"]] = dict()
+                self.algorithm_params[alg.name] = dict()
             else:
                 # Do not parse the rest of the parameters for this algorithm if it is not included
                 continue
 
-            if "directed" in cur_params:
-                print("UPDATE: we no longer use the directed key in the config file")
-                cur_params.pop("directed")
+            if cur_params.directed != None:
+                warnings.warn("UPDATE: we no longer use the directed key in the config file")
 
             # The algorithm has no named arguments so create a default placeholder
             if len(cur_params) == 0:
@@ -265,6 +233,39 @@ def process_config(self, raw_config):
                                         f'(current length {self.hash_length}).')
                     self.algorithm_params[alg["name"]][params_hash] = run_dict
 
+    def process_config(self, raw_config: RawConfig):
+        # Set up a few top-level config variables
+        self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir
+
+        # We allow the container framework not to be defined in the config. In the case it isn't, default to docker.
+        # However, if we get a bad value, we raise an exception.
+        if raw_config.container_framework != None:
+            container_framework = raw_config.container_framework
+            if container_framework == ContainerFramework.dsub:
+                warnings.warn("'dsub' framework integration is experimental and may not be fully supported.")
+            self.container_framework = container_framework
+        else:
+            self.container_framework = "docker"
+
+        # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container.
+        if raw_config.unpack_singularity:
+            # The value in the config is a string, and we need to convert it to a bool.
+            unpack_singularity = raw_config["unpack_singularity"]
+            if unpack_singularity and self.container_framework != "singularity":
+                warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.")
+            self.unpack_singularity = unpack_singularity
+
+        # Grab registry from the config, and if none is provided default to docker
+        if raw_config.container_registry and raw_config["container_registry"]["base_url"] != "" and raw_config["container_registry"]["owner"] != "":
+            self.container_prefix = raw_config["container_registry"]["base_url"] + "/" + raw_config["container_registry"]["owner"]
+
+        # Override the default parameter hash length if specified in the config file
+        if "hash_length" in raw_config and raw_config["hash_length"] != "":
+            self.hash_length = int(raw_config["hash_length"])
+
+        self.process_datasets(raw_config)
+        self.process_algorithms(raw_config)
+
         self.analysis_params = raw_config["analysis"] if "analysis" in raw_config else {}
         self.ml_params = self.analysis_params["ml"] if "ml" in self.analysis_params else {}
         self.evaluation_params = self.analysis_params["evaluation"] if "evaluation" in self.analysis_params else {}
diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py
new file mode 100644
index 000000000..1810bf0e5
--- /dev/null
+++ b/spras/config/raw_config.py
@@ -0,0 +1,64 @@
+"""
+Contains the raw pydantic schema for the configuration file.
+"""
+
+from enum import Enum
+from pydantic import BaseModel, ConfigDict, Field
+from typing import Optional
+
+# The default length of the truncated hash used to identify parameter combinations
+DEFAULT_HASH_LENGTH = 7
+
+class ContainerFramework(str, Enum):
+    docker = 'docker'
+    # TODO: add apptainer variant once #260 gets merged
+    singularity = 'singularity'
+    dsub = 'dsub'
+
+class ContainerRegistry(BaseModel):
+    base_url: str
+    owner: str = Field(description="The owner or project of the registry")
+
+class AlgorithmParams(BaseModel):
+    include: bool = Field(default=False)
+    directed: Optional[bool]
+    # TODO
+
+class Algorithm(BaseModel):
+    name: str
+    params: AlgorithmParams
+
+class Dataset(BaseModel):
+    label: str
+    node_files: list[str]
+    edge_files: list[str]
+    other_files: list[str]
+    data_dir: str
+
+class GoldStandard(BaseModel):
+    label: str
+    node_files: list[str]
+    data_dir: str
+    dataset_labels: list[str]
+
+class Locations(BaseModel):
+    reconstruction_dir: str
+
+class ReconstructionSettings(BaseModel):
+    locations: Locations
+
+class RawConfig(BaseModel):
+    # TODO: move this to nested container key
+    container_framework: Optional[ContainerFramework]
+    unpack_singularity: bool = Field(default=False)
+    container_registry: ContainerRegistry
+
+    hash_length: Optional[int] = Field(
+        description="The length of the hash used to identify a parameter combination",
+        default=DEFAULT_HASH_LENGTH)
+
+    algorithms: list[Algorithm]
+    datasets: list[Dataset]
+    gold_standards: list[GoldStandard] = Field(default=[])
+
+    reconstruction_settings: ReconstructionSettings
diff --git a/spras/containers.py b/spras/containers.py
index a1fda05f2..3e6c7c3fc 100644
--- a/spras/containers.py
+++ b/spras/containers.py
@@ -7,7 +7,7 @@
 
 import docker
 
-import spras.config as config
+import spras.config.config as config
 from spras.logging import indent
 from spras.util import hash_filename
 
diff --git a/test/AllPairs/test_ap.py b/test/AllPairs/test_ap.py
index 442b26a73..31dd612d9 100644
--- a/test/AllPairs/test_ap.py
+++ b/test/AllPairs/test_ap.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 from spras.allpairs import AllPairs
 
 # Note that we don't directly use the config in the test, but we need the config
diff --git a/test/DOMINO/test_domino.py b/test/DOMINO/test_domino.py
index 7f09fa975..4323ea4c9 100644
--- a/test/DOMINO/test_domino.py
+++ b/test/DOMINO/test_domino.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 from spras.domino import DOMINO, post_domino_id_transform, pre_domino_id_transform
 
 config.init_from_file("config/config.yaml")
diff --git a/test/LocalNeighborhood/test_ln.py b/test/LocalNeighborhood/test_ln.py
index fbee54902..9093efc68 100644
--- a/test/LocalNeighborhood/test_ln.py
+++ b/test/LocalNeighborhood/test_ln.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 
 config.init_from_file("config/config.yaml")
 
diff --git a/test/MEO/test_meo.py b/test/MEO/test_meo.py
index e2abdb72d..32958be20 100644
--- a/test/MEO/test_meo.py
+++ b/test/MEO/test_meo.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 from spras.meo import MEO, write_properties
 
 config.init_from_file("config/config.yaml")
diff --git a/test/MinCostFlow/test_mcf.py b/test/MinCostFlow/test_mcf.py
index 89bd61d0b..c777a665d 100644
--- a/test/MinCostFlow/test_mcf.py
+++ b/test/MinCostFlow/test_mcf.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 from spras.mincostflow import MinCostFlow
 
 config.init_from_file("config/config.yaml")
diff --git a/test/OmicsIntegrator1/test_oi1.py b/test/OmicsIntegrator1/test_oi1.py
index 35b41d428..a484c0af3 100644
--- a/test/OmicsIntegrator1/test_oi1.py
+++ b/test/OmicsIntegrator1/test_oi1.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 from spras.omicsintegrator1 import OmicsIntegrator1, write_conf
 
 config.init_from_file("config/config.yaml")
diff --git a/test/OmicsIntegrator2/test_oi2.py b/test/OmicsIntegrator2/test_oi2.py
index 2a0a3e3c1..13f7f30b6 100644
--- a/test/OmicsIntegrator2/test_oi2.py
+++ b/test/OmicsIntegrator2/test_oi2.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 from spras.omicsintegrator2 import OmicsIntegrator2
 
 config.init_from_file("config/config.yaml")
diff --git a/test/PathLinker/test_pathlinker.py b/test/PathLinker/test_pathlinker.py
index 3fd6a96bd..ed9f10670 100644
--- a/test/PathLinker/test_pathlinker.py
+++ b/test/PathLinker/test_pathlinker.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 from spras.pathlinker import PathLinker
 
 config.init_from_file("config/config.yaml")
diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml
index 833e6c4bb..49879e461 100644
--- a/test/analysis/input/config.yaml
+++ b/test/analysis/input/config.yaml
@@ -102,7 +102,6 @@ reconstruction_settings:
   locations:
     #place the save path here
     reconstruction_dir: "output"
-  run: true
 
 analysis:
   # Create one summary per pathway file and a single summary table for all pathways for each dataset
diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml
index 1ddac1cae..281ecb495 100644
--- a/test/analysis/input/egfr.yaml
+++ b/test/analysis/input/egfr.yaml
@@ -91,7 +91,6 @@ datasets:
 reconstruction_settings:
   locations:
     reconstruction_dir: output/egfr
-  run: true
 analysis:
   graphspace:
     include: true
diff --git a/test/analysis/test_cytoscape.py b/test/analysis/test_cytoscape.py
index 7451b9876..68a77cd07 100644
--- a/test/analysis/test_cytoscape.py
+++ b/test/analysis/test_cytoscape.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 from spras.analysis.cytoscape import run_cytoscape
 
 config.init_from_file("test/analysis/input/config.yaml")
diff --git a/test/analysis/test_summary.py b/test/analysis/test_summary.py
index 4ff5396da..0400d1f1b 100644
--- a/test/analysis/test_summary.py
+++ b/test/analysis/test_summary.py
@@ -3,7 +3,7 @@
 
 import pandas as pd
 
-import spras.config as config
+import spras.config.config as config
 from spras.analysis.summary import summarize_networks
 from spras.dataset import Dataset
 
diff --git a/test/test_config.py b/test/test_config.py
index 6c773ddc0..26b18a4e9 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 
 
 # Set up a dummy config for testing. For now, only include things that MUST exist in the dict
diff --git a/test/test_util.py b/test/test_util.py
index baf9db0ed..2a25fc0d1 100644
--- a/test/test_util.py
+++ b/test/test_util.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 from spras.containers import convert_docker_path, prepare_path_docker, prepare_volume
 from spras.util import hash_params_sha1_base32
 

From 4d1a19c54e22052d8e82f9c10bfdc791e4df5143 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Wed, 25 Jun 2025 20:49:36 +0000
Subject: [PATCH 02/34] feat: mostly structured config

---
 spras/config/__init__.py            |   0
 spras/config/config.py              | 113 +++++++++++++---------------
 spras/config/raw_config.py          |  27 ++++++-
 spras/config/raw_config_analysis.py |  33 ++++++++
 4 files changed, 109 insertions(+), 64 deletions(-)
 create mode 100644 spras/config/__init__.py
 create mode 100644 spras/config/raw_config_analysis.py

diff --git a/spras/config/__init__.py b/spras/config/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spras/config/config.py b/spras/config/config.py
index f6c5ada8b..751c774a5 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -18,12 +18,13 @@
 import re
 import warnings
 from collections.abc import Iterable
+from typing import Any
 
 import numpy as np
 import yaml
 
 from spras.util import NpHashEncoder, hash_params_sha1_base32
-from spras.config.raw_config import ContainerFramework, RawConfig, DEFAULT_HASH_LENGTH
+from spras.config.raw_config import ContainerFramework, RawConfig
 
 config = None
 
@@ -52,12 +53,15 @@ def init_from_file(filepath):
 
 
 class Config:
-    def __init__(self, raw_config):
-        # Member vars populated by process_config. Any values that don't have sensible initial values are set to None
+    def __init__(self, raw_config: dict[str, Any]):
+        parsed_raw_config = RawConfig.model_validate(raw_config)
+        self.process_config(parsed_raw_config)
+
+        # Member vars populated by process_config. Any values that don't have quick initial values are set to None
         # before they are populated for __init__ to show exactly what is being configured.
 
         # Directory used for storing output
-        self.out_dir = None
+        self.out_dir = parsed_raw_config.reconstruction_settings.locations.reconstruction_dir
         # Container framework used by PRMs. Valid options are "docker", "dsub", and "singularity"
         self.container_framework = None
         # The container prefix (host and organization) to use for images. Default is "docker.io/reedcompbio"
@@ -68,8 +72,8 @@ def __init__(self, raw_config):
         self.datasets = None
         # A dictionary to store configured gold standard data against output of SPRAS runs
         self.gold_standards = None
-        # The hash length SPRAS will use to identify parameter combinations. Default is 7
-        self.hash_length: int = DEFAULT_HASH_LENGTH
+        # The hash length SPRAS will use to identify parameter combinations.
+        self.hash_length = parsed_raw_config.hash_length
         # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key.
         self.algorithms = None
         # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations.
@@ -106,13 +110,6 @@ def __init__(self, raw_config):
         # wrapper error first before passing validation to pydantic.
         if raw_config == {}:
             raise ValueError("Config file cannot be empty. Use --configfile <filename> to set a config file.")
-
-        # Since process_config winds up modifying the raw_config passed to it as a side effect,
-        # we'll make a deep copy here to guarantee we don't break anything. This preserves the
-        # config as it's given to the Snakefile by Snakemake
-        _raw_config = copy.deepcopy(raw_config)
-        parsed_raw_config = RawConfig.model_validate_json(_raw_config)
-        self.process_config(parsed_raw_config)
     
     def process_datasets(self, raw_config: RawConfig):
         """
@@ -178,8 +175,12 @@ def process_algorithms(self, raw_config: RawConfig):
             if cur_params.directed != None:
                 warnings.warn("UPDATE: we no longer use the directed key in the config file")
 
+            cur_params = cur_params.__pydantic_extra__
+            if not cur_params:
+                raise RuntimeError("An internal error occured: ConfigDict extra should be set on AlgorithmParams.")
+
             # The algorithm has no named arguments so create a default placeholder
-            if len(cur_params) == 0:
+            if len(cur_params.keys()) == 0:
                 cur_params["run1"] = {"spras_placeholder": ["no parameters"]}
 
             # Each set of runs should be 1 level down in the config file
@@ -210,7 +211,7 @@ def process_algorithms(self, raw_config: RawConfig):
                                     # Catch-all for strings
                                     obj = [obj]
                             if not isinstance(obj, Iterable):
-                                raise ValueError(f"The object `{obj}` in algorithm {alg['name']} at key '{p}' in run '{run_params}' is not iterable!") from None
+                                raise ValueError(f"The object `{obj}` in algorithm {alg.name} at key '{p}' in run '{run_params}' is not iterable!") from None
                         all_runs.append(obj)
                 run_list_tuples = list(it.product(*all_runs))
                 param_name_tuple = tuple(param_name_list)
@@ -231,47 +232,18 @@ def process_algorithms(self, raw_config: RawConfig):
                     if params_hash in prior_params_hashes:
                         raise ValueError(f'Parameter hash collision detected. Increase the hash_length in the config file '
                                         f'(current length {self.hash_length}).')
-                    self.algorithm_params[alg["name"]][params_hash] = run_dict
-
-    def process_config(self, raw_config: RawConfig):
-        # Set up a few top-level config variables
-        self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir
-
-        # We allow the container framework not to be defined in the config. In the case it isn't, default to docker.
-        # However, if we get a bad value, we raise an exception.
-        if raw_config.container_framework != None:
-            container_framework = raw_config.container_framework
-            if container_framework == ContainerFramework.dsub:
-                warnings.warn("'dsub' framework integration is experimental and may not be fully supported.")
-            self.container_framework = container_framework
-        else:
-            self.container_framework = "docker"
-
-        # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container.
-        if raw_config.unpack_singularity:
-            # The value in the config is a string, and we need to convert it to a bool.
-            unpack_singularity = raw_config["unpack_singularity"]
-            if unpack_singularity and self.container_framework != "singularity":
-                warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.")
-            self.unpack_singularity = unpack_singularity
-
-        # Grab registry from the config, and if none is provided default to docker
-        if raw_config.container_registry and raw_config["container_registry"]["base_url"] != "" and raw_config["container_registry"]["owner"] != "":
-            self.container_prefix = raw_config["container_registry"]["base_url"] + "/" + raw_config["container_registry"]["owner"]
+                    self.algorithm_params[alg.name][params_hash] = run_dict
 
-        # Override the default parameter hash length if specified in the config file
-        if "hash_length" in raw_config and raw_config["hash_length"] != "":
-            self.hash_length = int(raw_config["hash_length"])
+    def process_analysis(self, raw_config: RawConfig):
+        if not raw_config.analysis:
+            return
 
-        self.process_datasets(raw_config)
-        self.process_algorithms(raw_config)
-
-        self.analysis_params = raw_config["analysis"] if "analysis" in raw_config else {}
-        self.ml_params = self.analysis_params["ml"] if "ml" in self.analysis_params else {}
-        self.evaluation_params = self.analysis_params["evaluation"] if "evaluation" in self.analysis_params else {}
+        self.analysis_params = raw_config.analysis
+        self.ml_params = self.analysis_params.ml if self.analysis_params.ml else {}
+        self.evaluation_params = self.analysis_params.evaluation if self.analysis_params.evaluation else {}
 
         self.pca_params = {}
-        if "components" in self.ml_params:
+        if self.ml_params.components:
             self.pca_params["components"] = self.ml_params["components"]
         if "labels" in self.ml_params:
             self.pca_params["labels"] = self.ml_params["labels"]
@@ -282,14 +254,14 @@ def process_config(self, raw_config: RawConfig):
         if "metric" in self.ml_params:
             self.hac_params["metric"] = self.ml_params ["metric"]
 
-        self.analysis_include_summary = raw_config["analysis"]["summary"]["include"]
-        self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"]
-        self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"]
-        self.analysis_include_ml = raw_config["analysis"]["ml"]["include"]
-        self.analysis_include_evaluation = raw_config["analysis"]["evaluation"]["include"]
+        self.analysis_include_summary = raw_config.analysis.summary.include
+        self.analysis_include_graphspace = raw_config.analysis.graphspace.include
+        self.analysis_include_cytoscape = raw_config.analysis.cytoscape.include
+        self.analysis_include_ml = raw_config.analysis.ml.include
+        self.analysis_include_evaluation = raw_config.analysis.evaluation.include
 
         # Only run ML aggregate per algorithm if analysis include ML is set to True
-        if 'aggregate_per_algorithm' in self.ml_params and self.analysis_include_ml:
+        if self.ml_params.aggregate_per_algorithm and self.analysis_include_ml:
             self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
         else:
             self.analysis_include_ml_aggregate_algo = False
@@ -304,11 +276,32 @@ def process_config(self, raw_config: RawConfig):
             self.analysis_include_evaluation = False
 
         # Only run Evaluation aggregate per algorithm if analysis include ML is set to True
-        if 'aggregate_per_algorithm' in self.evaluation_params and self.analysis_include_evaluation:
-            self.analysis_include_evaluation_aggregate_algo = raw_config["analysis"]["evaluation"]["aggregate_per_algorithm"]
+        if self.evaluation_params.aggregate_per_algorithm and self.analysis_include_evaluation:
+            self.analysis_include_evaluation_aggregate_algo = raw_config.analysis.evaluation.aggregate_per_algorithm
         else:
             self.analysis_include_evaluation_aggregate_algo = False
 
         # Only run Evaluation per algorithm if ML per algorithm is set to True
         if not self.analysis_include_ml_aggregate_algo:
             self.analysis_include_evaluation_aggregate_algo = False
+
+    def process_config(self, raw_config: RawConfig):
+        # Set up a few top-level config variables
+        self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir
+
+        if raw_config.container_framework == ContainerFramework.dsub:
+            warnings.warn("'dsub' framework integration is experimental and may not be fully supported.")
+        self.container_framework = raw_config.container_framework
+
+        # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container.
+        if raw_config.unpack_singularity and self.container_framework != "singularity":
+            warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.")
+        self.unpack_singularity = raw_config.unpack_singularity
+
+        # Grab registry from the config, and if none is provided default to docker
+        if raw_config.container_registry and raw_config.container_registry.base_url != "" and raw_config.container_registry.owner != "":
+            self.container_prefix = raw_config.container_registry.base_url + "/" + raw_config.container_registry.owner
+
+        self.process_datasets(raw_config)
+        self.process_algorithms(raw_config)
+        self.process_analysis(raw_config)
diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py
index 1810bf0e5..76992c8f1 100644
--- a/spras/config/raw_config.py
+++ b/spras/config/raw_config.py
@@ -6,6 +6,8 @@
 from pydantic import BaseModel, ConfigDict, Field
 from typing import Optional
 
+from spras.config.raw_config_analysis import Analysis
+
 # The default length of the truncated hash used to identify parameter combinations
 DEFAULT_HASH_LENGTH = 7
 
@@ -19,15 +21,21 @@ class ContainerRegistry(BaseModel):
     base_url: str
     owner: str = Field(description="The owner or project of the registry")
 
+    model_config = ConfigDict(extra='forbid')
+
 class AlgorithmParams(BaseModel):
     include: bool = Field(default=False)
     directed: Optional[bool]
-    # TODO
+
+    # TODO: use array of runs instead
+    model_config = ConfigDict(extra='allow')
 
 class Algorithm(BaseModel):
     name: str
     params: AlgorithmParams
 
+    model_config = ConfigDict(extra='forbid')
+
 class Dataset(BaseModel):
     label: str
     node_files: list[str]
@@ -35,30 +43,41 @@ class Dataset(BaseModel):
     other_files: list[str]
     data_dir: str
 
+    model_config = ConfigDict(extra='forbid')
+
 class GoldStandard(BaseModel):
     label: str
     node_files: list[str]
     data_dir: str
     dataset_labels: list[str]
 
+    model_config = ConfigDict(extra='forbid')
+
 class Locations(BaseModel):
     reconstruction_dir: str
 
+    model_config = ConfigDict(extra='forbid')
+
 class ReconstructionSettings(BaseModel):
     locations: Locations
 
+    model_config = ConfigDict(extra='forbid')
+
 class RawConfig(BaseModel):
-    # TODO: move this to nested container key
-    container_framework: Optional[ContainerFramework]
+    # TODO: move these container values to a nested container key
+    container_framework: ContainerFramework = Field(default=ContainerFramework.docker)
     unpack_singularity: bool = Field(default=False)
     container_registry: ContainerRegistry
 
-    hash_length: Optional[int] = Field(
+    hash_length: int = Field(
         description="The length of the hash used to identify a parameter combination",
         default=DEFAULT_HASH_LENGTH)
 
     algorithms: list[Algorithm]
     datasets: list[Dataset]
     gold_standards: list[GoldStandard] = Field(default=[])
+    analysis: Optional[Analysis]
 
     reconstruction_settings: ReconstructionSettings
+
+    model_config = ConfigDict(extra='forbid')
diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py
new file mode 100644
index 000000000..8743f5969
--- /dev/null
+++ b/spras/config/raw_config_analysis.py
@@ -0,0 +1,33 @@
+from pydantic import BaseModel
+from typing import Optional
+
+class SummaryAnalysis(BaseModel):
+    include: bool
+
+class GraphspaceAnalysis(BaseModel):
+    include: bool
+
+class CytoscapeAnalysis(BaseModel):
+    include: bool
+
+class MlAnalysis(BaseModel):
+    include: bool
+    aggregate_per_algorithm: bool
+    components: int
+    labels: bool
+    # TODO: enumify
+    linkage: str
+    # TODO: enumify
+    metric: str
+
+class EvaluationAnalysis(BaseModel):
+    include: bool
+    aggregate_per_algorithm: bool
+
+
+class Analysis(BaseModel):
+    summary: Optional[SummaryAnalysis]
+    graphspace: Optional[GraphspaceAnalysis]
+    cytoscape: Optional[CytoscapeAnalysis]
+    ml: Optional[MlAnalysis]
+    evaluation: Optional[EvaluationAnalysis]

From b56ecde361c9138cea993472d1f52c0adbf0dff2 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Wed, 25 Jun 2025 20:53:36 +0000
Subject: [PATCH 03/34] feat: add enum variants on ml

---
 config/config.yaml                  |  6 ------
 spras/config/config.py              |  8 ++++----
 spras/config/raw_config.py          |  3 ++-
 spras/config/raw_config_analysis.py | 22 +++++++++++++++++-----
 4 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 3179dfedc..68c580683 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -63,11 +63,6 @@ algorithms:
   - name: "omicsintegrator2"
     params:
       include: true
-      runs:
-        - b: 4
-          g: 0
-        - b: 2
-          g: 3
       run1:
         b: 4
         g: 0
@@ -149,7 +144,6 @@ reconstruction_settings:
     # TODO move to global
     reconstruction_dir: "output"
 
-
 analysis:
   # Create one summary per pathway file and a single summary table for all pathways for each dataset
   summary:
diff --git a/spras/config/config.py b/spras/config/config.py
index 751c774a5..b6be80ef1 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -23,8 +23,8 @@
 import numpy as np
 import yaml
 
-from spras.util import NpHashEncoder, hash_params_sha1_base32
 from spras.config.raw_config import ContainerFramework, RawConfig
+from spras.util import NpHashEncoder, hash_params_sha1_base32
 
 config = None
 
@@ -110,7 +110,7 @@ def __init__(self, raw_config: dict[str, Any]):
         # wrapper error first before passing validation to pydantic.
         if raw_config == {}:
             raise ValueError("Config file cannot be empty. Use --configfile <filename> to set a config file.")
-    
+
     def process_datasets(self, raw_config: RawConfig):
         """
         Parse dataset information
@@ -150,7 +150,7 @@ def process_datasets(self, raw_config: RawConfig):
         # dataset_labels = [dataset.get('label', f'dataset{index}') for index, dataset in enumerate(datasets)]
         # Maps from the dataset label to the dataset list index
         # dataset_dict = {dataset.get('label', f'dataset{index}'): index for index, dataset in enumerate(datasets)}
-    
+
     def process_algorithms(self, raw_config: RawConfig):
         """
         Parse algorithm information
@@ -172,7 +172,7 @@ def process_algorithms(self, raw_config: RawConfig):
                 # Do not parse the rest of the parameters for this algorithm if it is not included
                 continue
 
-            if cur_params.directed != None:
+            if cur_params.directed is not None:
                 warnings.warn("UPDATE: we no longer use the directed key in the config file")
 
             cur_params = cur_params.__pydantic_extra__
diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py
index 76992c8f1..f60cb47c5 100644
--- a/spras/config/raw_config.py
+++ b/spras/config/raw_config.py
@@ -3,9 +3,10 @@
 """
 
 from enum import Enum
-from pydantic import BaseModel, ConfigDict, Field
 from typing import Optional
 
+from pydantic import BaseModel, ConfigDict, Field
+
 from spras.config.raw_config_analysis import Analysis
 
 # The default length of the truncated hash used to identify parameter combinations
diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py
index 8743f5969..7eae2a127 100644
--- a/spras/config/raw_config_analysis.py
+++ b/spras/config/raw_config_analysis.py
@@ -1,6 +1,9 @@
-from pydantic import BaseModel
+from enum import Enum
 from typing import Optional
 
+from pydantic import BaseModel
+
+
 class SummaryAnalysis(BaseModel):
     include: bool
 
@@ -10,15 +13,24 @@ class GraphspaceAnalysis(BaseModel):
 class CytoscapeAnalysis(BaseModel):
     include: bool
 
+class MlLinkage(str, Enum):
+    ward = 'ward'
+    complete = 'complete'
+    average = 'average'
+    single = 'single'
+
+class MlMetric(str, Enum):
+    euclidean = 'euclidean'
+    manhattan = 'manhattan'
+    cosine = 'cosine'
+
 class MlAnalysis(BaseModel):
     include: bool
     aggregate_per_algorithm: bool
     components: int
     labels: bool
-    # TODO: enumify
-    linkage: str
-    # TODO: enumify
-    metric: str
+    linkage: MlLinkage
+    metric: MlMetric
 
 class EvaluationAnalysis(BaseModel):
     include: bool

From bf95888ccbe00d90f124a998ab128f03b324c33f Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Wed, 25 Jun 2025 21:06:53 +0000
Subject: [PATCH 04/34] fix: some defaults

---
 spras/config/config.py              |  6 +++---
 spras/config/raw_config.py          | 12 ++++++------
 spras/config/raw_config_analysis.py | 18 +++++++++---------
 test/test_config.py                 |  1 -
 4 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index b6be80ef1..75c9124c6 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -43,10 +43,10 @@ def init_from_file(filepath):
     try:
         with open(filepath, 'r') as yaml_file:
             config_dict = yaml.safe_load(yaml_file)
-    except FileNotFoundError:
-        raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.")
+    except FileNotFoundError as e:
+        raise RuntimeError(f"Error: The specified config '{filepath}' could not be found.") from e
     except yaml.YAMLError as e:
-        raise RuntimeError(f"Error: Failed to parse config '{filepath}': {e}")
+        raise RuntimeError(f"Error: Failed to parse config '{filepath}'") from e
 
     # And finally, initialize
     config = Config(config_dict)
diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py
index f60cb47c5..1580706e2 100644
--- a/spras/config/raw_config.py
+++ b/spras/config/raw_config.py
@@ -25,8 +25,8 @@ class ContainerRegistry(BaseModel):
     model_config = ConfigDict(extra='forbid')
 
 class AlgorithmParams(BaseModel):
-    include: bool = Field(default=False)
-    directed: Optional[bool]
+    include: bool = False
+    directed: Optional[bool] = None
 
     # TODO: use array of runs instead
     model_config = ConfigDict(extra='allow')
@@ -66,8 +66,8 @@ class ReconstructionSettings(BaseModel):
 
 class RawConfig(BaseModel):
     # TODO: move these container values to a nested container key
-    container_framework: ContainerFramework = Field(default=ContainerFramework.docker)
-    unpack_singularity: bool = Field(default=False)
+    container_framework: ContainerFramework = ContainerFramework.docker
+    unpack_singularity: bool = False
     container_registry: ContainerRegistry
 
     hash_length: int = Field(
@@ -76,8 +76,8 @@ class RawConfig(BaseModel):
 
     algorithms: list[Algorithm]
     datasets: list[Dataset]
-    gold_standards: list[GoldStandard] = Field(default=[])
-    analysis: Optional[Analysis]
+    gold_standards: list[GoldStandard] = []
+    analysis: Optional[Analysis] = None
 
     reconstruction_settings: ReconstructionSettings
 
diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py
index 7eae2a127..5682fb6e6 100644
--- a/spras/config/raw_config_analysis.py
+++ b/spras/config/raw_config_analysis.py
@@ -27,10 +27,10 @@ class MlMetric(str, Enum):
 class MlAnalysis(BaseModel):
     include: bool
     aggregate_per_algorithm: bool
-    components: int
-    labels: bool
-    linkage: MlLinkage
-    metric: MlMetric
+    components: int = 2
+    labels: bool = True
+    linkage: MlLinkage = MlLinkage.ward
+    metric: MlMetric = MlMetric.euclidean
 
 class EvaluationAnalysis(BaseModel):
     include: bool
@@ -38,8 +38,8 @@ class EvaluationAnalysis(BaseModel):
 
 
 class Analysis(BaseModel):
-    summary: Optional[SummaryAnalysis]
-    graphspace: Optional[GraphspaceAnalysis]
-    cytoscape: Optional[CytoscapeAnalysis]
-    ml: Optional[MlAnalysis]
-    evaluation: Optional[EvaluationAnalysis]
+    summary: Optional[SummaryAnalysis] = None
+    graphspace: Optional[GraphspaceAnalysis] = None
+    cytoscape: Optional[CytoscapeAnalysis] = None
+    ml: Optional[MlAnalysis] = None
+    evaluation: Optional[EvaluationAnalysis] = None
diff --git a/test/test_config.py b/test/test_config.py
index 26b18a4e9..3039e2dc3 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -25,7 +25,6 @@ def get_test_config():
         "datasets": [{"label": "alg1"}, {"label": "alg2"}],
         "gold_standards": [{"label": "gs1", "dataset_labels": []}],
         "algorithms": [
-            {"params": ["param2", "param2"]},
             {
                 "name": "strings",
                 "params": {

From 51d6a7b1efeee0b97f5ad6e1ca377f91fbd9ce19 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Wed, 25 Jun 2025 21:47:26 +0000
Subject: [PATCH 05/34] feat: fully finish config parsing

---
 spras/config/config.py              | 43 ++++++++++-------------------
 spras/config/raw_config.py          | 23 ++++++++++-----
 spras/config/raw_config_analysis.py | 21 +++++++-------
 spras/config/util_enum.py           | 14 ++++++++++
 test/test_config.py                 | 33 +++++++++++++++++++---
 5 files changed, 83 insertions(+), 51 deletions(-)
 create mode 100644 spras/config/util_enum.py

diff --git a/spras/config/config.py b/spras/config/config.py
index 75c9124c6..02ab61ca0 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -54,8 +54,12 @@ def init_from_file(filepath):
 
 class Config:
     def __init__(self, raw_config: dict[str, Any]):
+        # Since snakemake provides an empty config, we provide this
+        # wrapper error first before passing validation to pydantic.
+        if raw_config == {}:
+            raise ValueError("Config file cannot be empty. Use --configfile <filename> to set a config file.")
+
         parsed_raw_config = RawConfig.model_validate(raw_config)
-        self.process_config(parsed_raw_config)
 
         # Member vars populated by process_config. Any values that don't have quick initial values are set to None
         # before they are populated for __init__ to show exactly what is being configured.
@@ -106,10 +110,7 @@ def __init__(self, raw_config: dict[str, Any]):
         # A Boolean specifying whether to run the evaluation per algorithm analysis
         self.analysis_include_evaluation_aggregate_algo = None
 
-        # Since snakemake provides an empty config, we provide this
-        # wrapper error first before passing validation to pydantic.
-        if raw_config == {}:
-            raise ValueError("Config file cannot be empty. Use --configfile <filename> to set a config file.")
+        self.process_config(parsed_raw_config)
 
     def process_datasets(self, raw_config: RawConfig):
         """
@@ -118,7 +119,6 @@ def process_datasets(self, raw_config: RawConfig):
         Convert the dataset list into a dict where the label is the key and update the config data structure
         """
         # TODO allow labels to be optional and assign default labels
-        # TODO check for collisions in dataset labels, warn, and make the labels unique
         # Need to work more on input file naming to make less strict assumptions
         # about the filename structure
         # Currently assumes all datasets have a label and the labels are unique
@@ -130,12 +130,7 @@ def process_datasets(self, raw_config: RawConfig):
             if label in self.datasets:
                 raise ValueError(f"Datasets must have unique labels, but the label {label} appears at least twice.")
             self.datasets[label] = dict(dataset)
-
-            # Validate dataset labels
-            label_pattern = r'^\w+$'
-            if not bool(re.match(label_pattern, label)):
-                raise ValueError(f"Dataset label '{label}' contains invalid values. Dataset labels can only contain letters, numbers, or underscores.")
-
+        
         # parse gold standard information
         self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards}
 
@@ -173,7 +168,7 @@ def process_algorithms(self, raw_config: RawConfig):
                 continue
 
             if cur_params.directed is not None:
-                warnings.warn("UPDATE: we no longer use the directed key in the config file")
+                warnings.warn("UPDATE: we no longer use the directed key in the config file", stacklevel=2)
 
             cur_params = cur_params.__pydantic_extra__
             if not cur_params:
@@ -239,20 +234,10 @@ def process_analysis(self, raw_config: RawConfig):
             return
 
         self.analysis_params = raw_config.analysis
-        self.ml_params = self.analysis_params.ml if self.analysis_params.ml else {}
-        self.evaluation_params = self.analysis_params.evaluation if self.analysis_params.evaluation else {}
-
-        self.pca_params = {}
-        if self.ml_params.components:
-            self.pca_params["components"] = self.ml_params["components"]
-        if "labels" in self.ml_params:
-            self.pca_params["labels"] = self.ml_params["labels"]
+        self.ml_params = self.analysis_params.ml
+        self.evaluation_params = self.analysis_params.evaluation
 
-        self.hac_params = {}
-        if "linkage" in self.ml_params:
-            self.hac_params["linkage"] = self.ml_params["linkage"]
-        if "metric" in self.ml_params:
-            self.hac_params["metric"] = self.ml_params ["metric"]
+        self.pca_params = self.ml_params
 
         self.analysis_include_summary = raw_config.analysis.summary.include
         self.analysis_include_graphspace = raw_config.analysis.graphspace.include
@@ -262,7 +247,7 @@ def process_analysis(self, raw_config: RawConfig):
 
         # Only run ML aggregate per algorithm if analysis include ML is set to True
         if self.ml_params.aggregate_per_algorithm and self.analysis_include_ml:
-            self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
+            self.analysis_include_ml_aggregate_algo = raw_config.analysis.ml.aggregate_per_algorithm
         else:
             self.analysis_include_ml_aggregate_algo = False
 
@@ -290,12 +275,12 @@ def process_config(self, raw_config: RawConfig):
         self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir
 
         if raw_config.container_framework == ContainerFramework.dsub:
-            warnings.warn("'dsub' framework integration is experimental and may not be fully supported.")
+            warnings.warn("'dsub' framework integration is experimental and may not be fully supported.", stacklevel=2)
         self.container_framework = raw_config.container_framework
 
         # Unpack settings for running in singularity mode. Needed when running PRM containers if already in a container.
         if raw_config.unpack_singularity and self.container_framework != "singularity":
-            warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.")
+            warnings.warn("unpack_singularity is set to True, but the container framework is not singularity. This setting will have no effect.", stacklevel=2)
         self.unpack_singularity = raw_config.unpack_singularity
 
         # Grab registry from the config, and if none is provided default to docker
diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py
index 1580706e2..5ab6fed1a 100644
--- a/spras/config/raw_config.py
+++ b/spras/config/raw_config.py
@@ -2,17 +2,26 @@
 Contains the raw pydantic schema for the configuration file.
 """
 
-from enum import Enum
-from typing import Optional
+import re
+from spras.config.util_enum import CaseInsensitiveEnum
+from typing import Annotated, Optional
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import AfterValidator, BaseModel, ConfigDict, Field
 
 from spras.config.raw_config_analysis import Analysis
 
 # The default length of the truncated hash used to identify parameter combinations
 DEFAULT_HASH_LENGTH = 7
 
-class ContainerFramework(str, Enum):
+def label_validator(name: str):
+    label_pattern = r'^\w+$'
+    def validate(label: str):
+        if not bool(re.match(label_pattern, label)):
+            raise ValueError(f"{name} label '{label}' contains invalid values. {name} labels can only contain letters, numbers, or underscores.")
+        return label
+    return validate
+
+class ContainerFramework(CaseInsensitiveEnum):
     docker = 'docker'
     # TODO: add apptainer variant once #260 gets merged
     singularity = 'singularity'
@@ -26,7 +35,7 @@ class ContainerRegistry(BaseModel):
 
 class AlgorithmParams(BaseModel):
     include: bool = False
-    directed: Optional[bool] = None
+    directed: Annotated[Optional[bool], Field(deprecated=True)] = None
 
     # TODO: use array of runs instead
     model_config = ConfigDict(extra='allow')
@@ -38,7 +47,7 @@ class Algorithm(BaseModel):
     model_config = ConfigDict(extra='forbid')
 
 class Dataset(BaseModel):
-    label: str
+    label: Annotated[str, AfterValidator(label_validator("Dataset"))]
     node_files: list[str]
     edge_files: list[str]
     other_files: list[str]
@@ -47,7 +56,7 @@ class Dataset(BaseModel):
     model_config = ConfigDict(extra='forbid')
 
 class GoldStandard(BaseModel):
-    label: str
+    label: Annotated[str, AfterValidator(label_validator("Gold Standard"))]
     node_files: list[str]
     data_dir: str
     dataset_labels: list[str]
diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py
index 5682fb6e6..6b5be9bab 100644
--- a/spras/config/raw_config_analysis.py
+++ b/spras/config/raw_config_analysis.py
@@ -1,5 +1,4 @@
-from enum import Enum
-from typing import Optional
+from spras.config.util_enum import CaseInsensitiveEnum
 
 from pydantic import BaseModel
 
@@ -13,20 +12,20 @@ class GraphspaceAnalysis(BaseModel):
 class CytoscapeAnalysis(BaseModel):
     include: bool
 
-class MlLinkage(str, Enum):
+class MlLinkage(CaseInsensitiveEnum):
     ward = 'ward'
     complete = 'complete'
     average = 'average'
     single = 'single'
 
-class MlMetric(str, Enum):
+class MlMetric(CaseInsensitiveEnum):
     euclidean = 'euclidean'
     manhattan = 'manhattan'
     cosine = 'cosine'
 
 class MlAnalysis(BaseModel):
     include: bool
-    aggregate_per_algorithm: bool
+    aggregate_per_algorithm: bool = False
     components: int = 2
     labels: bool = True
     linkage: MlLinkage = MlLinkage.ward
@@ -34,12 +33,12 @@ class MlAnalysis(BaseModel):
 
 class EvaluationAnalysis(BaseModel):
     include: bool
-    aggregate_per_algorithm: bool
+    aggregate_per_algorithm: bool = False
 
 
 class Analysis(BaseModel):
-    summary: Optional[SummaryAnalysis] = None
-    graphspace: Optional[GraphspaceAnalysis] = None
-    cytoscape: Optional[CytoscapeAnalysis] = None
-    ml: Optional[MlAnalysis] = None
-    evaluation: Optional[EvaluationAnalysis] = None
+    summary: SummaryAnalysis = SummaryAnalysis(include=False)
+    graphspace: GraphspaceAnalysis = GraphspaceAnalysis(include=False)
+    cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False)
+    ml: MlAnalysis = MlAnalysis(include=False)
+    evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False)
diff --git a/spras/config/util_enum.py b/spras/config/util_enum.py
new file mode 100644
index 000000000..ec5700fd3
--- /dev/null
+++ b/spras/config/util_enum.py
@@ -0,0 +1,14 @@
+from enum import Enum
+from typing import Any
+
+# https://stackoverflow.com/a/76883868/7589775
+class CaseInsensitiveEnum(str, Enum):
+    @classmethod
+    def _missing_(cls, value: Any):
+        if isinstance(value, str):
+            value = value.lower()
+
+            for member in cls:
+                if member.lower() == value:
+                    return member
+        return None
diff --git a/test/test_config.py b/test/test_config.py
index 3039e2dc3..9a3bf6549 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -4,7 +4,14 @@
 import pytest
 
 import spras.config.config as config
+from spras.config.raw_config import DEFAULT_HASH_LENGTH
 
+filler_dataset_data: dict[str, str | list[str]] = {
+    "data_dir": "fake",
+    "edge_files": [],
+    "other_files": [],
+    "node_files": []
+}
 
 # Set up a dummy config for testing. For now, only include things that MUST exist in the dict
 # in order for the config init to complete. To test particular parts of the config initialization,
@@ -22,8 +29,25 @@ def get_test_config():
                 "reconstruction_dir": "my_dir"
             }
         },
-        "datasets": [{"label": "alg1"}, {"label": "alg2"}],
-        "gold_standards": [{"label": "gs1", "dataset_labels": []}],
+        "datasets": [{
+            "label": "alg1",
+            "data_dir": "fake",
+            "edge_files": [],
+            "other_files": [],
+            "node_files": []
+        }, {
+            "label": "alg2",
+            "data_dir": "faux",
+            "edge_files": [],
+            "other_files": [],
+            "node_files": []
+        }],
+        "gold_standards": [{
+            "label": "gs1",
+            "dataset_labels": [],
+            "node_files": [],
+            "data_dir": "gs-fake"
+        }],
         "algorithms": [
             {
                 "name": "strings",
@@ -125,9 +149,9 @@ def test_config_hash_length(self):
         config.init_global(test_config)
         assert (config.config.hash_length == 7)
 
-        test_config["hash_length"] = ""
+        test_config.pop("hash_length", None)
         config.init_global(test_config)
-        assert (config.config.hash_length == config.DEFAULT_HASH_LENGTH)
+        assert (config.config.hash_length == DEFAULT_HASH_LENGTH)
 
         # Initialize the configuration
         test_config["hash_length"] = "12"
@@ -193,6 +217,7 @@ def test_correct_dataset_label(self):
         test_config = get_test_config()
         correct_test_dicts = [{"label": "test"},  {"label": "123"}, {"label": "test123"}, {"label": "123test"}, {"label": "_"},
                               {"label": "test_test"}, {"label": "_test"}, {"label": "test_"}]
+        correct_test_dicts = [dict(list(d.items()) + list(filler_dataset_data.items())) for d in correct_test_dicts]
 
         for test_dict in correct_test_dicts:
             test_config["datasets"] = [test_dict]

From a27d38decf17c3579753e3b8e7e9b603abc1e538 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Wed, 25 Jun 2025 21:49:28 +0000
Subject: [PATCH 06/34] style: fmt

---
 spras/config/config.py              | 2 +-
 spras/config/raw_config.py          | 2 +-
 spras/config/raw_config_analysis.py | 4 ++--
 spras/config/util_enum.py           | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index 02ab61ca0..b627fcc73 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -130,7 +130,7 @@ def process_datasets(self, raw_config: RawConfig):
             if label in self.datasets:
                 raise ValueError(f"Datasets must have unique labels, but the label {label} appears at least twice.")
             self.datasets[label] = dict(dataset)
-        
+
         # parse gold standard information
         self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards}
 
diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py
index 5ab6fed1a..4c1cc3581 100644
--- a/spras/config/raw_config.py
+++ b/spras/config/raw_config.py
@@ -3,12 +3,12 @@
 """
 
 import re
-from spras.config.util_enum import CaseInsensitiveEnum
 from typing import Annotated, Optional
 
 from pydantic import AfterValidator, BaseModel, ConfigDict, Field
 
 from spras.config.raw_config_analysis import Analysis
+from spras.config.util_enum import CaseInsensitiveEnum
 
 # The default length of the truncated hash used to identify parameter combinations
 DEFAULT_HASH_LENGTH = 7
diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py
index 6b5be9bab..194fc8f7c 100644
--- a/spras/config/raw_config_analysis.py
+++ b/spras/config/raw_config_analysis.py
@@ -1,7 +1,7 @@
-from spras.config.util_enum import CaseInsensitiveEnum
-
 from pydantic import BaseModel
 
+from spras.config.util_enum import CaseInsensitiveEnum
+
 
 class SummaryAnalysis(BaseModel):
     include: bool
diff --git a/spras/config/util_enum.py b/spras/config/util_enum.py
index ec5700fd3..3e73eda98 100644
--- a/spras/config/util_enum.py
+++ b/spras/config/util_enum.py
@@ -1,6 +1,7 @@
 from enum import Enum
 from typing import Any
 
+
 # https://stackoverflow.com/a/76883868/7589775
 class CaseInsensitiveEnum(str, Enum):
     @classmethod

From dd4674a22d9af81c210093a8fd9da393074e38db Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Wed, 25 Jun 2025 22:03:04 +0000
Subject: [PATCH 07/34] fix: remove dep mark, use strict is None

---
 spras/config/config.py     | 2 +-
 spras/config/raw_config.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index b627fcc73..55d061607 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -171,7 +171,7 @@ def process_algorithms(self, raw_config: RawConfig):
                 warnings.warn("UPDATE: we no longer use the directed key in the config file", stacklevel=2)
 
             cur_params = cur_params.__pydantic_extra__
-            if not cur_params:
+            if cur_params is None:
                 raise RuntimeError("An internal error occured: ConfigDict extra should be set on AlgorithmParams.")
 
             # The algorithm has no named arguments so create a default placeholder
diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py
index 4c1cc3581..caff7c690 100644
--- a/spras/config/raw_config.py
+++ b/spras/config/raw_config.py
@@ -35,7 +35,7 @@ class ContainerRegistry(BaseModel):
 
 class AlgorithmParams(BaseModel):
     include: bool = False
-    directed: Annotated[Optional[bool], Field(deprecated=True)] = None
+    directed: Optional[bool] = None
 
     # TODO: use array of runs instead
     model_config = ConfigDict(extra='allow')

From 5a8826d24434613a611ccdfef69c81260e4d3129 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Wed, 25 Jun 2025 23:01:30 +0000
Subject: [PATCH 08/34] chore: correct config loc

---
 Snakefile              | 2 +-
 spras/config/config.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index df90f8e4a..f5817650d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -5,7 +5,7 @@ import yaml
 from spras.dataset import Dataset
 from spras.evaluation import Evaluation
 from spras.analysis import ml, summary, graphspace, cytoscape
-import spras.config as _config
+import spras.config.config as _config
 
 # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037
 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them
diff --git a/spras/config/config.py b/spras/config/config.py
index 55d061607..5894dd304 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -6,7 +6,7 @@
 module that imports this module can access a config option by checking the object's
 value. For example
 
-import spras.config as config
+import spras.config.config as config
 container_framework = config.config.container_framework
 
 will grab the top level registry configuration option as it appears in the config file

From a47b0dfbe7128221f2ee378f3188402a83f60ae6 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Wed, 25 Jun 2025 23:45:53 +0000
Subject: [PATCH 09/34] fix: specify hac params

---
 spras/config/config.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/spras/config/config.py b/spras/config/config.py
index 5894dd304..0db7bedf1 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -239,6 +239,11 @@ def process_analysis(self, raw_config: RawConfig):
 
         self.pca_params = self.ml_params
 
+        self.hac_params = {
+            "linkage": self.ml_params.linkage,
+            "metric": self.ml_params.metric
+        }
+
         self.analysis_include_summary = raw_config.analysis.summary.include
         self.analysis_include_graphspace = raw_config.analysis.graphspace.include
         self.analysis_include_cytoscape = raw_config.analysis.cytoscape.include

From 8d756045da12d48e603c5d6f7a7846d41d24fd53 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Thu, 26 Jun 2025 16:04:27 +0000
Subject: [PATCH 10/34] fix: expand class params

---
 spras/config/config.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index 0db7bedf1..b479c2a53 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -233,9 +233,10 @@ def process_analysis(self, raw_config: RawConfig):
         if not raw_config.analysis:
             return
 
-        self.analysis_params = raw_config.analysis
-        self.ml_params = self.analysis_params.ml
-        self.evaluation_params = self.analysis_params.evaluation
+        # these params are classes - we need to turn them into var dicts
+        self.analysis_params = vars(raw_config.analysis)
+        self.ml_params = vars(self.analysis_params.ml)
+        self.evaluation_params = vars(self.analysis_params.evaluation)
 
         self.pca_params = self.ml_params
 

From afa1de5605fee43de0457aa05c8ee956eb1982df Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Thu, 26 Jun 2025 16:06:36 +0000
Subject: [PATCH 11/34] fix: expand on pca_params

---
 spras/config/config.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index b479c2a53..515a677c5 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -233,12 +233,12 @@ def process_analysis(self, raw_config: RawConfig):
         if not raw_config.analysis:
             return
 
-        # these params are classes - we need to turn them into var dicts
-        self.analysis_params = vars(raw_config.analysis)
-        self.ml_params = vars(self.analysis_params.ml)
-        self.evaluation_params = vars(self.analysis_params.evaluation)
+        self.analysis_params = raw_config.analysis
+        self.ml_params = self.analysis_params.ml
+        self.evaluation_params = self.analysis_params.evaluation
 
-        self.pca_params = self.ml_params
+        # self.ml_params is a class, pca_params needs to be a dict.
+        self.pca_params = vars(self.ml_params)
 
         self.hac_params = {
             "linkage": self.ml_params.linkage,

From 5eefc51f79ad38c8c086f0c0e233fa6f91966d36 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Thu, 26 Jun 2025 09:35:20 -0700
Subject: [PATCH 12/34] fix: drop include dict

---
 spras/config/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index 515a677c5..8d52ebd97 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -238,7 +238,7 @@ def process_analysis(self, raw_config: RawConfig):
         self.evaluation_params = self.analysis_params.evaluation
 
         # self.ml_params is a class, pca_params needs to be a dict.
-        self.pca_params = vars(self.ml_params)
+        self.pca_params = {k: v for k, v in vars(self.ml_params).items if k != 'include'}
 
         self.hac_params = {
             "linkage": self.ml_params.linkage,

From 52431866627ce081795487c37064403a8045c829 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Thu, 26 Jun 2025 10:28:05 -0700
Subject: [PATCH 13/34] fix: call items

---
 spras/config/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index 8d52ebd97..23d8c44fd 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -238,7 +238,7 @@ def process_analysis(self, raw_config: RawConfig):
         self.evaluation_params = self.analysis_params.evaluation
 
         # self.ml_params is a class, pca_params needs to be a dict.
-        self.pca_params = {k: v for k, v in vars(self.ml_params).items if k != 'include'}
+        self.pca_params = {k: v for k, v in vars(self.ml_params).items() if k != 'include'}
 
         self.hac_params = {
             "linkage": self.ml_params.linkage,

From 3b20c48bae7ef46a4d15a9b34dc65c399732f95c Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Thu, 26 Jun 2025 17:51:54 +0000
Subject: [PATCH 14/34] fix: better typing and deafults

---
 spras/config/config.py     | 17 +++++++++--------
 spras/config/raw_config.py |  2 +-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index 23d8c44fd..d944cc976 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -23,7 +23,7 @@
 import numpy as np
 import yaml
 
-from spras.config.raw_config import ContainerFramework, RawConfig
+from spras.config.raw_config import ContainerFramework, RawConfig, Analysis
 from spras.util import NpHashEncoder, hash_params_sha1_base32
 
 config = None
@@ -86,9 +86,11 @@ def __init__(self, raw_config: dict[str, Any]):
         # Deprecated. Previously a dict mapping algorithm names to a Boolean tracking whether they used directed graphs.
         self.algorithm_directed = None
         # A dict with the analysis settings
-        self.analysis_params = None
+        self.analysis_params = parsed_raw_config.analysis
+        # A dict with the evaluation settings
+        self.evaluation_params = self.analysis_params.evaluation
         # A dict with the ML settings
-        self.ml_params = None
+        self.ml_params = self.analysis_params.ml
         # A Boolean specifying whether to run ML analysis for individual algorithms
         self.analysis_include_ml_aggregate_algo = None
         # A dict with the PCA settings
@@ -233,12 +235,11 @@ def process_analysis(self, raw_config: RawConfig):
         if not raw_config.analysis:
             return
 
-        self.analysis_params = raw_config.analysis
-        self.ml_params = self.analysis_params.ml
-        self.evaluation_params = self.analysis_params.evaluation
-
         # self.ml_params is a class, pca_params needs to be a dict.
-        self.pca_params = {k: v for k, v in vars(self.ml_params).items() if k != 'include'}
+        self.pca_params = {
+            "components": self.ml_params.components,
+            "labels": self.ml_params.labels
+        }
 
         self.hac_params = {
             "linkage": self.ml_params.linkage,
diff --git a/spras/config/raw_config.py b/spras/config/raw_config.py
index caff7c690..409b48427 100644
--- a/spras/config/raw_config.py
+++ b/spras/config/raw_config.py
@@ -86,7 +86,7 @@ class RawConfig(BaseModel):
     algorithms: list[Algorithm]
     datasets: list[Dataset]
     gold_standards: list[GoldStandard] = []
-    analysis: Optional[Analysis] = None
+    analysis: Analysis = Analysis()
 
     reconstruction_settings: ReconstructionSettings
 

From 2d4a90f669f77a9d209c658ce5596833a2d43881 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Thu, 26 Jun 2025 11:09:54 -0700
Subject: [PATCH 15/34] style: fmt

---
 spras/config/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index d944cc976..0c176cbab 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -23,7 +23,7 @@
 import numpy as np
 import yaml
 
-from spras.config.raw_config import ContainerFramework, RawConfig, Analysis
+from spras.config.raw_config import Analysis, ContainerFramework, RawConfig
 from spras.util import NpHashEncoder, hash_params_sha1_base32
 
 config = None

From 2a4fb2ec5c923954c9748dc63009002264fcebf9 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Wed, 9 Jul 2025 10:40:44 -0700
Subject: [PATCH 16/34] refactor: add config forbid

---
 spras/config/raw_config_analysis.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/spras/config/raw_config_analysis.py b/spras/config/raw_config_analysis.py
index 194fc8f7c..dbec5f1b9 100644
--- a/spras/config/raw_config_analysis.py
+++ b/spras/config/raw_config_analysis.py
@@ -1,4 +1,4 @@
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 
 from spras.config.util_enum import CaseInsensitiveEnum
 
@@ -6,12 +6,13 @@
 class SummaryAnalysis(BaseModel):
     include: bool
 
-class GraphspaceAnalysis(BaseModel):
-    include: bool
+    model_config = ConfigDict(extra='forbid')
 
 class CytoscapeAnalysis(BaseModel):
     include: bool
 
+    model_config = ConfigDict(extra='forbid')
+
 class MlLinkage(CaseInsensitiveEnum):
     ward = 'ward'
     complete = 'complete'
@@ -31,14 +32,18 @@ class MlAnalysis(BaseModel):
     linkage: MlLinkage = MlLinkage.ward
     metric: MlMetric = MlMetric.euclidean
 
+    model_config = ConfigDict(extra='forbid')
+
 class EvaluationAnalysis(BaseModel):
     include: bool
     aggregate_per_algorithm: bool = False
 
+    model_config = ConfigDict(extra='forbid')
 
 class Analysis(BaseModel):
     summary: SummaryAnalysis = SummaryAnalysis(include=False)
-    graphspace: GraphspaceAnalysis = GraphspaceAnalysis(include=False)
     cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False)
     ml: MlAnalysis = MlAnalysis(include=False)
     evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False)
+
+    model_config = ConfigDict(extra='forbid')

From 4ded57eeb490b21a51feec81ed119e400fc363ef Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Wed, 9 Jul 2025 17:45:47 +0000
Subject: [PATCH 17/34] refactor: update config imports

---
 test/BowTieBuilder/test_btb.py | 2 +-
 test/RWR/test_RWR.py           | 2 +-
 test/ST_RWR/test_STRWR.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/BowTieBuilder/test_btb.py b/test/BowTieBuilder/test_btb.py
index 88b12d0dd..d4a458b3c 100644
--- a/test/BowTieBuilder/test_btb.py
+++ b/test/BowTieBuilder/test_btb.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 
 config.init_from_file("config/config.yaml")
 
diff --git a/test/RWR/test_RWR.py b/test/RWR/test_RWR.py
index 4d6ce7864..b0316ded0 100644
--- a/test/RWR/test_RWR.py
+++ b/test/RWR/test_RWR.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 from spras.rwr import RWR
 
 config.init_from_file("config/config.yaml")
diff --git a/test/ST_RWR/test_STRWR.py b/test/ST_RWR/test_STRWR.py
index a0a5b4ea9..898b24055 100644
--- a/test/ST_RWR/test_STRWR.py
+++ b/test/ST_RWR/test_STRWR.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 from spras.strwr import ST_RWR
 
 config.init_from_file("config/config.yaml")

From 22b568645ef19cde8d430c1bf77a1aff5057ee84 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Wed, 9 Jul 2025 19:23:09 +0000
Subject: [PATCH 18/34] refactor: better names to schema files

---
 spras/config/{raw_config_analysis.py => analysis_schema.py} | 0
 spras/config/config.py                                      | 2 +-
 spras/config/{raw_config.py => schema.py}                   | 2 +-
 test/test_config.py                                         | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename spras/config/{raw_config_analysis.py => analysis_schema.py} (100%)
 rename spras/config/{raw_config.py => schema.py} (98%)

diff --git a/spras/config/raw_config_analysis.py b/spras/config/analysis_schema.py
similarity index 100%
rename from spras/config/raw_config_analysis.py
rename to spras/config/analysis_schema.py
diff --git a/spras/config/config.py b/spras/config/config.py
index 2f4b44efa..293a08ec7 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -23,7 +23,7 @@
 import numpy as np
 import yaml
 
-from spras.config.raw_config import Analysis, ContainerFramework, RawConfig
+from spras.config.schema import Analysis, ContainerFramework, RawConfig
 from spras.util import NpHashEncoder, hash_params_sha1_base32
 
 config = None
diff --git a/spras/config/raw_config.py b/spras/config/schema.py
similarity index 98%
rename from spras/config/raw_config.py
rename to spras/config/schema.py
index 409b48427..f882e5382 100644
--- a/spras/config/raw_config.py
+++ b/spras/config/schema.py
@@ -7,7 +7,7 @@
 
 from pydantic import AfterValidator, BaseModel, ConfigDict, Field
 
-from spras.config.raw_config_analysis import Analysis
+from spras.config.analysis_schema import Analysis
 from spras.config.util_enum import CaseInsensitiveEnum
 
 # The default length of the truncated hash used to identify parameter combinations
diff --git a/test/test_config.py b/test/test_config.py
index 84d7d1d54..6095ad145 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -4,7 +4,7 @@
 import pytest
 
 import spras.config.config as config
-from spras.config.raw_config import DEFAULT_HASH_LENGTH
+from spras.config.schema import DEFAULT_HASH_LENGTH
 
 filler_dataset_data: dict[str, str | list[str]] = {
     "data_dir": "fake",

From ea59e4cec7d5197fd394f3704a5406facaa6f1a9 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Fri, 11 Jul 2025 21:56:42 +0000
Subject: [PATCH 19/34] fix: no default include, mention model_config allow
 reason

---
 spras/config/schema.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spras/config/schema.py b/spras/config/schema.py
index f882e5382..d1b86e06d 100644
--- a/spras/config/schema.py
+++ b/spras/config/schema.py
@@ -34,10 +34,12 @@ class ContainerRegistry(BaseModel):
     model_config = ConfigDict(extra='forbid')
 
 class AlgorithmParams(BaseModel):
-    include: bool = False
+    include: bool
     directed: Optional[bool] = None
 
-    # TODO: use array of runs instead
+    # TODO: use array of runs instead. We currently rely on the
+    # extra parameters here to extract the algorithm parameter information,
+    # which is why this deviates from the usual ConfigDict(extra='forbid').
     model_config = ConfigDict(extra='allow')
 
 class Algorithm(BaseModel):

From fa7d7c984b1e91454926fb0ffa1b51ecdc2377b5 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Fri, 11 Jul 2025 15:10:01 -0700
Subject: [PATCH 20/34] fix(config): case-insensitive check on labels

---
 spras/config/config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index 293a08ec7..c6ac8f8e0 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -127,8 +127,8 @@ def process_datasets(self, raw_config: RawConfig):
         self.datasets = {}
         for dataset in raw_config.datasets:
             label = dataset.label
-            if label in self.datasets:
-                raise ValueError(f"Datasets must have unique labels, but the label {label} appears at least twice.")
+            if label.lower() in [key.lower() for key in self.datasets.keys()]:
+                raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.")
             self.datasets[label] = dict(dataset)
 
         # parse gold standard information

From 52eab214a9946a58f33ab34847da2119e2be6807 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Mon, 14 Jul 2025 16:23:15 +0000
Subject: [PATCH 21/34] refactor: merge config

---
 spras/config/analysis_schema.py | 49 -----------------------------
 spras/config/schema.py          | 56 ++++++++++++++++++++++++++++++++-
 spras/config/util_enum.py       |  4 +++
 3 files changed, 59 insertions(+), 50 deletions(-)
 delete mode 100644 spras/config/analysis_schema.py

diff --git a/spras/config/analysis_schema.py b/spras/config/analysis_schema.py
deleted file mode 100644
index dbec5f1b9..000000000
--- a/spras/config/analysis_schema.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from pydantic import BaseModel, ConfigDict
-
-from spras.config.util_enum import CaseInsensitiveEnum
-
-
-class SummaryAnalysis(BaseModel):
-    include: bool
-
-    model_config = ConfigDict(extra='forbid')
-
-class CytoscapeAnalysis(BaseModel):
-    include: bool
-
-    model_config = ConfigDict(extra='forbid')
-
-class MlLinkage(CaseInsensitiveEnum):
-    ward = 'ward'
-    complete = 'complete'
-    average = 'average'
-    single = 'single'
-
-class MlMetric(CaseInsensitiveEnum):
-    euclidean = 'euclidean'
-    manhattan = 'manhattan'
-    cosine = 'cosine'
-
-class MlAnalysis(BaseModel):
-    include: bool
-    aggregate_per_algorithm: bool = False
-    components: int = 2
-    labels: bool = True
-    linkage: MlLinkage = MlLinkage.ward
-    metric: MlMetric = MlMetric.euclidean
-
-    model_config = ConfigDict(extra='forbid')
-
-class EvaluationAnalysis(BaseModel):
-    include: bool
-    aggregate_per_algorithm: bool = False
-
-    model_config = ConfigDict(extra='forbid')
-
-class Analysis(BaseModel):
-    summary: SummaryAnalysis = SummaryAnalysis(include=False)
-    cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False)
-    ml: MlAnalysis = MlAnalysis(include=False)
-    evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False)
-
-    model_config = ConfigDict(extra='forbid')
diff --git a/spras/config/schema.py b/spras/config/schema.py
index d1b86e06d..f4cad554c 100644
--- a/spras/config/schema.py
+++ b/spras/config/schema.py
@@ -1,5 +1,13 @@
 """
 Contains the raw pydantic schema for the configuration file.
+
+Using Pydantic as our backing config parser allows us to declaratively
+type our config, giving us more robust user errors with guarantees
+that parts of the config exist after parsing it through Pydantic.
+
+We declare models using two classes here:
+- `BaseModel` (docs: https://docs.pydantic.dev/latest/api/base_model/)
+- `CaseInsensitiveEnum` (see ./util_enum.py)
 """
 
 import re
@@ -7,9 +15,55 @@
 
 from pydantic import AfterValidator, BaseModel, ConfigDict, Field
 
-from spras.config.analysis_schema import Analysis
 from spras.config.util_enum import CaseInsensitiveEnum
 
+
+class SummaryAnalysis(BaseModel):
+    include: bool
+
+    model_config = ConfigDict(extra='forbid')
+
+class CytoscapeAnalysis(BaseModel):
+    include: bool
+
+    model_config = ConfigDict(extra='forbid')
+
+class MlLinkage(CaseInsensitiveEnum):
+    ward = 'ward'
+    complete = 'complete'
+    average = 'average'
+    single = 'single'
+
+class MlMetric(CaseInsensitiveEnum):
+    euclidean = 'euclidean'
+    manhattan = 'manhattan'
+    cosine = 'cosine'
+
+class MlAnalysis(BaseModel):
+    include: bool
+    aggregate_per_algorithm: bool = False
+    components: int = 2
+    labels: bool = True
+    linkage: MlLinkage = MlLinkage.ward
+    metric: MlMetric = MlMetric.euclidean
+
+    model_config = ConfigDict(extra='forbid')
+
+class EvaluationAnalysis(BaseModel):
+    include: bool
+    aggregate_per_algorithm: bool = False
+
+    model_config = ConfigDict(extra='forbid')
+
+class Analysis(BaseModel):
+    summary: SummaryAnalysis = SummaryAnalysis(include=False)
+    cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False)
+    ml: MlAnalysis = MlAnalysis(include=False)
+    evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False)
+
+    model_config = ConfigDict(extra='forbid')
+
+
 # The default length of the truncated hash used to identify parameter combinations
 DEFAULT_HASH_LENGTH = 7
 
diff --git a/spras/config/util_enum.py b/spras/config/util_enum.py
index 3e73eda98..b7680222b 100644
--- a/spras/config/util_enum.py
+++ b/spras/config/util_enum.py
@@ -4,6 +4,10 @@
 
 # https://stackoverflow.com/a/76883868/7589775
 class CaseInsensitiveEnum(str, Enum):
+    """
+    We prefer this over Enum to make sure the config parsing
+    is more relaxed when it comes to string enum values.
+    """
     @classmethod
     def _missing_(cls, value: Any):
         if isinstance(value, str):

From 3c305f4b5b4c69e2eec44eee2bc8d2e6a1a92cfe Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Mon, 14 Jul 2025 16:44:39 +0000
Subject: [PATCH 22/34] docs: use concepts link

---
 spras/config/schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/config/schema.py b/spras/config/schema.py
index f4cad554c..63fe1b613 100644
--- a/spras/config/schema.py
+++ b/spras/config/schema.py
@@ -6,7 +6,7 @@
 that parts of the config exist after parsing it through Pydantic.
 
 We declare models using two classes here:
-- `BaseModel` (docs: https://docs.pydantic.dev/latest/api/base_model/)
+- `BaseModel` (docs: https://docs.pydantic.dev/latest/concepts/models/)
 - `CaseInsensitiveEnum` (see ./util_enum.py)
 """
 

From 49e50a03e85a82949e7913bb66a34c70195cb70a Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Mon, 14 Jul 2025 18:13:59 +0000
Subject: [PATCH 23/34] refactor: mv util_enum -> util

---
 spras/config/schema.py                 | 2 +-
 spras/config/{util_enum.py => util.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename spras/config/{util_enum.py => util.py} (100%)

diff --git a/spras/config/schema.py b/spras/config/schema.py
index 63fe1b613..10991ad2f 100644
--- a/spras/config/schema.py
+++ b/spras/config/schema.py
@@ -15,7 +15,7 @@
 
 from pydantic import AfterValidator, BaseModel, ConfigDict, Field
 
-from spras.config.util_enum import CaseInsensitiveEnum
+from spras.config.util import CaseInsensitiveEnum
 
 
 class SummaryAnalysis(BaseModel):
diff --git a/spras/config/util_enum.py b/spras/config/util.py
similarity index 100%
rename from spras/config/util_enum.py
rename to spras/config/util.py

From cb28f61396c90415611ae970f4d7fdd7b924519b Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Mon, 14 Jul 2025 11:14:19 -0700
Subject: [PATCH 24/34] docs: correct util_enum path

---
 spras/config/schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/config/schema.py b/spras/config/schema.py
index 10991ad2f..623c9dd9b 100644
--- a/spras/config/schema.py
+++ b/spras/config/schema.py
@@ -7,7 +7,7 @@
 
 We declare models using two classes here:
 - `BaseModel` (docs: https://docs.pydantic.dev/latest/concepts/models/)
-- `CaseInsensitiveEnum` (see ./util_enum.py)
+- `CaseInsensitiveEnum` (see ./util.py)
 """
 
 import re

From 2c938ed09708359ebaaf82733646868998df759f Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Mon, 14 Jul 2025 16:05:36 -0700
Subject: [PATCH 25/34] fix: add spras.config to pyproject

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index bedbe1628..b18ef12c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,4 +74,4 @@ select = [
 # py-modules tells setuptools which directory is our actual module
 py-modules = ["spras"]
 # packages tells setuptools what the exported package is called (ie allows import spras)
-packages = ["spras", "spras.analysis"]
+packages = ["spras", "spras.analysis", "spras.config"]

From ebcf6b0e6bc1cf04ad318ac5a1a7305aeffb3686 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Wed, 16 Jul 2025 14:43:27 -0700
Subject: [PATCH 26/34] docs: mention `args` in contributing

---
 CONTRIBUTING.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ffb138c14..8b9fee1f5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -110,6 +110,9 @@ Use `pathlinker.py` as an example.
 Call the new class within `local_neighborhood.py` `LocalNeighborhood` and set `__all__` so the class can be [imported](https://docs.python.org/3/tutorial/modules.html#importing-from-a-package).
 Specify the list of `required_input` files to be `network` and `nodes`.
 These entries are used to tell Snakemake what input files should be present before running the Local Neighborhood algorithm.
+You also need to implement an `args` class with pydantic. Since `LocalNeighborhood`
+takes no additional arguments, use the `Empty` arguments class (see `AllPairs` for an example), and make sure to specify it as a generic when writing a PRM class. This means your
+class declaration should look something like `class LocalNeighborhood(PRM[Empty]):`
 
 Before implementing the `generate_inputs` function, explore the structure of the `Dataset` class interactively.
 In an interactive Python session, run the following commands to load the `data0` dataset and explore the nodes and interactome.

From 4733b954305e52640053f6e8c94298263a55cded Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Wed, 16 Jul 2025 14:46:33 -0700
Subject: [PATCH 27/34] Revert "docs: mention `args` in contributing"

This reverts commit ebcf6b0e6bc1cf04ad318ac5a1a7305aeffb3686.
---
 CONTRIBUTING.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8b9fee1f5..ffb138c14 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -110,9 +110,6 @@ Use `pathlinker.py` as an example.
 Call the new class within `local_neighborhood.py` `LocalNeighborhood` and set `__all__` so the class can be [imported](https://docs.python.org/3/tutorial/modules.html#importing-from-a-package).
 Specify the list of `required_input` files to be `network` and `nodes`.
 These entries are used to tell Snakemake what input files should be present before running the Local Neighborhood algorithm.
-You also need to implement an `args` class with pydantic. Since `LocalNeighborhood`
-takes no additional arguments, use the `Empty` arguments class (see `AllPairs` for an example), and make sure to specify it as a generic when writing a PRM class. This means your
-class declaration should look something like `class LocalNeighborhood(PRM[Empty]):`
 
 Before implementing the `generate_inputs` function, explore the structure of the `Dataset` class interactively.
 In an interactive Python session, run the following commands to load the `data0` dataset and explore the nodes and interactome.

From fa51d79c683ea8cdc09bfc40ab2c1d5df849919c Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Thu, 17 Jul 2025 19:45:00 +0000
Subject: [PATCH 28/34] docs: document some pydantic choices

---
 spras/config/schema.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/spras/config/schema.py b/spras/config/schema.py
index 623c9dd9b..07cf1047d 100644
--- a/spras/config/schema.py
+++ b/spras/config/schema.py
@@ -17,10 +17,15 @@
 
 from spras.config.util import CaseInsensitiveEnum
 
+# Most options here have an `include` property,
+# which is meant to make disabling parts of the configuration easier.
+# When an option does not have a default, it means that it must be set by the user.
 
 class SummaryAnalysis(BaseModel):
     include: bool
 
+    # We prefer to never allow extra keys, to prevent
+    # any user mistypes.
     model_config = ConfigDict(extra='forbid')
 
 class CytoscapeAnalysis(BaseModel):
@@ -28,6 +33,9 @@ class CytoscapeAnalysis(BaseModel):
 
     model_config = ConfigDict(extra='forbid')
 
+# Note that CaseInsensitiveEnum is not pydantic: pydantic
+# has special support for enums, but we avoid the
+# pydantic-specific "model_config" key here for this reason.
 class MlLinkage(CaseInsensitiveEnum):
     ward = 'ward'
     complete = 'complete'
@@ -68,6 +76,10 @@ class Analysis(BaseModel):
 DEFAULT_HASH_LENGTH = 7
 
 def label_validator(name: str):
+    """
+    A validator takes in a label
+    and ensure that it's alphanumeric (with underscores).
+    """
     label_pattern = r'^\w+$'
     def validate(label: str):
         if not bool(re.match(label_pattern, label)):
@@ -103,6 +115,9 @@ class Algorithm(BaseModel):
     model_config = ConfigDict(extra='forbid')
 
 class Dataset(BaseModel):
+    # We prefer AfterValidator here to allow pydantic to run its own
+    # validation & coercion logic before we check it against our own
+    # requirements
     label: Annotated[str, AfterValidator(label_validator("Dataset"))]
     node_files: list[str]
     edge_files: list[str]
@@ -124,6 +139,7 @@ class Locations(BaseModel):
 
     model_config = ConfigDict(extra='forbid')
 
+# NOTE: This setting doesn't have any uses past setting the output_dir as of now.
 class ReconstructionSettings(BaseModel):
     locations: Locations
 
@@ -135,9 +151,8 @@ class RawConfig(BaseModel):
     unpack_singularity: bool = False
     container_registry: ContainerRegistry
 
-    hash_length: int = Field(
-        description="The length of the hash used to identify a parameter combination",
-        default=DEFAULT_HASH_LENGTH)
+    hash_length: int = DEFAULT_HASH_LENGTH
+    "The length of the hash used to identify a parameter combination"
 
     algorithms: list[Algorithm]
     datasets: list[Dataset]
@@ -146,4 +161,6 @@ class RawConfig(BaseModel):
 
     reconstruction_settings: ReconstructionSettings
 
-    model_config = ConfigDict(extra='forbid')
+    # We include use_attribute_docstrings here to preserve the docstrings
+    # after attributes at runtime (for future JSON schema generation)
+    model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True)

From 1bef8c763c8acf8695e0b4a1cd83297b1550f25b Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Thu, 24 Jul 2025 13:09:29 -0700
Subject: [PATCH 29/34] fix: add defaults for kde and remove_empty_pathways

---
 spras/config/schema.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spras/config/schema.py b/spras/config/schema.py
index 07cf1047d..0a03a05f6 100644
--- a/spras/config/schema.py
+++ b/spras/config/schema.py
@@ -52,6 +52,8 @@ class MlAnalysis(BaseModel):
     aggregate_per_algorithm: bool = False
     components: int = 2
     labels: bool = True
+    kde: bool = False
+    remove_empty_pathways: bool = False
     linkage: MlLinkage = MlLinkage.ward
     metric: MlMetric = MlMetric.euclidean
 

From 593206cbe731f612f453f2e4570667a85e6b25b8 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Thu, 24 Jul 2025 13:09:36 -0700
Subject: [PATCH 30/34] style: fmt

---
 spras/config/config.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index 354b9f676..11208187c 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -15,7 +15,6 @@
 import copy as copy
 import itertools as it
 import os
-import re
 import warnings
 from collections.abc import Iterable
 from typing import Any
@@ -23,7 +22,7 @@
 import numpy as np
 import yaml
 
-from spras.config.schema import Analysis, ContainerFramework, RawConfig
+from spras.config.schema import ContainerFramework, RawConfig
 from spras.util import NpHashEncoder, hash_params_sha1_base32
 
 config = None
@@ -275,7 +274,7 @@ def process_analysis(self, raw_config: RawConfig):
         # Only run Evaluation per algorithm if ML per algorithm is set to True
         if not self.analysis_include_ml_aggregate_algo:
             self.analysis_include_evaluation_aggregate_algo = False
-        
+
         # Set kde to True if Evaluation is set to True
         # When Evaluation is True, PCA is used to pick a single parameter combination for all algorithms with multiple
         # parameter combinations and KDE is used to choose the parameter combination in the PC space

From c465c9ce6595691c53520328215172778fe20ed0 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Thu, 24 Jul 2025 13:38:19 -0700
Subject: [PATCH 31/34] test: update rn import

---
 test/ResponseNet/test_rn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/ResponseNet/test_rn.py b/test/ResponseNet/test_rn.py
index 6fa09904b..6b9fe05cf 100644
--- a/test/ResponseNet/test_rn.py
+++ b/test/ResponseNet/test_rn.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-import spras.config as config
+import spras.config.config as config
 from spras.responsenet import ResponseNet
 
 config.init_from_file("config/config.yaml")

From 2443735c6e549042476c3225d47b8302fc109514 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Fri, 25 Jul 2025 16:14:26 -0700
Subject: [PATCH 32/34] style: typos

---
 spras/config/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index 11208187c..605faecf4 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -171,7 +171,7 @@ def process_algorithms(self, raw_config: RawConfig):
 
             cur_params = cur_params.__pydantic_extra__
             if cur_params is None:
-                raise RuntimeError("An internal error occured: ConfigDict extra should be set on AlgorithmParams.")
+                raise RuntimeError("An internal error occurred: ConfigDict extra should be set on AlgorithmParams.")
 
             # The algorithm has no named arguments so create a default placeholder
             if len(cur_params.keys()) == 0:

From 18a173f9442ef9acc0b325e8417727e73d1c69b6 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Fri, 25 Jul 2025 22:26:06 -0700
Subject: [PATCH 33/34] docs: grammar

---
 spras/config/schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/config/schema.py b/spras/config/schema.py
index 0a03a05f6..f3a552694 100644
--- a/spras/config/schema.py
+++ b/spras/config/schema.py
@@ -80,7 +80,7 @@ class Analysis(BaseModel):
 def label_validator(name: str):
     """
     A validator takes in a label
-    and ensure that it's alphanumeric (with underscores).
+    and ensures that it's alphanumeric (with underscores).
     """
     label_pattern = r'^\w+$'
     def validate(label: str):

From 5225616df80599ff0965632bef51914c9bf7471f Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Mon, 28 Jul 2025 08:36:28 -0700
Subject: [PATCH 34/34] docs: use nicer alphanumeric explanation

Co-authored-by: Neha Talluri <78840540+ntalluri@users.noreply.github.com>
---
 spras/config/schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/config/schema.py b/spras/config/schema.py
index f3a552694..c84ea4384 100644
--- a/spras/config/schema.py
+++ b/spras/config/schema.py
@@ -80,7 +80,7 @@ class Analysis(BaseModel):
 def label_validator(name: str):
     """
     A validator takes in a label
-    and ensures that it's alphanumeric (with underscores).
+    and ensures that it contains only letters, numbers, or underscores.
     """
     label_pattern = r'^\w+$'
     def validate(label: str):