From 43d746be1e739d0a8b7844d1c827938516dc72af Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Mon, 6 Oct 2025 20:36:52 +0000
Subject: [PATCH 1/5] feat: Dataset#category

---
 spras/config/schema.py | 4 +++-
 spras/dataset.py       | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/spras/config/schema.py b/spras/config/schema.py
index f99bbe2d7..76c42c5db 100644
--- a/spras/config/schema.py
+++ b/spras/config/schema.py
@@ -121,12 +121,14 @@ class Dataset(BaseModel):
     # validation & coercion logic before we check it against our own
     # requirements
     label: Annotated[str, AfterValidator(label_validator("Dataset"))]
+    category: Optional[str]
+    "The dataset category, for working with multiple datasets at once in the configuration."
     node_files: list[str]
     edge_files: list[str]
     other_files: list[str]
     data_dir: str
 
-    model_config = ConfigDict(extra='forbid')
+    model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True)
 
 class GoldStandard(BaseModel):
     label: Annotated[str, AfterValidator(label_validator("Gold Standard"))]
diff --git a/spras/dataset.py b/spras/dataset.py
index 95f409a80..6282d1b47 100644
--- a/spras/dataset.py
+++ b/spras/dataset.py
@@ -1,7 +1,7 @@
 import os
 import pickle as pkl
 import warnings
-from typing import TypedDict
+from typing import NotRequired, TypedDict
 
 import pandas as pd
 
@@ -18,6 +18,7 @@ class DatasetDict(TypedDict):
     object. See spras/config/schema.py's `Dataset` class for the pydantic formation of `DatasetDict`.
     """
     label: str
+    category: NotRequired[str]
     node_files: list[str | os.PathLike]
     edge_files: list[str | os.PathLike]
     other_files: list[str | os.PathLike]
@@ -30,6 +31,7 @@ class Dataset:
 
     def __init__(self, dataset_dict: DatasetDict):
         self.label = None
+        self.category = None
         self.interactome = None
         self.node_table = None
         self.node_set = set()
@@ -78,6 +80,7 @@ def load_files_from_dict(self, dataset_dict: DatasetDict):
         """
 
         self.label = dataset_dict["label"]
+        self.category = dataset_dict["category"] if "category" in dataset_dict else None
 
         # Get file paths from config
         # TODO support multiple edge files

From df9c29593cd4f7d67597ce772bb1e7eaee918081 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Mon, 6 Oct 2025 20:40:36 +0000
Subject: [PATCH 2/5] chore: make Datset#category optional in schema

---
 spras/config/schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/config/schema.py b/spras/config/schema.py
index 76c42c5db..dcec8f27a 100644
--- a/spras/config/schema.py
+++ b/spras/config/schema.py
@@ -121,7 +121,7 @@ class Dataset(BaseModel):
     # validation & coercion logic before we check it against our own
     # requirements
     label: Annotated[str, AfterValidator(label_validator("Dataset"))]
-    category: Optional[str]
+    category: Optional[str] = None
     "The dataset category, for working with multiple datasets at once in the configuration."
     node_files: list[str]
     edge_files: list[str]

From edb25c4a7fa496c9a09d911b5f222917ba4a86e2 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Mon, 6 Oct 2025 20:53:25 +0000
Subject: [PATCH 3/5] feat: store dataset categories

---
 spras/config/config.py | 14 +++++++++++++-
 test/test_config.py    | 24 ++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index 346682f53..2fce035b2 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -73,6 +73,8 @@ def __init__(self, raw_config: dict[str, Any]):
         self.unpack_singularity = False
         # A dictionary to store configured datasets against which SPRAS will be run
         self.datasets = None
+        # A dictionary to store dataset categories with their associated dataset labels
+        self.dataset_categories = None
         # A dictionary to store configured gold standard data against output of SPRAS runs
         self.gold_standards = None
         # The hash length SPRAS will use to identify parameter combinations.
@@ -124,12 +126,22 @@ def process_datasets(self, raw_config: RawConfig):
         # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts
         # Convert to dicts to simplify the yaml logging
         self.datasets = {}
+        self.dataset_categories = {}
         for dataset in raw_config.datasets:
             label = dataset.label
-            if label.lower() in [key.lower() for key in self.datasets.keys()]:
+            if label.casefold() in [key.casefold() for key in self.datasets.keys()]:
                 raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.")
             self.datasets[label] = dict(dataset)
 
+            # Extra check for conflicting categories which we don't store, yet.
+            category = dataset.category
+            if category:
+                if category.casefold() in [key.casefold() for key in self.datasets.keys()]:
+                    raise ValueError(f"Dataset categories can not appear as (case-insensitive) labels, yet category {category} appears as a label.")
+
+                category_dataset_labels = self.dataset_categories.setdefault(category, [])
+                category_dataset_labels.append(dataset.label)
+
         # parse gold standard information
         self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards}
 
diff --git a/test/test_config.py b/test/test_config.py
index f5ec454b7..0d89574e6 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -31,12 +31,14 @@ def get_test_config():
         },
         "datasets": [{
             "label": "alg1",
+            "category": "category1",
             "data_dir": "fake",
             "edge_files": [],
             "other_files": [],
             "node_files": []
         }, {
             "label": "alg2",
+            "category": "category2",
             "data_dir": "faux",
             "edge_files": [],
             "other_files": [],
@@ -220,6 +222,28 @@ def test_correct_dataset_label(self):
             test_config["datasets"] = [test_dict]
             config.init_global(test_config)  # no error should be raised
 
+    def test_correct_dataset_category(self):
+        test_config = get_test_config()
+        config.init_global(test_config)
+        assert config.config.dataset_categories
+        assert len(config.config.dataset_categories["category1"]) == 1
+        assert len(config.config.dataset_categories["category2"]) == 1
+
+    def test_multiple_dataset_category(self):
+        test_config = get_test_config()
+        for dataset in test_config["datasets"]:
+            dataset["category"] = "category1"
+        config.init_global(test_config)
+        assert config.config.dataset_categories
+        assert len(config.config.dataset_categories["category1"]) == 2
+
+    def test_bad_dataset_category(self):
+        test_config = get_test_config()
+        for dataset in test_config["datasets"]:
+            dataset["category"] = "alg2"
+        with pytest.raises(ValueError): # categories can not match dataset labels
+            config.init_global(test_config)
+
     def test_error_gs_label(self):
         test_config = get_test_config()
         error_labels = ["test$", "@test'"]

From 965f87c5470585f313f9916330df78afc1036ccf Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Tue, 7 Oct 2025 21:45:01 -0700
Subject: [PATCH 4/5] Update spras/config/schema.py

---
 spras/config/schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/config/schema.py b/spras/config/schema.py
index dcec8f27a..9f1cea933 100644
--- a/spras/config/schema.py
+++ b/spras/config/schema.py
@@ -122,7 +122,7 @@ class Dataset(BaseModel):
     # requirements
     label: Annotated[str, AfterValidator(label_validator("Dataset"))]
     category: Optional[str] = None
-    "The dataset category, for working with multiple datasets at once in the configuration."
+    "The dataset category, for working with dataset collections in the configuration."
     node_files: list[str]
     edge_files: list[str]
     other_files: list[str]

From dc40e7f7d3b9433bc6603a615d2f6a80335ba617 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Fri, 30 Jan 2026 20:43:05 -0800
Subject: [PATCH 5/5] chore: update dataset schema with categories

---
 spras/config/dataset.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spras/config/dataset.py b/spras/config/dataset.py
index 9af413385..6fb03ff56 100644
--- a/spras/config/dataset.py
+++ b/spras/config/dataset.py
@@ -1,4 +1,4 @@
-from typing import Annotated
+from typing import Annotated, Optional
 
 from pydantic import AfterValidator, BaseModel, ConfigDict
 
@@ -19,5 +19,7 @@ class DatasetSchema(BaseModel):
     edge_files: list[LoosePathLike]
     other_files: list[LoosePathLike]
     data_dir: LoosePathLike
+    category: Optional[str] = None
+    "The dataset category, for working with dataset collections in the configuration."
 
     model_config = ConfigDict(extra='forbid')