From 43d746be1e739d0a8b7844d1c827938516dc72af Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 6 Oct 2025 20:36:52 +0000 Subject: [PATCH 1/5] feat: Dataset#category --- spras/config/schema.py | 4 +++- spras/dataset.py | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/spras/config/schema.py b/spras/config/schema.py index f99bbe2d7..76c42c5db 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -121,12 +121,14 @@ class Dataset(BaseModel): # validation & coercion logic before we check it against our own # requirements label: Annotated[str, AfterValidator(label_validator("Dataset"))] + category: Optional[str] + "The dataset category, for working with multiple datasets at once in the configuration." node_files: list[str] edge_files: list[str] other_files: list[str] data_dir: str - model_config = ConfigDict(extra='forbid') + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) class GoldStandard(BaseModel): label: Annotated[str, AfterValidator(label_validator("Gold Standard"))] diff --git a/spras/dataset.py b/spras/dataset.py index 95f409a80..6282d1b47 100644 --- a/spras/dataset.py +++ b/spras/dataset.py @@ -1,7 +1,7 @@ import os import pickle as pkl import warnings -from typing import TypedDict +from typing import NotRequired, TypedDict import pandas as pd @@ -18,6 +18,7 @@ class DatasetDict(TypedDict): object. See spras/config/schema.py's `Dataset` class for the pydantic formation of `DatasetDict`. """ label: str + category: NotRequired[str] node_files: list[str | os.PathLike] edge_files: list[str | os.PathLike] other_files: list[str | os.PathLike] @@ -30,6 +31,7 @@ class Dataset: def __init__(self, dataset_dict: DatasetDict): self.label = None + self.category = None self.interactome = None self.node_table = None self.node_set = set() @@ -78,6 +80,7 @@ def load_files_from_dict(self, dataset_dict: DatasetDict): """ self.label = dataset_dict["label"] + self.category = dataset_dict["category"] if "category" in dataset_dict else None # Get file paths from config # TODO support multiple edge files From df9c29593cd4f7d67597ce772bb1e7eaee918081 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 6 Oct 2025 20:40:36 +0000 Subject: [PATCH 2/5] chore: make Datset#category optional in schema --- spras/config/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/schema.py b/spras/config/schema.py index 76c42c5db..dcec8f27a 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -121,7 +121,7 @@ class Dataset(BaseModel): # validation & coercion logic before we check it against our own # requirements label: Annotated[str, AfterValidator(label_validator("Dataset"))] - category: Optional[str] + category: Optional[str] = None "The dataset category, for working with multiple datasets at once in the configuration." node_files: list[str] edge_files: list[str] From edb25c4a7fa496c9a09d911b5f222917ba4a86e2 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 6 Oct 2025 20:53:25 +0000 Subject: [PATCH 3/5] feat: store dataset categories --- spras/config/config.py | 14 +++++++++++++- test/test_config.py | 24 ++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/spras/config/config.py b/spras/config/config.py index 346682f53..2fce035b2 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -73,6 +73,8 @@ def __init__(self, raw_config: dict[str, Any]): self.unpack_singularity = False # A dictionary to store configured datasets against which SPRAS will be run self.datasets = None + # A dictionary to store dataset categories with their associated dataset labels + self.dataset_categories = None # A dictionary to store configured gold standard data against output of SPRAS runs self.gold_standards = None # The hash length SPRAS will use to identify parameter combinations. @@ -124,12 +126,22 @@ def process_datasets(self, raw_config: RawConfig): # When Snakemake parses the config file it loads the datasets as OrderedDicts not dicts # Convert to dicts to simplify the yaml logging self.datasets = {} + self.dataset_categories = {} for dataset in raw_config.datasets: label = dataset.label - if label.lower() in [key.lower() for key in self.datasets.keys()]: + if label.casefold() in [key.casefold() for key in self.datasets.keys()]: raise ValueError(f"Datasets must have unique case-insensitive labels, but the label {label} appears at least twice.") self.datasets[label] = dict(dataset) + # Extra check for conflicting categories which we don't store, yet. + category = dataset.category + if category: + if category.casefold() in [key.casefold() for key in self.datasets.keys()]: + raise ValueError(f"Dataset categories can not appear as (case-insensitive) labels, yet category {category} appears as a label.") + + category_dataset_labels = self.dataset_categories.setdefault(category, []) + category_dataset_labels.append(dataset.label) + # parse gold standard information self.gold_standards = {gold_standard.label: dict(gold_standard) for gold_standard in raw_config.gold_standards} diff --git a/test/test_config.py b/test/test_config.py index f5ec454b7..0d89574e6 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -31,12 +31,14 @@ def get_test_config(): }, "datasets": [{ "label": "alg1", + "category": "category1", "data_dir": "fake", "edge_files": [], "other_files": [], "node_files": [] }, { "label": "alg2", + "category": "category2", "data_dir": "faux", "edge_files": [], "other_files": [], @@ -220,6 +222,28 @@ def test_correct_dataset_label(self): test_config["datasets"] = [test_dict] config.init_global(test_config) # no error should be raised + def test_correct_dataset_category(self): + test_config = get_test_config() + config.init_global(test_config) + assert config.config.dataset_categories + assert len(config.config.dataset_categories["category1"]) == 1 + assert len(config.config.dataset_categories["category2"]) == 1 + + def test_multiple_dataset_category(self): + test_config = get_test_config() + for dataset in test_config["datasets"]: + dataset["category"] = "category1" + config.init_global(test_config) + assert config.config.dataset_categories + assert len(config.config.dataset_categories["category1"]) == 2 + + def test_bad_dataset_category(self): + test_config = get_test_config() + for dataset in test_config["datasets"]: + dataset["category"] = "alg2" + with pytest.raises(ValueError): # categories can not match dataset labels + config.init_global(test_config) + def test_error_gs_label(self): test_config = get_test_config() error_labels = ["test$", "@test'"] From 965f87c5470585f313f9916330df78afc1036ccf Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 7 Oct 2025 21:45:01 -0700 Subject: [PATCH 4/5] Update spras/config/schema.py --- spras/config/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config/schema.py b/spras/config/schema.py index dcec8f27a..9f1cea933 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -122,7 +122,7 @@ class Dataset(BaseModel): # requirements label: Annotated[str, AfterValidator(label_validator("Dataset"))] category: Optional[str] = None - "The dataset category, for working with multiple datasets at once in the configuration." + "The dataset category, for working with dataset collections in the configuration." node_files: list[str] edge_files: list[str] other_files: list[str] From dc40e7f7d3b9433bc6603a615d2f6a80335ba617 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 30 Jan 2026 20:43:05 -0800 Subject: [PATCH 5/5] chore: update dataset schema with categories --- spras/config/dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spras/config/dataset.py b/spras/config/dataset.py index 9af413385..6fb03ff56 100644 --- a/spras/config/dataset.py +++ b/spras/config/dataset.py @@ -1,4 +1,4 @@ -from typing import Annotated +from typing import Annotated, Optional from pydantic import AfterValidator, BaseModel, ConfigDict @@ -19,5 +19,7 @@ class DatasetSchema(BaseModel): edge_files: list[LoosePathLike] other_files: list[LoosePathLike] data_dir: LoosePathLike + category: Optional[str] = None + "The dataset category, for working with dataset collections in the configuration." model_config = ConfigDict(extra='forbid')