From 3629b2ccb782e18e4eaa0e7955c67d0d123b6b49 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Fri, 23 Jan 2026 09:08:32 +0000
Subject: [PATCH 1/5] feat: raw_generation.py

---
 datasets/contributing/.gitignore        |  4 ++
 datasets/contributing/README.md         | 10 ++++
 datasets/contributing/raw_generation.py | 78 +++++++++++++++++++++++++
 pyproject.toml                          |  1 +
 uv.lock                                 | 11 ++++
 5 files changed, 104 insertions(+)
 create mode 100644 datasets/contributing/.gitignore
 create mode 100644 datasets/contributing/README.md
 create mode 100644 datasets/contributing/raw_generation.py

diff --git a/datasets/contributing/.gitignore b/datasets/contributing/.gitignore
new file mode 100644
index 0000000..56204f5
--- /dev/null
+++ b/datasets/contributing/.gitignore
@@ -0,0 +1,4 @@
+gold-standard.tsv
+interactome.tsv
+sources.txt
+targets.txt
diff --git a/datasets/contributing/README.md b/datasets/contributing/README.md
new file mode 100644
index 0000000..01519e2
--- /dev/null
+++ b/datasets/contributing/README.md
@@ -0,0 +1,10 @@
+# Contributing Guide dataset
+
+**This is an artificial dataset** for how to make datasets.
+
+This comes with a `raw_generation.py` script, which produces the associated raw data, where the gold standard is `k` paths of length `n` with
+Erdős-Rényi edges, such that the sources and targets come from the start and ends of each path. The background interactome is the gold standard with
+more edge and node noise. This is not a topologically-accurate emulation of (signaling) pathways, but it suffices to trick most pathway reconstruction
+algorithms.
+
+This does not cover the (very common!) task of ID mapping, as this can vary constantly between datasets.
diff --git a/datasets/contributing/raw_generation.py b/datasets/contributing/raw_generation.py
new file mode 100644
index 0000000..acbca21
--- /dev/null
+++ b/datasets/contributing/raw_generation.py
@@ -0,0 +1,78 @@
+import argparse
+import itertools
+from pathlib import Path
+import random
+import networkx
+import uuid
+import pandas
+
+def random_id() -> str:
+    return uuid.uuid4().hex
+
+def assign_ids(graph: networkx.DiGraph) -> networkx.DiGraph:
+    """Assigns new IDs to a graph based on `random_id`"""
+    mapping = {node: random_id() for node in graph}
+    return networkx.relabel_nodes(graph, mapping)
+
+def gnp_noise(graph: networkx.DiGraph, p: float):
+    """
+    The mutative equivalent to networkx.gnp_random_graph,
+    whose original implementation does not consume a graph.
+    """
+    for e in itertools.permutations(graph.nodes, 2):
+        if random.random() < p:
+            graph.add_edge(*e)
+
+def generate_parser():
+    parser = argparse.ArgumentParser(prog='Pathway generator')
+    parser.add_argument("--path-count", type=int, default=10)
+    parser.add_argument("--path-length", type=int, default=7)
+
+    parser.add_argument("--sources-output", type=str, default="sources.txt")
+    parser.add_argument("--targets-output", type=str, default="targets.txt")
+
+    parser.add_argument("--gold-standard-noise", type=float, default=0.03)
+    parser.add_argument("--gold-standard-output", type=str, default="gold-standard.tsv")
+
+    parser.add_argument("--interactome-extra-nodes", type=int, default=400)
+    parser.add_argument("--interactome-noise", type=float, default=0.01)
+    parser.add_argument("--interactome-output", type=str, default="interactome.tsv")
+    return parser
+
+def main():
+    args = generate_parser().parse_args()
+
+    graph = networkx.DiGraph()
+    sources: list[str] = []
+    targets: list[str] = []
+
+    # Add the path graphs to form the base of the pathway, while getting sources and targets as well.
+    for _ in range(args.path_count):
+        path_graph = networkx.path_graph(args.path_length, create_using=networkx.DiGraph())
+        path_graph = assign_ids(path_graph)
+
+        topological_sort = list(networkx.topological_sort(path_graph))
+        first_node, last_node = (topological_sort[0], topological_sort[-1])
+        sources.append(first_node)
+        targets.append(last_node)
+
+        graph = networkx.union(graph, path_graph)
+
+    Path(args.sources_output).write_text("\n".join(sources))
+    Path(args.targets_output).write_text("\n".join(targets))
+
+    # Then, we'll add some noise: this will be our gold standard.
+    gnp_noise(graph, args.gold_standard_noise)
+    gold_standard = pandas.DataFrame(((a, b) for a, b, _data in networkx.to_edgelist(graph)), columns=["Source", "Target"])
+    # We make the gold standard output a little annoying to force some post-processing with pandas.
+    gold_standard.insert(1, "Interaction-Type", "pp")
+    gold_standard.to_csv(args.gold_standard_output, index=False, sep='\t')
+
+    # and we'll follow along similarly to above to build our interactome.
+    graph.add_nodes_from((random_id() for _ in range(args.interactome_extra_nodes)))
+    gnp_noise(graph, args.interactome_noise)
+    interactome = pandas.DataFrame(((a, b) for a, b, _data in networkx.to_edgelist(graph)), columns=["Source", "Target"])
+    interactome.to_csv(args.interactome_output, index=False, sep='\t')
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 3d2f3ab..00cc2a3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,7 @@ dependencies = [
     "bioservices>=1.12.1",
     "gdown>=5.2.0",
     "more-itertools>=10.7.0",
+    "networkx>=3.6.1",
     "pandas>=2.3.0",
 ]
 
diff --git a/uv.lock b/uv.lock
index 2fb28dd..f186be8 100644
--- a/uv.lock
+++ b/uv.lock
@@ -790,6 +790,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a9/82/0340caa499416c78e5d8f5f05947ae4bc3cba53c9f038ab6e9ed964e22f1/nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b", size = 78454, upload-time = "2024-04-04T11:20:34.895Z" },
 ]
 
+[[package]]
+name = "networkx"
+version = "3.6.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" },
+]
+
 [[package]]
 name = "nodeenv"
 version = "1.9.1"
@@ -1423,6 +1432,7 @@ dependencies = [
     { name = "bioservices" },
     { name = "gdown" },
     { name = "more-itertools" },
+    { name = "networkx" },
     { name = "pandas" },
 ]
 
@@ -1437,6 +1447,7 @@ requires-dist = [
     { name = "bioservices", specifier = ">=1.12.1" },
     { name = "gdown", specifier = ">=5.2.0" },
     { name = "more-itertools", specifier = ">=10.7.0" },
+    { name = "networkx", specifier = ">=3.6.1" },
     { name = "pandas", specifier = ">=2.3.0" },
 ]
 

From 21c857aa58604604166f41a87b0986813b14ac30 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Fri, 23 Jan 2026 09:29:08 +0000
Subject: [PATCH 2/5] docs: contributing

---
 CONTRIBUTING.md | 125 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 114 insertions(+), 11 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 38b13d9..bd4da9a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -7,17 +7,120 @@ There are `TODOs` that better enhance the reproducability and accuracy of datase
 
 ## Adding a dataset
 
-See `datasets/diseases` as an example of a dataset. Datasets take some form of raw data from an online service and convert it into usable datasets
-with associated gold standards for SPRAS to run on.
-
-To add a dataset:
-1. Check that your dataset provider isn't already added (some of these datasets act as providers for multiple datasets)
-1. Create a new folder under `datasets/<your-dataset>`
-1. Add an attached Snakefile that converts your `raw` data to `processed` data.
-    - Make sure to use `uv` here. See `diseases`'s Snakefile for an example.
-1. Add your Snakefile to the top-level `run_snakemake.sh` file.
-1. Add your datasets to the appropiate `configs`
-    - If your dataset has gold standards, make sure to include them here.
+**Check that your data provider isn't already a dataset in `datasets`.** There are some datasets that are able to serve more data, and only use
+a subset of it: these datasets can be extended for your needs.
+
+The goal of a dataset is to take raw data and produce data to be fed to SPRAS.
+We'll follow along with `datasets/contributing`. This mini-tutorial assumes that you already have familiarity with SPRAS
+[as per its contributing guide](https://spras.readthedocs.io/en/latest/contributing/index.html).
+
+### Uploading raw data
+
+This is a fake dataset: the data can be generated by running `datasets/contributing/raw_generation.py`, where the following artifacts will output:
+- `sources.txt`
+- `targets.txt`
+- `gold-standard.tsv`
+- `interactome.tsv`
+
+Unlike in this example, the data used in other datasets comes from other sources (whether that's supplementary info in a paper, or out of
+biological databases like UniProt.) These artifacts can be large, and occasionally update, so we store them in Google Drive for caching and download
+them when we want to reconstruct a dataset.
+
+Note that the four artifacts above change every time `raw_generation.py` is run. Upload those artifacts to Google Drive in a folder of your choice.
+Share the file and allow for _Anyone with the link_ to _View_ the file.
+
+Once shared, copying the URL should look something like:
+
+> https://drive.google.com/file/d/1Agte0Aezext-8jLhGP4GmaF3tS7gHX-h/view?usp=sharing
+
+We always drop the `/view?` from the end, and replace `/file/d/` with `uc?id=`, which turns the URL to a direct download link, which is internally
+downloaded with [gdown](https://github.com/wkentaro/gdown).
+
+Now, add a directive to `cache/directory.py` under `Contributing`. Since this doesn't have an online URL, this should use `CacheItem.cache_only`, to
+indicate that no other online database serves this URL.
+
+Your new directive under the `directory` dictionary should look something as so, with one entry for every artifact:
+
+```python
+...,
+"Contributing": {
+    "interactome.tsv": CacheItem.cache_only(
+        name="Randomly-generated contributing interactome",
+        cached="https://drive.google.com/uc?id=..."
+    ),
+    ...
+}
+```
+
+### Setting up a workflow
+
+Now, we need to make these files SPRAS-compatible. To do this, we'll set up a `Snakefile`, which will handle:
+- Artifact downloading
+- Script running.
+
+`sources.txt` and `targets.txt` are already in a SPRAS-ready format, but we need to process `gold-standard.tsv` and `interactome.tsv`.
+
+Create a `Snakefile` under your dataset with the top-level directives:
+
+```python
+# This allows us to automatically fetch the Google Drive data.
+include: "../../cache/Snakefile"
+
+rule all:
+    input:
+        # The two files we will be passing to SPRAS
+        "raw/sources.txt",
+        "raw/targets.txt",
+        # The two files we will be processing
+        "processed/gold-standard.tsv"
+        "processed/interactome.tsv"
+```
+
+We'll generate four `fetch` rules, or rules that tell Snakemake to download the data we uploaded to Google Drive earlier.
+
+```python
+produce_fetch_rules({
+    # The value array is a path into the dictionary from `cache/directory.py`.
+    "raw/sources.txt": ["Contributing", "sources.txt"],
+    # and so on for targets, gold-standard, and interactome: note that excluding these two stops the Snakemake file from working by design!
+    ...
+})
+```
+
+Create two scripts that make `gold-standard.tsv` and `interactome.tsv` SPRAS-ready, consulting
+the [SPRAS file format documentation](https://spras.readthedocs.io/en/latest/output.html). You can use any dependencies inside the top-level
+`pyproject.toml`, and you can test out your scripts with `uv run <script>`, an installation requirement from the top-level README.
+
+
+> [!TIP]
+> Getting the current directory of your script prevents path errors. We use the snippet `Path(__file__).parent.resolve()`
+> throughout the repository.
+
+Once you have your scripts, add rules that consume the raw data and produce your processed data. For example:
+
+```py
+rule interactome:
+    input:
+        "raw/interactome.tsv"
+    output:
+        "processed/interactome.tsv"
+    shell:
+        "uv run scripts/process_interactome.py"
+```
+
+Once you do the same for `gold-standard.tsv`, your data pipeline is ready! You can test it with `uv run snakemake --cores 1`.
+
+### Adding to `run_snakemake.sh`
+
+Make sure that your `Snakefile` is run inside the top-level `run_snakemake.sh` file.
+
+### Adding to the SPRAS config
+
+Since this is a pathway problem and not a disease mining problem, we'll mutate `configs/pra.yaml`. Add your dataset and gold standard to the
+configuration. Since this dataset passes in a mix of raw and processed files, it would be best to make the `data_dir` set to `datasets/contributing`,
+then refer to individual files when linking node or edge files in the configuration.
+
+To test these, use the `conda` environment from the `spras` submodule to run `snakemake` with SPRAS.
 
 ## Adding an algorithm
 

From 5d948f6ccc96d25c98d903d10804bdaab24d386c Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Fri, 23 Jan 2026 09:36:34 +0000
Subject: [PATCH 3/5] docs: typo

---
 CONTRIBUTING.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index bd4da9a..b4bfaf3 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -82,7 +82,8 @@ We'll generate four `fetch` rules, or rules that tell Snakemake to download the
 produce_fetch_rules({
     # The value array is a path into the dictionary from `cache/directory.py`.
     "raw/sources.txt": ["Contributing", "sources.txt"],
-    # and so on for targets, gold-standard, and interactome: note that excluding these two stops the Snakemake file from working by design!
+    # and so on for targets, gold-standard, and interactome:
+    # note that excluding these three stops the Snakemake file from working by design!
     ...
 })
 ```

From 7b963db30139d23083d5ee0f067986309b25c951 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Fri, 23 Jan 2026 02:44:09 -0800
Subject: [PATCH 4/5] docs: fix comma, contextualize cache inclusion

---
 CONTRIBUTING.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b4bfaf3..0a5f33b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -63,7 +63,7 @@ Now, we need to make these files SPRAS-compatible. To do this, we'll set up a `S
 Create a `Snakefile` under your dataset with the top-level directives:
 
 ```python
-# This allows us to automatically fetch the Google Drive data.
+# This provides the `produce_fetch_rules` util to allows us to automatically fetch the Google Drive data.
 include: "../../cache/Snakefile"
 
 rule all:
@@ -72,7 +72,7 @@ rule all:
         "raw/sources.txt",
         "raw/targets.txt",
         # The two files we will be processing
-        "processed/gold-standard.tsv"
+        "processed/gold-standard.tsv",
         "processed/interactome.tsv"
 ```
 

From a677cb8f92f10395195317a9975209b8282ade39 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Fri, 23 Jan 2026 02:45:15 -0800
Subject: [PATCH 5/5] docs: clarification on view

---
 CONTRIBUTING.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0a5f33b..1c47403 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -33,8 +33,10 @@ Once shared, copying the URL should look something like:
 
 > https://drive.google.com/file/d/1Agte0Aezext-8jLhGP4GmaF3tS7gHX-h/view?usp=sharing
 
-We always drop the `/view?` from the end, and replace `/file/d/` with `uc?id=`, which turns the URL to a direct download link, which is internally
-downloaded with [gdown](https://github.com/wkentaro/gdown).
+We always drop the entire `/view?...` suffix, and replace `/file/d/` with `/uc?id=`, which turns the URL to a direct download link, which is internally
+downloaded with [gdown](https://github.com/wkentaro/gdown). Those post-processing steps should make the URL now look as so:
+
+> https://drive.google.com/uc?id=1Agte0Aezext-8jLhGP4GmaF3tS7gHX-h
 
 Now, add a directive to `cache/directory.py` under `Contributing`. Since this doesn't have an online URL, this should use `CacheItem.cache_only`, to
 indicate that no other online database serves this URL.