From 3629b2ccb782e18e4eaa0e7955c67d0d123b6b49 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 23 Jan 2026 09:08:32 +0000 Subject: [PATCH 1/5] feat: raw_generation.py --- datasets/contributing/.gitignore | 4 ++ datasets/contributing/README.md | 10 ++++ datasets/contributing/raw_generation.py | 78 +++++++++++++++++++++++++ pyproject.toml | 1 + uv.lock | 11 ++++ 5 files changed, 104 insertions(+) create mode 100644 datasets/contributing/.gitignore create mode 100644 datasets/contributing/README.md create mode 100644 datasets/contributing/raw_generation.py diff --git a/datasets/contributing/.gitignore b/datasets/contributing/.gitignore new file mode 100644 index 0000000..56204f5 --- /dev/null +++ b/datasets/contributing/.gitignore @@ -0,0 +1,4 @@ +gold-standard.tsv +interactome.tsv +sources.txt +targets.txt diff --git a/datasets/contributing/README.md b/datasets/contributing/README.md new file mode 100644 index 0000000..01519e2 --- /dev/null +++ b/datasets/contributing/README.md @@ -0,0 +1,10 @@ +# Contributing Guide dataset + +**This is an artificial dataset** for how to make datasets. + +This comes with a `raw_generation.py` script, which produces the associated raw data, where the gold standard is `k` paths of length `n` with +Erdős-Rényi edges, such that the sources and targets come from the start and ends of each path. The background interactome is the gold standard with +more edge and node noise. This is not a topologically-accurate emulation of (signaling) pathways, but it suffices to trick most pathway reconstruction +algorithms. + +This does not cover the (very common!) task of ID mapping, as this can vary constantly between datasets. diff --git a/datasets/contributing/raw_generation.py b/datasets/contributing/raw_generation.py new file mode 100644 index 0000000..acbca21 --- /dev/null +++ b/datasets/contributing/raw_generation.py @@ -0,0 +1,78 @@ +import argparse +import itertools +from pathlib import Path +import random +import networkx +import uuid +import pandas + +def random_id() -> str: + return uuid.uuid4().hex + +def assign_ids(graph: networkx.DiGraph) -> networkx.DiGraph: + """Assigns new IDs to a graph based on `random_id`""" + mapping = {node: random_id() for node in graph} + return networkx.relabel_nodes(graph, mapping) + +def gnp_noise(graph: networkx.DiGraph, p: float): + """ + The mutative equivalent to networkx.gnp_random_graph, + whose original implementation does not consume a graph. + """ + for e in itertools.permutations(graph.nodes, 2): + if random.random() < p: + graph.add_edge(*e) + +def generate_parser(): + parser = argparse.ArgumentParser(prog='Pathway generator') + parser.add_argument("--path-count", type=int, default=10) + parser.add_argument("--path-length", type=int, default=7) + + parser.add_argument("--sources-output", type=str, default="sources.txt") + parser.add_argument("--targets-output", type=str, default="targets.txt") + + parser.add_argument("--gold-standard-noise", type=float, default=0.03) + parser.add_argument("--gold-standard-output", type=str, default="gold-standard.tsv") + + parser.add_argument("--interactome-extra-nodes", type=int, default=400) + parser.add_argument("--interactome-noise", type=float, default=0.01) + parser.add_argument("--interactome-output", type=str, default="interactome.tsv") + return parser + +def main(): + args = generate_parser().parse_args() + + graph = networkx.DiGraph() + sources: list[str] = [] + targets: list[str] = [] + + # Add the path graphs to form the base of the pathway, while getting sources and targets as well. + for _ in range(args.path_count): + path_graph = networkx.path_graph(args.path_length, create_using=networkx.DiGraph()) + path_graph = assign_ids(path_graph) + + topological_sort = list(networkx.topological_sort(path_graph)) + first_node, last_node = (topological_sort[0], topological_sort[-1]) + sources.append(first_node) + targets.append(last_node) + + graph = networkx.union(graph, path_graph) + + Path(args.sources_output).write_text("\n".join(sources)) + Path(args.targets_output).write_text("\n".join(targets)) + + # Then, we'll add some noise: this will be our gold standard. + gnp_noise(graph, args.gold_standard_noise) + gold_standard = pandas.DataFrame(((a, b) for a, b, _data in networkx.to_edgelist(graph)), columns=["Source", "Target"]) + # We make the gold standard output a little annoying to force some post-processing with pandas. + gold_standard.insert(1, "Interaction-Type", "pp") + gold_standard.to_csv(args.gold_standard_output, index=False, sep='\t') + + # and we'll follow along similarly to above to build our interactome. + graph.add_nodes_from((random_id() for _ in range(args.interactome_extra_nodes))) + gnp_noise(graph, args.interactome_noise) + interactome = pandas.DataFrame(((a, b) for a, b, _data in networkx.to_edgelist(graph)), columns=["Source", "Target"]) + interactome.to_csv(args.interactome_output, index=False, sep='\t') + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 3d2f3ab..00cc2a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "bioservices>=1.12.1", "gdown>=5.2.0", "more-itertools>=10.7.0", + "networkx>=3.6.1", "pandas>=2.3.0", ] diff --git a/uv.lock b/uv.lock index 2fb28dd..f186be8 100644 --- a/uv.lock +++ b/uv.lock @@ -790,6 +790,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a9/82/0340caa499416c78e5d8f5f05947ae4bc3cba53c9f038ab6e9ed964e22f1/nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b", size = 78454, upload-time = "2024-04-04T11:20:34.895Z" }, ] +[[package]] +name = "networkx" +version = "3.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, +] + [[package]] name = "nodeenv" version = "1.9.1" @@ -1423,6 +1432,7 @@ dependencies = [ { name = "bioservices" }, { name = "gdown" }, { name = "more-itertools" }, + { name = "networkx" }, { name = "pandas" }, ] @@ -1437,6 +1447,7 @@ requires-dist = [ { name = "bioservices", specifier = ">=1.12.1" }, { name = "gdown", specifier = ">=5.2.0" }, { name = "more-itertools", specifier = ">=10.7.0" }, + { name = "networkx", specifier = ">=3.6.1" }, { name = "pandas", specifier = ">=2.3.0" }, ] From 21c857aa58604604166f41a87b0986813b14ac30 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 23 Jan 2026 09:29:08 +0000 Subject: [PATCH 2/5] docs: contributing --- CONTRIBUTING.md | 125 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 114 insertions(+), 11 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 38b13d9..bd4da9a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -7,17 +7,120 @@ There are `TODOs` that better enhance the reproducability and accuracy of datase ## Adding a dataset -See `datasets/diseases` as an example of a dataset. Datasets take some form of raw data from an online service and convert it into usable datasets -with associated gold standards for SPRAS to run on. - -To add a dataset: -1. Check that your dataset provider isn't already added (some of these datasets act as providers for multiple datasets) -1. Create a new folder under `datasets/` -1. Add an attached Snakefile that converts your `raw` data to `processed` data. - - Make sure to use `uv` here. See `diseases`'s Snakefile for an example. -1. Add your Snakefile to the top-level `run_snakemake.sh` file. -1. Add your datasets to the appropiate `configs` - - If your dataset has gold standards, make sure to include them here. +**Check that your data provider isn't already a dataset in `datasets`.** There are some datasets that are able to serve more data, and only use +a subset of it: these datasets can be extended for your needs. + +The goal of a dataset is to take raw data and produce data to be fed to SPRAS. +We'll follow along with `datasets/contributing`. This mini-tutorial assumes that you already have familiarity with SPRAS +[as per its contributing guide](https://spras.readthedocs.io/en/latest/contributing/index.html). + +### Uploading raw data + +This is a fake dataset: the data can be generated by running `datasets/contributing/raw_generation.py`, where the following artifacts will output: +- `sources.txt` +- `targets.txt` +- `gold-standard.tsv` +- `interactome.tsv` + +Unlike in this example, the data used in other datasets comes from other sources (whether that's supplementary info in a paper, or out of +biological databases like UniProt.) These artifacts can be large, and occasionally update, so we store them in Google Drive for caching and download +them when we want to reconstruct a dataset. + +Note that the four artifacts above change every time `raw_generation.py` is run. Upload those artifacts to Google Drive in a folder of your choice. +Share the file and allow for _Anyone with the link_ to _View_ the file. + +Once shared, copying the URL should look something like: + +> https://drive.google.com/file/d/1Agte0Aezext-8jLhGP4GmaF3tS7gHX-h/view?usp=sharing + +We always drop the `/view?` from the end, and replace `/file/d/` with `uc?id=`, which turns the URL to a direct download link, which is internally +downloaded with [gdown](https://github.com/wkentaro/gdown). + +Now, add a directive to `cache/directory.py` under `Contributing`. Since this doesn't have an online URL, this should use `CacheItem.cache_only`, to +indicate that no other online database serves this URL. + +Your new directive under the `directory` dictionary should look something as so, with one entry for every artifact: + +```python +..., +"Contributing": { + "interactome.tsv": CacheItem.cache_only( + name="Randomly-generated contributing interactome", + cached="https://drive.google.com/uc?id=..." + ), + ... +} +``` + +### Setting up a workflow + +Now, we need to make these files SPRAS-compatible. To do this, we'll set up a `Snakefile`, which will handle: +- Artifact downloading +- Script running. + +`sources.txt` and `targets.txt` are already in a SPRAS-ready format, but we need to process `gold-standard.tsv` and `interactome.tsv`. + +Create a `Snakefile` under your dataset with the top-level directives: + +```python +# This allows us to automatically fetch the Google Drive data. +include: "../../cache/Snakefile" + +rule all: + input: + # The two files we will be passing to SPRAS + "raw/sources.txt", + "raw/targets.txt", + # The two files we will be processing + "processed/gold-standard.tsv" + "processed/interactome.tsv" +``` + +We'll generate four `fetch` rules, or rules that tell Snakemake to download the data we uploaded to Google Drive earlier. + +```python +produce_fetch_rules({ + # The value array is a path into the dictionary from `cache/directory.py`. + "raw/sources.txt": ["Contributing", "sources.txt"], + # and so on for targets, gold-standard, and interactome: note that excluding these two stops the Snakemake file from working by design! + ... +}) +``` + +Create two scripts that make `gold-standard.tsv` and `interactome.tsv` SPRAS-ready, consulting +the [SPRAS file format documentation](https://spras.readthedocs.io/en/latest/output.html). You can use any dependencies inside the top-level +`pyproject.toml`, and you can test out your scripts with `uv run