Reed-CompBio · tristan-f-r · Jul 1, 2025 · Jul 28, 2025 · Jan 6, 2026 · Jan 6, 2026
diff --git a/cache/directory.py b/cache/directory.py
@@ -78,10 +78,10 @@ def download(self, output: str | PathLike):
 directory: CacheDirectory = {
     "STRING": {
         "9606": {
-            "9606.protein.links.txt.gz": CacheItem(
-                name="STRING 9606 protein links",
-                cached="https://drive.google.com/uc?id=1fvjdIbgzbgJrdJxWRRRwwS1zuegf6DOj",
-                online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
+            "9606.protein.links.full.txt.gz": CacheItem(
+                name="STRING 9606 full links",
+                cached="https://drive.google.com/uc?id=13tE_-A6g7McZs_lZGz9As7iE-5cBFvqE",
+                online="http://stringdb-downloads.org/download/protein.links.full.v12.0/9606.protein.links.full.v12.0.txt.gz",
             ),
             "9606.protein.aliases.txt.gz": CacheItem(
                 name="STRING 9606 protein aliases",

diff --git a/datasets/diseases/Snakefile b/datasets/diseases/Snakefile
@@ -13,7 +13,7 @@ produce_fetch_rules({
     "raw/HumanDO.tsv": ["DISEASES", "HumanDO.tsv"],
     "raw/tiga_gene-trait_stats.tsv": ["DISEASES", "tiga_gene-trait_stats.tsv"],
     "raw/ensg-ensp.tsv": ["BioMart", "ensg-ensp.tsv"],
-    "raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True),
+    "raw/9606.protein.links.full.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True),
     "raw/9606.protein.aliases.txt": FetchConfig(["STRING", "9606", "9606.protein.aliases.txt.gz"], uncompress=True),
 })
 
@@ -42,7 +42,7 @@ rule files:
     input:
         "data/inputs.csv",
         "data/gold_standard.csv",
-        "raw/9606.protein.links.txt"
+        "raw/9606.protein.links.full.txt"
     output:
         # These are the two we use for the SPRAS run for now
         "GS_files/Alopecia_areata_GS.txt",

diff --git a/datasets/diseases/scripts/files.py b/datasets/diseases/scripts/files.py
@@ -42,7 +42,7 @@ def main():
 
     # See /cache/directory.py for information on how this was grabbed.
     # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES.
-    string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.txt", sep=" ", skiprows=[0], header=None)
+    string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ", skiprows=[0], header=None)
 
     # Threshold anything above a confidence score of 900 to trim down the background interactome
     string = string[string.iloc[:, 2] > 900]

diff --git a/datasets/synthetic-data/.gitignore b/datasets/synthetic-data/.gitignore
@@ -0,0 +1,3 @@
+intermediate
+processed
+raw/9606.protein.links.full.v12.0.txt
diff --git a/datasets/synthetic-data/README.md b/datasets/synthetic-data/README.md
@@ -0,0 +1,63 @@
+# Synthetic Data
+
+## Download STRING Human Interactome
+1. Download the STRING *Homo sapiens* `9606.protein.links.full.v12.0.txt.gz` database file from [STRING](https://string-db.org/cgi/download?sessionId=bL9sRTdIaUEt&species_text=Homo+sapiens&settings_expanded=0&min_download_score=0&filter_redundant_pairs=0&delimiter_type=txt).
+2. Move the downloaded file into the `raw/human-interactome/` folder.
+3. From the `raw/synthetic-data/` directory, extract the file using:
+
+   ```sh
+   gunzip human-interactome/9606.protein.links.full.v12.0.txt.gz
+   ```
+
+## Download New PANTHER Pathways
+1. Visit [Pathway Commons](https://www.pathwaycommons.org/).
+2. Search for the desired pathway (e.g., "signaling") and filter the results by the **PANTHER pathway** data source.  
+   Example: [Search for "Signaling" filtered by PANTHER pathway](https://apps.pathwaycommons.org/search?datasource=panther&q=Signaling&type=Pathway)
+3. Click on the desired pathway and download the **Extended SIF** version of the pathway.
+4. In the `raw/pathway-data/` folder, create a new subfolder named after the pathway you downloaded.
+5. Move the downloaded Extended SIF file to this new folder (as a `.txt` file). Rename the file to match the subfolder name exactly.
+
+## Sources and Targets
+
+[Sources](https://www.pnas.org/doi/full/10.1073/pnas.1808790115) are silico human surfaceomes receptors.
+
+[Targets](https://academic.oup.com/nar/article/51/D1/D39/6765312) are human transcription factors.
+
+## Steps to Generate SPRAS-Compatible Pathways
+
+### 1. Process PANTHER Pathways
+
+1. Open `Snakefile` and add the name of any new pathways to the `pathways` entry.
+2. Run the command:
+   ```
+   uv run scripts/process_panther_pathway.py <pathway>
+   ```
+3. This will create five new files in the respective `pathway` subfolder of the `pathway-data/` directory:
+- `EDGES.txt`
+- `NODES.txt`
+- `PRIZES-100.txt`
+- `SOURCES.txt`
+- `TARGETS.txt`
+
+### 2. Convert Pathways to SPRAS-Compatible Format
+1.	In `panther_spras_formatting.py`, add the name of any new pathways to the `pathway_dirs` list on **line 8**.
+2.	From the synthetic-data/ directory, run the command:
+```
+python scripts/panther_spras_formatting.py
+```
+3. This will create a new folder named `spras-compatible-pathway-data`, containing subfolders for each PANTHER pathway in SPRAS-compatible format.  
+Each subfolder will include the following three files:
+- `<pathway_name>_gs_edges.txt`
+- `<pathway_name>_gs_nodes.txt`
+- `<pathway_name>_node_prizes.txt`
+
+# Pilot Data
+For the pilot data, use the list `["Wnt_signaling", "JAK_STAT_signaling", "Interferon_gamma_signaling", "FGF_signaling", "Ras"]` in both:
+- the list in `combine.py`
+- the list in `overlap_analytics.py`
+
+Make sure these pathways in the list are also added `["Wnt_signaling", "JAK_STAT_signaling", "Interferon_gamma_signaling", "FGF_signaling", "Ras"]`to:
+- the `pathways` vector in `ProcessPantherPathway.R`
+- the list in `panther_spras_formatting.py`
+
+**Once you’ve updated the pathway lists in all relevant scripts, run all the steps above to generate the Pilot dataset.**
diff --git a/datasets/synthetic-data/Snakefile b/datasets/synthetic-data/Snakefile
@@ -0,0 +1,49 @@
+include: "../../cache/Snakefile"
+
+pathways = ["Apoptosis_signaling", "B_cell_activation",
+            "Beta3_adrenergic_rec", "Cadherin_signaling",
+            "Hedgehog_signaling", "Insulin_IGF",
+            "Interleukin_signaling", "Notch_signaling",
+            "PDGF_signaling", "Ras", "T_cell_activation",
+            "Toll_signaling", "Wnt_signaling", "p38_MAPK",
+            "Nicotinic_acetylchol", "Fas_signaling",
+            "FGF_signaling", "Interferon_gamma_signaling",
+            "JAK_STAT_signaling", "VEGF_signaling"]
+
+rule all:
+    input:
+        "raw/9606.protein.links.full.v12.0.txt",
+        expand([
+            "processed/{pathway}/{pathway}_node_prizes.txt",
+            "processed/{pathway}/{pathway}_gs_edges.txt",
+            "processed/{pathway}/{pathway}_gs_nodes.txt"
+        ], pathway=pathways)
+
+produce_fetch_rules({
+    "raw/9606.protein.links.full.v12.0.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True),
+})
+
+rule process_panther_pathway:
+    input: "raw/pathway-data/{pathway}.txt"
+    output:
+        "intermediate/{pathway}/EDGES.txt",
+        "intermediate/{pathway}/NODES.txt",
+        "intermediate/{pathway}/TARGETS.txt",
+        "intermediate/{pathway}/SOURCES.txt",
+        "intermediate/{pathway}/PRIZES.txt"
+    shell:
+        "uv run scripts/process_panther_pathway.py {wildcards.pathway}"
+
+rule make_spras_compatible:
+    input:
+        "intermediate/{pathway}/EDGES.txt",
+        "intermediate/{pathway}/NODES.txt",
+        "intermediate/{pathway}/TARGETS.txt",
+        "intermediate/{pathway}/SOURCES.txt",
+        "intermediate/{pathway}/PRIZES.txt"
+    output:
+        "processed/{pathway}/{pathway}_node_prizes.txt",
+        "processed/{pathway}/{pathway}_gs_edges.txt",
+        "processed/{pathway}/{pathway}_gs_nodes.txt"
+    shell:
+        "uv run scripts/panther_spras_formatting.py {wildcards.pathway}"