Reed-CompBio · tristan-f-r · Jan 23, 2026 · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -52,8 +52,20 @@ jobs:
       - name: Log conda environment
         shell: bash --login {0}
         run: conda list
+      - name: Fetch Artifact Cache
+        id: fetch-cache
+        uses: actions/cache/restore@v4
+        with:
+          path: cache/artifacts
+          key: cache-artifacts
       - name: Process raw data through Snakemake
         run: sh run_snakemake.sh
+      - name: Cache Artifact Cache
+        id: cache-cache
+        uses: actions/cache/save@v4
+        with:
+          path: cache/artifacts
+          key: cache-artifacts
       - name: Run Snakemake workflow for DMMMs
         shell: bash --login {0}
         run: snakemake --cores 4 --configfile configs/dmmm.yaml --show-failed-logs -s spras/Snakefile

diff --git a/cache/README.md b/cache/README.md
@@ -1,3 +1,8 @@
 # cache
 
-Handles artifact fetching and cache. This folder has a `Snakefile` which only contains a single function used for producing fetching rules.
+Handles artifact fetching and cache. This folder has:
+
+- `Snakefile` which only contains a function used for producing fetching rules.
+- `directory.py`, the actual location of file URLs and their cached counterparts.
+- `cli.py`, a utility for manually fetching specific URLs from `directory.py`.
+- `util.py`, an internal file for use by the other files above.
diff --git a/cache/cli.py b/cache/cli.py
@@ -0,0 +1,29 @@
+"""
+Downloads the online variants of cache items.
+
+This may be expanded in the future, so only depend on this file as a debugging utility.
+
+For example, `python cache/cli.py KEGG/ko03250.xml ko03250.xm` allows running the KEGG query
+for ko03250.xml, which can not be normally accessed automatically in the browser.
+"""
+
+import argparse
+from cache.directory import get_cache_item
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog='Cache',
+        description='CLI utility for directory.py')
+    parser.add_argument('path')
+    parser.add_argument('output')
+
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+    cache_item = get_cache_item(args.path.split("/"))
+
+    cache_item.download_online(args.output)
+
+if __name__ == "__main__":
+    main()
diff --git a/cache/directory.py b/cache/directory.py
@@ -2,6 +2,7 @@
 from typing import Union
 from os import PathLike
 from tempfile import NamedTemporaryFile
+from typing import Optional
 import urllib.request
 import filecmp
 import urllib.parse
@@ -24,18 +25,34 @@ def fetch_biomart_url(xml: str) -> str:
 
 @dataclass
 class CacheItem:
-    """Class for differentriating between offline and online items in a cache."""
+    """
+    Class for differentriating between offline and online items in a cache.
+
+    NOTE: If cached is "", we assume that online is a Google Drive URL (for cases where there is no
+    remaining online data source.)
+    """
 
     name: str
     """The display name of the artifact, used for human-printing."""
     cached: str
     online: str
+    online_headers: Optional[list[tuple[str, str]]] = None
 
     @classmethod
     def cache_only(cls, name: str, cached: str) -> "CacheItem":
         """Wrapper method to explicitly declare a CacheItem as cached only."""
         return cls(name=name, online=cached, cached="")
 
+    def download_online(self, output: str | PathLike):
+        # https://stackoverflow.com/a/45313194/7589775: this is to add optional headers to requests.
+        # We remove the opener at the end by re-installing the default opener.
+        opener = urllib.request.build_opener()
+        if self.online_headers:
+            opener.addheaders = self.online_headers
+        urllib.request.install_opener(opener)
+        urllib.request.urlretrieve(self.online, output)
+        urllib.request.install_opener(urllib.request.build_opener())
+
     def download(self, output: str | PathLike):
         print(f"Fetching {self.name}...")
         print(f"Downloading {self.online}...")
@@ -46,7 +63,7 @@ def download(self, output: str | PathLike):
             gdown.download(self.online, str(output))
             return
 
-        urllib.request.urlretrieve(self.online, output)
+        self.download_online(output)
 
         with NamedTemporaryFile() as cached_file:
             print(f"Downloading cache {self.cached}...")
@@ -75,13 +92,16 @@ def download(self, output: str | PathLike):
     },
     "UniProt": {
         # We use FTP when possible, but we delegate to the UniProt REST API in cases that would save significant bandwidth.
+        # See https://ftp.uniprot.org/pub/databases/uniprot/current_release/README for the FTP README.
         "9606": {
-            # We prefer manually curated genes.
+            # We prefer manually curated, or SwissProt, genes. This URL selects these genes using the REST API.
             "SwissProt_9606.tsv": CacheItem(
                 name="UniProt 9606 SwissProt genes",
                 cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk",
                 online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29",
             ),
+            # idmapping FTP files. See the associated README:
+            # https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README
             "HUMAN_9606_idmapping_selected.tab.gz": CacheItem(
                 name="UniProt 9606 ID external database mapping",
                 cached="https://drive.google.com/uc?id=1Oysa5COq31H771rVeyrs-6KFhE3VJqoX",
@@ -152,6 +172,15 @@ def download(self, output: str | PathLike):
             online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads",
         ),
     },
+    "iRefIndex": {
+        # This can also be obtained from the SPRAS repo
+        # (https://github.com/Reed-CompBio/spras/blob/b5d7a2499afa8eab14c60ce0f99fa7e8a23a2c64/input/phosphosite-irefindex13.0-uniprot.txt).
+        # iRefIndex has been down for quite some time, so this is only from the cache.
+        "phosphosite-irefindex13.0-uniprot.txt": CacheItem.cache_only(
+            name="iRefIndex v13.0 UniProt interactome",
+            cached="https://drive.google.com/uc?id=1fQ8Z3FjEwUseEtsExO723zj7mAAtdomo"
+        )
+    },
     "OsmoticStress": {
         "yeast_pcsf_network.sif": CacheItem.cache_only(
             # In the paper https://doi.org/10.1016/j.celrep.2018.08.085

diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml
@@ -44,12 +44,12 @@ datasets:
   # HIV: https://github.com/Reed-CompBio/spras-benchmarking/blob/0293ae4dc0be59502fac06b42cfd9796a4b4413e/hiv-benchmarking/spras-config/config.yaml
   - label: dmmmhiv_060
     node_files: ["processed_prize_060.txt"]
-    edge_files: ["../../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"]
+    edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"]
     other_files: []
     data_dir: "datasets/hiv/processed"
   - label: dmmmhiv_05
     node_files: ["processed_prize_05.txt"]
-    edge_files: ["../../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"]
+    edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"]
     other_files: []
     data_dir: "datasets/hiv/processed"
   # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml
@@ -74,7 +74,7 @@ datasets:
     other_files: []
   - label: dmmmdepmap_cellline_fadu
     data_dir: datasets/depmap
-    edge_files: ["../../databases/irefindex/phosphosite-irefindex13.0-uniprot.txt"]
+    edge_files: ["raw/phosphosite-irefindex13.0-uniprot.txt"]
     node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"]
     other_files: []
 gold_standards:

diff --git a/databases/.gitignore b/databases/.gitignore
diff --git a/databases/irefindex/README.md b/databases/irefindex/README.md