kbase · ialarmedalien · Jan 28, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/.gitignore b/.gitignore
@@ -166,5 +166,6 @@ cython_debug/
 #.idea/
 
 
-#Ignore vscode AI rules
+# codacy stuff
 .github/instructions/codacy.instructions.md
+.codacy
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,10 +12,7 @@ dependencies = [
     "berdl-notebook-utils",
     "biopython>=1.86",
     "click>=8.3.1",
-    "xmlschema>=4.2.0",
-    "xsdata[cli,lxml]>=25.7",
     "lxml>=6.0.2",
-    "pytest-asyncio>=1.3.0",
     "ruff>=0.14.13",
 ]
 
@@ -24,9 +21,18 @@ dev = [
     "hypothesis>=6.148.9",
     "mutmut>=3.4.0",
     "pytest>=9.0.2",
+    "pytest-asyncio>=1.3.0",
     "pytest-cov>=7.0.0",
     "pytest-env>=1.2.0",
 ]
+models = [
+    "genson>=1.3.0",
+    "json2python-models>=0.3.1",
+]
+xml = [
+    "xmlschema>=4.3.1",
+    "xsdata[cli,lxml]>=26.1",
+]
 
 [tool.ruff]
 line-length = 120
@@ -146,7 +152,7 @@ ignore = [
 
 [tool.ruff.lint.per-file-ignores]
 "*.ipynb" = ["T201"] # ignore printing in notebooks
-"tests/**/*.py" = ["S101", "T201"] # use of assert
+"tests/**/*.py" = ["S101", "T201", "FBT001", "FBT002"]  # use of assert
 "**/__init__.py" = ["D104"]
 
 [tool.ruff.lint.mccabe]
@@ -164,8 +170,9 @@ build-backend = "uv_build"
 pythonpath = ["src"]
 log_cli = true
 log_cli_level = "INFO"
+log_level = "INFO"
 addopts = ["-v"]
-markers = ["requires_spark: must be run in an environment where spark is available"]
+markers = ["requires_spark: must be run in an environment where spark is available", "slow_test: does what it says on the tin"]
 
 # environment settings for running tests
 [tool.pytest_env]

diff --git a/src/cdm_data_loader_utils/parsers/uniprot/__init__.py b/src/cdm_data_loader_utils/parsers/uniprot/__init__.py
diff --git a/src/cdm_data_loader_utils/parsers/uniprot/idmapping.py b/src/cdm_data_loader_utils/parsers/uniprot/idmapping.py
@@ -0,0 +1,152 @@
+"""Parser for UniProt ID Mapping file.
+
+UniProt provides comprehensive mappings between their protein IDs and many other databases. Mappings are extracted from the UniProt records where possible.
+
+Source file: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz
+
+Legacy mappings (pre-UniProt proteome redundancy reduction drive): https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.2015_03.gz
+
+Docs: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README
+
+Metalink file: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/RELEASE.metalink
+
+Retrieve the list of databases referenced from the UniProt API: https://rest.uniprot.org/database/stream?format=json&query=%28*%29
+
+1) idmapping.dat
+This file has three columns, delimited by tab:
+1. UniProtKB-AC
+2. ID_type
+3. ID
+where ID_type is the database name as appearing in UniProtKB cross-references,
+and as supported by the ID mapping tool on the UniProt web site,
+http://www.uniprot.org/mapping and where ID is the identifier in
+that cross-referenced database.
+"""
+
+import datetime
+from uuid import uuid4
+
+import click
+from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql import functions as sf
+from pyspark.sql.types import StringType, StructField
+
+from cdm_data_loader_utils.core.constants import INVALID_DATA_FIELD_NAME
+from cdm_data_loader_utils.core.pipeline_run import PipelineRun
+from cdm_data_loader_utils.readers.dsv import read
+from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger
+from cdm_data_loader_utils.utils.minio import list_remote_dir_contents
+from cdm_data_loader_utils.utils.spark_delta import APPEND, set_up_workspace, write_delta
+from cdm_data_loader_utils.validation.dataframe_validator import DataFrameValidator, Validator
+from cdm_data_loader_utils.validation.df_nullable_fields import validate as check_nullable_fields
+
+APP_NAME = "uniprot_idmapping"
+NOW = datetime.datetime.now(tz=datetime.UTC)
+DB = "db"
+XREF = "xref"
+ID = "id"
+COLUMNS = [ID, DB, XREF]
+
+
+logger = get_cdm_logger()
+
+ID_MAPPING_SCHEMA = [StructField(n, StringType(), nullable=False) for n in COLUMNS]
+
+
+def ingest(spark: SparkSession, run: PipelineRun, id_mapping_tsv: str) -> DataFrame:
+    """Parse the ID mapping file and convert it to a dataframe.
+
+    :param spark: spark sesh
+    :type spark: SparkSession
+    :param id_mapping_tsv: path to the ID mapping tsv file
+    :type id_mapping_tsv: str
+    :return: dataframe containing the ID mapping stuff
+    :rtype: DataFrame
+    """
+    options = {
+        "delimiter": "\t",
+        "header": False,
+        "ignoreLeadingWhiteSpace": True,
+        "ignoreTrailingWhiteSpace": True,
+        "enforceSchema": True,
+        "inferSchema": False,
+    }
+
+    df = read(spark, id_mapping_tsv, ID_MAPPING_SCHEMA, options)
+    id_map_parse_result = DataFrameValidator(spark).validate_dataframe(
+        data_to_validate=df,
+        schema=ID_MAPPING_SCHEMA,
+        run=run,
+        validator=Validator(check_nullable_fields, {"invalid_col": INVALID_DATA_FIELD_NAME}),
+        invalid_col=INVALID_DATA_FIELD_NAME,
+    )
+    id_map_df = id_map_parse_result.valid_df
+
+    # destination format:
+    # "entity_id", "identifier", "description", "source", "relationship"
+    return id_map_df.select(
+        # prefix with UniProt
+        sf.concat(sf.lit("UniProt:"), sf.col("id")).alias("uniprot_id"),
+        sf.col(DB),
+        sf.col(XREF),
+        sf.lit(None).cast("string").alias("description"),
+        sf.lit("UniProt ID mapping").alias("source"),
+        sf.lit(None).cast("string").alias("relationship"),
+    )
+
+
+def read_and_write(spark: SparkSession, pipeline_run: PipelineRun, id_mapping_tsv: str) -> None:
+    """Read in the UniProt ID mapping and write it out as a uniprot_identifier table.
+
+    :param spark: spark sesh
+    :type spark: SparkSession
+    :param delta_ns: namespace to write to
+    :type delta_ns: str
+    :param id_mapping_tsv: path to the ID mapping file
+    :type id_mapping_tsv: str
+    :param mode: write mode (append or overwrite)
+    :type mode: str
+    """
+    # get the metalink XML and retrieve data source info
+    write_delta(
+        spark, ingest(spark, pipeline_run, id_mapping_tsv), pipeline_run.namespace, "uniprot_identifier", APPEND
+    )
+
+
+@click.command()
+@click.option(
+    "--source",
+    required=True,
+    help="Full path to the source directory containing ID mapping file(s). Does not need to specify the Bucket (i.e. cdm-lake) but should specify everything else.",
+)
+@click.option(
+    "--namespace",
+    default="uniprot",
+    show_default=True,
+    help="Delta Lake database name",
+)
+@click.option(
+    "--tenant-name",
+    default=None,
+    help="Tenant warehouse to save processed data to; defaults to saving data to the user warehouse if a tenant is not specified",
+)
+def main(source: str, namespace: str, tenant_name: str | None) -> None:
+    """Run the UniProt ID Mapping importer.
+
+    :param source: full path to the source directory containing ID mapping file(s)
+    :type source: str
+    :param namespace: Delta Lake database name
+    :type namespace: str
+    :param tenant_name: Tenant warehouse to save processed data to; defaults to saving data to the user warehouse if a tenant is not specified
+    :type tenant_name: str | None
+    """
+    (spark, delta_ns) = set_up_workspace(APP_NAME, namespace, tenant_name)
+    for file in list_remote_dir_contents(source):
+        # file names are in the 'Key' value
+        # 'tenant-general-warehouse/kbase/datasets/uniprot/id_mapping/id_mapping_part_001.tsv.gz'
+        pipeline_run = PipelineRun(str(uuid4()), APP_NAME, file["Key"], delta_ns)
+        read_and_write(spark, pipeline_run, file["Key"])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/cdm_data_loader_utils/parsers/uniprot/metalink.py b/src/cdm_data_loader_utils/parsers/uniprot/metalink.py
@@ -0,0 +1,96 @@
+"""Parser for UniProt metalink XML files.
+
+These metadata files provide information and links for UniProt and related downloads.
+
+
+"""
+
+import datetime
+from pathlib import Path
+from typing import Any
+from xml.etree.ElementTree import Element
+
+from defusedxml.ElementTree import parse
+
+from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger
+
+NS = {"": "http://www.metalinker.org/"}
+NOW = datetime.datetime.now(tz=datetime.UTC)
+COLUMNS = ["id", "db", "xref"]
+
+logger = get_cdm_logger()
+
+
+def parse_metalink(metalink_xml_path: Path | str) -> Element | None:
+    """Parse the metalink file and return the root node."""
+    document = parse(str(metalink_xml_path))
+    root = document.getroot()
+    if root is not None:
+        return root
+    msg = f"Could not find root in metalink file: {metalink_xml_path!s}"
+    logger.error(msg)
+    raise RuntimeError(msg)
+
+
+def generate_data_source_table(metalink_xml_path: Path | str) -> dict[str, Any]:
+    """Generate the data source information for the ID Mapping data."""
+    root = parse_metalink(metalink_xml_path)
+    if root is None:
+        return {}
+
+    data_source = {
+        "license": root.findtext("./license/name", namespaces=NS),
+        "publisher": root.findtext("./publisher/name", namespaces=NS),
+        "resource_type": "dataset",
+        "version": root.findtext("./version", namespaces=NS),
+    }
+    missing = [k for k in data_source if not data_source[k]]
+    if missing:
+        msg = f"Missing required elements from metalink file: {', '.join(missing)}"
+        logger.error(msg)
+        raise RuntimeError(msg)
+
+    return data_source
+
+
+def get_files(metalink_xml_path: Path | str, files_to_find: list[str] | None = None) -> dict[str, Any]:
+    """Generate the data source information for the ID Mapping data."""
+    root = parse_metalink(metalink_xml_path)
+    assert root is not None
+
+    if files_to_find is not None and files_to_find == []:
+        logger.warning("Empty file list supplied to get_files: aborting.")
+        return {}
+
+    files = {}
+    for f in root.findall("./files/file", NS):
+        # get the name, size, any verification info
+        name = f.get("name")
+        # skip now if the file is not of interest
+        if files_to_find and name not in files_to_find:
+            continue
+
+        size = f.findtext("./size", namespaces=NS)
+        checksum = f.find("./verification/hash", NS)
+        if checksum is not None:
+            checksum_fn = checksum.get("type")
+            checksum_value = checksum.text
+        else:
+            checksum_fn = checksum_value = None
+        dl_url = f.findtext("./resources/url[@location='us']", namespaces=NS)
+        files[name] = {
+            "name": name,
+            "size": size,
+            "checksum": checksum_value,
+            "checksum_fn": checksum_fn,
+            "url": dl_url,
+        }
+
+    # report on unfound files
+    if files_to_find:
+        not_found = {f for f in files_to_find if f not in files}
+        if not_found:
+            msg = "The following files were not found: " + ", ".join(sorted(not_found))
+            logger.warning(msg)
+
+    return files