Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -166,5 +166,6 @@ cython_debug/
#.idea/


#Ignore vscode AI rules
# codacy stuff
.github/instructions/codacy.instructions.md
.codacy
17 changes: 12 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,7 @@ dependencies = [
"berdl-notebook-utils",
"biopython>=1.86",
"click>=8.3.1",
"xmlschema>=4.2.0",
"xsdata[cli,lxml]>=25.7",
"lxml>=6.0.2",
"pytest-asyncio>=1.3.0",
"ruff>=0.14.13",
]

Expand All @@ -24,9 +21,18 @@ dev = [
"hypothesis>=6.148.9",
"mutmut>=3.4.0",
"pytest>=9.0.2",
"pytest-asyncio>=1.3.0",
"pytest-cov>=7.0.0",
"pytest-env>=1.2.0",
]
models = [
"genson>=1.3.0",
"json2python-models>=0.3.1",
]
xml = [
"xmlschema>=4.3.1",
"xsdata[cli,lxml]>=26.1",
]

[tool.ruff]
line-length = 120
Expand Down Expand Up @@ -146,7 +152,7 @@ ignore = [

[tool.ruff.lint.per-file-ignores]
"*.ipynb" = ["T201"] # ignore printing in notebooks
"tests/**/*.py" = ["S101", "T201"] # use of assert
"tests/**/*.py" = ["S101", "T201", "FBT001", "FBT002"] # use of assert
"**/__init__.py" = ["D104"]

[tool.ruff.lint.mccabe]
Expand All @@ -164,8 +170,9 @@ build-backend = "uv_build"
pythonpath = ["src"]
log_cli = true
log_cli_level = "INFO"
log_level = "INFO"
addopts = ["-v"]
markers = ["requires_spark: must be run in an environment where spark is available"]
markers = ["requires_spark: must be run in an environment where spark is available", "slow_test: does what it says on the tin"]

# environment settings for running tests
[tool.pytest_env]
Expand Down
Empty file.
152 changes: 152 additions & 0 deletions src/cdm_data_loader_utils/parsers/uniprot/idmapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""Parser for UniProt ID Mapping file.

UniProt provides comprehensive mappings between their protein IDs and many other databases. Mappings are extracted from the UniProt records where possible.

Source file: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz

Legacy mappings (pre-UniProt proteome redundancy reduction drive): https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.2015_03.gz

Docs: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README

Metalink file: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/RELEASE.metalink

Retrieve the list of databases referenced from the UniProt API: https://rest.uniprot.org/database/stream?format=json&query=%28*%29

1) idmapping.dat
This file has three columns, delimited by tab:
1. UniProtKB-AC
2. ID_type
3. ID
where ID_type is the database name as appearing in UniProtKB cross-references,
and as supported by the ID mapping tool on the UniProt web site,
http://www.uniprot.org/mapping and where ID is the identifier in
that cross-referenced database.
"""

import datetime
from uuid import uuid4

import click
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql import functions as sf
from pyspark.sql.types import StringType, StructField

from cdm_data_loader_utils.core.constants import INVALID_DATA_FIELD_NAME
from cdm_data_loader_utils.core.pipeline_run import PipelineRun
from cdm_data_loader_utils.readers.dsv import read
from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger
from cdm_data_loader_utils.utils.minio import list_remote_dir_contents
from cdm_data_loader_utils.utils.spark_delta import APPEND, set_up_workspace, write_delta
from cdm_data_loader_utils.validation.dataframe_validator import DataFrameValidator, Validator
from cdm_data_loader_utils.validation.df_nullable_fields import validate as check_nullable_fields

APP_NAME = "uniprot_idmapping"
NOW = datetime.datetime.now(tz=datetime.UTC)
DB = "db"
XREF = "xref"
ID = "id"
COLUMNS = [ID, DB, XREF]


logger = get_cdm_logger()

ID_MAPPING_SCHEMA = [StructField(n, StringType(), nullable=False) for n in COLUMNS]


def ingest(spark: SparkSession, run: PipelineRun, id_mapping_tsv: str) -> DataFrame:
"""Parse the ID mapping file and convert it to a dataframe.

:param spark: spark sesh
:type spark: SparkSession
:param id_mapping_tsv: path to the ID mapping tsv file
:type id_mapping_tsv: str
:return: dataframe containing the ID mapping stuff
:rtype: DataFrame
"""
options = {
"delimiter": "\t",
"header": False,
"ignoreLeadingWhiteSpace": True,
"ignoreTrailingWhiteSpace": True,
"enforceSchema": True,
"inferSchema": False,
}

df = read(spark, id_mapping_tsv, ID_MAPPING_SCHEMA, options)
id_map_parse_result = DataFrameValidator(spark).validate_dataframe(
data_to_validate=df,
schema=ID_MAPPING_SCHEMA,
run=run,
validator=Validator(check_nullable_fields, {"invalid_col": INVALID_DATA_FIELD_NAME}),
invalid_col=INVALID_DATA_FIELD_NAME,
)
id_map_df = id_map_parse_result.valid_df

# destination format:
# "entity_id", "identifier", "description", "source", "relationship"
return id_map_df.select(
# prefix with UniProt
sf.concat(sf.lit("UniProt:"), sf.col("id")).alias("uniprot_id"),
sf.col(DB),
sf.col(XREF),
sf.lit(None).cast("string").alias("description"),
sf.lit("UniProt ID mapping").alias("source"),
sf.lit(None).cast("string").alias("relationship"),
)


def read_and_write(spark: SparkSession, pipeline_run: PipelineRun, id_mapping_tsv: str) -> None:
"""Read in the UniProt ID mapping and write it out as a uniprot_identifier table.

:param spark: spark sesh
:type spark: SparkSession
:param delta_ns: namespace to write to
:type delta_ns: str
:param id_mapping_tsv: path to the ID mapping file
:type id_mapping_tsv: str
:param mode: write mode (append or overwrite)
:type mode: str
"""
# get the metalink XML and retrieve data source info
write_delta(

Check warning on line 111 in src/cdm_data_loader_utils/parsers/uniprot/idmapping.py

View check run for this annotation

Codecov / codecov/patch

src/cdm_data_loader_utils/parsers/uniprot/idmapping.py#L111

Added line #L111 was not covered by tests
spark, ingest(spark, pipeline_run, id_mapping_tsv), pipeline_run.namespace, "uniprot_identifier", APPEND
)


@click.command()
@click.option(
"--source",
required=True,
help="Full path to the source directory containing ID mapping file(s). Does not need to specify the Bucket (i.e. cdm-lake) but should specify everything else.",
)
@click.option(
"--namespace",
default="uniprot",
show_default=True,
help="Delta Lake database name",
)
@click.option(
"--tenant-name",
default=None,
help="Tenant warehouse to save processed data to; defaults to saving data to the user warehouse if a tenant is not specified",
)
def main(source: str, namespace: str, tenant_name: str | None) -> None:
"""Run the UniProt ID Mapping importer.

:param source: full path to the source directory containing ID mapping file(s)
:type source: str
:param namespace: Delta Lake database name
:type namespace: str
:param tenant_name: Tenant warehouse to save processed data to; defaults to saving data to the user warehouse if a tenant is not specified
:type tenant_name: str | None
"""
(spark, delta_ns) = set_up_workspace(APP_NAME, namespace, tenant_name)
for file in list_remote_dir_contents(source):

Check warning on line 144 in src/cdm_data_loader_utils/parsers/uniprot/idmapping.py

View check run for this annotation

Codecov / codecov/patch

src/cdm_data_loader_utils/parsers/uniprot/idmapping.py#L143-L144

Added lines #L143 - L144 were not covered by tests
# file names are in the 'Key' value
# 'tenant-general-warehouse/kbase/datasets/uniprot/id_mapping/id_mapping_part_001.tsv.gz'
pipeline_run = PipelineRun(str(uuid4()), APP_NAME, file["Key"], delta_ns)
read_and_write(spark, pipeline_run, file["Key"])

Check warning on line 148 in src/cdm_data_loader_utils/parsers/uniprot/idmapping.py

View check run for this annotation

Codecov / codecov/patch

src/cdm_data_loader_utils/parsers/uniprot/idmapping.py#L147-L148

Added lines #L147 - L148 were not covered by tests


if __name__ == "__main__":
main()

Check warning on line 152 in src/cdm_data_loader_utils/parsers/uniprot/idmapping.py

View check run for this annotation

Codecov / codecov/patch

src/cdm_data_loader_utils/parsers/uniprot/idmapping.py#L152

Added line #L152 was not covered by tests
96 changes: 96 additions & 0 deletions src/cdm_data_loader_utils/parsers/uniprot/metalink.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""Parser for UniProt metalink XML files.

These metadata files provide information and links for UniProt and related downloads.


"""

import datetime
from pathlib import Path
from typing import Any
from xml.etree.ElementTree import Element

from defusedxml.ElementTree import parse

from cdm_data_loader_utils.utils.cdm_logger import get_cdm_logger

NS = {"": "http://www.metalinker.org/"}
NOW = datetime.datetime.now(tz=datetime.UTC)
COLUMNS = ["id", "db", "xref"]

logger = get_cdm_logger()


def parse_metalink(metalink_xml_path: Path | str) -> Element | None:
"""Parse the metalink file and return the root node."""
document = parse(str(metalink_xml_path))
root = document.getroot()
if root is not None:
return root
msg = f"Could not find root in metalink file: {metalink_xml_path!s}"
logger.error(msg)
raise RuntimeError(msg)

Check warning on line 32 in src/cdm_data_loader_utils/parsers/uniprot/metalink.py

View check run for this annotation

Codecov / codecov/patch

src/cdm_data_loader_utils/parsers/uniprot/metalink.py#L30-L32

Added lines #L30 - L32 were not covered by tests


def generate_data_source_table(metalink_xml_path: Path | str) -> dict[str, Any]:
"""Generate the data source information for the ID Mapping data."""
root = parse_metalink(metalink_xml_path)
if root is None:
return {}

Check warning on line 39 in src/cdm_data_loader_utils/parsers/uniprot/metalink.py

View check run for this annotation

Codecov / codecov/patch

src/cdm_data_loader_utils/parsers/uniprot/metalink.py#L39

Added line #L39 was not covered by tests

data_source = {
"license": root.findtext("./license/name", namespaces=NS),
"publisher": root.findtext("./publisher/name", namespaces=NS),
"resource_type": "dataset",
"version": root.findtext("./version", namespaces=NS),
}
missing = [k for k in data_source if not data_source[k]]
if missing:
msg = f"Missing required elements from metalink file: {', '.join(missing)}"
logger.error(msg)
raise RuntimeError(msg)

return data_source


def get_files(metalink_xml_path: Path | str, files_to_find: list[str] | None = None) -> dict[str, Any]:
"""Generate the data source information for the ID Mapping data."""
root = parse_metalink(metalink_xml_path)
assert root is not None

if files_to_find is not None and files_to_find == []:
logger.warning("Empty file list supplied to get_files: aborting.")
return {}

files = {}
for f in root.findall("./files/file", NS):
# get the name, size, any verification info
name = f.get("name")
# skip now if the file is not of interest
if files_to_find and name not in files_to_find:
continue

size = f.findtext("./size", namespaces=NS)
checksum = f.find("./verification/hash", NS)
if checksum is not None:
checksum_fn = checksum.get("type")
checksum_value = checksum.text
else:
checksum_fn = checksum_value = None

Check warning on line 79 in src/cdm_data_loader_utils/parsers/uniprot/metalink.py

View check run for this annotation

Codecov / codecov/patch

src/cdm_data_loader_utils/parsers/uniprot/metalink.py#L79

Added line #L79 was not covered by tests
dl_url = f.findtext("./resources/url[@location='us']", namespaces=NS)
files[name] = {
"name": name,
"size": size,
"checksum": checksum_value,
"checksum_fn": checksum_fn,
"url": dl_url,
}

# report on unfound files
if files_to_find:
not_found = {f for f in files_to_find if f not in files}
if not_found:
msg = "The following files were not found: " + ", ".join(sorted(not_found))
logger.warning(msg)

return files
Loading
Loading