From 48f9e3f2dad0d9290407dec643ccb9ba38cb8f30 Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Thu, 29 Jan 2026 13:22:29 -0800 Subject: [PATCH 1/3] First pass at RefSeq API annotation report endpoint parser --- .../model/kbase_cdm_schema.py | 610 ++++++++++++++ .../parsers/refseq/__init__.py | 0 .../parsers/refseq/api/__init__.py | 0 .../parsers/refseq/api/annotation_parse.py | 447 +++++++++++ tests/data/refseq/annotation_report.json | 105 +++ .../data/refseq/annotation_report.parsed.json | 239 ++++++ tests/parsers/refseq/__init__.py | 0 tests/parsers/refseq/api/__init__.py | 0 .../refseq/api/test_annotation_report.py | 757 ++++++++++++++++++ 9 files changed, 2158 insertions(+) create mode 100644 src/cdm_data_loader_utils/model/kbase_cdm_schema.py create mode 100644 src/cdm_data_loader_utils/parsers/refseq/__init__.py create mode 100644 src/cdm_data_loader_utils/parsers/refseq/api/__init__.py create mode 100644 src/cdm_data_loader_utils/parsers/refseq/api/annotation_parse.py create mode 100644 tests/data/refseq/annotation_report.json create mode 100644 tests/data/refseq/annotation_report.parsed.json create mode 100644 tests/parsers/refseq/__init__.py create mode 100644 tests/parsers/refseq/api/__init__.py create mode 100644 tests/parsers/refseq/api/test_annotation_report.py diff --git a/src/cdm_data_loader_utils/model/kbase_cdm_schema.py b/src/cdm_data_loader_utils/model/kbase_cdm_schema.py new file mode 100644 index 0000000..2e09b0d --- /dev/null +++ b/src/cdm_data_loader_utils/model/kbase_cdm_schema.py @@ -0,0 +1,610 @@ +"""Automated conversion of cdm_schema to PySpark.""" + +from pyspark.sql.types import BooleanType, DateType, FloatType, IntegerType, StringType, StructField, StructType + +CDM_SCHEMA = { + "Association": StructType( + [ + StructField("association_id", StringType(), nullable=False), + StructField("subject", StringType(), nullable=False), + StructField("object", StringType(), nullable=False), + StructField("predicate", StringType(), nullable=False), + StructField("negated", BooleanType(), nullable=True), + StructField("evidence_type", StringType(), nullable=True), + StructField("primary_knowledge_source", StringType(), nullable=True), + StructField("aggregator_knowledge_source", StringType(), nullable=True), + StructField("annotation_date", DateType(), nullable=True), + StructField("comments", StringType(), nullable=True), + ] + ), + "Association_x_SupportingObject": StructType( + [ + StructField("association_id", StringType(), nullable=False), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "Cluster": StructType( + [ + StructField("cluster_id", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + StructField("name", StringType(), nullable=True), + StructField("entity_type", StringType(), nullable=False), + StructField("protocol_id", StringType(), nullable=True), + ] + ), + "ClusterMember": StructType( + [ + StructField("cluster_id", StringType(), nullable=False), + StructField("entity_id", StringType(), nullable=False), + StructField("is_representative", BooleanType(), nullable=True), + StructField("is_seed", BooleanType(), nullable=True), + StructField("score", FloatType(), nullable=True), + ] + ), + "Contig": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("gc_content", FloatType(), nullable=True), + StructField("length", IntegerType(), nullable=True), + ] + ), + "ContigCollection": StructType( + [ + StructField("contig_collection_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("asm_score", FloatType(), nullable=True), + StructField("checkm_completeness", FloatType(), nullable=True), + StructField("checkm_contamination", FloatType(), nullable=True), + StructField("checkm_version", StringType(), nullable=True), + StructField("contig_bp", IntegerType(), nullable=True), + StructField("contig_collection_type", StringType(), nullable=True), + StructField("contig_l50", IntegerType(), nullable=True), + StructField("contig_l90", IntegerType(), nullable=True), + StructField("contig_n50", IntegerType(), nullable=True), + StructField("contig_n90", IntegerType(), nullable=True), + StructField("contig_logsum", FloatType(), nullable=True), + StructField("contig_max", IntegerType(), nullable=True), + StructField("contig_powersum", FloatType(), nullable=True), + StructField("gap_percent", FloatType(), nullable=True), + StructField("gc_average", FloatType(), nullable=True), + StructField("gc_std", FloatType(), nullable=True), + StructField("gtdb_taxon_id", StringType(), nullable=True), + StructField("n_chromosomes", IntegerType(), nullable=True), + StructField("n_contigs", IntegerType(), nullable=True), + StructField("n_scaffolds", IntegerType(), nullable=True), + StructField("ncbi_taxon_id", StringType(), nullable=True), + StructField("scaffold_l50", IntegerType(), nullable=True), + StructField("scaffold_l90", IntegerType(), nullable=True), + StructField("scaffold_n50", IntegerType(), nullable=True), + StructField("scaffold_n90", IntegerType(), nullable=True), + StructField("scaffold_bp", IntegerType(), nullable=True), + StructField("scaffold_logsum", FloatType(), nullable=True), + StructField("scaffold_maximum_length", IntegerType(), nullable=True), + StructField("scaffold_powersum", FloatType(), nullable=True), + StructField("scaffolds_n_over_50K", IntegerType(), nullable=True), + StructField("scaffolds_percent_over_50K", FloatType(), nullable=True), + StructField("scaffolds_total_length_over_50k", IntegerType(), nullable=True), + ] + ), + "ContigCollection_x_EncodedFeature": StructType( + [ + StructField("contig_collection_id", StringType(), nullable=False), + StructField("encoded_feature_id", StringType(), nullable=False), + ] + ), + "ContigCollection_x_Feature": StructType( + [ + StructField("contig_collection_id", StringType(), nullable=False), + StructField("feature_id", StringType(), nullable=False), + ] + ), + "ContigCollection_x_Protein": StructType( + [ + StructField("contig_collection_id", StringType(), nullable=False), + StructField("protein_id", StringType(), nullable=False), + ] + ), + "Contig_x_ContigCollection": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("contig_collection_id", StringType(), nullable=False), + ] + ), + "Contig_x_EncodedFeature": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("encoded_feature_id", StringType(), nullable=False), + ] + ), + "Contig_x_Feature": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("feature_id", StringType(), nullable=False), + ] + ), + "Contig_x_Protein": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("protein_id", StringType(), nullable=False), + ] + ), + "Contributor": StructType( + [ + StructField("contributor_id", StringType(), nullable=False), + StructField("contributor_type", StringType(), nullable=True), + StructField("name", StringType(), nullable=True), + StructField("given_name", StringType(), nullable=True), + StructField("family_name", StringType(), nullable=True), + ] + ), + "ContributorAffiliation": StructType( + [ + StructField("contributor_id", StringType(), nullable=False), + StructField("affiliation_id", StringType(), nullable=True), + ] + ), + "Contributor_x_DataSource": StructType( + [ + StructField("contributor_id", StringType(), nullable=False), + StructField("data_source_id", StringType(), nullable=False), + StructField("contributor_role", StringType(), nullable=True), + ] + ), + "Contributor_x_Role_x_Project": StructType( + [ + StructField("contributor_id", StringType(), nullable=False), + StructField("project_id", StringType(), nullable=False), + StructField("contributor_role", StringType(), nullable=True), + ] + ), + "ControlledTermValue": StructType( + [ + StructField("value_cv_label", StringType(), nullable=False), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "ControlledVocabularyTermValue": StructType( + [ + StructField("value_cv_label", StringType(), nullable=True), + StructField("value_cv_id", StringType(), nullable=True), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "DataSource": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + ] + ), + "DataSourceNew": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("comments", StringType(), nullable=True), + StructField("date_accessed", DateType(), nullable=False), + StructField("date_published", DateType(), nullable=True), + StructField("date_updated", DateType(), nullable=True), + StructField("license", StringType(), nullable=True), + StructField("publisher", StringType(), nullable=True), + StructField("resource_type", StringType(), nullable=False), + StructField("url", StringType(), nullable=True), + StructField("version", StringType(), nullable=True), + ] + ), + "DataSource_x_Description": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("resource_description_id", StringType(), nullable=False), + ] + ), + "DataSource_x_FundingReference": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("funding_reference_id", StringType(), nullable=False), + ] + ), + "DataSource_x_License": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("license_id", StringType(), nullable=False), + ] + ), + "DataSource_x_Title": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("resource_title_id", StringType(), nullable=False), + ] + ), + "DateTimeValue": StructType( + [ + StructField("date_time", DateType(), nullable=False), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "EncodedFeature": StructType( + [ + StructField("encoded_feature_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("has_stop_codon", BooleanType(), nullable=True), + StructField("type", StringType(), nullable=True), + ] + ), + "EncodedFeature_x_Feature": StructType( + [ + StructField("encoded_feature_id", StringType(), nullable=False), + StructField("feature_id", StringType(), nullable=False), + ] + ), + "EncodedFeature_x_Protein": StructType( + [ + StructField("encoded_feature_id", StringType(), nullable=False), + StructField("protein_id", StringType(), nullable=False), + ] + ), + "EntailedEdge": StructType( + [ + StructField("subject", StringType(), nullable=True), + StructField("predicate", StringType(), nullable=True), + StructField("object", StringType(), nullable=True), + ] + ), + "Entity": StructType( + [ + StructField("entity_id", StringType(), nullable=False), + StructField("entity_type", StringType(), nullable=False), + StructField("data_source_id", StringType(), nullable=True), + StructField("data_source_entity_id", StringType(), nullable=True), + StructField("data_source_created", DateType(), nullable=False), + StructField("data_source_updated", DateType(), nullable=True), + StructField("created", DateType(), nullable=False), + StructField("updated", DateType(), nullable=False), + ] + ), + "Event": StructType( + [ + StructField("event_id", StringType(), nullable=False), + StructField("created_at", DateType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("name", StringType(), nullable=True), + StructField("location", StringType(), nullable=True), + ] + ), + "Experiment": StructType( + [ + StructField("experiment_id", StringType(), nullable=False), + StructField("protocol_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("created_at", DateType(), nullable=True), + ] + ), + "ExperimentCondition": StructType( + [ + StructField("experiment_condition_id", StringType(), nullable=False), + StructField("experiment_id", StringType(), nullable=False), + StructField("variable_id", StringType(), nullable=False), + StructField("value", StringType(), nullable=True), + ] + ), + "ExperimentConditionSet": StructType( + [ + StructField("experiment_condition_set_id", StringType(), nullable=False), + StructField("experiment_condition_id", StringType(), nullable=False), + ] + ), + "Feature": StructType( + [ + StructField("feature_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("cds_phase", StringType(), nullable=True), + StructField("e_value", FloatType(), nullable=True), + StructField("end", IntegerType(), nullable=True), + StructField("p_value", FloatType(), nullable=True), + StructField("start", IntegerType(), nullable=True), + StructField("strand", StringType(), nullable=True), + StructField("source_database", StringType(), nullable=True), + StructField("protocol_id", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + ] + ), + "Feature_x_Protein": StructType( + [ + StructField("feature_id", StringType(), nullable=False), + StructField("protein_id", StringType(), nullable=False), + ] + ), + "FundingReference": StructType( + [ + StructField("funding_reference_id", StringType(), nullable=False), + StructField("funder", StringType(), nullable=True), + StructField("grant_id", StringType(), nullable=True), + StructField("grant_title", StringType(), nullable=True), + StructField("grant_url", StringType(), nullable=True), + ] + ), + "Geolocation": StructType( + [ + StructField("latitude", FloatType(), nullable=False), + StructField("longitude", FloatType(), nullable=False), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "GoldEnvironmentalContext": StructType( + [ + StructField("gold_environmental_context_id", StringType(), nullable=False), + StructField("ecosystem", StringType(), nullable=True), + StructField("ecosystem_category", StringType(), nullable=True), + StructField("ecosystem_subtype", StringType(), nullable=True), + StructField("ecosystem_type", StringType(), nullable=True), + StructField("specific_ecosystem", StringType(), nullable=True), + ] + ), + "Identifier": StructType( + [ + StructField("entity_id", StringType(), nullable=False), + StructField("identifier", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + StructField("source", StringType(), nullable=True), + StructField("relationship", StringType(), nullable=True), + ] + ), + "License": StructType( + [ + StructField("license_id", StringType(), nullable=False), + StructField("id", StringType(), nullable=True), + StructField("name", StringType(), nullable=True), + StructField("url", StringType(), nullable=True), + ] + ), + "Measurement": StructType( + [ + StructField("measurement_id", StringType(), nullable=False), + StructField("measurement_set_id", StringType(), nullable=False), + StructField("experiment_condition_set_id", StringType(), nullable=False), + StructField("value", StringType(), nullable=True), + ] + ), + "MeasurementSet": StructType( + [ + StructField("measurement_set_id", StringType(), nullable=False), + StructField("variable_id", StringType(), nullable=False), + StructField("quality", StringType(), nullable=True), + StructField("created_at", DateType(), nullable=True), + ] + ), + "MixsEnvironmentalContext": StructType( + [ + StructField("mixs_environmental_context_id", StringType(), nullable=False), + StructField("env_broad_scale", StringType(), nullable=True), + StructField("env_local_scale", StringType(), nullable=True), + StructField("env_medium", StringType(), nullable=True), + ] + ), + "Name": StructType( + [ + StructField("entity_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + StructField("source", StringType(), nullable=True), + ] + ), + "OrderedProtocolStep": StructType( + [ + StructField("protocol_id", StringType(), nullable=False), + StructField("protocol_step_id", StringType(), nullable=False), + StructField("step_index", IntegerType(), nullable=False), + ] + ), + "Parameter": StructType( + [ + StructField("parameter_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("value_type", StringType(), nullable=True), + StructField("required", BooleanType(), nullable=True), + StructField("cardinality", StringType(), nullable=True), + StructField("default", StringType(), nullable=True), + StructField("parameter_type", StringType(), nullable=True), + ] + ), + "Prefix": StructType( + [ + StructField("prefix", StringType(), nullable=True), + StructField("base", StringType(), nullable=True), + ] + ), + "Project": StructType( + [ + StructField("project_id", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + ] + ), + "Protein": StructType( + [ + StructField("protein_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("evidence_for_existence", StringType(), nullable=True), + StructField("length", IntegerType(), nullable=True), + StructField("sequence", StringType(), nullable=True), + ] + ), + "Protocol": StructType( + [ + StructField("protocol_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("doi", StringType(), nullable=True), + StructField("url", StringType(), nullable=True), + StructField("version", StringType(), nullable=True), + ] + ), + "ProtocolExecution": StructType( + [ + StructField("protocol_execution_id", StringType(), nullable=False), + StructField("protocol_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("created_at", DateType(), nullable=True), + ] + ), + "ProtocolInput": StructType( + [ + StructField("parameter_id", StringType(), nullable=False), + StructField("protocol_input_id", StringType(), nullable=False), + StructField("protocol_execution_id", StringType(), nullable=False), + StructField("value", StringType(), nullable=False), + ] + ), + "ProtocolInputSet": StructType( + [ + StructField("protocol_input_id", StringType(), nullable=False), + StructField("protocol_input_set_id", StringType(), nullable=False), + ] + ), + "ProtocolOutput": StructType( + [ + StructField("protocol_output_id", StringType(), nullable=False), + StructField("protocol_input_set_id", StringType(), nullable=False), + StructField("value", StringType(), nullable=False), + ] + ), + "ProtocolStep": StructType( + [ + StructField("protocol_step_id", StringType(), nullable=False), + StructField("step", StringType(), nullable=True), + ] + ), + "ProtocolVariable": StructType( + [ + StructField("protocol_id", StringType(), nullable=False), + StructField("variable_id", StringType(), nullable=False), + ] + ), + "Publication": StructType( + [ + StructField("publication_id", StringType(), nullable=False), + ] + ), + "QuantityRangeValue": StructType( + [ + StructField("maximum_numeric_value", FloatType(), nullable=False), + StructField("minimum_numeric_value", FloatType(), nullable=False), + StructField("unit_cv_id", StringType(), nullable=True), + StructField("unit_cv_label", StringType(), nullable=True), + StructField("unit_string", StringType(), nullable=True), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "QuantityValue": StructType( + [ + StructField("numeric_value", FloatType(), nullable=False), + StructField("unit_cv_id", StringType(), nullable=True), + StructField("unit_cv_label", StringType(), nullable=True), + StructField("unit_string", StringType(), nullable=True), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "ResourceDescription": StructType( + [ + StructField("resource_description_id", StringType(), nullable=False), + StructField("description_text", StringType(), nullable=False), + StructField("description_type", StringType(), nullable=True), + StructField("language", StringType(), nullable=True), + ] + ), + "ResourceTitle": StructType( + [ + StructField("resource_title_id", StringType(), nullable=False), + StructField("language", StringType(), nullable=True), + StructField("title", StringType(), nullable=False), + StructField("title_type", StringType(), nullable=True), + ] + ), + "Sample": StructType( + [ + StructField("sample_id", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + ] + ), + "Sequence": StructType( + [ + StructField("sequence_id", StringType(), nullable=False), + StructField("entity_id", StringType(), nullable=False), + StructField("type", StringType(), nullable=True), + StructField("length", IntegerType(), nullable=True), + StructField("checksum", StringType(), nullable=True), + ] + ), + "Statement": StructType( + [ + StructField("subject", StringType(), nullable=True), + StructField("predicate", StringType(), nullable=True), + StructField("object", StringType(), nullable=True), + StructField("value", StringType(), nullable=True), + StructField("datatype", StringType(), nullable=True), + StructField("language", StringType(), nullable=True), + ] + ), + "TextValue": StructType( + [ + StructField("text_value", StringType(), nullable=False), + StructField("language", StringType(), nullable=True), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "Variable": StructType( + [ + StructField("variable_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("name_cv_id", StringType(), nullable=True), + StructField("unit", StringType(), nullable=True), + StructField("value_type", StringType(), nullable=False), + ] + ), + "VariableValue": StructType( + [ + StructField("variable_value_id", StringType(), nullable=False), + StructField("variable_id", StringType(), nullable=False), + StructField("value_type", StringType(), nullable=True), + ] + ), +} diff --git a/src/cdm_data_loader_utils/parsers/refseq/__init__.py b/src/cdm_data_loader_utils/parsers/refseq/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/cdm_data_loader_utils/parsers/refseq/api/__init__.py b/src/cdm_data_loader_utils/parsers/refseq/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/cdm_data_loader_utils/parsers/refseq/api/annotation_parse.py b/src/cdm_data_loader_utils/parsers/refseq/api/annotation_parse.py new file mode 100644 index 0000000..47c6882 --- /dev/null +++ b/src/cdm_data_loader_utils/parsers/refseq/api/annotation_parse.py @@ -0,0 +1,447 @@ +""" + +RefSeq annotation parser for transforming NCBI Datasets API JSON into CDM-formatted Delta Lake tables. + +Usage: +uv run python src/cdm_data_loader_utils/parsers/annotation_parse.py \ + --accession GCF_000869125.1 \ + --namespace refseq_api \ + --query + +""" + +import argparse +import json +from pathlib import Path + +import requests +from delta import configure_spark_with_delta_pip +from pyspark.sql import SparkSession + +from cdm_data_loader_utils.model.kbase_cdm_schema import CDM_SCHEMA + + +# --------------------------------------------------------------------- +# Accession-based annotation fetch +# --------------------------------------------------------------------- +def fetch_annotation_json(accession: str) -> dict: + """Fetch annotation JSON from NCBI Datasets API.""" + url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{accession}/annotation_report" + resp = requests.get(url, headers={"Accept": "application/json"}, timeout=60) + resp.raise_for_status() + return resp.json() + + +# --------------------------------------------------------------------- +# Spark initialization with Delta support +# --------------------------------------------------------------------- +def build_spark_session(app_name: str = "RefSeqAnnotationToCDM") -> SparkSession: + builder = ( + SparkSession.builder.appName(app_name) + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") + .enableHiveSupport() + ) + return configure_spark_with_delta_pip(builder).getOrCreate() + + +def init_spark_and_db(app_name: str, database: str) -> SparkSession: + spark = build_spark_session(app_name) + spark.sql(f"CREATE DATABASE IF NOT EXISTS {database}") + spark.sql(f"USE {database}") + return spark + + +# --------------------------------------------------------------------- +# CDM PREFIX NORMALIZATION +# --------------------------------------------------------------------- +def apply_prefix(identifier: str | None) -> str | None: + if not identifier: + return None + + if identifier.startswith("GeneID:"): + return identifier.replace("GeneID:", "ncbigene:") + + if identifier.startswith(("YP_", "XP_", "WP_", "NP_", "NC_")): + return f"refseq:{identifier}" + + if identifier.startswith("GCF_"): + return f"insdc.gcf:{identifier}" + + return identifier + + +# --------------------------------------------------------------------- +# Safe integer conversion +# --------------------------------------------------------------------- +def to_int(val: str) -> int | None: + try: + return int(val) + except Exception: + return None + + +# --------------------------------------------------------------------- +# For repeat section markers +# --------------------------------------------------------------------- +def unique_annotations(data: dict): + seen = set() + for report in data.get("reports", []): + ann = report.get("annotation", {}) + gene_id = ann.get("gene_id") + if gene_id and gene_id not in seen: + seen.add(gene_id) + yield gene_id, ann + + +# --------------------------------------------------------------------- +# IDENTIFIERS +# --------------------------------------------------------------------- +def load_identifiers(data: dict) -> list[tuple[str, str, str, str, str | None]]: + """Extract Identifier table records.""" + out = [] + + for gene_id, ann in unique_annotations(data): + entity_id = f"ncbigene:{gene_id}" + out.append((entity_id, gene_id, ann.get("name"), "RefSeq", ann.get("relationship"))) + return list({tuple(row) for row in out}) # deduplicate + + +# --------------------------------------------------------------------- +# NAME EXTRACTION +# --------------------------------------------------------------------- +def load_names(data: dict) -> list[tuple[str, str, str, str]]: + """Extract Name table records.""" + out = [] + + for gene_id, ann in unique_annotations(data): + entity_id = f"ncbigene:{gene_id}" + for label, desc in [ + ("symbol", "RefSeq gene symbol"), + ("name", "RefSeq gene name"), + ("locus_tag", "RefSeq locus tag"), + ]: + val = ann.get(label) + if val: + out.append((entity_id, val, desc, "RefSeq")) + return list({tuple(row) for row in out}) + + +# --------------------------------------------------------------------- +# FEATURE LOCATIONS +# --------------------------------------------------------------------- +def load_feature_records(data: dict) -> list[tuple]: + """Extract Feature table records.""" + features = [] + + for gene_id, ann in unique_annotations(data): + feature_id = f"ncbigene:{gene_id}" + for region in ann.get("genomic_regions", []): + for r in region.get("gene_range", {}).get("range", []): + strand = { + "plus": "positive", + "minus": "negative", + "unstranded": "unstranded", + }.get(r.get("orientation"), "unknown") + features.append( + ( + feature_id, + None, + None, + None, + to_int(r.get("end")), + None, + to_int(r.get("begin")), + strand, + "RefSeq", + None, + "gene", + ) + ) + return list({tuple(row) for row in features}) + + +# --------------------------------------------------------------------- +# PARSE CONTIG_COLLECTION <-> FEATURE +# --------------------------------------------------------------------- +def load_contig_collection_x_feature(data: dict) -> list[tuple[str, str]]: + """Parse ContigCollection Feature links.""" + links = [] + + for gene_id, ann in unique_annotations(data): + regions = ann.get("genomic_regions", []) + + if not regions: + continue + + acc = regions[0].get("gene_range", {}).get("accession_version") + if acc: + links.append((apply_prefix(acc), f"ncbigene:{gene_id}")) + + return list(set(links)) + + +# --------------------------------------------------------------------- +# PARSE CONTIG_COLLECTION <-> PROTEIN +# --------------------------------------------------------------------- +def load_contig_collection_x_protein(data: dict) -> list[tuple[str, str]]: + links = [] + + for report in data.get("reports", []): + ann = report.get("annotation", {}) + assembly = ann.get("annotations", [{}])[0].get("assembly_accession") + if not assembly: + continue + + contig_id = apply_prefix(assembly) + for p in ann.get("proteins", []): + pid = p.get("accession_version") + if pid: + links.append((contig_id, apply_prefix(pid))) + + return list(set(links)) + + +# --------------------------------------------------------------------- +# PARSE FEATURE <-> PROTEIN +# --------------------------------------------------------------------- +def load_feature_x_protein(data: dict) -> list[tuple[str, str]]: + links = [] + + for gene_id, ann in unique_annotations(data): + feature_id = f"ncbigene:{gene_id}" + + for p in ann.get("proteins", []): + pid = p.get("accession_version") + if pid: + protein_id = apply_prefix(pid) + links.append((feature_id, protein_id)) + + return list(set(links)) + + +# --------------------------------------------------------------------- +# PARSE CONTIGS +# --------------------------------------------------------------------- +def load_contigs(data: dict) -> list[tuple[str, str | None, float | None, int | None]]: + contigs = {} + + for report in data.get("reports", []): + for region in report.get("annotation", {}).get("genomic_regions", []): + acc = region.get("gene_range", {}).get("accession_version") + if acc: + contig_id = apply_prefix(acc) + # Only track first occurrence of each contig + contigs.setdefault(contig_id, {"hash": None, "gc_content": None, "length": None}) + + return [(cid, meta["hash"], meta["gc_content"], meta["length"]) for cid, meta in contigs.items()] + + +# --------------------------------------------------------------------- +# PARSE CONTIG <-> CONTIG_COLLECTION +# --------------------------------------------------------------------- +def load_contig_x_contig_collection(data: dict) -> list[tuple[str, str]]: + links = [] + + for report in data.get("reports", []): + ann = report.get("annotation", {}) + regions = ann.get("genomic_regions", []) + annotations = ann.get("annotations", []) + + if not regions or not annotations: + continue + + contig = regions[0].get("gene_range", {}).get("accession_version") + assembly = annotations[0].get("assembly_accession") + + if contig and assembly: + links.append( + ( + f"refseq:{contig}", + apply_prefix(assembly), + ) + ) + + return list(set(links)) + + +# --------------------------------------------------------------------- +# DELTA TABLE +# --------------------------------------------------------------------- +def write_to_table( + spark: SparkSession, + records: list[tuple], + table_name: str, + database: str = "default", +) -> None: + if records: + spark.createDataFrame(records, CDM_SCHEMA[table_name]).write.format("delta").mode("overwrite").option( + "overwriteSchema", "true" + ).saveAsTable(f"{database}.{table_name}") + + +# --------------------------------------------------------------------- +# SQL PREVIEW +# --------------------------------------------------------------------- + +CDM_TABLES = [ + "Identifier", + "Name", + "Feature", + "ContigCollection_x_Feature", + "ContigCollection_x_Protein", + "Feature_x_Protein", + "Contig", + "Contig_x_ContigCollection", +] + + +def run_sql_query(spark: SparkSession, database: str = "default") -> None: + spark.sql(f"USE {database}") + for table in CDM_TABLES: + print(f"\n[SQL Preview] {table}") + spark.sql(f"SELECT * FROM {table} LIMIT 20").show(truncate=False) + + +def parse_annotation_data(spark: SparkSession, datasets: list[dict], namespace: str) -> None: + # ----------------------------------------- + # Parse and write CDM tables + # ----------------------------------------- + for data in datasets: + write_to_table( + spark, + load_identifiers(data), + "Identifier", + namespace, + ) + + write_to_table( + spark, + load_names(data), + "Name", + namespace, + ) + + write_to_table( + spark, + load_feature_records(data), + "Feature", + namespace, + ) + + write_to_table( + spark, + load_contig_collection_x_feature(data), + "ContigCollection_x_Feature", + namespace, + ) + + write_to_table( + spark, + load_contig_collection_x_protein(data), + "ContigCollection_x_Protein", + namespace, + ) + + write_to_table( + spark, + load_feature_x_protein(data), + "Feature_x_Protein", + namespace, + ) + + write_to_table( + spark, + load_contigs(data), + "Contig", + namespace, + ) + + write_to_table( + spark, + load_contig_x_contig_collection(data), + "Contig_x_ContigCollection", + namespace, + ) + + +# --------------------------------------------------------------------- +# CLI ENTRY +# --------------------------------------------------------------------- +def main(): + parser = argparse.ArgumentParser(description="RefSeq Annotation Parser to CDM") + + # ------------------------- + # Input options + # ------------------------- + parser.add_argument("--accession", type=str, help="RefSeq genome accession (e.g. GCF_000869125.1)") + parser.add_argument("--input_file", type=str, help="Path to a RefSeq annotation JSON file.") + parser.add_argument("--input_dir", type=str, help="Directory containing RefSeq annotation JSON files.") + + # ------------------------- + # Output / runtime options + # ------------------------- + parser.add_argument( + "--namespace", + default="refseq_api", + help="Database to write Delta tables.", + ) + parser.add_argument( + "--tenant", + default=None, + help="Tenant SQL warehouse to use.", + ) + parser.add_argument( + "--query", + action="store_true", + help="Preview SQL output after writing.", + ) + + args = parser.parse_args() + + # ----------------------------------------- + # Input validation + # ----------------------------------------- + if not args.accession and not args.input_file and not args.input_dir: + raise ValueError("provide --accession, --input_file, or --input_dir.") + + # ----------------------------------------- + # Initialize Spark + # ----------------------------------------- + spark = init_spark_and_db("RefSeq Annotation Parser", args.namespace) + + if args.tenant: + spark.sql(f"USE CATALOG {args.tenant}") + + # ----------------------------------------- + # Load annotation data + # ----------------------------------------- + datasets: list[dict] = [] + + if args.accession: + # Fetch from NCBI Datasets API + data = fetch_annotation_json(args.accession) + datasets.append(data) + + if args.input_file: + with open(args.input_file) as f: + datasets.append(json.load(f)) + + if args.input_dir: + for path in Path(args.input_dir).rglob("*.json"): + with open(path) as f: + datasets.append(json.load(f)) + + parse_annotation_data(spark, datasets, args.namespace) + + # ----------------------------------------- + # SQL preview + # ----------------------------------------- + if args.query: + run_sql_query(spark, args.namespace) + + spark.stop() + + +if __name__ == "__main__": + main() diff --git a/tests/data/refseq/annotation_report.json b/tests/data/refseq/annotation_report.json new file mode 100644 index 0000000..53cd0d6 --- /dev/null +++ b/tests/data/refseq/annotation_report.json @@ -0,0 +1,105 @@ +{ + "reports": [ + { + "annotation": { + "gene_id": "4156250", + "name": "hypothetical protein", + "gene_type": "protein-coding", + "locus_tag": "MIV001R", + "genomic_regions": [ + { + "gene_range": { + "accession_version": "NC_008187.1", + "range": [ + { + "begin": "2620", + "end": "3066", + "orientation": "plus" + } + ] + } + } + ], + "proteins": [ + { + "accession_version": "YP_654573.1", + "name": "hypothetical protein", + "length": 148 + } + ], + "annotations": [ + { + "assembly_accession": "GCF_000869125.1" + } + ] + }, + "row_id": "1" + }, + { + "annotation": { + "gene_id": "4156251", + "name": "hypodermical protein", + "gene_type": "protein-coding", + "locus_tag": "MIV002R", + "genomic_regions": [ + { + "gene_range": { + "accession_version": "NC_008187.1", + "range": [ + { + "begin": "3603", + "end": "4979", + "orientation": "plus" + } + ] + } + } + ], + "proteins": [ + { + "accession_version": "YP_654574.1", + "name": "hypothetical protein", + "length": 458 + } + ], + "annotations": [ + { + "assembly_accession": "GCF_000869125.1" + } + ] + }, + "row_id": "2" + }, + { + "annotation": { + "gene_id": "4156252", + "name": "very hypothetical protein", + "gene_type": "protein-coding", + "locus_tag": "MIV003R", + "symbol": "kappa-delta-phi", + "genomic_regions": [ + { + "gene_range": { + "accession_version": "NC_008187.1", + "range": [ + { + "begin": "5168", + "end": "5638", + "orientation": "minus" + } + ] + } + } + ], + "proteins": [], + "annotations": [ + { + "assembly_accession": "GCF_000869125.1" + } + ] + }, + "row_id": "3" + } + ], + "total_count": 3 +} diff --git a/tests/data/refseq/annotation_report.parsed.json b/tests/data/refseq/annotation_report.parsed.json new file mode 100644 index 0000000..178e040 --- /dev/null +++ b/tests/data/refseq/annotation_report.parsed.json @@ -0,0 +1,239 @@ +{ + "contig": [ + { + "contig_id": "refseq:NC_008187.1", + "hash": null, + "gc_content": null, + "length": null + } + ], + "contig_x_contigcollection": [ + { + "contig_id": "refseq:NC_008187.1", + "contig_collection_id": "insdc.gcf:GCF_000869125.1" + } + ], + "contig_x_feature": [ + { + "contig_id": "refseq:NC_008187.1", + "feature_id": "ncbigene:4156250" + }, + { + "contig_id": "refseq:NC_008187.1", + "feature_id": "ncbigene:4156251" + }, + { + "contig_id": "refseq:NC_008187.1", + "feature_id": "ncbigene:4156252" + } + ], + "contig_x_protein": [ + { + "contig_id": "refseq:NC_008187.1", + "protein_id": "refseq:YP_654573.1" + }, + { + "contig_id": "refseq:NC_008187.1", + "protein_id": "refseq:YP_654574.1" + } + ], + "contigcollection": [ + { + "contig_collection_id": "insdc.gcf:GCF_000869125.1", + "hash": null + } + ], + "contigcollection_x_feature": [ + { + "contig_collection_id": "insdc.gcf:GCF_000869125.1", + "feature_id": "ncbigene:4156250" + }, + { + "contig_collection_id": "insdc.gcf:GCF_000869125.1", + "feature_id": "ncbigene:4156251" + }, + { + "contig_collection_id": "insdc.gcf:GCF_000869125.1", + "feature_id": "ncbigene:4156252" + } + ], + "contigcollection_x_protein": [ + { + "contig_collection_id": "insdc.gcf:GCF_000869125.1", + "protein_id": "refseq:YP_654573.1" + }, + { + "contig_collection_id": "insdc.gcf:GCF_000869125.1", + "protein_id": "refseq:YP_654574.1" + } + ], + "feature_x_protein": [ + { + "feature_id": "ncbigene:4156250", + "protein_id": "refseq:YP_654573.1" + }, + { + "feature_id": "ncbigene:4156251", + "protein_id": "refseq:YP_654574.1" + } + ], + "feature": [ + { + "feature_id": "ncbigene:4156250", + "hash": null, + "cds_phase": null, + "e_value": null, + "end": 3066, + "p_value": null, + "start": 2620, + "strand": "positive", + "source_database": "ncbigene", + "protocol_id": null, + "type": "protein-coding" + }, + { + "feature_id": "ncbigene:4156251", + "hash": null, + "cds_phase": null, + "e_value": null, + "end": 4979, + "p_value": null, + "start": 3603, + "strand": "positive", + "source_database": "ncbigene", + "protocol_id": null, + "type": "protein-coding" + }, + { + "feature_id": "ncbigene:4156251", + "hash": null, + "cds_phase": null, + "e_value": null, + "end": 5638, + "p_value": null, + "start": 5168, + "strand": "negative", + "source_database": "ncbigene", + "protocol_id": null, + "type": "protein-coding" + } + ], + "identifier": [ + { + "entity_id": "insdc.gcf:GCF_000869125", + "identifier": "insdc.gcf:GCF_000869125", + "description": "RefSeq genome ID", + "source": "RefSeq", + "relationship": null + }, + { + "entity_id": "refseq:NC_008187.1", + "identifier": "refseq:NC_008187.1", + "description": "RefSeq assembly ID", + "source": "RefSeq", + "relationship": null + }, + { + "entity_id": "ncbigene:4156250", + "identifier": "ncbigene:4156250", + "description": "NCBI gene ID", + "source": "RefSeq", + "relationship": null + }, + { + "entity_id": "ncbigene:4156251", + "identifier": "ncbigene:4156251", + "description": "NCBI gene ID", + "source": "RefSeq", + "relationship": null + }, + { + "entity_id": "ncbigene:4156252", + "identifier": "ncbigene:4156252", + "description": "NCBI gene ID", + "source": "RefSeq", + "relationship": null + }, + { + "entity_id": "refseq:YP_654573.1", + "identifier": "refseq:YP_654573.1", + "description": "RefSeq protein ID", + "source": "RefSeq", + "relationship": null + } + ], + "name": [ + { + "entity_id": "ncbigene:4156250", + "name": "hypothetical protein", + "description": "RefSeq gene name", + "source": "RefSeq" + }, + { + "entity_id": "ncbigene:4156251", + "name": "hypodermical protein", + "description": "RefSeq gene name", + "source": "RefSeq" + }, + { + "entity_id": "ncbigene:4156252", + "name": "very hypothetical protein", + "description": "RefSeq gene name", + "source": "RefSeq" + }, + { + "entity_id": "ncbigene:4156250", + "name": "MIV001R", + "description": "RefSeq locus tag", + "source": "RefSeq" + }, + { + "entity_id": "ncbigene:4156251", + "name": "MIV002R", + "description": "RefSeq locus tag", + "source": "RefSeq" + }, + { + "entity_id": "ncbigene:4156252", + "name": "MIV003R", + "description": "RefSeq locus tag", + "source": "RefSeq" + }, + { + "entity_id": "ncbigene:4156252", + "name": "kappa-delta-phi", + "description": "RefSeq symbol", + "source": "RefSeq" + }, + { + "entity_id": "refseq:YP_654573.1", + "name": "hypothetical protein", + "description": "RefSeq protein name", + "source": "RefSeq" + }, + { + "entity_id": "refseq:YP_654574.1", + "name": "hypothetical protein", + "description": "RefSeq protein name", + "source": "RefSeq" + } + ], + "protein": [ + { + "protein_id": "refseq:YP_654573.1", + "hash": null, + "description": null, + "evidence_for_existence": null, + "length": null, + "sequence": null + }, + { + "protein_id": "refseq:YP_654574.1", + "hash": null, + "description": null, + "evidence_for_existence": null, + "length": null, + "sequence": null + } + ] +} diff --git a/tests/parsers/refseq/__init__.py b/tests/parsers/refseq/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/parsers/refseq/api/__init__.py b/tests/parsers/refseq/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/parsers/refseq/api/test_annotation_report.py b/tests/parsers/refseq/api/test_annotation_report.py new file mode 100644 index 0000000..434583a --- /dev/null +++ b/tests/parsers/refseq/api/test_annotation_report.py @@ -0,0 +1,757 @@ +### uv run pytest tests/parsers/test_annotation_parse.py + +import json +from pathlib import Path + +import pytest +from cdm_data_loader_utils.parsers.refseq.api.annotation_report import ( + apply_prefix, + load_contig_collection_x_feature, + load_contig_collection_x_protein, + load_contig_x_contig_collection, + load_contigs, + load_feature_records, + load_feature_x_protein, + load_identifiers, + load_names, + parse_annotation_data, + to_int, +) +from pyspark.sql import SparkSession +from pyspark.testing import assertDataFrameEqual, assertSchemaEqual + +from cdm_data_loader_utils.model.kbase_cdm_schema import CDM_SCHEMA +from tests.conftest import TEST_NS + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + ( + { + "reports": [ + { + "annotation": { + "gene_id": "1234", + "name": "hypothetical protein", + "relationship": "RefSeq gene symbol", + } + } + ] + }, + [ + ( + "ncbigene:1234", + "1234", + "hypothetical protein", + "RefSeq", + "RefSeq gene symbol", + ) + ], + ), + ( + {"reports": [{"annotation": {"gene_id": "5678", "name": "some protein"}}]}, + [("ncbigene:5678", "5678", "some protein", "RefSeq", None)], + ), + ( + { + "reports": [ + { + "annotation": { + "name": "no gene id here", + "relationship": "RefSeq locus tag", + } + } + ] + }, + [], + ), + ], +) +def test_load_identifiers(input_data, expected_output): + result = load_identifiers(input_data) + assert result == expected_output + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: all name fields present + ( + { + "reports": [ + { + "annotation": { + "gene_id": "1234", + "symbol": "abc", + "name": "ABC protein", + "locus_tag": "LTG_1234", + } + } + ] + }, + [ + ("ncbigene:1234", "abc", "RefSeq gene symbol", "RefSeq"), + ("ncbigene:1234", "ABC protein", "RefSeq gene name", "RefSeq"), + ("ncbigene:1234", "LTG_1234", "RefSeq locus tag", "RefSeq"), + ], + ), + # Case 2: only gene_name present + ( + {"reports": [{"annotation": {"gene_id": "5678", "name": "Hypothetical protein"}}]}, + [ + ( + "ncbigene:5678", + "Hypothetical protein", + "RefSeq gene name", + "RefSeq", + ) + ], + ), + # Case 3: no gene_id + ( + {"reports": [{"annotation": {"name": "Unnamed", "symbol": "XYZ"}}]}, + [], + ), + # Case 4: only locus_tag present + ( + {"reports": [{"annotation": {"gene_id": "8888", "locus_tag": "LTG_8888"}}]}, + [("ncbigene:8888", "LTG_8888", "RefSeq locus tag", "RefSeq")], + ), + # Case 5: multiple reports + ( + { + "reports": [ + {"annotation": {"gene_id": "1001", "symbol": "DEF"}}, + {"annotation": {"gene_id": "1002", "name": "DEF protein"}}, + ] + }, + [ + ("ncbigene:1001", "DEF", "RefSeq gene symbol", "RefSeq"), + ("ncbigene:1002", "DEF protein", "RefSeq gene name", "RefSeq"), + ], + ), + ], +) +def test_load_names(input_data, expected_output): + result = load_names(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: basic valid input with plus strand + ( + { + "reports": [ + { + "annotation": { + "gene_id": "1234", + "genomic_regions": [ + { + "gene_range": { + "range": [ + { + "begin": "100", + "end": "200", + "orientation": "plus", + } + ] + } + } + ], + } + } + ] + }, + [ + ( + "ncbigene:1234", + None, + None, + None, + 200, + None, + 100, + "positive", + "RefSeq", + None, + "gene", + ) + ], + ), + # Case 2: multiple ranges, different strands + ( + { + "reports": [ + { + "annotation": { + "gene_id": "5678", + "genomic_regions": [ + { + "gene_range": { + "range": [ + { + "begin": "300", + "end": "500", + "orientation": "minus", + }, + { + "begin": "600", + "end": "800", + "orientation": "plus", + }, + ] + } + } + ], + } + } + ] + }, + [ + ( + "ncbigene:5678", + None, + None, + None, + 500, + None, + 300, + "negative", + "RefSeq", + None, + "gene", + ), + ( + "ncbigene:5678", + None, + None, + None, + 800, + None, + 600, + "positive", + "RefSeq", + None, + "gene", + ), + ], + ), + # Case 3: missing orientation + ( + { + "reports": [ + { + "annotation": { + "gene_id": "9999", + "genomic_regions": [{"gene_range": {"range": [{"begin": "1", "end": "2"}]}}], + } + } + ] + }, + [ + ( + "ncbigene:9999", + None, + None, + None, + 2, + None, + 1, + "unknown", + "RefSeq", + None, + "gene", + ) + ], + ), + # Case 4: no gene_id + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [ + { + "gene_range": { + "range": [ + { + "begin": "100", + "end": "200", + "orientation": "plus", + } + ] + } + } + ] + } + } + ] + }, + [], + ), + # Case 5: non-integer start/end + ( + { + "reports": [ + { + "annotation": { + "gene_id": "1111", + "genomic_regions": [ + { + "gene_range": { + "range": [ + { + "begin": "abc", + "end": "xyz", + "orientation": "plus", + } + ] + } + } + ], + } + } + ] + }, + [ + ( + "ncbigene:1111", + None, + None, + None, + None, + None, + None, + "positive", + "RefSeq", + None, + "gene", + ) + ], + ), + ], +) +def test_load_feature_records(input_data, expected_output): + result = load_feature_records(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: valid mapping + ( + { + "reports": [ + { + "annotation": { + "gene_id": "12345", + "genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}], + } + } + ] + }, + [("refseq:NC_000001.11", "ncbigene:12345")], + ), + # Case 2: no gene_id + ( + {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000002.11"}}]}}]}, + [], + ), + # Case 3: no genomic_regions + ( + {"reports": [{"annotation": {"gene_id": "67890"}}]}, + [], + ), + # Case 4: empty genomic_regions list + ( + {"reports": [{"annotation": {"gene_id": "99999", "genomic_regions": []}}]}, + [], + ), + # Case 5: missing accession_version + ( + { + "reports": [ + { + "annotation": { + "gene_id": "13579", + "genomic_regions": [{"gene_range": {}}], + } + } + ] + }, + [], + ), + ], +) +def test_load_contig_collection_x_feature(input_data, expected_output): + result = load_contig_collection_x_feature(input_data) + assert result == expected_output + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: Valid report with multiple proteins + ( + { + "reports": [ + { + "annotation": { + "proteins": [ + {"accession_version": "XP_123"}, + {"accession_version": "XP_456"}, + ], + "annotations": [{"assembly_accession": "GCF_000001"}], + } + } + ] + }, + [ + ("insdc.gcf:GCF_000001", "refseq:XP_123"), + ("insdc.gcf:GCF_000001", "refseq:XP_456"), + ], + ), + # Case 2: No proteins + ( + { + "reports": [ + { + "annotation": { + "proteins": [], + "annotations": [{"assembly_accession": "GCF_000002"}], + } + } + ] + }, + [], + ), + # Case 3: No annotations + ( + {"reports": [{"annotation": {"proteins": [{"accession_version": "XP_789"}]}}]}, + [], + ), + # Case 4: Missing assembly_accession + ( + { + "reports": [ + { + "annotation": { + "proteins": [{"accession_version": "XP_789"}], + "annotations": [{}], + } + } + ] + }, + [], + ), + # Case 5: Some proteins missing accession_version + ( + { + "reports": [ + { + "annotation": { + "proteins": [ + {"accession_version": "XP_111"}, + {}, + {"accession_version": "XP_222"}, + ], + "annotations": [{"assembly_accession": "GCF_000003"}], + } + } + ] + }, + [ + ("insdc.gcf:GCF_000003", "refseq:XP_111"), + ("insdc.gcf:GCF_000003", "refseq:XP_222"), + ], + ), + ], +) +def test_load_contig_collection_x_protein(input_data, expected_output): + result = load_contig_collection_x_protein(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: valid gene with multiple proteins + ( + { + "reports": [ + { + "annotation": { + "gene_id": "4156311", + "proteins": [ + {"accession_version": "XP_001"}, + {"accession_version": "XP_002"}, + ], + } + } + ] + }, + [ + ("ncbigene:4156311", "refseq:XP_001"), + ("ncbigene:4156311", "refseq:XP_002"), + ], + ), + # Case 2: no gene_id + ( + {"reports": [{"annotation": {"proteins": [{"accession_version": "XP_999"}]}}]}, + [], + ), + # Case 3: gene with no proteins + ( + {"reports": [{"annotation": {"gene_id": "4156312"}}]}, + [], + ), + # Case 4: some proteins missing accession_version + ( + { + "reports": [ + { + "annotation": { + "gene_id": "4156313", + "proteins": [ + {"accession_version": "XP_777"}, + {}, + {"accession_version": "XP_888"}, + ], + } + } + ] + }, + [ + ("ncbigene:4156313", "refseq:XP_777"), + ("ncbigene:4156313", "refseq:XP_888"), + ], + ), + # Case 5: empty report list + ({"reports": []}, []), + ], +) +def test_load_feature_x_protein(input_data, expected_output): + result = load_feature_x_protein(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: Valid contig and assembly + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}], + "annotations": [{"assembly_accession": "GCF_000001.1"}], + } + } + ] + }, + [("refseq:NC_000001.11", "insdc.gcf:GCF_000001.1")], + ), + # Case 2: Missing genomic_regions + ( + {"reports": [{"annotation": {"annotations": [{"assembly_accession": "GCF_000002.1"}]}}]}, + [], + ), + # Case 3: Missing annotations + ( + {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000003.11"}}]}}]}, + [], + ), + # Case 4: Missing accession_version in region + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [{"gene_range": {}}], + "annotations": [{"assembly_accession": "GCF_000004.1"}], + } + } + ] + }, + [], + ), + # Case 5: Missing assembly_accession in annotations + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [{"gene_range": {"accession_version": "NC_000005.11"}}], + "annotations": [{}], + } + } + ] + }, + [], + ), + # Case 6: Multiple reports, one valid + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [{"gene_range": {"accession_version": "NC_000006.11"}}], + "annotations": [{"assembly_accession": "GCF_000006.1"}], + } + }, + { + "annotation": { + "genomic_regions": [{"gene_range": {"accession_version": "NC_000007.11"}}], + "annotations": [{}], + } + }, + ] + }, + [("refseq:NC_000006.11", "insdc.gcf:GCF_000006.1")], + ), + ], +) +def test_load_contig_x_contig_collection(input_data, expected_output): + result = load_contig_x_contig_collection(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: Valid contig with accession_version + ( + {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}]}}]}, + [("refseq:NC_000001.11", None, None, None)], + ), + # Case 2: Multiple contigs, different accession_versions + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [ + {"gene_range": {"accession_version": "NC_000001.11"}}, + {"gene_range": {"accession_version": "NC_000002.12"}}, + ] + } + } + ] + }, + [ + ("refseq:NC_000001.11", None, None, None), + ("refseq:NC_000002.12", None, None, None), + ], + ), + # Case 3: Duplicate accession versions + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [ + {"gene_range": {"accession_version": "NC_000003.13"}}, + {"gene_range": {"accession_version": "NC_000003.13"}}, + ] + } + } + ] + }, + [("refseq:NC_000003.13", None, None, None)], + ), + # Case 4: Missing accession_version + ( + {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {}}]}}]}, + [], + ), + # Case 5: Empty reports + ( + {"reports": []}, + [], + ), + ], +) +def test_load_contigs(input_data, expected_output): + result = load_contigs(input_data) + assert sorted(result) == sorted(expected_output) + + +### add new test: to_int +@pytest.mark.parametrize( + "input_id, expected", + [ + ("GeneID:123", "ncbigene:123"), + ("YP_009725307.1", "refseq:YP_009725307.1"), + ("GCF_000001405.39", "insdc.gcf:GCF_000001405.39"), + ("random", "random"), + ], +) +def test_apply_prefix(input_id, expected): + assert apply_prefix(input_id) == expected + + +@pytest.mark.parametrize("val, expected", [("123", 123), ("abc", None), ("", None)]) +def test_to_int(val, expected): + assert to_int(val) == expected + + +TABLE_NAME_MAP = { + "contig": "Contig", + "feature": "Feature", + "identifier": "Identifier", + "name": "Name", + "contig_x_contigcollection": "Contig_x_ContigCollection", + "contigcollection_x_feature": "ContigCollection_x_Feature", + "contigcollection_x_protein": "ContigCollection_x_Protein", + "feature_x_protein": "Feature_x_Protein", +} + + +@pytest.mark.requires_spark +def test_parse_annotation_data(spark: SparkSession, test_data_dir: Path) -> None: + """Test the parsing of the annotation data.""" + spark.sql(f"CREATE DATABASE IF NOT EXISTS {TEST_NS}") + spark.catalog.setCurrentDatabase(TEST_NS) + + # Load NCBI dataset from NCBI API + sample_api_response = test_data_dir / "refseq" / "annotation_report.json" + dataset = json.load(sample_api_response.open()) + + # Run parse function + parse_annotation_data(spark, [dataset], TEST_NS) + + # Expected tables to validate from output + expected_tables = [ + "contig", + "contig_x_contigcollection", + "contigcollection_x_feature", + "contigcollection_x_protein", + "feature", + "feature_x_protein", + "identifier", + "name", + ] + + for table_name in expected_tables: + result_df = spark.table(f"{TEST_NS}.{table_name}") + schema_key = TABLE_NAME_MAP[table_name] + + # Construct expected_df just for schema comparison + rows = [r.asDict() for r in result_df.collect()] + expected_df = spark.createDataFrame(rows, schema=CDM_SCHEMA[schema_key]) + + # Assert schema match + assertSchemaEqual( + expected_df.schema, + result_df.schema, + ) + # Assert content match + assertDataFrameEqual( + expected_df, + result_df, + ) From 6039dc66486696488d3bcdee257ef73abf9e230c Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Fri, 30 Jan 2026 08:55:48 -0800 Subject: [PATCH 2/3] Restoring deleted test --- ...notation_parse.py => annotation_report.py} | 0 .../refseq/api/test_annotation_report.py | 100 +++++++----------- 2 files changed, 41 insertions(+), 59 deletions(-) rename src/cdm_data_loader_utils/parsers/refseq/api/{annotation_parse.py => annotation_report.py} (100%) diff --git a/src/cdm_data_loader_utils/parsers/refseq/api/annotation_parse.py b/src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py similarity index 100% rename from src/cdm_data_loader_utils/parsers/refseq/api/annotation_parse.py rename to src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py diff --git a/tests/parsers/refseq/api/test_annotation_report.py b/tests/parsers/refseq/api/test_annotation_report.py index 434583a..567c7bb 100644 --- a/tests/parsers/refseq/api/test_annotation_report.py +++ b/tests/parsers/refseq/api/test_annotation_report.py @@ -2,8 +2,13 @@ import json from pathlib import Path +from typing import Any import pytest +from pyspark.sql import SparkSession +from pyspark.testing import assertDataFrameEqual, assertSchemaEqual + +from cdm_data_loader_utils.model.kbase_cdm_schema import CDM_SCHEMA from cdm_data_loader_utils.parsers.refseq.api.annotation_report import ( apply_prefix, load_contig_collection_x_feature, @@ -17,15 +22,13 @@ parse_annotation_data, to_int, ) -from pyspark.sql import SparkSession -from pyspark.testing import assertDataFrameEqual, assertSchemaEqual - -from cdm_data_loader_utils.model.kbase_cdm_schema import CDM_SCHEMA from tests.conftest import TEST_NS +CDM_SCHEMA_LC = {k.lower(): v for k, v in CDM_SCHEMA.items()} + @pytest.mark.parametrize( - "input_data, expected_output", + ("input_data", "expected_output"), [ ( { @@ -68,13 +71,13 @@ ), ], ) -def test_load_identifiers(input_data, expected_output): +def test_load_identifiers(input_data: dict[str, Any], expected_output: list[tuple]) -> None: result = load_identifiers(input_data) assert result == expected_output @pytest.mark.parametrize( - "input_data, expected_output", + ("input_data", "expected_output"), [ # Case 1: all name fields present ( @@ -133,13 +136,13 @@ def test_load_identifiers(input_data, expected_output): ), ], ) -def test_load_names(input_data, expected_output): +def test_load_names(input_data: dict[str, Any], expected_output: list[tuple]) -> None: result = load_names(input_data) assert sorted(result) == sorted(expected_output) @pytest.mark.parametrize( - "input_data, expected_output", + ("input_data", "expected_output"), [ # Case 1: basic valid input with plus strand ( @@ -334,13 +337,13 @@ def test_load_names(input_data, expected_output): ), ], ) -def test_load_feature_records(input_data, expected_output): +def test_load_feature_records(input_data: dict[str, Any], expected_output: list[tuple]): result = load_feature_records(input_data) assert sorted(result) == sorted(expected_output) @pytest.mark.parametrize( - "input_data, expected_output", + ("input_data", "expected_output"), [ # Case 1: valid mapping ( @@ -387,13 +390,13 @@ def test_load_feature_records(input_data, expected_output): ), ], ) -def test_load_contig_collection_x_feature(input_data, expected_output): +def test_load_contig_collection_x_feature(input_data: dict[str, Any], expected_output) -> None: result = load_contig_collection_x_feature(input_data) assert result == expected_output @pytest.mark.parametrize( - "input_data, expected_output", + ("input_data", "expected_output"), [ # Case 1: Valid report with multiple proteins ( @@ -471,13 +474,13 @@ def test_load_contig_collection_x_feature(input_data, expected_output): ), ], ) -def test_load_contig_collection_x_protein(input_data, expected_output): +def test_load_contig_collection_x_protein(input_data: dict[str, Any], expected_output: list[tuple]) -> None: result = load_contig_collection_x_protein(input_data) assert sorted(result) == sorted(expected_output) @pytest.mark.parametrize( - "input_data, expected_output", + ("input_data", "expected_output"), [ # Case 1: valid gene with multiple proteins ( @@ -534,13 +537,13 @@ def test_load_contig_collection_x_protein(input_data, expected_output): ({"reports": []}, []), ], ) -def test_load_feature_x_protein(input_data, expected_output): +def test_load_feature_x_protein(input_data: dict[str, Any], expected_output: list[tuple[str, str]]) -> None: result = load_feature_x_protein(input_data) assert sorted(result) == sorted(expected_output) @pytest.mark.parametrize( - "input_data, expected_output", + ("input_data", "expected_output"), [ # Case 1: Valid contig and assembly ( @@ -616,13 +619,13 @@ def test_load_feature_x_protein(input_data, expected_output): ), ], ) -def test_load_contig_x_contig_collection(input_data, expected_output): +def test_load_contig_x_contig_collection(input_data: dict[str, Any], expected_output: list[tuple[str, str]]) -> None: result = load_contig_x_contig_collection(input_data) assert sorted(result) == sorted(expected_output) @pytest.mark.parametrize( - "input_data, expected_output", + ("input_data", "expected_output"), [ # Case 1: Valid contig with accession_version ( @@ -676,14 +679,14 @@ def test_load_contig_x_contig_collection(input_data, expected_output): ), ], ) -def test_load_contigs(input_data, expected_output): +def test_load_contigs(input_data: dict[str, Any], expected_output: list[tuple]) -> None: result = load_contigs(input_data) assert sorted(result) == sorted(expected_output) ### add new test: to_int @pytest.mark.parametrize( - "input_id, expected", + ("input_id", "expected"), [ ("GeneID:123", "ncbigene:123"), ("YP_009725307.1", "refseq:YP_009725307.1"), @@ -691,59 +694,35 @@ def test_load_contigs(input_data, expected_output): ("random", "random"), ], ) -def test_apply_prefix(input_id, expected): +def test_apply_prefix(input_id: str, expected: str) -> None: assert apply_prefix(input_id) == expected -@pytest.mark.parametrize("val, expected", [("123", 123), ("abc", None), ("", None)]) -def test_to_int(val, expected): +@pytest.mark.parametrize(("val", "expected"), [("123", 123), ("abc", None), ("", None)]) +def test_to_int(val: str, expected: int | None) -> None: assert to_int(val) == expected -TABLE_NAME_MAP = { - "contig": "Contig", - "feature": "Feature", - "identifier": "Identifier", - "name": "Name", - "contig_x_contigcollection": "Contig_x_ContigCollection", - "contigcollection_x_feature": "ContigCollection_x_Feature", - "contigcollection_x_protein": "ContigCollection_x_Protein", - "feature_x_protein": "Feature_x_Protein", -} - - @pytest.mark.requires_spark def test_parse_annotation_data(spark: SparkSession, test_data_dir: Path) -> None: - """Test the parsing of the annotation data.""" + """ + Test that parse_annotation_data produces expected tables with correct schemas and non-empty output. + """ spark.sql(f"CREATE DATABASE IF NOT EXISTS {TEST_NS}") - spark.catalog.setCurrentDatabase(TEST_NS) - - # Load NCBI dataset from NCBI API + # Load and parse test JSON sample_api_response = test_data_dir / "refseq" / "annotation_report.json" dataset = json.load(sample_api_response.open()) - - # Run parse function + # Run the parser parse_annotation_data(spark, [dataset], TEST_NS) - # Expected tables to validate from output - expected_tables = [ - "contig", - "contig_x_contigcollection", - "contigcollection_x_feature", - "contigcollection_x_protein", - "feature", - "feature_x_protein", - "identifier", - "name", - ] + # Load expected results JSON + sample_api_response = test_data_dir / "refseq" / "annotation_report.parsed.json" + expected_tables = json.load(sample_api_response.open()) + result_df = {table.name: spark.table(f"{TEST_NS}.{table.name}") for table in spark.catalog.listTables(TEST_NS)} - for table_name in expected_tables: + for table_name in result_df: result_df = spark.table(f"{TEST_NS}.{table_name}") - schema_key = TABLE_NAME_MAP[table_name] - - # Construct expected_df just for schema comparison - rows = [r.asDict() for r in result_df.collect()] - expected_df = spark.createDataFrame(rows, schema=CDM_SCHEMA[schema_key]) + expected_df = spark.createDataFrame(expected_tables[table_name], schema=CDM_SCHEMA_LC[table_name]) # Assert schema match assertSchemaEqual( @@ -755,3 +734,6 @@ def test_parse_annotation_data(spark: SparkSession, test_data_dir: Path) -> None expected_df, result_df, ) + + # make sure that all expected tables are present + assert set(expected_tables) == set(result_df) From b0cef542bb2fbe35e87cc1ac42872702a6b02a0e Mon Sep 17 00:00:00 2001 From: YueWang Date: Fri, 30 Jan 2026 17:30:00 -0800 Subject: [PATCH 3/3] Add extra schemas --- pyproject.toml | 5 +- .../parsers/refseq/api/annotation_report.py | 173 +++++++++++++++--- uv.lock | 39 ++-- 3 files changed, 165 insertions(+), 52 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a2cc5ab..28a0239 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,8 @@ dependencies = [ "click>=8.3.1", "lxml>=6.0.2", "ruff>=0.14.13", + "requests>=2.31.0", + "delta-spark>=2.4.0" ] [project.scripts] @@ -21,7 +23,8 @@ uniprot = "cdm_data_loader_utils.parsers.uniprot.uniprot_kb:cli" uniref = "cdm_data_loader_utils.pasers.uniprot.uniref:cli" -[dependency-groups] +##[dependency-groups] +[project.optional-dependencies] dev = [ "pytest>=9.0.2", "pytest-asyncio>=1.3.0", diff --git a/src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py b/src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py index 47c6882..96885e5 100644 --- a/src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py +++ b/src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py @@ -3,7 +3,7 @@ RefSeq annotation parser for transforming NCBI Datasets API JSON into CDM-formatted Delta Lake tables. Usage: -uv run python src/cdm_data_loader_utils/parsers/annotation_parse.py \ +uv run python src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py \ --accession GCF_000869125.1 \ --namespace refseq_api \ --query @@ -143,21 +143,19 @@ def load_feature_records(data: dict) -> list[tuple]: "minus": "negative", "unstranded": "unstranded", }.get(r.get("orientation"), "unknown") - features.append( - ( - feature_id, - None, - None, - None, - to_int(r.get("end")), - None, - to_int(r.get("begin")), - strand, - "RefSeq", - None, - "gene", - ) - ) + features.append(( + feature_id, + None, + None, + None, + to_int(r.get("end")), + None, + to_int(r.get("begin")), + strand, + "RefSeq", + None, + "gene", + )) return list({tuple(row) for row in features}) @@ -182,7 +180,7 @@ def load_contig_collection_x_feature(data: dict) -> list[tuple[str, str]]: # --------------------------------------------------------------------- -# PARSE CONTIG_COLLECTION <-> PROTEIN +# PARSE CONTIG_COLLECTION <-> PROTEIN %%% # --------------------------------------------------------------------- def load_contig_collection_x_protein(data: dict) -> list[tuple[str, str]]: links = [] @@ -255,16 +253,81 @@ def load_contig_x_contig_collection(data: dict) -> list[tuple[str, str]]: assembly = annotations[0].get("assembly_accession") if contig and assembly: - links.append( - ( - f"refseq:{contig}", - apply_prefix(assembly), - ) - ) + links.append(( + f"refseq:{contig}", + apply_prefix(assembly), + )) return list(set(links)) +def load_contig_x_feature(data: dict) -> list[tuple[str, str]]: + """Extract (contig_id, feature_id) pairs.""" + links = [] + + for gene_id, ann in unique_annotations(data): + feature_id = f"ncbigene:{gene_id}" + + for region in ann.get("genomic_regions", []): + acc = region.get("gene_range", {}).get("accession_version") + if acc: + contig_id = apply_prefix(acc) + links.append((contig_id, feature_id)) + + return list(set(links)) + + +def load_contig_x_protein(data: dict) -> list[tuple[str, str]]: + links = [] + + for _, ann in unique_annotations(data): + contig_id = None + + for region in ann.get("genomic_regions", []): + acc = region.get("gene_range", {}).get("accession_version") + if acc: + contig_id = apply_prefix(acc) + break # only take first + + if contig_id: + for p in ann.get("proteins", []): + pid = p.get("accession_version") + if pid: + links.append((contig_id, apply_prefix(pid))) + + return list({tuple(row) for row in links}) + + +### contig collection has 34 rows + + +def load_contig_collections(data: dict) -> list[tuple]: + schema = CDM_SCHEMA["ContigCollection"] + + records = [] + + for report in data.get("reports", []): + for item in report.get("contigcollection", []): + row = tuple(item.get(f) for f in schema.fieldNames()) + records.append(row) + + return records + + +def load_protein(data: dict) -> list[tuple[str, str | None, str | None, str | None, int | None, str | None]]: + """Extract Protein table rows.""" + out = [] + + for _, ann in unique_annotations(data): + for p in ann.get("proteins", []): + pid = apply_prefix(p.get("accession_version")) + name = p.get("name") + length = p.get("length") + out.append((pid, None, name, None, length, None)) + + return list({tuple(row) for row in out}) + + # --------------------------------------------------------------------- # DELTA TABLE # --------------------------------------------------------------------- @@ -274,10 +337,26 @@ def write_to_table( table_name: str, database: str = "default", ) -> None: - if records: - spark.createDataFrame(records, CDM_SCHEMA[table_name]).write.format("delta").mode("overwrite").option( - "overwriteSchema", "true" - ).saveAsTable(f"{database}.{table_name}") + if not records: + print(f"[DEBUG] {table_name}: no records") + return + + # print(f"\n[DEBUG] Writing table: {table_name}") + # print(f"[DEBUG] Record sample: {records[0]}") + # print(f"[DEBUG] Record field count: {len(records[0])}") + # print(f"[DEBUG] Schema field count: {len(CDM_SCHEMA[table_name])}") + # print(f"[DEBUG] Schema fields: {[f.name for f in CDM_SCHEMA[table_name]]}") + + df = spark.createDataFrame(records, CDM_SCHEMA[table_name]) + # print(f"[DEBUG] DataFrame row count: {df.count()}") + + df.printSchema() + df.show(truncate=False) + + df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{database}.{table_name}") + print(f"[DEBUG] Writing to {database}.{table_name}, number of records: {len(records)}") + + df.createOrReplaceTempView(table_name) # --------------------------------------------------------------------- @@ -293,14 +372,22 @@ def write_to_table( "Feature_x_Protein", "Contig", "Contig_x_ContigCollection", + "ContigCollection", + "Protein", + "Contig_x_Feature", + "Contig_x_Protein", ] def run_sql_query(spark: SparkSession, database: str = "default") -> None: spark.sql(f"USE {database}") for table in CDM_TABLES: - print(f"\n[SQL Preview] {table}") - spark.sql(f"SELECT * FROM {table} LIMIT 20").show(truncate=False) + full_table_name = f"{database}.{table}" + print(f"\n[SQL Preview] {full_table_name}") + try: + spark.sql(f"SELECT * FROM {full_table_name} LIMIT 20").show(truncate=False) + except Exception as e: + print(f"[WARNING] Could not query table {full_table_name}: {e}") def parse_annotation_data(spark: SparkSession, datasets: list[dict], namespace: str) -> None: @@ -364,6 +451,34 @@ def parse_annotation_data(spark: SparkSession, datasets: list[dict], namespace: namespace, ) + write_to_table( + spark, + load_contig_x_feature(data), + "Contig_x_Feature", + namespace, + ) + + write_to_table( + spark, + load_contig_x_protein(data), + "Contig_x_Protein", + namespace, + ) + + write_to_table( + spark, + load_contig_collections(data), + "ContigCollection", + namespace, + ) + + write_to_table( + spark, + load_protein(data), + "Protein", + namespace, + ) + # --------------------------------------------------------------------- # CLI ENTRY diff --git a/uv.lock b/uv.lock index 9e5a82b..5999601 100644 --- a/uv.lock +++ b/uv.lock @@ -376,11 +376,13 @@ source = { editable = "." } dependencies = [ { name = "biopython" }, { name = "click" }, + { name = "delta-spark" }, { name = "lxml" }, + { name = "requests" }, { name = "ruff" }, ] -[package.dev-dependencies] +[package.optional-dependencies] dev = [ { name = "pytest" }, { name = "pytest-asyncio" }, @@ -405,32 +407,25 @@ xml = [ [package.metadata] requires-dist = [ + { name = "berdl-notebook-utils", marker = "extra == 'local'", git = "https://github.com/BERDataLakehouse/spark_notebook.git?subdirectory=notebook_utils" }, { name = "biopython", specifier = ">=1.86" }, { name = "click", specifier = ">=8.3.1" }, + { name = "delta-spark", specifier = ">=2.4.0" }, + { name = "genson", marker = "extra == 'models'", specifier = ">=1.3.0" }, + { name = "hypothesis", marker = "extra == 'experimental'", specifier = ">=6.150.2" }, + { name = "json2python-models", marker = "extra == 'models'", specifier = ">=0.3.1" }, { name = "lxml", specifier = ">=6.0.2" }, + { name = "mutmut", marker = "extra == 'experimental'", specifier = ">=3.4.0" }, + { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" }, + { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=1.3.0" }, + { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=7.0.0" }, + { name = "pytest-env", marker = "extra == 'dev'", specifier = ">=1.2.0" }, + { name = "requests", specifier = ">=2.31.0" }, { name = "ruff", specifier = ">=0.14.13" }, + { name = "xmlschema", marker = "extra == 'xml'", specifier = ">=4.3.1" }, + { name = "xsdata", extras = ["cli", "lxml"], marker = "extra == 'xml'", specifier = ">=26.1" }, ] - -[package.metadata.requires-dev] -dev = [ - { name = "pytest", specifier = ">=9.0.2" }, - { name = "pytest-asyncio", specifier = ">=1.3.0" }, - { name = "pytest-cov", specifier = ">=7.0.0" }, - { name = "pytest-env", specifier = ">=1.2.0" }, -] -experimental = [ - { name = "hypothesis", specifier = ">=6.150.2" }, - { name = "mutmut", specifier = ">=3.4.0" }, -] -local = [{ name = "berdl-notebook-utils", git = "https://github.com/BERDataLakehouse/spark_notebook.git?subdirectory=notebook_utils" }] -models = [ - { name = "genson", specifier = ">=1.3.0" }, - { name = "json2python-models", specifier = ">=0.3.1" }, -] -xml = [ - { name = "xmlschema", specifier = ">=4.3.1" }, - { name = "xsdata", extras = ["cli", "lxml"], specifier = ">=26.1" }, -] +provides-extras = ["dev", "experimental", "local", "models", "xml"] [[package]] name = "cdm-jupyterlab-brand-extension"