From 48f9e3f2dad0d9290407dec643ccb9ba38cb8f30 Mon Sep 17 00:00:00 2001
From: ialarmedalien <ialarmedalien@gmail.com>
Date: Thu, 29 Jan 2026 13:22:29 -0800
Subject: [PATCH 1/3] First pass at RefSeq API annotation report endpoint
 parser

---
 .../model/kbase_cdm_schema.py                 | 610 ++++++++++++++
 .../parsers/refseq/__init__.py                |   0
 .../parsers/refseq/api/__init__.py            |   0
 .../parsers/refseq/api/annotation_parse.py    | 447 +++++++++++
 tests/data/refseq/annotation_report.json      | 105 +++
 .../data/refseq/annotation_report.parsed.json | 239 ++++++
 tests/parsers/refseq/__init__.py              |   0
 tests/parsers/refseq/api/__init__.py          |   0
 .../refseq/api/test_annotation_report.py      | 757 ++++++++++++++++++
 9 files changed, 2158 insertions(+)
 create mode 100644 src/cdm_data_loader_utils/model/kbase_cdm_schema.py
 create mode 100644 src/cdm_data_loader_utils/parsers/refseq/__init__.py
 create mode 100644 src/cdm_data_loader_utils/parsers/refseq/api/__init__.py
 create mode 100644 src/cdm_data_loader_utils/parsers/refseq/api/annotation_parse.py
 create mode 100644 tests/data/refseq/annotation_report.json
 create mode 100644 tests/data/refseq/annotation_report.parsed.json
 create mode 100644 tests/parsers/refseq/__init__.py
 create mode 100644 tests/parsers/refseq/api/__init__.py
 create mode 100644 tests/parsers/refseq/api/test_annotation_report.py

diff --git a/src/cdm_data_loader_utils/model/kbase_cdm_schema.py b/src/cdm_data_loader_utils/model/kbase_cdm_schema.py
new file mode 100644
index 0000000..2e09b0d
--- /dev/null
+++ b/src/cdm_data_loader_utils/model/kbase_cdm_schema.py
@@ -0,0 +1,610 @@
+"""Automated conversion of cdm_schema to PySpark."""
+
+from pyspark.sql.types import BooleanType, DateType, FloatType, IntegerType, StringType, StructField, StructType
+
+CDM_SCHEMA = {
+    "Association": StructType(
+        [
+            StructField("association_id", StringType(), nullable=False),
+            StructField("subject", StringType(), nullable=False),
+            StructField("object", StringType(), nullable=False),
+            StructField("predicate", StringType(), nullable=False),
+            StructField("negated", BooleanType(), nullable=True),
+            StructField("evidence_type", StringType(), nullable=True),
+            StructField("primary_knowledge_source", StringType(), nullable=True),
+            StructField("aggregator_knowledge_source", StringType(), nullable=True),
+            StructField("annotation_date", DateType(), nullable=True),
+            StructField("comments", StringType(), nullable=True),
+        ]
+    ),
+    "Association_x_SupportingObject": StructType(
+        [
+            StructField("association_id", StringType(), nullable=False),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "Cluster": StructType(
+        [
+            StructField("cluster_id", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+            StructField("name", StringType(), nullable=True),
+            StructField("entity_type", StringType(), nullable=False),
+            StructField("protocol_id", StringType(), nullable=True),
+        ]
+    ),
+    "ClusterMember": StructType(
+        [
+            StructField("cluster_id", StringType(), nullable=False),
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("is_representative", BooleanType(), nullable=True),
+            StructField("is_seed", BooleanType(), nullable=True),
+            StructField("score", FloatType(), nullable=True),
+        ]
+    ),
+    "Contig": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("gc_content", FloatType(), nullable=True),
+            StructField("length", IntegerType(), nullable=True),
+        ]
+    ),
+    "ContigCollection": StructType(
+        [
+            StructField("contig_collection_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("asm_score", FloatType(), nullable=True),
+            StructField("checkm_completeness", FloatType(), nullable=True),
+            StructField("checkm_contamination", FloatType(), nullable=True),
+            StructField("checkm_version", StringType(), nullable=True),
+            StructField("contig_bp", IntegerType(), nullable=True),
+            StructField("contig_collection_type", StringType(), nullable=True),
+            StructField("contig_l50", IntegerType(), nullable=True),
+            StructField("contig_l90", IntegerType(), nullable=True),
+            StructField("contig_n50", IntegerType(), nullable=True),
+            StructField("contig_n90", IntegerType(), nullable=True),
+            StructField("contig_logsum", FloatType(), nullable=True),
+            StructField("contig_max", IntegerType(), nullable=True),
+            StructField("contig_powersum", FloatType(), nullable=True),
+            StructField("gap_percent", FloatType(), nullable=True),
+            StructField("gc_average", FloatType(), nullable=True),
+            StructField("gc_std", FloatType(), nullable=True),
+            StructField("gtdb_taxon_id", StringType(), nullable=True),
+            StructField("n_chromosomes", IntegerType(), nullable=True),
+            StructField("n_contigs", IntegerType(), nullable=True),
+            StructField("n_scaffolds", IntegerType(), nullable=True),
+            StructField("ncbi_taxon_id", StringType(), nullable=True),
+            StructField("scaffold_l50", IntegerType(), nullable=True),
+            StructField("scaffold_l90", IntegerType(), nullable=True),
+            StructField("scaffold_n50", IntegerType(), nullable=True),
+            StructField("scaffold_n90", IntegerType(), nullable=True),
+            StructField("scaffold_bp", IntegerType(), nullable=True),
+            StructField("scaffold_logsum", FloatType(), nullable=True),
+            StructField("scaffold_maximum_length", IntegerType(), nullable=True),
+            StructField("scaffold_powersum", FloatType(), nullable=True),
+            StructField("scaffolds_n_over_50K", IntegerType(), nullable=True),
+            StructField("scaffolds_percent_over_50K", FloatType(), nullable=True),
+            StructField("scaffolds_total_length_over_50k", IntegerType(), nullable=True),
+        ]
+    ),
+    "ContigCollection_x_EncodedFeature": StructType(
+        [
+            StructField("contig_collection_id", StringType(), nullable=False),
+            StructField("encoded_feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "ContigCollection_x_Feature": StructType(
+        [
+            StructField("contig_collection_id", StringType(), nullable=False),
+            StructField("feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "ContigCollection_x_Protein": StructType(
+        [
+            StructField("contig_collection_id", StringType(), nullable=False),
+            StructField("protein_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contig_x_ContigCollection": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("contig_collection_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contig_x_EncodedFeature": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("encoded_feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contig_x_Feature": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contig_x_Protein": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("protein_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contributor": StructType(
+        [
+            StructField("contributor_id", StringType(), nullable=False),
+            StructField("contributor_type", StringType(), nullable=True),
+            StructField("name", StringType(), nullable=True),
+            StructField("given_name", StringType(), nullable=True),
+            StructField("family_name", StringType(), nullable=True),
+        ]
+    ),
+    "ContributorAffiliation": StructType(
+        [
+            StructField("contributor_id", StringType(), nullable=False),
+            StructField("affiliation_id", StringType(), nullable=True),
+        ]
+    ),
+    "Contributor_x_DataSource": StructType(
+        [
+            StructField("contributor_id", StringType(), nullable=False),
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("contributor_role", StringType(), nullable=True),
+        ]
+    ),
+    "Contributor_x_Role_x_Project": StructType(
+        [
+            StructField("contributor_id", StringType(), nullable=False),
+            StructField("project_id", StringType(), nullable=False),
+            StructField("contributor_role", StringType(), nullable=True),
+        ]
+    ),
+    "ControlledTermValue": StructType(
+        [
+            StructField("value_cv_label", StringType(), nullable=False),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "ControlledVocabularyTermValue": StructType(
+        [
+            StructField("value_cv_label", StringType(), nullable=True),
+            StructField("value_cv_id", StringType(), nullable=True),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "DataSource": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+        ]
+    ),
+    "DataSourceNew": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("comments", StringType(), nullable=True),
+            StructField("date_accessed", DateType(), nullable=False),
+            StructField("date_published", DateType(), nullable=True),
+            StructField("date_updated", DateType(), nullable=True),
+            StructField("license", StringType(), nullable=True),
+            StructField("publisher", StringType(), nullable=True),
+            StructField("resource_type", StringType(), nullable=False),
+            StructField("url", StringType(), nullable=True),
+            StructField("version", StringType(), nullable=True),
+        ]
+    ),
+    "DataSource_x_Description": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("resource_description_id", StringType(), nullable=False),
+        ]
+    ),
+    "DataSource_x_FundingReference": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("funding_reference_id", StringType(), nullable=False),
+        ]
+    ),
+    "DataSource_x_License": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("license_id", StringType(), nullable=False),
+        ]
+    ),
+    "DataSource_x_Title": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("resource_title_id", StringType(), nullable=False),
+        ]
+    ),
+    "DateTimeValue": StructType(
+        [
+            StructField("date_time", DateType(), nullable=False),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "EncodedFeature": StructType(
+        [
+            StructField("encoded_feature_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("has_stop_codon", BooleanType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+        ]
+    ),
+    "EncodedFeature_x_Feature": StructType(
+        [
+            StructField("encoded_feature_id", StringType(), nullable=False),
+            StructField("feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "EncodedFeature_x_Protein": StructType(
+        [
+            StructField("encoded_feature_id", StringType(), nullable=False),
+            StructField("protein_id", StringType(), nullable=False),
+        ]
+    ),
+    "EntailedEdge": StructType(
+        [
+            StructField("subject", StringType(), nullable=True),
+            StructField("predicate", StringType(), nullable=True),
+            StructField("object", StringType(), nullable=True),
+        ]
+    ),
+    "Entity": StructType(
+        [
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("entity_type", StringType(), nullable=False),
+            StructField("data_source_id", StringType(), nullable=True),
+            StructField("data_source_entity_id", StringType(), nullable=True),
+            StructField("data_source_created", DateType(), nullable=False),
+            StructField("data_source_updated", DateType(), nullable=True),
+            StructField("created", DateType(), nullable=False),
+            StructField("updated", DateType(), nullable=False),
+        ]
+    ),
+    "Event": StructType(
+        [
+            StructField("event_id", StringType(), nullable=False),
+            StructField("created_at", DateType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("name", StringType(), nullable=True),
+            StructField("location", StringType(), nullable=True),
+        ]
+    ),
+    "Experiment": StructType(
+        [
+            StructField("experiment_id", StringType(), nullable=False),
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("created_at", DateType(), nullable=True),
+        ]
+    ),
+    "ExperimentCondition": StructType(
+        [
+            StructField("experiment_condition_id", StringType(), nullable=False),
+            StructField("experiment_id", StringType(), nullable=False),
+            StructField("variable_id", StringType(), nullable=False),
+            StructField("value", StringType(), nullable=True),
+        ]
+    ),
+    "ExperimentConditionSet": StructType(
+        [
+            StructField("experiment_condition_set_id", StringType(), nullable=False),
+            StructField("experiment_condition_id", StringType(), nullable=False),
+        ]
+    ),
+    "Feature": StructType(
+        [
+            StructField("feature_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("cds_phase", StringType(), nullable=True),
+            StructField("e_value", FloatType(), nullable=True),
+            StructField("end", IntegerType(), nullable=True),
+            StructField("p_value", FloatType(), nullable=True),
+            StructField("start", IntegerType(), nullable=True),
+            StructField("strand", StringType(), nullable=True),
+            StructField("source_database", StringType(), nullable=True),
+            StructField("protocol_id", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+        ]
+    ),
+    "Feature_x_Protein": StructType(
+        [
+            StructField("feature_id", StringType(), nullable=False),
+            StructField("protein_id", StringType(), nullable=False),
+        ]
+    ),
+    "FundingReference": StructType(
+        [
+            StructField("funding_reference_id", StringType(), nullable=False),
+            StructField("funder", StringType(), nullable=True),
+            StructField("grant_id", StringType(), nullable=True),
+            StructField("grant_title", StringType(), nullable=True),
+            StructField("grant_url", StringType(), nullable=True),
+        ]
+    ),
+    "Geolocation": StructType(
+        [
+            StructField("latitude", FloatType(), nullable=False),
+            StructField("longitude", FloatType(), nullable=False),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "GoldEnvironmentalContext": StructType(
+        [
+            StructField("gold_environmental_context_id", StringType(), nullable=False),
+            StructField("ecosystem", StringType(), nullable=True),
+            StructField("ecosystem_category", StringType(), nullable=True),
+            StructField("ecosystem_subtype", StringType(), nullable=True),
+            StructField("ecosystem_type", StringType(), nullable=True),
+            StructField("specific_ecosystem", StringType(), nullable=True),
+        ]
+    ),
+    "Identifier": StructType(
+        [
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("identifier", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+            StructField("source", StringType(), nullable=True),
+            StructField("relationship", StringType(), nullable=True),
+        ]
+    ),
+    "License": StructType(
+        [
+            StructField("license_id", StringType(), nullable=False),
+            StructField("id", StringType(), nullable=True),
+            StructField("name", StringType(), nullable=True),
+            StructField("url", StringType(), nullable=True),
+        ]
+    ),
+    "Measurement": StructType(
+        [
+            StructField("measurement_id", StringType(), nullable=False),
+            StructField("measurement_set_id", StringType(), nullable=False),
+            StructField("experiment_condition_set_id", StringType(), nullable=False),
+            StructField("value", StringType(), nullable=True),
+        ]
+    ),
+    "MeasurementSet": StructType(
+        [
+            StructField("measurement_set_id", StringType(), nullable=False),
+            StructField("variable_id", StringType(), nullable=False),
+            StructField("quality", StringType(), nullable=True),
+            StructField("created_at", DateType(), nullable=True),
+        ]
+    ),
+    "MixsEnvironmentalContext": StructType(
+        [
+            StructField("mixs_environmental_context_id", StringType(), nullable=False),
+            StructField("env_broad_scale", StringType(), nullable=True),
+            StructField("env_local_scale", StringType(), nullable=True),
+            StructField("env_medium", StringType(), nullable=True),
+        ]
+    ),
+    "Name": StructType(
+        [
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+            StructField("source", StringType(), nullable=True),
+        ]
+    ),
+    "OrderedProtocolStep": StructType(
+        [
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("protocol_step_id", StringType(), nullable=False),
+            StructField("step_index", IntegerType(), nullable=False),
+        ]
+    ),
+    "Parameter": StructType(
+        [
+            StructField("parameter_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("value_type", StringType(), nullable=True),
+            StructField("required", BooleanType(), nullable=True),
+            StructField("cardinality", StringType(), nullable=True),
+            StructField("default", StringType(), nullable=True),
+            StructField("parameter_type", StringType(), nullable=True),
+        ]
+    ),
+    "Prefix": StructType(
+        [
+            StructField("prefix", StringType(), nullable=True),
+            StructField("base", StringType(), nullable=True),
+        ]
+    ),
+    "Project": StructType(
+        [
+            StructField("project_id", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+        ]
+    ),
+    "Protein": StructType(
+        [
+            StructField("protein_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("evidence_for_existence", StringType(), nullable=True),
+            StructField("length", IntegerType(), nullable=True),
+            StructField("sequence", StringType(), nullable=True),
+        ]
+    ),
+    "Protocol": StructType(
+        [
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("doi", StringType(), nullable=True),
+            StructField("url", StringType(), nullable=True),
+            StructField("version", StringType(), nullable=True),
+        ]
+    ),
+    "ProtocolExecution": StructType(
+        [
+            StructField("protocol_execution_id", StringType(), nullable=False),
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("created_at", DateType(), nullable=True),
+        ]
+    ),
+    "ProtocolInput": StructType(
+        [
+            StructField("parameter_id", StringType(), nullable=False),
+            StructField("protocol_input_id", StringType(), nullable=False),
+            StructField("protocol_execution_id", StringType(), nullable=False),
+            StructField("value", StringType(), nullable=False),
+        ]
+    ),
+    "ProtocolInputSet": StructType(
+        [
+            StructField("protocol_input_id", StringType(), nullable=False),
+            StructField("protocol_input_set_id", StringType(), nullable=False),
+        ]
+    ),
+    "ProtocolOutput": StructType(
+        [
+            StructField("protocol_output_id", StringType(), nullable=False),
+            StructField("protocol_input_set_id", StringType(), nullable=False),
+            StructField("value", StringType(), nullable=False),
+        ]
+    ),
+    "ProtocolStep": StructType(
+        [
+            StructField("protocol_step_id", StringType(), nullable=False),
+            StructField("step", StringType(), nullable=True),
+        ]
+    ),
+    "ProtocolVariable": StructType(
+        [
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("variable_id", StringType(), nullable=False),
+        ]
+    ),
+    "Publication": StructType(
+        [
+            StructField("publication_id", StringType(), nullable=False),
+        ]
+    ),
+    "QuantityRangeValue": StructType(
+        [
+            StructField("maximum_numeric_value", FloatType(), nullable=False),
+            StructField("minimum_numeric_value", FloatType(), nullable=False),
+            StructField("unit_cv_id", StringType(), nullable=True),
+            StructField("unit_cv_label", StringType(), nullable=True),
+            StructField("unit_string", StringType(), nullable=True),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "QuantityValue": StructType(
+        [
+            StructField("numeric_value", FloatType(), nullable=False),
+            StructField("unit_cv_id", StringType(), nullable=True),
+            StructField("unit_cv_label", StringType(), nullable=True),
+            StructField("unit_string", StringType(), nullable=True),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "ResourceDescription": StructType(
+        [
+            StructField("resource_description_id", StringType(), nullable=False),
+            StructField("description_text", StringType(), nullable=False),
+            StructField("description_type", StringType(), nullable=True),
+            StructField("language", StringType(), nullable=True),
+        ]
+    ),
+    "ResourceTitle": StructType(
+        [
+            StructField("resource_title_id", StringType(), nullable=False),
+            StructField("language", StringType(), nullable=True),
+            StructField("title", StringType(), nullable=False),
+            StructField("title_type", StringType(), nullable=True),
+        ]
+    ),
+    "Sample": StructType(
+        [
+            StructField("sample_id", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+        ]
+    ),
+    "Sequence": StructType(
+        [
+            StructField("sequence_id", StringType(), nullable=False),
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("type", StringType(), nullable=True),
+            StructField("length", IntegerType(), nullable=True),
+            StructField("checksum", StringType(), nullable=True),
+        ]
+    ),
+    "Statement": StructType(
+        [
+            StructField("subject", StringType(), nullable=True),
+            StructField("predicate", StringType(), nullable=True),
+            StructField("object", StringType(), nullable=True),
+            StructField("value", StringType(), nullable=True),
+            StructField("datatype", StringType(), nullable=True),
+            StructField("language", StringType(), nullable=True),
+        ]
+    ),
+    "TextValue": StructType(
+        [
+            StructField("text_value", StringType(), nullable=False),
+            StructField("language", StringType(), nullable=True),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "Variable": StructType(
+        [
+            StructField("variable_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("name_cv_id", StringType(), nullable=True),
+            StructField("unit", StringType(), nullable=True),
+            StructField("value_type", StringType(), nullable=False),
+        ]
+    ),
+    "VariableValue": StructType(
+        [
+            StructField("variable_value_id", StringType(), nullable=False),
+            StructField("variable_id", StringType(), nullable=False),
+            StructField("value_type", StringType(), nullable=True),
+        ]
+    ),
+}
diff --git a/src/cdm_data_loader_utils/parsers/refseq/__init__.py b/src/cdm_data_loader_utils/parsers/refseq/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/cdm_data_loader_utils/parsers/refseq/api/__init__.py b/src/cdm_data_loader_utils/parsers/refseq/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/cdm_data_loader_utils/parsers/refseq/api/annotation_parse.py b/src/cdm_data_loader_utils/parsers/refseq/api/annotation_parse.py
new file mode 100644
index 0000000..47c6882
--- /dev/null
+++ b/src/cdm_data_loader_utils/parsers/refseq/api/annotation_parse.py
@@ -0,0 +1,447 @@
+"""
+
+RefSeq annotation parser for transforming NCBI Datasets API JSON into CDM-formatted Delta Lake tables.
+
+Usage:
+uv run python src/cdm_data_loader_utils/parsers/annotation_parse.py \
+  --accession GCF_000869125.1 \
+  --namespace refseq_api \
+  --query
+
+"""
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+from delta import configure_spark_with_delta_pip
+from pyspark.sql import SparkSession
+
+from cdm_data_loader_utils.model.kbase_cdm_schema import CDM_SCHEMA
+
+
+# ---------------------------------------------------------------------
+# Accession-based annotation fetch
+# ---------------------------------------------------------------------
+def fetch_annotation_json(accession: str) -> dict:
+    """Fetch annotation JSON from NCBI Datasets API."""
+    url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{accession}/annotation_report"
+    resp = requests.get(url, headers={"Accept": "application/json"}, timeout=60)
+    resp.raise_for_status()
+    return resp.json()
+
+
+# ---------------------------------------------------------------------
+# Spark initialization with Delta support
+# ---------------------------------------------------------------------
+def build_spark_session(app_name: str = "RefSeqAnnotationToCDM") -> SparkSession:
+    builder = (
+        SparkSession.builder.appName(app_name)
+        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
+        .enableHiveSupport()
+    )
+    return configure_spark_with_delta_pip(builder).getOrCreate()
+
+
+def init_spark_and_db(app_name: str, database: str) -> SparkSession:
+    spark = build_spark_session(app_name)
+    spark.sql(f"CREATE DATABASE IF NOT EXISTS {database}")
+    spark.sql(f"USE {database}")
+    return spark
+
+
+# ---------------------------------------------------------------------
+# CDM PREFIX NORMALIZATION
+# ---------------------------------------------------------------------
+def apply_prefix(identifier: str | None) -> str | None:
+    if not identifier:
+        return None
+
+    if identifier.startswith("GeneID:"):
+        return identifier.replace("GeneID:", "ncbigene:")
+
+    if identifier.startswith(("YP_", "XP_", "WP_", "NP_", "NC_")):
+        return f"refseq:{identifier}"
+
+    if identifier.startswith("GCF_"):
+        return f"insdc.gcf:{identifier}"
+
+    return identifier
+
+
+# ---------------------------------------------------------------------
+# Safe integer conversion
+# ---------------------------------------------------------------------
+def to_int(val: str) -> int | None:
+    try:
+        return int(val)
+    except Exception:
+        return None
+
+
+# ---------------------------------------------------------------------
+# For repeat section markers
+# ---------------------------------------------------------------------
+def unique_annotations(data: dict):
+    seen = set()
+    for report in data.get("reports", []):
+        ann = report.get("annotation", {})
+        gene_id = ann.get("gene_id")
+        if gene_id and gene_id not in seen:
+            seen.add(gene_id)
+            yield gene_id, ann
+
+
+# ---------------------------------------------------------------------
+# IDENTIFIERS
+# ---------------------------------------------------------------------
+def load_identifiers(data: dict) -> list[tuple[str, str, str, str, str | None]]:
+    """Extract Identifier table records."""
+    out = []
+
+    for gene_id, ann in unique_annotations(data):
+        entity_id = f"ncbigene:{gene_id}"
+        out.append((entity_id, gene_id, ann.get("name"), "RefSeq", ann.get("relationship")))
+    return list({tuple(row) for row in out})  # deduplicate
+
+
+# ---------------------------------------------------------------------
+# NAME EXTRACTION
+# ---------------------------------------------------------------------
+def load_names(data: dict) -> list[tuple[str, str, str, str]]:
+    """Extract Name table records."""
+    out = []
+
+    for gene_id, ann in unique_annotations(data):
+        entity_id = f"ncbigene:{gene_id}"
+        for label, desc in [
+            ("symbol", "RefSeq gene symbol"),
+            ("name", "RefSeq gene name"),
+            ("locus_tag", "RefSeq locus tag"),
+        ]:
+            val = ann.get(label)
+            if val:
+                out.append((entity_id, val, desc, "RefSeq"))
+    return list({tuple(row) for row in out})
+
+
+# ---------------------------------------------------------------------
+# FEATURE LOCATIONS
+# ---------------------------------------------------------------------
+def load_feature_records(data: dict) -> list[tuple]:
+    """Extract Feature table records."""
+    features = []
+
+    for gene_id, ann in unique_annotations(data):
+        feature_id = f"ncbigene:{gene_id}"
+        for region in ann.get("genomic_regions", []):
+            for r in region.get("gene_range", {}).get("range", []):
+                strand = {
+                    "plus": "positive",
+                    "minus": "negative",
+                    "unstranded": "unstranded",
+                }.get(r.get("orientation"), "unknown")
+                features.append(
+                    (
+                        feature_id,
+                        None,
+                        None,
+                        None,
+                        to_int(r.get("end")),
+                        None,
+                        to_int(r.get("begin")),
+                        strand,
+                        "RefSeq",
+                        None,
+                        "gene",
+                    )
+                )
+    return list({tuple(row) for row in features})
+
+
+# ---------------------------------------------------------------------
+# PARSE CONTIG_COLLECTION <-> FEATURE
+# ---------------------------------------------------------------------
+def load_contig_collection_x_feature(data: dict) -> list[tuple[str, str]]:
+    """Parse ContigCollection Feature links."""
+    links = []
+
+    for gene_id, ann in unique_annotations(data):
+        regions = ann.get("genomic_regions", [])
+
+        if not regions:
+            continue
+
+        acc = regions[0].get("gene_range", {}).get("accession_version")
+        if acc:
+            links.append((apply_prefix(acc), f"ncbigene:{gene_id}"))
+
+    return list(set(links))
+
+
+# ---------------------------------------------------------------------
+# PARSE CONTIG_COLLECTION <-> PROTEIN
+# ---------------------------------------------------------------------
+def load_contig_collection_x_protein(data: dict) -> list[tuple[str, str]]:
+    links = []
+
+    for report in data.get("reports", []):
+        ann = report.get("annotation", {})
+        assembly = ann.get("annotations", [{}])[0].get("assembly_accession")
+        if not assembly:
+            continue
+
+        contig_id = apply_prefix(assembly)
+        for p in ann.get("proteins", []):
+            pid = p.get("accession_version")
+            if pid:
+                links.append((contig_id, apply_prefix(pid)))
+
+    return list(set(links))
+
+
+# ---------------------------------------------------------------------
+# PARSE FEATURE <-> PROTEIN
+# ---------------------------------------------------------------------
+def load_feature_x_protein(data: dict) -> list[tuple[str, str]]:
+    links = []
+
+    for gene_id, ann in unique_annotations(data):
+        feature_id = f"ncbigene:{gene_id}"
+
+        for p in ann.get("proteins", []):
+            pid = p.get("accession_version")
+            if pid:
+                protein_id = apply_prefix(pid)
+                links.append((feature_id, protein_id))
+
+    return list(set(links))
+
+
+# ---------------------------------------------------------------------
+# PARSE CONTIGS
+# ---------------------------------------------------------------------
+def load_contigs(data: dict) -> list[tuple[str, str | None, float | None, int | None]]:
+    contigs = {}
+
+    for report in data.get("reports", []):
+        for region in report.get("annotation", {}).get("genomic_regions", []):
+            acc = region.get("gene_range", {}).get("accession_version")
+            if acc:
+                contig_id = apply_prefix(acc)
+                # Only track first occurrence of each contig
+                contigs.setdefault(contig_id, {"hash": None, "gc_content": None, "length": None})
+
+    return [(cid, meta["hash"], meta["gc_content"], meta["length"]) for cid, meta in contigs.items()]
+
+
+# ---------------------------------------------------------------------
+# PARSE CONTIG <-> CONTIG_COLLECTION
+# ---------------------------------------------------------------------
+def load_contig_x_contig_collection(data: dict) -> list[tuple[str, str]]:
+    links = []
+
+    for report in data.get("reports", []):
+        ann = report.get("annotation", {})
+        regions = ann.get("genomic_regions", [])
+        annotations = ann.get("annotations", [])
+
+        if not regions or not annotations:
+            continue
+
+        contig = regions[0].get("gene_range", {}).get("accession_version")
+        assembly = annotations[0].get("assembly_accession")
+
+        if contig and assembly:
+            links.append(
+                (
+                    f"refseq:{contig}",
+                    apply_prefix(assembly),
+                )
+            )
+
+    return list(set(links))
+
+
+# ---------------------------------------------------------------------
+# DELTA TABLE
+# ---------------------------------------------------------------------
+def write_to_table(
+    spark: SparkSession,
+    records: list[tuple],
+    table_name: str,
+    database: str = "default",
+) -> None:
+    if records:
+        spark.createDataFrame(records, CDM_SCHEMA[table_name]).write.format("delta").mode("overwrite").option(
+            "overwriteSchema", "true"
+        ).saveAsTable(f"{database}.{table_name}")
+
+
+# ---------------------------------------------------------------------
+# SQL PREVIEW
+# ---------------------------------------------------------------------
+
+CDM_TABLES = [
+    "Identifier",
+    "Name",
+    "Feature",
+    "ContigCollection_x_Feature",
+    "ContigCollection_x_Protein",
+    "Feature_x_Protein",
+    "Contig",
+    "Contig_x_ContigCollection",
+]
+
+
+def run_sql_query(spark: SparkSession, database: str = "default") -> None:
+    spark.sql(f"USE {database}")
+    for table in CDM_TABLES:
+        print(f"\n[SQL Preview] {table}")
+        spark.sql(f"SELECT * FROM {table} LIMIT 20").show(truncate=False)
+
+
+def parse_annotation_data(spark: SparkSession, datasets: list[dict], namespace: str) -> None:
+    # -----------------------------------------
+    # Parse and write CDM tables
+    # -----------------------------------------
+    for data in datasets:
+        write_to_table(
+            spark,
+            load_identifiers(data),
+            "Identifier",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_names(data),
+            "Name",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_feature_records(data),
+            "Feature",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contig_collection_x_feature(data),
+            "ContigCollection_x_Feature",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contig_collection_x_protein(data),
+            "ContigCollection_x_Protein",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_feature_x_protein(data),
+            "Feature_x_Protein",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contigs(data),
+            "Contig",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contig_x_contig_collection(data),
+            "Contig_x_ContigCollection",
+            namespace,
+        )
+
+
+# ---------------------------------------------------------------------
+# CLI ENTRY
+# ---------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(description="RefSeq Annotation Parser to CDM")
+
+    # -------------------------
+    # Input options
+    # -------------------------
+    parser.add_argument("--accession", type=str, help="RefSeq genome accession (e.g. GCF_000869125.1)")
+    parser.add_argument("--input_file", type=str, help="Path to a RefSeq annotation JSON file.")
+    parser.add_argument("--input_dir", type=str, help="Directory containing RefSeq annotation JSON files.")
+
+    # -------------------------
+    # Output / runtime options
+    # -------------------------
+    parser.add_argument(
+        "--namespace",
+        default="refseq_api",
+        help="Database to write Delta tables.",
+    )
+    parser.add_argument(
+        "--tenant",
+        default=None,
+        help="Tenant SQL warehouse to use.",
+    )
+    parser.add_argument(
+        "--query",
+        action="store_true",
+        help="Preview SQL output after writing.",
+    )
+
+    args = parser.parse_args()
+
+    # -----------------------------------------
+    # Input validation
+    # -----------------------------------------
+    if not args.accession and not args.input_file and not args.input_dir:
+        raise ValueError("provide --accession, --input_file, or --input_dir.")
+
+    # -----------------------------------------
+    # Initialize Spark
+    # -----------------------------------------
+    spark = init_spark_and_db("RefSeq Annotation Parser", args.namespace)
+
+    if args.tenant:
+        spark.sql(f"USE CATALOG {args.tenant}")
+
+    # -----------------------------------------
+    # Load annotation data
+    # -----------------------------------------
+    datasets: list[dict] = []
+
+    if args.accession:
+        # Fetch from NCBI Datasets API
+        data = fetch_annotation_json(args.accession)
+        datasets.append(data)
+
+    if args.input_file:
+        with open(args.input_file) as f:
+            datasets.append(json.load(f))
+
+    if args.input_dir:
+        for path in Path(args.input_dir).rglob("*.json"):
+            with open(path) as f:
+                datasets.append(json.load(f))
+
+    parse_annotation_data(spark, datasets, args.namespace)
+
+    # -----------------------------------------
+    # SQL preview
+    # -----------------------------------------
+    if args.query:
+        run_sql_query(spark, args.namespace)
+
+    spark.stop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/data/refseq/annotation_report.json b/tests/data/refseq/annotation_report.json
new file mode 100644
index 0000000..53cd0d6
--- /dev/null
+++ b/tests/data/refseq/annotation_report.json
@@ -0,0 +1,105 @@
+{
+    "reports": [
+        {
+            "annotation": {
+                "gene_id": "4156250",
+                "name": "hypothetical protein",
+                "gene_type": "protein-coding",
+                "locus_tag": "MIV001R",
+                "genomic_regions": [
+                    {
+                        "gene_range": {
+                            "accession_version": "NC_008187.1",
+                            "range": [
+                                {
+                                    "begin": "2620",
+                                    "end": "3066",
+                                    "orientation": "plus"
+                                }
+                            ]
+                        }
+                    }
+                ],
+                "proteins": [
+                    {
+                        "accession_version": "YP_654573.1",
+                        "name": "hypothetical protein",
+                        "length": 148
+                    }
+                ],
+                "annotations": [
+                    {
+                        "assembly_accession": "GCF_000869125.1"
+                    }
+                ]
+            },
+            "row_id": "1"
+        },
+        {
+            "annotation": {
+                "gene_id": "4156251",
+                "name": "hypodermical protein",
+                "gene_type": "protein-coding",
+                "locus_tag": "MIV002R",
+                "genomic_regions": [
+                    {
+                        "gene_range": {
+                            "accession_version": "NC_008187.1",
+                            "range": [
+                                {
+                                    "begin": "3603",
+                                    "end": "4979",
+                                    "orientation": "plus"
+                                }
+                            ]
+                        }
+                    }
+                ],
+                "proteins": [
+                    {
+                        "accession_version": "YP_654574.1",
+                        "name": "hypothetical protein",
+                        "length": 458
+                    }
+                ],
+                "annotations": [
+                    {
+                        "assembly_accession": "GCF_000869125.1"
+                    }
+                ]
+            },
+            "row_id": "2"
+        },
+        {
+            "annotation": {
+                "gene_id": "4156252",
+                "name": "very hypothetical protein",
+                "gene_type": "protein-coding",
+                "locus_tag": "MIV003R",
+                "symbol": "kappa-delta-phi",
+                "genomic_regions": [
+                    {
+                        "gene_range": {
+                            "accession_version": "NC_008187.1",
+                            "range": [
+                                {
+                                    "begin": "5168",
+                                    "end": "5638",
+                                    "orientation": "minus"
+                                }
+                            ]
+                        }
+                    }
+                ],
+                "proteins": [],
+                "annotations": [
+                    {
+                        "assembly_accession": "GCF_000869125.1"
+                    }
+                ]
+            },
+            "row_id": "3"
+        }
+    ],
+    "total_count": 3
+}
diff --git a/tests/data/refseq/annotation_report.parsed.json b/tests/data/refseq/annotation_report.parsed.json
new file mode 100644
index 0000000..178e040
--- /dev/null
+++ b/tests/data/refseq/annotation_report.parsed.json
@@ -0,0 +1,239 @@
+{
+  "contig": [
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "hash": null,
+      "gc_content": null,
+      "length": null
+    }
+  ],
+  "contig_x_contigcollection": [
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1"
+    }
+  ],
+  "contig_x_feature": [
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "feature_id": "ncbigene:4156250"
+    },
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "feature_id": "ncbigene:4156251"
+    },
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "feature_id": "ncbigene:4156252"
+    }
+  ],
+  "contig_x_protein": [
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "protein_id": "refseq:YP_654573.1"
+    },
+    {
+      "contig_id": "refseq:NC_008187.1",
+      "protein_id": "refseq:YP_654574.1"
+    }
+  ],
+  "contigcollection": [
+    {
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1",
+      "hash": null
+    }
+  ],
+  "contigcollection_x_feature": [
+    {
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1",
+      "feature_id": "ncbigene:4156250"
+    },
+    {
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1",
+      "feature_id": "ncbigene:4156251"
+    },
+    {
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1",
+      "feature_id": "ncbigene:4156252"
+    }
+  ],
+  "contigcollection_x_protein": [
+    {
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1",
+      "protein_id": "refseq:YP_654573.1"
+    },
+    {
+      "contig_collection_id": "insdc.gcf:GCF_000869125.1",
+      "protein_id": "refseq:YP_654574.1"
+    }
+  ],
+  "feature_x_protein": [
+    {
+      "feature_id": "ncbigene:4156250",
+      "protein_id": "refseq:YP_654573.1"
+    },
+    {
+      "feature_id": "ncbigene:4156251",
+      "protein_id": "refseq:YP_654574.1"
+    }
+  ],
+  "feature": [
+    {
+      "feature_id": "ncbigene:4156250",
+      "hash": null,
+      "cds_phase": null,
+      "e_value": null,
+      "end": 3066,
+      "p_value": null,
+      "start": 2620,
+      "strand": "positive",
+      "source_database": "ncbigene",
+      "protocol_id": null,
+      "type": "protein-coding"
+    },
+    {
+      "feature_id": "ncbigene:4156251",
+      "hash": null,
+      "cds_phase": null,
+      "e_value": null,
+      "end": 4979,
+      "p_value": null,
+      "start": 3603,
+      "strand": "positive",
+      "source_database": "ncbigene",
+      "protocol_id": null,
+      "type": "protein-coding"
+    },
+    {
+      "feature_id": "ncbigene:4156251",
+      "hash": null,
+      "cds_phase": null,
+      "e_value": null,
+      "end": 5638,
+      "p_value": null,
+      "start": 5168,
+      "strand": "negative",
+      "source_database": "ncbigene",
+      "protocol_id": null,
+      "type": "protein-coding"
+    }
+  ],
+  "identifier": [
+    {
+      "entity_id": "insdc.gcf:GCF_000869125",
+      "identifier": "insdc.gcf:GCF_000869125",
+      "description": "RefSeq genome ID",
+      "source": "RefSeq",
+      "relationship": null
+    },
+    {
+      "entity_id": "refseq:NC_008187.1",
+      "identifier": "refseq:NC_008187.1",
+      "description": "RefSeq assembly ID",
+      "source": "RefSeq",
+      "relationship": null
+    },
+    {
+      "entity_id": "ncbigene:4156250",
+      "identifier": "ncbigene:4156250",
+      "description": "NCBI gene ID",
+      "source": "RefSeq",
+      "relationship": null
+    },
+    {
+      "entity_id": "ncbigene:4156251",
+      "identifier": "ncbigene:4156251",
+      "description": "NCBI gene ID",
+      "source": "RefSeq",
+      "relationship": null
+    },
+    {
+      "entity_id": "ncbigene:4156252",
+      "identifier": "ncbigene:4156252",
+      "description": "NCBI gene ID",
+      "source": "RefSeq",
+      "relationship": null
+    },
+    {
+      "entity_id": "refseq:YP_654573.1",
+      "identifier": "refseq:YP_654573.1",
+      "description": "RefSeq protein ID",
+      "source": "RefSeq",
+      "relationship": null
+    }
+  ],
+  "name": [
+    {
+      "entity_id": "ncbigene:4156250",
+      "name": "hypothetical protein",
+      "description": "RefSeq gene name",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "ncbigene:4156251",
+      "name": "hypodermical protein",
+      "description": "RefSeq gene name",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "ncbigene:4156252",
+      "name": "very hypothetical protein",
+      "description": "RefSeq gene name",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "ncbigene:4156250",
+      "name": "MIV001R",
+      "description": "RefSeq locus tag",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "ncbigene:4156251",
+      "name": "MIV002R",
+      "description": "RefSeq locus tag",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "ncbigene:4156252",
+      "name": "MIV003R",
+      "description": "RefSeq locus tag",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "ncbigene:4156252",
+      "name": "kappa-delta-phi",
+      "description": "RefSeq symbol",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "refseq:YP_654573.1",
+      "name": "hypothetical protein",
+      "description": "RefSeq protein name",
+      "source": "RefSeq"
+    },
+    {
+      "entity_id": "refseq:YP_654574.1",
+      "name": "hypothetical protein",
+      "description": "RefSeq protein name",
+      "source": "RefSeq"
+    }
+  ],
+  "protein": [
+    {
+      "protein_id": "refseq:YP_654573.1",
+      "hash": null,
+      "description": null,
+      "evidence_for_existence": null,
+      "length": null,
+      "sequence": null
+    },
+    {
+      "protein_id": "refseq:YP_654574.1",
+      "hash": null,
+      "description": null,
+      "evidence_for_existence": null,
+      "length": null,
+      "sequence": null
+    }
+  ]
+}
diff --git a/tests/parsers/refseq/__init__.py b/tests/parsers/refseq/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/parsers/refseq/api/__init__.py b/tests/parsers/refseq/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/parsers/refseq/api/test_annotation_report.py b/tests/parsers/refseq/api/test_annotation_report.py
new file mode 100644
index 0000000..434583a
--- /dev/null
+++ b/tests/parsers/refseq/api/test_annotation_report.py
@@ -0,0 +1,757 @@
+### uv run pytest tests/parsers/test_annotation_parse.py
+
+import json
+from pathlib import Path
+
+import pytest
+from cdm_data_loader_utils.parsers.refseq.api.annotation_report import (
+    apply_prefix,
+    load_contig_collection_x_feature,
+    load_contig_collection_x_protein,
+    load_contig_x_contig_collection,
+    load_contigs,
+    load_feature_records,
+    load_feature_x_protein,
+    load_identifiers,
+    load_names,
+    parse_annotation_data,
+    to_int,
+)
+from pyspark.sql import SparkSession
+from pyspark.testing import assertDataFrameEqual, assertSchemaEqual
+
+from cdm_data_loader_utils.model.kbase_cdm_schema import CDM_SCHEMA
+from tests.conftest import TEST_NS
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "1234",
+                            "name": "hypothetical protein",
+                            "relationship": "RefSeq gene symbol",
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:1234",
+                    "1234",
+                    "hypothetical protein",
+                    "RefSeq",
+                    "RefSeq gene symbol",
+                )
+            ],
+        ),
+        (
+            {"reports": [{"annotation": {"gene_id": "5678", "name": "some protein"}}]},
+            [("ncbigene:5678", "5678", "some protein", "RefSeq", None)],
+        ),
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "name": "no gene id here",
+                            "relationship": "RefSeq locus tag",
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+    ],
+)
+def test_load_identifiers(input_data, expected_output):
+    result = load_identifiers(input_data)
+    assert result == expected_output
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: all name fields present
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "1234",
+                            "symbol": "abc",
+                            "name": "ABC protein",
+                            "locus_tag": "LTG_1234",
+                        }
+                    }
+                ]
+            },
+            [
+                ("ncbigene:1234", "abc", "RefSeq gene symbol", "RefSeq"),
+                ("ncbigene:1234", "ABC protein", "RefSeq gene name", "RefSeq"),
+                ("ncbigene:1234", "LTG_1234", "RefSeq locus tag", "RefSeq"),
+            ],
+        ),
+        # Case 2: only gene_name present
+        (
+            {"reports": [{"annotation": {"gene_id": "5678", "name": "Hypothetical protein"}}]},
+            [
+                (
+                    "ncbigene:5678",
+                    "Hypothetical protein",
+                    "RefSeq gene name",
+                    "RefSeq",
+                )
+            ],
+        ),
+        # Case 3: no gene_id
+        (
+            {"reports": [{"annotation": {"name": "Unnamed", "symbol": "XYZ"}}]},
+            [],
+        ),
+        # Case 4: only locus_tag present
+        (
+            {"reports": [{"annotation": {"gene_id": "8888", "locus_tag": "LTG_8888"}}]},
+            [("ncbigene:8888", "LTG_8888", "RefSeq locus tag", "RefSeq")],
+        ),
+        # Case 5: multiple reports
+        (
+            {
+                "reports": [
+                    {"annotation": {"gene_id": "1001", "symbol": "DEF"}},
+                    {"annotation": {"gene_id": "1002", "name": "DEF protein"}},
+                ]
+            },
+            [
+                ("ncbigene:1001", "DEF", "RefSeq gene symbol", "RefSeq"),
+                ("ncbigene:1002", "DEF protein", "RefSeq gene name", "RefSeq"),
+            ],
+        ),
+    ],
+)
+def test_load_names(input_data, expected_output):
+    result = load_names(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: basic valid input with plus strand
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "1234",
+                            "genomic_regions": [
+                                {
+                                    "gene_range": {
+                                        "range": [
+                                            {
+                                                "begin": "100",
+                                                "end": "200",
+                                                "orientation": "plus",
+                                            }
+                                        ]
+                                    }
+                                }
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:1234",
+                    None,
+                    None,
+                    None,
+                    200,
+                    None,
+                    100,
+                    "positive",
+                    "RefSeq",
+                    None,
+                    "gene",
+                )
+            ],
+        ),
+        # Case 2: multiple ranges, different strands
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "5678",
+                            "genomic_regions": [
+                                {
+                                    "gene_range": {
+                                        "range": [
+                                            {
+                                                "begin": "300",
+                                                "end": "500",
+                                                "orientation": "minus",
+                                            },
+                                            {
+                                                "begin": "600",
+                                                "end": "800",
+                                                "orientation": "plus",
+                                            },
+                                        ]
+                                    }
+                                }
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:5678",
+                    None,
+                    None,
+                    None,
+                    500,
+                    None,
+                    300,
+                    "negative",
+                    "RefSeq",
+                    None,
+                    "gene",
+                ),
+                (
+                    "ncbigene:5678",
+                    None,
+                    None,
+                    None,
+                    800,
+                    None,
+                    600,
+                    "positive",
+                    "RefSeq",
+                    None,
+                    "gene",
+                ),
+            ],
+        ),
+        # Case 3: missing orientation
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "9999",
+                            "genomic_regions": [{"gene_range": {"range": [{"begin": "1", "end": "2"}]}}],
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:9999",
+                    None,
+                    None,
+                    None,
+                    2,
+                    None,
+                    1,
+                    "unknown",
+                    "RefSeq",
+                    None,
+                    "gene",
+                )
+            ],
+        ),
+        # Case 4: no gene_id
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [
+                                {
+                                    "gene_range": {
+                                        "range": [
+                                            {
+                                                "begin": "100",
+                                                "end": "200",
+                                                "orientation": "plus",
+                                            }
+                                        ]
+                                    }
+                                }
+                            ]
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 5: non-integer start/end
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "1111",
+                            "genomic_regions": [
+                                {
+                                    "gene_range": {
+                                        "range": [
+                                            {
+                                                "begin": "abc",
+                                                "end": "xyz",
+                                                "orientation": "plus",
+                                            }
+                                        ]
+                                    }
+                                }
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:1111",
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    "positive",
+                    "RefSeq",
+                    None,
+                    "gene",
+                )
+            ],
+        ),
+    ],
+)
+def test_load_feature_records(input_data, expected_output):
+    result = load_feature_records(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: valid mapping
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "12345",
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}],
+                        }
+                    }
+                ]
+            },
+            [("refseq:NC_000001.11", "ncbigene:12345")],
+        ),
+        # Case 2: no gene_id
+        (
+            {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000002.11"}}]}}]},
+            [],
+        ),
+        # Case 3: no genomic_regions
+        (
+            {"reports": [{"annotation": {"gene_id": "67890"}}]},
+            [],
+        ),
+        # Case 4: empty genomic_regions list
+        (
+            {"reports": [{"annotation": {"gene_id": "99999", "genomic_regions": []}}]},
+            [],
+        ),
+        # Case 5: missing accession_version
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "13579",
+                            "genomic_regions": [{"gene_range": {}}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+    ],
+)
+def test_load_contig_collection_x_feature(input_data, expected_output):
+    result = load_contig_collection_x_feature(input_data)
+    assert result == expected_output
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: Valid report with multiple proteins
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "proteins": [
+                                {"accession_version": "XP_123"},
+                                {"accession_version": "XP_456"},
+                            ],
+                            "annotations": [{"assembly_accession": "GCF_000001"}],
+                        }
+                    }
+                ]
+            },
+            [
+                ("insdc.gcf:GCF_000001", "refseq:XP_123"),
+                ("insdc.gcf:GCF_000001", "refseq:XP_456"),
+            ],
+        ),
+        # Case 2: No proteins
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "proteins": [],
+                            "annotations": [{"assembly_accession": "GCF_000002"}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 3: No annotations
+        (
+            {"reports": [{"annotation": {"proteins": [{"accession_version": "XP_789"}]}}]},
+            [],
+        ),
+        # Case 4: Missing assembly_accession
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "proteins": [{"accession_version": "XP_789"}],
+                            "annotations": [{}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 5: Some proteins missing accession_version
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "proteins": [
+                                {"accession_version": "XP_111"},
+                                {},
+                                {"accession_version": "XP_222"},
+                            ],
+                            "annotations": [{"assembly_accession": "GCF_000003"}],
+                        }
+                    }
+                ]
+            },
+            [
+                ("insdc.gcf:GCF_000003", "refseq:XP_111"),
+                ("insdc.gcf:GCF_000003", "refseq:XP_222"),
+            ],
+        ),
+    ],
+)
+def test_load_contig_collection_x_protein(input_data, expected_output):
+    result = load_contig_collection_x_protein(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: valid gene with multiple proteins
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "4156311",
+                            "proteins": [
+                                {"accession_version": "XP_001"},
+                                {"accession_version": "XP_002"},
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                ("ncbigene:4156311", "refseq:XP_001"),
+                ("ncbigene:4156311", "refseq:XP_002"),
+            ],
+        ),
+        # Case 2: no gene_id
+        (
+            {"reports": [{"annotation": {"proteins": [{"accession_version": "XP_999"}]}}]},
+            [],
+        ),
+        # Case 3: gene with no proteins
+        (
+            {"reports": [{"annotation": {"gene_id": "4156312"}}]},
+            [],
+        ),
+        # Case 4: some proteins missing accession_version
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "4156313",
+                            "proteins": [
+                                {"accession_version": "XP_777"},
+                                {},
+                                {"accession_version": "XP_888"},
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                ("ncbigene:4156313", "refseq:XP_777"),
+                ("ncbigene:4156313", "refseq:XP_888"),
+            ],
+        ),
+        # Case 5: empty report list
+        ({"reports": []}, []),
+    ],
+)
+def test_load_feature_x_protein(input_data, expected_output):
+    result = load_feature_x_protein(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: Valid contig and assembly
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}],
+                            "annotations": [{"assembly_accession": "GCF_000001.1"}],
+                        }
+                    }
+                ]
+            },
+            [("refseq:NC_000001.11", "insdc.gcf:GCF_000001.1")],
+        ),
+        # Case 2: Missing genomic_regions
+        (
+            {"reports": [{"annotation": {"annotations": [{"assembly_accession": "GCF_000002.1"}]}}]},
+            [],
+        ),
+        # Case 3: Missing annotations
+        (
+            {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000003.11"}}]}}]},
+            [],
+        ),
+        # Case 4: Missing accession_version in region
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {}}],
+                            "annotations": [{"assembly_accession": "GCF_000004.1"}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 5: Missing assembly_accession in annotations
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000005.11"}}],
+                            "annotations": [{}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 6: Multiple reports, one valid
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000006.11"}}],
+                            "annotations": [{"assembly_accession": "GCF_000006.1"}],
+                        }
+                    },
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000007.11"}}],
+                            "annotations": [{}],
+                        }
+                    },
+                ]
+            },
+            [("refseq:NC_000006.11", "insdc.gcf:GCF_000006.1")],
+        ),
+    ],
+)
+def test_load_contig_x_contig_collection(input_data, expected_output):
+    result = load_contig_x_contig_collection(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: Valid contig with accession_version
+        (
+            {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}]}}]},
+            [("refseq:NC_000001.11", None, None, None)],
+        ),
+        # Case 2: Multiple contigs, different accession_versions
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [
+                                {"gene_range": {"accession_version": "NC_000001.11"}},
+                                {"gene_range": {"accession_version": "NC_000002.12"}},
+                            ]
+                        }
+                    }
+                ]
+            },
+            [
+                ("refseq:NC_000001.11", None, None, None),
+                ("refseq:NC_000002.12", None, None, None),
+            ],
+        ),
+        # Case 3: Duplicate accession versions
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [
+                                {"gene_range": {"accession_version": "NC_000003.13"}},
+                                {"gene_range": {"accession_version": "NC_000003.13"}},
+                            ]
+                        }
+                    }
+                ]
+            },
+            [("refseq:NC_000003.13", None, None, None)],
+        ),
+        # Case 4: Missing accession_version
+        (
+            {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {}}]}}]},
+            [],
+        ),
+        # Case 5: Empty reports
+        (
+            {"reports": []},
+            [],
+        ),
+    ],
+)
+def test_load_contigs(input_data, expected_output):
+    result = load_contigs(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+### add new test: to_int
+@pytest.mark.parametrize(
+    "input_id, expected",
+    [
+        ("GeneID:123", "ncbigene:123"),
+        ("YP_009725307.1", "refseq:YP_009725307.1"),
+        ("GCF_000001405.39", "insdc.gcf:GCF_000001405.39"),
+        ("random", "random"),
+    ],
+)
+def test_apply_prefix(input_id, expected):
+    assert apply_prefix(input_id) == expected
+
+
+@pytest.mark.parametrize("val, expected", [("123", 123), ("abc", None), ("", None)])
+def test_to_int(val, expected):
+    assert to_int(val) == expected
+
+
+TABLE_NAME_MAP = {
+    "contig": "Contig",
+    "feature": "Feature",
+    "identifier": "Identifier",
+    "name": "Name",
+    "contig_x_contigcollection": "Contig_x_ContigCollection",
+    "contigcollection_x_feature": "ContigCollection_x_Feature",
+    "contigcollection_x_protein": "ContigCollection_x_Protein",
+    "feature_x_protein": "Feature_x_Protein",
+}
+
+
+@pytest.mark.requires_spark
+def test_parse_annotation_data(spark: SparkSession, test_data_dir: Path) -> None:
+    """Test the parsing of the annotation data."""
+    spark.sql(f"CREATE DATABASE IF NOT EXISTS {TEST_NS}")
+    spark.catalog.setCurrentDatabase(TEST_NS)
+
+    # Load NCBI dataset from NCBI API
+    sample_api_response = test_data_dir / "refseq" / "annotation_report.json"
+    dataset = json.load(sample_api_response.open())
+
+    # Run parse function
+    parse_annotation_data(spark, [dataset], TEST_NS)
+
+    # Expected tables to validate from output
+    expected_tables = [
+        "contig",
+        "contig_x_contigcollection",
+        "contigcollection_x_feature",
+        "contigcollection_x_protein",
+        "feature",
+        "feature_x_protein",
+        "identifier",
+        "name",
+    ]
+
+    for table_name in expected_tables:
+        result_df = spark.table(f"{TEST_NS}.{table_name}")
+        schema_key = TABLE_NAME_MAP[table_name]
+
+        # Construct expected_df just for schema comparison
+        rows = [r.asDict() for r in result_df.collect()]
+        expected_df = spark.createDataFrame(rows, schema=CDM_SCHEMA[schema_key])
+
+        # Assert schema match
+        assertSchemaEqual(
+            expected_df.schema,
+            result_df.schema,
+        )
+        # Assert content match
+        assertDataFrameEqual(
+            expected_df,
+            result_df,
+        )

From 6039dc66486696488d3bcdee257ef73abf9e230c Mon Sep 17 00:00:00 2001
From: ialarmedalien <ialarmedalien@gmail.com>
Date: Fri, 30 Jan 2026 08:55:48 -0800
Subject: [PATCH 2/3] Restoring deleted test

---
 ...notation_parse.py => annotation_report.py} |   0
 .../refseq/api/test_annotation_report.py      | 100 +++++++-----------
 2 files changed, 41 insertions(+), 59 deletions(-)
 rename src/cdm_data_loader_utils/parsers/refseq/api/{annotation_parse.py => annotation_report.py} (100%)

diff --git a/src/cdm_data_loader_utils/parsers/refseq/api/annotation_parse.py b/src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py
similarity index 100%
rename from src/cdm_data_loader_utils/parsers/refseq/api/annotation_parse.py
rename to src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py
diff --git a/tests/parsers/refseq/api/test_annotation_report.py b/tests/parsers/refseq/api/test_annotation_report.py
index 434583a..567c7bb 100644
--- a/tests/parsers/refseq/api/test_annotation_report.py
+++ b/tests/parsers/refseq/api/test_annotation_report.py
@@ -2,8 +2,13 @@
 
 import json
 from pathlib import Path
+from typing import Any
 
 import pytest
+from pyspark.sql import SparkSession
+from pyspark.testing import assertDataFrameEqual, assertSchemaEqual
+
+from cdm_data_loader_utils.model.kbase_cdm_schema import CDM_SCHEMA
 from cdm_data_loader_utils.parsers.refseq.api.annotation_report import (
     apply_prefix,
     load_contig_collection_x_feature,
@@ -17,15 +22,13 @@
     parse_annotation_data,
     to_int,
 )
-from pyspark.sql import SparkSession
-from pyspark.testing import assertDataFrameEqual, assertSchemaEqual
-
-from cdm_data_loader_utils.model.kbase_cdm_schema import CDM_SCHEMA
 from tests.conftest import TEST_NS
 
+CDM_SCHEMA_LC = {k.lower(): v for k, v in CDM_SCHEMA.items()}
+
 
 @pytest.mark.parametrize(
-    "input_data, expected_output",
+    ("input_data", "expected_output"),
     [
         (
             {
@@ -68,13 +71,13 @@
         ),
     ],
 )
-def test_load_identifiers(input_data, expected_output):
+def test_load_identifiers(input_data: dict[str, Any], expected_output: list[tuple]) -> None:
     result = load_identifiers(input_data)
     assert result == expected_output
 
 
 @pytest.mark.parametrize(
-    "input_data, expected_output",
+    ("input_data", "expected_output"),
     [
         # Case 1: all name fields present
         (
@@ -133,13 +136,13 @@ def test_load_identifiers(input_data, expected_output):
         ),
     ],
 )
-def test_load_names(input_data, expected_output):
+def test_load_names(input_data: dict[str, Any], expected_output: list[tuple]) -> None:
     result = load_names(input_data)
     assert sorted(result) == sorted(expected_output)
 
 
 @pytest.mark.parametrize(
-    "input_data, expected_output",
+    ("input_data", "expected_output"),
     [
         # Case 1: basic valid input with plus strand
         (
@@ -334,13 +337,13 @@ def test_load_names(input_data, expected_output):
         ),
     ],
 )
-def test_load_feature_records(input_data, expected_output):
+def test_load_feature_records(input_data: dict[str, Any], expected_output: list[tuple]):
     result = load_feature_records(input_data)
     assert sorted(result) == sorted(expected_output)
 
 
 @pytest.mark.parametrize(
-    "input_data, expected_output",
+    ("input_data", "expected_output"),
     [
         # Case 1: valid mapping
         (
@@ -387,13 +390,13 @@ def test_load_feature_records(input_data, expected_output):
         ),
     ],
 )
-def test_load_contig_collection_x_feature(input_data, expected_output):
+def test_load_contig_collection_x_feature(input_data: dict[str, Any], expected_output) -> None:
     result = load_contig_collection_x_feature(input_data)
     assert result == expected_output
 
 
 @pytest.mark.parametrize(
-    "input_data, expected_output",
+    ("input_data", "expected_output"),
     [
         # Case 1: Valid report with multiple proteins
         (
@@ -471,13 +474,13 @@ def test_load_contig_collection_x_feature(input_data, expected_output):
         ),
     ],
 )
-def test_load_contig_collection_x_protein(input_data, expected_output):
+def test_load_contig_collection_x_protein(input_data: dict[str, Any], expected_output: list[tuple]) -> None:
     result = load_contig_collection_x_protein(input_data)
     assert sorted(result) == sorted(expected_output)
 
 
 @pytest.mark.parametrize(
-    "input_data, expected_output",
+    ("input_data", "expected_output"),
     [
         # Case 1: valid gene with multiple proteins
         (
@@ -534,13 +537,13 @@ def test_load_contig_collection_x_protein(input_data, expected_output):
         ({"reports": []}, []),
     ],
 )
-def test_load_feature_x_protein(input_data, expected_output):
+def test_load_feature_x_protein(input_data: dict[str, Any], expected_output: list[tuple[str, str]]) -> None:
     result = load_feature_x_protein(input_data)
     assert sorted(result) == sorted(expected_output)
 
 
 @pytest.mark.parametrize(
-    "input_data, expected_output",
+    ("input_data", "expected_output"),
     [
         # Case 1: Valid contig and assembly
         (
@@ -616,13 +619,13 @@ def test_load_feature_x_protein(input_data, expected_output):
         ),
     ],
 )
-def test_load_contig_x_contig_collection(input_data, expected_output):
+def test_load_contig_x_contig_collection(input_data: dict[str, Any], expected_output: list[tuple[str, str]]) -> None:
     result = load_contig_x_contig_collection(input_data)
     assert sorted(result) == sorted(expected_output)
 
 
 @pytest.mark.parametrize(
-    "input_data, expected_output",
+    ("input_data", "expected_output"),
     [
         # Case 1: Valid contig with accession_version
         (
@@ -676,14 +679,14 @@ def test_load_contig_x_contig_collection(input_data, expected_output):
         ),
     ],
 )
-def test_load_contigs(input_data, expected_output):
+def test_load_contigs(input_data: dict[str, Any], expected_output: list[tuple]) -> None:
     result = load_contigs(input_data)
     assert sorted(result) == sorted(expected_output)
 
 
 ### add new test: to_int
 @pytest.mark.parametrize(
-    "input_id, expected",
+    ("input_id", "expected"),
     [
         ("GeneID:123", "ncbigene:123"),
         ("YP_009725307.1", "refseq:YP_009725307.1"),
@@ -691,59 +694,35 @@ def test_load_contigs(input_data, expected_output):
         ("random", "random"),
     ],
 )
-def test_apply_prefix(input_id, expected):
+def test_apply_prefix(input_id: str, expected: str) -> None:
     assert apply_prefix(input_id) == expected
 
 
-@pytest.mark.parametrize("val, expected", [("123", 123), ("abc", None), ("", None)])
-def test_to_int(val, expected):
+@pytest.mark.parametrize(("val", "expected"), [("123", 123), ("abc", None), ("", None)])
+def test_to_int(val: str, expected: int | None) -> None:
     assert to_int(val) == expected
 
 
-TABLE_NAME_MAP = {
-    "contig": "Contig",
-    "feature": "Feature",
-    "identifier": "Identifier",
-    "name": "Name",
-    "contig_x_contigcollection": "Contig_x_ContigCollection",
-    "contigcollection_x_feature": "ContigCollection_x_Feature",
-    "contigcollection_x_protein": "ContigCollection_x_Protein",
-    "feature_x_protein": "Feature_x_Protein",
-}
-
-
 @pytest.mark.requires_spark
 def test_parse_annotation_data(spark: SparkSession, test_data_dir: Path) -> None:
-    """Test the parsing of the annotation data."""
+    """
+    Test that parse_annotation_data produces expected tables with correct schemas and non-empty output.
+    """
     spark.sql(f"CREATE DATABASE IF NOT EXISTS {TEST_NS}")
-    spark.catalog.setCurrentDatabase(TEST_NS)
-
-    # Load NCBI dataset from NCBI API
+    # Load and parse test JSON
     sample_api_response = test_data_dir / "refseq" / "annotation_report.json"
     dataset = json.load(sample_api_response.open())
-
-    # Run parse function
+    # Run the parser
     parse_annotation_data(spark, [dataset], TEST_NS)
 
-    # Expected tables to validate from output
-    expected_tables = [
-        "contig",
-        "contig_x_contigcollection",
-        "contigcollection_x_feature",
-        "contigcollection_x_protein",
-        "feature",
-        "feature_x_protein",
-        "identifier",
-        "name",
-    ]
+    # Load expected results JSON
+    sample_api_response = test_data_dir / "refseq" / "annotation_report.parsed.json"
+    expected_tables = json.load(sample_api_response.open())
+    result_df = {table.name: spark.table(f"{TEST_NS}.{table.name}") for table in spark.catalog.listTables(TEST_NS)}
 
-    for table_name in expected_tables:
+    for table_name in result_df:
         result_df = spark.table(f"{TEST_NS}.{table_name}")
-        schema_key = TABLE_NAME_MAP[table_name]
-
-        # Construct expected_df just for schema comparison
-        rows = [r.asDict() for r in result_df.collect()]
-        expected_df = spark.createDataFrame(rows, schema=CDM_SCHEMA[schema_key])
+        expected_df = spark.createDataFrame(expected_tables[table_name], schema=CDM_SCHEMA_LC[table_name])
 
         # Assert schema match
         assertSchemaEqual(
@@ -755,3 +734,6 @@ def test_parse_annotation_data(spark: SparkSession, test_data_dir: Path) -> None
             expected_df,
             result_df,
         )
+
+    # make sure that all expected tables are present
+    assert set(expected_tables) == set(result_df)

From b0cef542bb2fbe35e87cc1ac42872702a6b02a0e Mon Sep 17 00:00:00 2001
From: YueWang <ywang4196@lbl.gov>
Date: Fri, 30 Jan 2026 17:30:00 -0800
Subject: [PATCH 3/3] Add extra schemas

---
 pyproject.toml                                |   5 +-
 .../parsers/refseq/api/annotation_report.py   | 173 +++++++++++++++---
 uv.lock                                       |  39 ++--
 3 files changed, 165 insertions(+), 52 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a2cc5ab..28a0239 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,8 @@ dependencies = [
     "click>=8.3.1",
     "lxml>=6.0.2",
     "ruff>=0.14.13",
+    "requests>=2.31.0",
+    "delta-spark>=2.4.0"
 ]
 
 [project.scripts]
@@ -21,7 +23,8 @@ uniprot = "cdm_data_loader_utils.parsers.uniprot.uniprot_kb:cli"
 uniref = "cdm_data_loader_utils.pasers.uniprot.uniref:cli"
 
 
-[dependency-groups]
+##[dependency-groups]
+[project.optional-dependencies]
 dev = [
     "pytest>=9.0.2",
     "pytest-asyncio>=1.3.0",
diff --git a/src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py b/src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py
index 47c6882..96885e5 100644
--- a/src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py
+++ b/src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py
@@ -3,7 +3,7 @@
 RefSeq annotation parser for transforming NCBI Datasets API JSON into CDM-formatted Delta Lake tables.
 
 Usage:
-uv run python src/cdm_data_loader_utils/parsers/annotation_parse.py \
+uv run python src/cdm_data_loader_utils/parsers/refseq/api/annotation_report.py \
   --accession GCF_000869125.1 \
   --namespace refseq_api \
   --query
@@ -143,21 +143,19 @@ def load_feature_records(data: dict) -> list[tuple]:
                     "minus": "negative",
                     "unstranded": "unstranded",
                 }.get(r.get("orientation"), "unknown")
-                features.append(
-                    (
-                        feature_id,
-                        None,
-                        None,
-                        None,
-                        to_int(r.get("end")),
-                        None,
-                        to_int(r.get("begin")),
-                        strand,
-                        "RefSeq",
-                        None,
-                        "gene",
-                    )
-                )
+                features.append((
+                    feature_id,
+                    None,
+                    None,
+                    None,
+                    to_int(r.get("end")),
+                    None,
+                    to_int(r.get("begin")),
+                    strand,
+                    "RefSeq",
+                    None,
+                    "gene",
+                ))
     return list({tuple(row) for row in features})
 
 
@@ -182,7 +180,7 @@ def load_contig_collection_x_feature(data: dict) -> list[tuple[str, str]]:
 
 
 # ---------------------------------------------------------------------
-# PARSE CONTIG_COLLECTION <-> PROTEIN
+# PARSE CONTIG_COLLECTION <-> PROTEIN %%%
 # ---------------------------------------------------------------------
 def load_contig_collection_x_protein(data: dict) -> list[tuple[str, str]]:
     links = []
@@ -255,16 +253,81 @@ def load_contig_x_contig_collection(data: dict) -> list[tuple[str, str]]:
         assembly = annotations[0].get("assembly_accession")
 
         if contig and assembly:
-            links.append(
-                (
-                    f"refseq:{contig}",
-                    apply_prefix(assembly),
-                )
-            )
+            links.append((
+                f"refseq:{contig}",
+                apply_prefix(assembly),
+            ))
 
     return list(set(links))
 
 
+def load_contig_x_feature(data: dict) -> list[tuple[str, str]]:
+    """Extract (contig_id, feature_id) pairs."""
+    links = []
+
+    for gene_id, ann in unique_annotations(data):
+        feature_id = f"ncbigene:{gene_id}"
+
+        for region in ann.get("genomic_regions", []):
+            acc = region.get("gene_range", {}).get("accession_version")
+            if acc:
+                contig_id = apply_prefix(acc)
+                links.append((contig_id, feature_id))
+
+    return list(set(links))
+
+
+def load_contig_x_protein(data: dict) -> list[tuple[str, str]]:
+    links = []
+
+    for _, ann in unique_annotations(data):
+        contig_id = None
+
+        for region in ann.get("genomic_regions", []):
+            acc = region.get("gene_range", {}).get("accession_version")
+            if acc:
+                contig_id = apply_prefix(acc)
+                break  # only take first
+
+        if contig_id:
+            for p in ann.get("proteins", []):
+                pid = p.get("accession_version")
+                if pid:
+                    links.append((contig_id, apply_prefix(pid)))
+
+    return list({tuple(row) for row in links})
+
+
+### contig collection has 34 rows
+
+
+def load_contig_collections(data: dict) -> list[tuple]:
+    schema = CDM_SCHEMA["ContigCollection"]
+
+    records = []
+
+    for report in data.get("reports", []):
+        for item in report.get("contigcollection", []):
+            row = tuple(item.get(f) for f in schema.fieldNames())
+            records.append(row)
+
+    return records
+
+
+def load_protein(data: dict) -> list[tuple[str, str | None, str | None, str | None, int | None, str | None]]:
+    """Extract Protein table rows."""
+    out = []
+
+    for _, ann in unique_annotations(data):
+        for p in ann.get("proteins", []):
+            pid = apply_prefix(p.get("accession_version"))
+            name = p.get("name")
+            length = p.get("length")
+            out.append((pid, None, name, None, length, None))
+
+    return list({tuple(row) for row in out})
+
+
 # ---------------------------------------------------------------------
 # DELTA TABLE
 # ---------------------------------------------------------------------
@@ -274,10 +337,26 @@ def write_to_table(
     table_name: str,
     database: str = "default",
 ) -> None:
-    if records:
-        spark.createDataFrame(records, CDM_SCHEMA[table_name]).write.format("delta").mode("overwrite").option(
-            "overwriteSchema", "true"
-        ).saveAsTable(f"{database}.{table_name}")
+    if not records:
+        print(f"[DEBUG] {table_name}: no records")
+        return
+
+    # print(f"\n[DEBUG] Writing table: {table_name}")
+    # print(f"[DEBUG] Record sample: {records[0]}")
+    # print(f"[DEBUG] Record field count: {len(records[0])}")
+    # print(f"[DEBUG] Schema field count: {len(CDM_SCHEMA[table_name])}")
+    # print(f"[DEBUG] Schema fields: {[f.name for f in CDM_SCHEMA[table_name]]}")
+
+    df = spark.createDataFrame(records, CDM_SCHEMA[table_name])
+    # print(f"[DEBUG] DataFrame row count: {df.count()}")
+
+    df.printSchema()
+    df.show(truncate=False)
+
+    df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{database}.{table_name}")
+    print(f"[DEBUG] Writing to {database}.{table_name}, number of records: {len(records)}")
+
+    df.createOrReplaceTempView(table_name)
 
 
 # ---------------------------------------------------------------------
@@ -293,14 +372,22 @@ def write_to_table(
     "Feature_x_Protein",
     "Contig",
     "Contig_x_ContigCollection",
+    "ContigCollection",
+    "Protein",
+    "Contig_x_Feature",
+    "Contig_x_Protein",
 ]
 
 
 def run_sql_query(spark: SparkSession, database: str = "default") -> None:
     spark.sql(f"USE {database}")
     for table in CDM_TABLES:
-        print(f"\n[SQL Preview] {table}")
-        spark.sql(f"SELECT * FROM {table} LIMIT 20").show(truncate=False)
+        full_table_name = f"{database}.{table}"
+        print(f"\n[SQL Preview] {full_table_name}")
+        try:
+            spark.sql(f"SELECT * FROM {full_table_name} LIMIT 20").show(truncate=False)
+        except Exception as e:
+            print(f"[WARNING] Could not query table {full_table_name}: {e}")
 
 
 def parse_annotation_data(spark: SparkSession, datasets: list[dict], namespace: str) -> None:
@@ -364,6 +451,34 @@ def parse_annotation_data(spark: SparkSession, datasets: list[dict], namespace:
             namespace,
         )
 
+        write_to_table(
+            spark,
+            load_contig_x_feature(data),
+            "Contig_x_Feature",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contig_x_protein(data),
+            "Contig_x_Protein",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_contig_collections(data),
+            "ContigCollection",
+            namespace,
+        )
+
+        write_to_table(
+            spark,
+            load_protein(data),
+            "Protein",
+            namespace,
+        )
+
 
 # ---------------------------------------------------------------------
 # CLI ENTRY
diff --git a/uv.lock b/uv.lock
index 9e5a82b..5999601 100644
--- a/uv.lock
+++ b/uv.lock
@@ -376,11 +376,13 @@ source = { editable = "." }
 dependencies = [
     { name = "biopython" },
     { name = "click" },
+    { name = "delta-spark" },
     { name = "lxml" },
+    { name = "requests" },
     { name = "ruff" },
 ]
 
-[package.dev-dependencies]
+[package.optional-dependencies]
 dev = [
     { name = "pytest" },
     { name = "pytest-asyncio" },
@@ -405,32 +407,25 @@ xml = [
 
 [package.metadata]
 requires-dist = [
+    { name = "berdl-notebook-utils", marker = "extra == 'local'", git = "https://github.com/BERDataLakehouse/spark_notebook.git?subdirectory=notebook_utils" },
     { name = "biopython", specifier = ">=1.86" },
     { name = "click", specifier = ">=8.3.1" },
+    { name = "delta-spark", specifier = ">=2.4.0" },
+    { name = "genson", marker = "extra == 'models'", specifier = ">=1.3.0" },
+    { name = "hypothesis", marker = "extra == 'experimental'", specifier = ">=6.150.2" },
+    { name = "json2python-models", marker = "extra == 'models'", specifier = ">=0.3.1" },
     { name = "lxml", specifier = ">=6.0.2" },
+    { name = "mutmut", marker = "extra == 'experimental'", specifier = ">=3.4.0" },
+    { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
+    { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=1.3.0" },
+    { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=7.0.0" },
+    { name = "pytest-env", marker = "extra == 'dev'", specifier = ">=1.2.0" },
+    { name = "requests", specifier = ">=2.31.0" },
     { name = "ruff", specifier = ">=0.14.13" },
+    { name = "xmlschema", marker = "extra == 'xml'", specifier = ">=4.3.1" },
+    { name = "xsdata", extras = ["cli", "lxml"], marker = "extra == 'xml'", specifier = ">=26.1" },
 ]
-
-[package.metadata.requires-dev]
-dev = [
-    { name = "pytest", specifier = ">=9.0.2" },
-    { name = "pytest-asyncio", specifier = ">=1.3.0" },
-    { name = "pytest-cov", specifier = ">=7.0.0" },
-    { name = "pytest-env", specifier = ">=1.2.0" },
-]
-experimental = [
-    { name = "hypothesis", specifier = ">=6.150.2" },
-    { name = "mutmut", specifier = ">=3.4.0" },
-]
-local = [{ name = "berdl-notebook-utils", git = "https://github.com/BERDataLakehouse/spark_notebook.git?subdirectory=notebook_utils" }]
-models = [
-    { name = "genson", specifier = ">=1.3.0" },
-    { name = "json2python-models", specifier = ">=0.3.1" },
-]
-xml = [
-    { name = "xmlschema", specifier = ">=4.3.1" },
-    { name = "xsdata", extras = ["cli", "lxml"], specifier = ">=26.1" },
-]
+provides-extras = ["dev", "experimental", "local", "models", "xml"]
 
 [[package]]
 name = "cdm-jupyterlab-brand-extension"