From 6d54a392c53c6e410470c07a98e3a022f28dec15 Mon Sep 17 00:00:00 2001 From: "Basli, Adel" Date: Sat, 9 Nov 2024 14:08:45 +0100 Subject: [PATCH 1/2] changing the format --- openfda/adae/annotate.py | 224 +- openfda/adae/pipeline.py | 919 +++--- openfda/adae/tests/pipeline_test.py | 407 +-- .../annotation_table/combine_harmonization.py | 199 +- openfda/annotation_table/extract_unii.py | 120 +- openfda/annotation_table/pipeline.py | 1222 +++---- .../tests/extract_unii_test.py | 102 +- .../annotation_table/tests/pipeline_test.py | 185 +- .../annotation_table/unii_harmonization.py | 196 +- openfda/app.py | 36 +- openfda/caers/parse.py | 27 +- openfda/caers/pipeline.py | 268 +- openfda/classification/pipeline.py | 206 +- openfda/common.py | 499 +-- openfda/config.py | 70 +- openfda/covid19serology/pipeline.py | 108 +- .../covid19serology/tests/pipeline_test.py | 118 +- openfda/deploy/tests/adae/test_endpoint.py | 240 +- .../deploy/tests/device510k/test_endpoint.py | 50 +- .../deploy/tests/deviceclass/test_endpoint.py | 46 +- .../deploy/tests/deviceevent/test_endpoint.py | 2900 +++++++++-------- .../deploy/tests/devicepma/test_endpoint.py | 21 +- .../tests/devicerecall/test_endpoint.py | 213 +- .../tests/devicereglist/test_endpoint.py | 37 +- .../deploy/tests/deviceudi/test_endpoint.py | 546 ++-- .../tests/downloadstats/test_endpoint.py | 53 +- .../deploy/tests/drugevent/test_endpoint.py | 175 +- .../deploy/tests/druglabel/test_endpoint.py | 185 +- .../deploy/tests/drugsfda/test_endpoint.py | 407 +-- .../deploy/tests/enforcement/test_endpoint.py | 115 +- .../deploy/tests/foodevent/test_endpoint.py | 144 +- openfda/deploy/tests/general/test_endpoint.py | 127 +- openfda/deploy/tests/issues/test_endpoint.py | 62 +- openfda/deploy/tests/ndc/test_endpoint.py | 256 +- .../deploy/tests/othernsde/test_endpoint.py | 104 +- .../tests/othersubstance/test_endpoint.py | 575 ++-- .../deploy/tests/serology/test_endpoint.py | 127 +- openfda/deploy/tests/status/test_endpoint.py | 82 +- openfda/device_clearance/pipeline.py | 162 +- openfda/device_clearance/transform.py | 120 +- openfda/device_common.py | 46 +- openfda/device_harmonization/pipeline.py | 478 +-- openfda/device_pma/pipeline.py | 171 +- openfda/device_pma/transform.py | 72 +- openfda/device_recall/pipeline.py | 772 +++-- openfda/device_udi/pipeline.py | 565 ++-- openfda/device_udi/tests/pipeline_test.py | 366 ++- openfda/download_util.py | 29 +- openfda/downloadstats/pipeline.py | 389 ++- openfda/drugsfda/annotate.py | 185 +- openfda/drugsfda/pipeline.py | 1205 +++---- openfda/drugsfda/tests/pipeline_test.py | 104 +- openfda/elasticsearch_requests.py | 112 +- openfda/export/pipeline.py | 800 +++-- openfda/faers/annotate.py | 242 +- openfda/faers/pipeline.py | 342 +- openfda/faers/tests/annotate_test.py | 50 +- openfda/faers/xml_to_json.py | 278 +- openfda/index_util.py | 990 +++--- openfda/maude/generate_maude_mapping.py | 126 +- openfda/maude/pipeline.py | 1477 +++++---- openfda/ndc/annotate.py | 160 +- openfda/ndc/pipeline.py | 601 ++-- openfda/ndc/tests/pipeline_test.py | 961 +++--- openfda/nsde/pipeline.py | 132 +- openfda/parallel/__init__.py | 24 +- openfda/parallel/inputs.py | 449 +-- openfda/parallel/luigi_support.py | 64 +- openfda/parallel/mapper.py | 32 +- openfda/parallel/mapreduce.py | 483 +-- openfda/parallel/outputs.py | 216 +- openfda/parallel/reducer.py | 205 +- openfda/parallel/sharded_db.py | 108 +- openfda/parallel/watchdog.py | 75 +- openfda/registration/pipeline.py | 1049 +++--- openfda/res/annotate.py | 326 +- openfda/res/ean.py | 124 +- openfda/res/extract.py | 86 +- openfda/res/pipeline.py | 450 +-- openfda/res/tests/extract_test.py | 152 +- openfda/spl/annotate.py | 216 +- openfda/spl/extract.py | 115 +- openfda/spl/fix_date.py | 62 +- openfda/spl/pipeline.py | 492 +-- openfda/spl/process_barcodes.py | 164 +- openfda/spl/tests/date_fix_test.py | 42 +- openfda/spl/tests/extract_test.py | 331 +- openfda/spl/tests/process_barcodes_test.py | 105 +- openfda/substance_data/pipeline.py | 1410 ++++---- openfda/tasks.py | 112 +- openfda/test_common.py | 17 +- openfda/tests/api_test.py | 100 +- openfda/tests/api_test_helpers.py | 196 +- openfda/tests/common_test.py | 55 +- openfda/tests/index_util_test.py | 109 +- openfda/tests/luigi_test.py | 59 +- openfda/tests/parallel_test.py | 338 +- openfda/tobacco_problem/pipeline.py | 183 +- scripts/convert_json_to_yaml.py | 10 +- scripts/dbreader.py | 86 +- scripts/generate_fields_yaml.py | 400 +-- scripts/generate_mapping_from_sections.py | 78 +- scripts/generate_schema.py | 92 +- scripts/generate_tests.py | 77 +- setup.py | 82 +- 105 files changed, 16579 insertions(+), 14501 deletions(-) diff --git a/openfda/adae/annotate.py b/openfda/adae/annotate.py index 1815837..842cb4d 100644 --- a/openfda/adae/annotate.py +++ b/openfda/adae/annotate.py @@ -6,150 +6,158 @@ import simplejson as json from openfda import parallel -STRIP_PUNCT_RE = re.compile('[^a-zA-Z -]+') +STRIP_PUNCT_RE = re.compile("[^a-zA-Z -]+") def normalize_product_name(product_name): - 'Simple drugname normalization: strip punctuation and whitespace and lowercase.' - product_name = product_name.strip().lower() - return STRIP_PUNCT_RE.sub('', product_name) + "Simple drugname normalization: strip punctuation and whitespace and lowercase." + product_name = product_name.strip().lower() + return STRIP_PUNCT_RE.sub("", product_name) + def normalize_product_ndc(product_ndc): - 'Simple ndc normalization: strip letters, whitespace, and trim 10 digit ndcs.' - if any(i.isdigit() for i in product_ndc): - product_ndc = re.sub("[^\d-]", "", product_ndc.strip()) - if len(product_ndc) > 9: - product_ndc = product_ndc[:9] - return product_ndc + "Simple ndc normalization: strip letters, whitespace, and trim 10 digit ndcs." + if any(i.isdigit() for i in product_ndc): + product_ndc = re.sub("[^\d-]", "", product_ndc.strip()) + if len(product_ndc) > 9: + product_ndc = product_ndc[:9] + return product_ndc def read_json_file(json_file): - for line in json_file: - yield json.loads(line) + for line in json_file: + yield json.loads(line) def read_harmonized_file(harmonized_file): - ''' - Create a dictionary that is keyed by NDC package codes as well as drug names. First brand_name, then - generic_name. Used to join against the animal event drug names. - ''' - return_dict = collections.defaultdict(list) - for row in read_json_file(harmonized_file): - if "animal" in row['product_type'].lower(): + """ + Create a dictionary that is keyed by NDC package codes as well as drug names. First brand_name, then + generic_name. Used to join against the animal event drug names. + """ + return_dict = collections.defaultdict(list) + for row in read_json_file(harmonized_file): + if "animal" in row["product_type"].lower(): - drug = row['brand_name'].strip().lower() if 'brand_name' in row else None - ndc = row['product_ndc'].strip() if 'product_ndc' in row else None + drug = row["brand_name"].strip().lower() if "brand_name" in row else None + ndc = row["product_ndc"].strip() if "product_ndc" in row else None - if drug: - return_dict[normalize_product_name(drug)].append(row) - if ndc: - return_dict[ndc].append(row) + if drug: + return_dict[normalize_product_name(drug)].append(row) + if ndc: + return_dict[ndc].append(row) - return return_dict + return return_dict def _add_field(openfda, field, value): - if not isinstance(value, list): - value = [value] + if not isinstance(value, list): + value = [value] - for v in value: - if not v: - continue + for v in value: + if not v: + continue - if field not in openfda: - openfda[field] = {} - openfda[field][v] = True - return + if field not in openfda: + openfda[field] = {} + openfda[field][v] = True + return def AddHarmonizedRowToOpenfda(openfda, row): - if not row['is_original_packager']: - return - - if row['product_ndc'] in row['spl_product_ndc']: - _add_field(openfda, 'application_number', row['application_number']) - # Using the most precise possible name for brand_name - if row['brand_name_suffix']: - brand_name = row['brand_name'] + ' ' + row['brand_name_suffix'] - else: - brand_name = row['brand_name'] - - _add_field(openfda, 'brand_name', brand_name.rstrip().upper()) - _add_field(openfda, 'generic_name', row['generic_name'].upper()) - _add_field(openfda, 'manufacturer_name', row['manufacturer_name']) - _add_field(openfda, 'product_ndc', row['product_ndc']) - _add_field(openfda, 'product_ndc', row['spl_product_ndc']) - _add_field(openfda, 'product_type', row['product_type']) - - for route in row['route'].split(';'): - route = route.strip() - _add_field(openfda, 'route', route) - - for substance in row['substance_name'].split(';'): - substance = substance.strip() - _add_field(openfda, 'substance_name', substance) - - for rxnorm in row['rxnorm']: - _add_field(openfda, 'rxcui', rxnorm['rxcui']) - - _add_field(openfda, 'spl_id', row['id']) - _add_field(openfda, 'spl_set_id', row['spl_set_id']) - _add_field(openfda, 'package_ndc', row['package_ndc']) - - if row['unii_indexing'] != []: - for key, value in row['unii_indexing'].items(): - if key == 'unii': - _add_field(openfda, 'unii', value) - if key == 'va': - for this_item in value: - for va_key, va_value in this_item.items(): - if va_key == 'name': - if va_value.find('[MoA]') != -1: - _add_field(openfda, 'pharm_class_moa', va_value) - if va_value.find('[Chemical/Ingredient]') != -1 or va_value.find('[CS]') != -1: - _add_field(openfda, 'pharm_class_cs', va_value) - if va_value.find('[PE]') != -1: - _add_field(openfda, 'pharm_class_pe', va_value) - if va_value.find('[EPC]') != -1: - _add_field(openfda, 'pharm_class_epc', va_value) - if va_key == 'number': - _add_field(openfda, 'nui', va_value) + if not row["is_original_packager"]: + return + + if row["product_ndc"] in row["spl_product_ndc"]: + _add_field(openfda, "application_number", row["application_number"]) + # Using the most precise possible name for brand_name + if row["brand_name_suffix"]: + brand_name = row["brand_name"] + " " + row["brand_name_suffix"] + else: + brand_name = row["brand_name"] + + _add_field(openfda, "brand_name", brand_name.rstrip().upper()) + _add_field(openfda, "generic_name", row["generic_name"].upper()) + _add_field(openfda, "manufacturer_name", row["manufacturer_name"]) + _add_field(openfda, "product_ndc", row["product_ndc"]) + _add_field(openfda, "product_ndc", row["spl_product_ndc"]) + _add_field(openfda, "product_type", row["product_type"]) + + for route in row["route"].split(";"): + route = route.strip() + _add_field(openfda, "route", route) + + for substance in row["substance_name"].split(";"): + substance = substance.strip() + _add_field(openfda, "substance_name", substance) + + for rxnorm in row["rxnorm"]: + _add_field(openfda, "rxcui", rxnorm["rxcui"]) + + _add_field(openfda, "spl_id", row["id"]) + _add_field(openfda, "spl_set_id", row["spl_set_id"]) + _add_field(openfda, "package_ndc", row["package_ndc"]) + + if row["unii_indexing"] != []: + for key, value in row["unii_indexing"].items(): + if key == "unii": + _add_field(openfda, "unii", value) + if key == "va": + for this_item in value: + for va_key, va_value in this_item.items(): + if va_key == "name": + if va_value.find("[MoA]") != -1: + _add_field(openfda, "pharm_class_moa", va_value) + if ( + va_value.find("[Chemical/Ingredient]") != -1 + or va_value.find("[CS]") != -1 + ): + _add_field(openfda, "pharm_class_cs", va_value) + if va_value.find("[PE]") != -1: + _add_field(openfda, "pharm_class_pe", va_value) + if va_value.find("[EPC]") != -1: + _add_field(openfda, "pharm_class_epc", va_value) + if va_key == "number": + _add_field(openfda, "nui", va_value) def AnnotateDrugByKey(drug, key, harmonized_dict): - if key in harmonized_dict: - openfda = {} + if key in harmonized_dict: + openfda = {} - for row in harmonized_dict[key]: - AddHarmonizedRowToOpenfda(openfda, row) + for row in harmonized_dict[key]: + AddHarmonizedRowToOpenfda(openfda, row) - openfda_lists = {} - for field, value in openfda.items(): - openfda_lists[field] = [s for s in value.keys()] + openfda_lists = {} + for field, value in openfda.items(): + openfda_lists[field] = [s for s in value.keys()] - drug['openfda'] = openfda_lists + drug["openfda"] = openfda_lists def AnnotateDrug(drug, harmonized_dict): - if 'product_ndc' in drug and drug['product_ndc']: - AnnotateDrugByKey(drug, normalize_product_ndc(drug['product_ndc']), harmonized_dict) - if 'brand_name' in drug and drug['brand_name']: - AnnotateDrugByKey(drug, normalize_product_name(drug['brand_name']), harmonized_dict) + if "product_ndc" in drug and drug["product_ndc"]: + AnnotateDrugByKey( + drug, normalize_product_ndc(drug["product_ndc"]), harmonized_dict + ) + if "brand_name" in drug and drug["brand_name"]: + AnnotateDrugByKey( + drug, normalize_product_name(drug["brand_name"]), harmonized_dict + ) def AnnotateEvent(event, harmonized_dict): - if 'drug' not in event: - return + if "drug" not in event: + return - for drug in event['drug']: - AnnotateDrug(drug, harmonized_dict) + for drug in event["drug"]: + AnnotateDrug(drug, harmonized_dict) class AnnotateMapper(parallel.Mapper): - def __init__(self, harmonized_dict): - self.harmonized_dict = harmonized_dict + def __init__(self, harmonized_dict): + self.harmonized_dict = harmonized_dict - def map_shard(self, map_input, map_output): - for id, event in map_input: - AnnotateEvent(event, self.harmonized_dict) - map_output.add(id, event) + def map_shard(self, map_input, map_output): + for id, event in map_input: + AnnotateEvent(event, self.harmonized_dict) + map_output.add(id, event) diff --git a/openfda/adae/pipeline.py b/openfda/adae/pipeline.py index f79a0b8..61a07db 100644 --- a/openfda/adae/pipeline.py +++ b/openfda/adae/pipeline.py @@ -20,440 +20,549 @@ from openfda.common import newest_file_timestamp from openfda.parallel import NullOutput -ADAE_BUCKET = 's3://openfda-data-adae' -ADAE_LOCAL_DIR = config.data_dir('adae/s3_sync') -BATCH = arrow.utcnow().floor('day').format('YYYYMMDD') -AWS_CLI = 'aws' +ADAE_BUCKET = "s3://openfda-data-adae" +ADAE_LOCAL_DIR = config.data_dir("adae/s3_sync") +BATCH = arrow.utcnow().floor("day").format("YYYYMMDD") +AWS_CLI = "aws" # TODO: move to an external file once the list grows unmanageable. -NULLIFIED = ['US-FDACVM-2018-US-045311.xml', 'US-FDACVM-2018-US-048571.xml', 'US-FDACVM-2018-US-046672.xml', - 'US-FDACVM-2017-US-042492.xml', 'US-FDACVM-2018-US-044065.xml', 'US-FDACVM-2017-US-070108.xml', - 'US-FDACVM-2017-US-002864.xml', - 'US-FDACVM-2017-US-002866.xml', 'US-FDACVM-2017-US-052458.xml', 'US-FDACVM-2017-US-055193.xml', - 'US-FDACVM-2017-US-043931.xml', 'US-FDACVM-2018-US-002321.xml', - 'US-FDACVM-2018-US-063536.xml', 'US-FDACVM-2015-US-221044.xml', 'US-FDACVM2019-US-016263.xml', - 'US-FDACVM-2016-US-062923.xml', 'US-FDACVM-2017-US-001483.xml', 'US-FDACVM-2017-US-009155.xml', - 'US-FDACVM-2017-US-028125.xml', 'US-FDACVM-2017-US-033030.xml', - 'US-FDACVM-2019-US-007584.xml', 'US-FDACVM-2019-US-000655.xml', 'US-FDACVM-2019-US-009469.xml', - 'US-FDACVM-2019-US-021451.xml', 'US-FDACVM-2012-US-036039.xml', - 'US-FDACVM-2019-US-021383.xml', 'US-FDACVM-2019-US-036949.xml', 'US-FDACVM-2019-US-056874.xml', - 'US-FDACVM-2020-US-000506.xml', 'US-FDACVM-2020-US-012144.xml', - 'US-FDACVM-2019-US-008020.xml', 'US-FDACVM-2020-US-055134.xml', 'US-FDACVM-2020-US-042720.xml', - 'US-FDACVM-2020-US-045814.xml', 'US-FDACVM-2020-US-045816.xml', - 'US-FDACVM-2020-US-045820.xml', 'US-FDACVM-2020-US-045824.xml', 'US-FDACVM-2016-US-054799.xml', - 'US-FDACVM-2017-US-060740.xml', - 'US-FDACVM-2016-US-062923.xml', 'US-FDACVM-2017-US-001483.xml', 'US-FDACVM-2017-US-009155.xml', - 'US-FDACVM-2017-US-028125.xml', 'US-FDACVM-2017-US-033030.xml', 'US-FDACVM-2012-US-031040.xml', - 'US-FDACVM-2012-US-031016.xml', - 'US-FDACVM-2014-US-011762.xml', - 'US-FDACVM-2012-US-045510.xml', - 'US-FDACVM-2012-US-031080.xml', - 'US-FDACVM-2012-US-030987.xml', - 'US-FDACVM-2012-US-031012.xml', - 'US-FDACVM-2018-US-026106.xml', - 'US-FDACVM-2019-US-063566.xml', - 'US-FDACVM-2020-US-035313.xml', - 'US-FDACVM-2021-US-024914.xml'] +NULLIFIED = [ + "US-FDACVM-2018-US-045311.xml", + "US-FDACVM-2018-US-048571.xml", + "US-FDACVM-2018-US-046672.xml", + "US-FDACVM-2017-US-042492.xml", + "US-FDACVM-2018-US-044065.xml", + "US-FDACVM-2017-US-070108.xml", + "US-FDACVM-2017-US-002864.xml", + "US-FDACVM-2017-US-002866.xml", + "US-FDACVM-2017-US-052458.xml", + "US-FDACVM-2017-US-055193.xml", + "US-FDACVM-2017-US-043931.xml", + "US-FDACVM-2018-US-002321.xml", + "US-FDACVM-2018-US-063536.xml", + "US-FDACVM-2015-US-221044.xml", + "US-FDACVM2019-US-016263.xml", + "US-FDACVM-2016-US-062923.xml", + "US-FDACVM-2017-US-001483.xml", + "US-FDACVM-2017-US-009155.xml", + "US-FDACVM-2017-US-028125.xml", + "US-FDACVM-2017-US-033030.xml", + "US-FDACVM-2019-US-007584.xml", + "US-FDACVM-2019-US-000655.xml", + "US-FDACVM-2019-US-009469.xml", + "US-FDACVM-2019-US-021451.xml", + "US-FDACVM-2012-US-036039.xml", + "US-FDACVM-2019-US-021383.xml", + "US-FDACVM-2019-US-036949.xml", + "US-FDACVM-2019-US-056874.xml", + "US-FDACVM-2020-US-000506.xml", + "US-FDACVM-2020-US-012144.xml", + "US-FDACVM-2019-US-008020.xml", + "US-FDACVM-2020-US-055134.xml", + "US-FDACVM-2020-US-042720.xml", + "US-FDACVM-2020-US-045814.xml", + "US-FDACVM-2020-US-045816.xml", + "US-FDACVM-2020-US-045820.xml", + "US-FDACVM-2020-US-045824.xml", + "US-FDACVM-2016-US-054799.xml", + "US-FDACVM-2017-US-060740.xml", + "US-FDACVM-2016-US-062923.xml", + "US-FDACVM-2017-US-001483.xml", + "US-FDACVM-2017-US-009155.xml", + "US-FDACVM-2017-US-028125.xml", + "US-FDACVM-2017-US-033030.xml", + "US-FDACVM-2012-US-031040.xml", + "US-FDACVM-2012-US-031016.xml", + "US-FDACVM-2014-US-011762.xml", + "US-FDACVM-2012-US-045510.xml", + "US-FDACVM-2012-US-031080.xml", + "US-FDACVM-2012-US-030987.xml", + "US-FDACVM-2012-US-031012.xml", + "US-FDACVM-2018-US-026106.xml", + "US-FDACVM-2019-US-063566.xml", + "US-FDACVM-2020-US-035313.xml", + "US-FDACVM-2021-US-024914.xml", +] class SyncS3(luigi.Task): - bucket = ADAE_BUCKET - local_dir = ADAE_LOCAL_DIR - aws = AWS_CLI - - def output(self): - return luigi.LocalTarget(ADAE_LOCAL_DIR) - - def run(self): - common.quiet_cmd([self.aws, - '--profile=' + config.aws_profile(), - 's3', 'sync', + bucket = ADAE_BUCKET + local_dir = ADAE_LOCAL_DIR + aws = AWS_CLI + + def output(self): + return luigi.LocalTarget(ADAE_LOCAL_DIR) + + def run(self): + common.quiet_cmd( + [ + self.aws, + "--profile=" + config.aws_profile(), + "s3", + "sync", self.bucket, - self.local_dir]) + self.local_dir, + ] + ) class ExtractXML(parallel.MRTask): - local_dir = ADAE_LOCAL_DIR - - def requires(self): - return SyncS3() - - def output(self): - return luigi.LocalTarget(os.path.join(config.data_dir('adae/extracted'), BATCH)) - - def mapreduce_inputs(self): - return parallel.Collection.from_glob(os.path.join(self.local_dir, '*.zip')) - - def map(self, zip_file, value, output): - output_dir = self.output().path - common.shell_cmd_quiet('mkdir -p %s', output_dir) - common.shell_cmd_quiet('7z x "%s" -aoa -bd -y -o%s', zip_file, output_dir) - - def output_format(self): - return NullOutput() + local_dir = ADAE_LOCAL_DIR -class XML2JSONMapper(parallel.Mapper): - UNIT_OF_MEASUREMENTS_MAP = { - "a": "Year", - "s": "Second", - "min": "Minute", - "h": "Hour", - "d": "Day", - "wk": "Week", - "mo": "Month", - "g": "Gram", - "mg": "Milligram", - "ug": "Microgram", - "ng": "Nanogram", - "kg": "Kilogram" - } - - VALUE_DECODE_MAP = { - "unit": UNIT_OF_MEASUREMENTS_MAP, - "numerator_unit": UNIT_OF_MEASUREMENTS_MAP, - "denominator_unit": UNIT_OF_MEASUREMENTS_MAP - } - - NS = {'ns': 'urn:hl7-org:v3'} - - DATE_FIELDS = ['original_receive_date', 'onset_date', 'first_exposure_date', 'last_exposure_date', - 'manufacturing_date', 'lot_expiration'] - - def map_shard(self, map_input, map_output): - - XPATH_MAP = { - "//ns:investigationEvent/ns:id[@root='1.2.3.4']/@extension": ["@id", "unique_aer_id_number"], - "//ns:attentionLine/ns:keyWordText[text()='Report Identifier']/following-sibling::ns:value": ["report_id"], - "//ns:outboundRelationship/ns:priorityNumber[@value='1']/following-sibling::ns:relatedInvestigation/ns:effectiveTime/@value": [ - "original_receive_date"], - - "//ns:investigationEvent//ns:primaryInformationRecipient/ns:assignedEntity/ns:representedOrganization/ns:name": - ["receiver.organization"], - "//ns:investigationEvent//ns:primaryInformationRecipient/ns:assignedEntity/ns:addr/ns:streetAddressLine": - ["receiver.street_address"], - "//ns:investigationEvent//ns:primaryInformationRecipient/ns:assignedEntity/ns:addr/ns:city": - ["receiver.city"], - "//ns:investigationEvent//ns:primaryInformationRecipient/ns:assignedEntity/ns:addr/ns:state": - ["receiver.state"], - "//ns:investigationEvent//ns:primaryInformationRecipient/ns:assignedEntity/ns:addr/ns:postalCode": - ["receiver.postal_code"], - "//ns:investigationEvent//ns:primaryInformationRecipient/ns:assignedEntity/ns:addr/ns:country": - ["receiver.country"], - - "//ns:outboundRelationship/ns:priorityNumber[@value='1']/following-sibling::ns:relatedInvestigation//ns:code[@codeSystem='2.16.840.1.113883.13.194']/@displayName": [ - "primary_reporter"], - "//ns:outboundRelationship/ns:priorityNumber[@value='2']/following-sibling::ns:relatedInvestigation//ns:code[@codeSystem='2.16.840.1.113883.13.194']/@displayName": [ - "secondary_reporter"], - "//ns:investigationCharacteristic/ns:code[@code='T95004']/following-sibling::ns:value/@displayName": [ - "type_of_information"], - "//ns:investigationCharacteristic/ns:code[@code='T95022']/following-sibling::ns:value/@value": [ - "serious_ae"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:player2[@classCode='ANM']/ns:quantity/@value": [ - "number_of_animals_treated"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2//ns:code[@code='T95005']/following-sibling::ns:value/@value": [ - "number_of_animals_affected"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:player2[@classCode='ANM']/ns:code[@codeSystem='2.16.840.1.113883.4.341']/@displayName": [ - "animal.species"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:player2[@classCode='ANM']/ns:administrativeGenderCode/@displayName": [ - "animal.gender"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:player2[@classCode='ANM']/ns:genderStatusCode/@displayName": [ - "animal.reproductive_status"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95010']/following-sibling::ns:value/@displayName": [ - "animal.female_animal_physiological_status"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95012']/following-sibling::ns:value/ns:low/@value": [ - "animal.age.min"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95012']/following-sibling::ns:value/ns:high/@value": [ - "animal.age.max"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95012']/following-sibling::ns:value/ns:low/@unit": [ - "animal.age.unit"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95012']/following-sibling::ns:value/following-sibling::ns:methodCode[@codeSystem='2.16.840.1.113883.13.200']/@displayName": [ - "animal.age.qualifier"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95011']/following-sibling::ns:value/ns:low/@value": [ - "animal.weight.min"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95011']/following-sibling::ns:value/ns:high/@value": [ - "animal.weight.max"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95011']/following-sibling::ns:value/ns:low/@unit": [ - "animal.weight.unit"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95011']/following-sibling::ns:value/following-sibling::ns:methodCode[@codeSystem='2.16.840.1.113883.13.200']/@displayName": [ - "animal.weight.qualifier"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95007']/following-sibling::ns:value/@value": [ - "animal.breed.is_crossbred"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95008']/following-sibling::ns:value[@codeSystem='2.16.840.1.113883.4.342']/@displayName": [ - "animal.breed.breed_component"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='C53279' or @code='C82467' or @code='C49495' or @code='C28554' or @code='C21115' or @code='C17998']/following-sibling::ns:value[@value>0]/..": - self.outcome, - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95006']/following-sibling::ns:value/@displayName": [ - "health_assessment_prior_to_exposure.condition"], - "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95006']/following-sibling::ns:author//ns:code/@displayName": [ - "health_assessment_prior_to_exposure.assessed_by"], - - "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:substanceAdministration": self.drug, - - "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:observation/ns:code[@code='T95020']/following-sibling::ns:effectiveTime/ns:low/@value": [ - "onset_date"], - "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:observation/ns:code[@code='T95020']/following-sibling::ns:effectiveTime/ns:width/@value": [ - "duration.value"], - "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:observation/ns:code[@code='T95020']/following-sibling::ns:effectiveTime/ns:width/@unit": [ - "duration.unit"], - - "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:observation/ns:code[@code='T95020']/following-sibling::ns:value[@codeSystem='2.16.840.1.113883.4.358' or @codeSystem='2.16.840.1.113883.13.226' or @codeSystem='2.16.840.1.113883.13.227']/..": - self.reaction, - - "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:observation/ns:code[@code='T95021']/following-sibling::ns:value/@displayName": - ["time_between_exposure_and_onset"], - "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:observation/ns:code[@code='T95023']/following-sibling::ns:value/@value": - ["treated_for_ae"] + def requires(self): + return SyncS3() - } + def output(self): + return luigi.LocalTarget(os.path.join(config.data_dir("adae/extracted"), BATCH)) - ae = {} - try: - if os.path.getsize(map_input.filename) > 0: - tree = etree.parse(open(map_input.filename)) - self.process_xpath_map(XPATH_MAP, tree, ae) - map_output.add(ae["unique_aer_id_number"], ae) - else: - logging.warning("Zero length input file: " + map_input.filename) - except lxml.etree.XMLSyntaxError as err: - logging.warning("Malformed input file: " + map_input.filename + ". Error was " + str(err)) - except Exception: - traceback.print_exc() - logging.error(sys.exc_info()[0]) - raise - - def process_xpath_map(self, xpath_map, root_node, json): - for xpath, json_fields in iter(xpath_map.items()): - nodeset = root_node.xpath(xpath, - namespaces=self.NS) - if type(nodeset) == list and len(nodeset) > 0: - self.set_value_in_json(nodeset, json_fields, json) - - def set_value_in_json(self, nodeset, json_fields, json): - - if callable(json_fields): - json_fields(nodeset, json) - return - - val = [] - for el in nodeset: - if type(el) == etree._Element: - val.append(el.text) - else: - val.append(str(el)) - - val = val if len(val) > 1 else val[0] - - for field in json_fields: - if type(field) == str: - self.set_field_dot_notation(val, field, json) - - def set_field_dot_notation(self, val, field, json): - # Decode value if needed - if field in self.VALUE_DECODE_MAP: - if val in self.VALUE_DECODE_MAP[field]: - val = self.VALUE_DECODE_MAP[field][val] - - parts = field.split('.') - if (len(parts) > 1): - first = parts[0] - if (json.get(first) == None): - json[first] = {} - self.set_field_dot_notation(val, '.'.join(parts[1:]), json[first]) - elif val is not None: - if not (field in self.DATE_FIELDS and not val): - json[field] = val - - def isdigit(self, s): - try: - float(s) - return True - except ValueError: - return False - - def drug(self, nodeset, json): - - XPATH_MAP = { - "./ns:effectiveTime/ns:comp/ns:low/@value": [ - "first_exposure_date"], - "./ns:effectiveTime/ns:comp/ns:high/@value": [ - "last_exposure_date"], - "./ns:effectiveTime/ns:comp/ns:period/@value": [ - "frequency_of_administration.value"], - "./ns:effectiveTime/ns:comp/ns:period/@unit": [ - "frequency_of_administration.unit"], - "./ns:performer/ns:assignedEntity/ns:code/@displayName": [ - "administered_by"], - "./ns:routeCode/@displayName": [ - "route"], - "./ns:doseCheckQuantity/ns:numerator/@value": [ - "dose.numerator"], - "./ns:doseCheckQuantity/ns:numerator/ns:translation/@displayName": [ - "dose.numerator_unit"], - "./ns:doseCheckQuantity/ns:denominator/@value": [ - "dose.denominator"], - "./ns:doseCheckQuantity/ns:denominator/ns:translation/@displayName": [ - "dose.denominator_unit"], - ".//ns:code[@code='T95015']/following-sibling::ns:value/@value": [ - "used_according_to_label"], - ".//ns:code[@code='T95016']/following-sibling::ns:outboundRelationship2//ns:value[@value='true']/preceding-sibling::ns:code/@displayName": [ - "off_label_use"], - ".//ns:code[@code='T95024']/following-sibling::ns:value/@value": [ - "previous_exposure_to_drug"], - ".//ns:code[@code='T95025']/following-sibling::ns:value/@value": [ - "previous_ae_to_drug"], - ".//ns:code[@code='T95026']/following-sibling::ns:value/@value": [ - "ae_abated_after_stopping_drug"], - ".//ns:code[@code='T95027']/following-sibling::ns:value/@value": [ - "ae_reappeared_after_resuming_drug"], - "./ns:consumable/ns:instanceOfKind/ns:productInstanceInstance/ns:existenceTime/ns:low/@value": [ - "manufacturing_date"], - "./ns:consumable/ns:instanceOfKind/ns:productInstanceInstance/ns:lotNumberText": [ - "lot_number"], - "./ns:consumable/ns:instanceOfKind/ns:productInstanceInstance/ns:lotNumberText/following-sibling::ns:expirationTime/@value": [ - "lot_expiration"], - "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:code/@code": [ - "product_ndc"], - "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:code/following-sibling::ns:name": [ - "brand_name"], - "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:code/following-sibling::ns:formCode/@displayName": [ - "dosage_form"], - "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:asManufacturedProduct/ns:manufacturerOrganization/ns:name": [ - "manufacturer.name"], - "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:asManufacturedProduct/ns:subjectOf/ns:approval/ns:id/@extension": [ - "manufacturer.registration_number"], - "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:instanceOfKind//ns:code[@code='T95017']/following-sibling::ns:value/@value": [ - "number_of_defective_items"], - "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:instanceOfKind//ns:code[@code='T95018']/following-sibling::ns:value/@value": [ - "number_of_items_returned"], - "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:asSpecializedKind//ns:code[@code='T95013']/following-sibling::ns:name": [ - "atc_vet_code"], - "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:ingredient": - self.ingredient - } + def mapreduce_inputs(self): + return parallel.Collection.from_glob(os.path.join(self.local_dir, "*.zip")) - json["drug"] = [] - for node in nodeset: - drug = {} - self.process_xpath_map(XPATH_MAP, node, drug) - # Convert yyyymm to yyyy-mm that ES can understand. - if drug.get('lot_expiration') is not None: - drug['lot_expiration'] = re.sub(r'^(\d{4})(\d{2})$', r'\1-\2', drug['lot_expiration']) - - # A corner-case scenario (US-FDACVM-2014-US-065247.xml) - if drug.get('dose') is not None and drug['dose'].get('denominator') is not None and not self.isdigit(drug['dose'][ - 'denominator']): - logging.warning("Non-numeric denominator: " + drug['dose'][ - 'denominator']) - drug['dose']['denominator'] = '0' - - - json["drug"].append(drug) - - def ingredient(self, nodeset, json): - - XPATH_MAP = { - "./ns:ingredientSubstance/ns:name": ["name"], - "./ns:quantity/ns:numerator/@value": [ - "dose.numerator"], - "./ns:quantity/ns:numerator/ns:translation/@displayName": [ - "dose.numerator_unit"], - "./ns:quantity/ns:denominator/@value": [ - "dose.denominator"], - "./ns:quantity/ns:denominator/ns:translation/@displayName": [ - "dose.denominator_unit"] - } + def map(self, zip_file, value, output): + output_dir = self.output().path + common.shell_cmd_quiet("mkdir -p %s", output_dir) + common.shell_cmd_quiet('7z x "%s" -aoa -bd -y -o%s', zip_file, output_dir) - json["active_ingredients"] = [] - for node in nodeset: - ingredient = {} - self.process_xpath_map(XPATH_MAP, node, ingredient) - if "name" in list(ingredient.keys()): - ingredient["name"] = ingredient["name"].title() - json["active_ingredients"].append(ingredient) + def output_format(self): + return NullOutput() - def outcome(self, nodeset, json): - XPATH_MAP = { - "./ns:code/@displayName": ["medical_status"], - "./ns:value/@value": ["number_of_animals_affected"] +class XML2JSONMapper(parallel.Mapper): + UNIT_OF_MEASUREMENTS_MAP = { + "a": "Year", + "s": "Second", + "min": "Minute", + "h": "Hour", + "d": "Day", + "wk": "Week", + "mo": "Month", + "g": "Gram", + "mg": "Milligram", + "ug": "Microgram", + "ng": "Nanogram", + "kg": "Kilogram", } - json["outcome"] = [] - for node in nodeset: - outcome = {} - self.process_xpath_map(XPATH_MAP, node, outcome) - json["outcome"].append(outcome) - - def reaction(self, nodeset, json): - - XPATH_MAP = { - "./ns:value[@codeSystem='2.16.840.1.113883.4.358' or @codeSystem='2.16.840.1.113883.13.226' or @codeSystem='2.16.840.1.113883.13.227']/@codeSystemVersion": [ - "veddra_version"], - "./ns:value[@codeSystem='2.16.840.1.113883.4.358' or @codeSystem='2.16.840.1.113883.13.226' or @codeSystem='2.16.840.1.113883.13.227']/@code": [ - "veddra_term_code"], - "./ns:value[@codeSystem='2.16.840.1.113883.4.358' or @codeSystem='2.16.840.1.113883.13.226' or @codeSystem='2.16.840.1.113883.13.227']/@displayName": [ - "veddra_term_name"], - "./ns:referenceRange/ns:observationRange/ns:value/@value": ["number_of_animals_affected"], - "./ns:referenceRange/ns:observationRange/ns:interpretationCode/@displayName": ["accuracy"] + VALUE_DECODE_MAP = { + "unit": UNIT_OF_MEASUREMENTS_MAP, + "numerator_unit": UNIT_OF_MEASUREMENTS_MAP, + "denominator_unit": UNIT_OF_MEASUREMENTS_MAP, } - json["reaction"] = [] - for node in nodeset: - reaction = {} - self.process_xpath_map(XPATH_MAP, node, reaction) - json["reaction"].append(reaction) + NS = {"ns": "urn:hl7-org:v3"} + + DATE_FIELDS = [ + "original_receive_date", + "onset_date", + "first_exposure_date", + "last_exposure_date", + "manufacturing_date", + "lot_expiration", + ] + + def map_shard(self, map_input, map_output): + + XPATH_MAP = { + "//ns:investigationEvent/ns:id[@root='1.2.3.4']/@extension": [ + "@id", + "unique_aer_id_number", + ], + "//ns:attentionLine/ns:keyWordText[text()='Report Identifier']/following-sibling::ns:value": [ + "report_id" + ], + "//ns:outboundRelationship/ns:priorityNumber[@value='1']/following-sibling::ns:relatedInvestigation/ns:effectiveTime/@value": [ + "original_receive_date" + ], + "//ns:investigationEvent//ns:primaryInformationRecipient/ns:assignedEntity/ns:representedOrganization/ns:name": [ + "receiver.organization" + ], + "//ns:investigationEvent//ns:primaryInformationRecipient/ns:assignedEntity/ns:addr/ns:streetAddressLine": [ + "receiver.street_address" + ], + "//ns:investigationEvent//ns:primaryInformationRecipient/ns:assignedEntity/ns:addr/ns:city": [ + "receiver.city" + ], + "//ns:investigationEvent//ns:primaryInformationRecipient/ns:assignedEntity/ns:addr/ns:state": [ + "receiver.state" + ], + "//ns:investigationEvent//ns:primaryInformationRecipient/ns:assignedEntity/ns:addr/ns:postalCode": [ + "receiver.postal_code" + ], + "//ns:investigationEvent//ns:primaryInformationRecipient/ns:assignedEntity/ns:addr/ns:country": [ + "receiver.country" + ], + "//ns:outboundRelationship/ns:priorityNumber[@value='1']/following-sibling::ns:relatedInvestigation//ns:code[@codeSystem='2.16.840.1.113883.13.194']/@displayName": [ + "primary_reporter" + ], + "//ns:outboundRelationship/ns:priorityNumber[@value='2']/following-sibling::ns:relatedInvestigation//ns:code[@codeSystem='2.16.840.1.113883.13.194']/@displayName": [ + "secondary_reporter" + ], + "//ns:investigationCharacteristic/ns:code[@code='T95004']/following-sibling::ns:value/@displayName": [ + "type_of_information" + ], + "//ns:investigationCharacteristic/ns:code[@code='T95022']/following-sibling::ns:value/@value": [ + "serious_ae" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:player2[@classCode='ANM']/ns:quantity/@value": [ + "number_of_animals_treated" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2//ns:code[@code='T95005']/following-sibling::ns:value/@value": [ + "number_of_animals_affected" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:player2[@classCode='ANM']/ns:code[@codeSystem='2.16.840.1.113883.4.341']/@displayName": [ + "animal.species" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:player2[@classCode='ANM']/ns:administrativeGenderCode/@displayName": [ + "animal.gender" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:player2[@classCode='ANM']/ns:genderStatusCode/@displayName": [ + "animal.reproductive_status" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95010']/following-sibling::ns:value/@displayName": [ + "animal.female_animal_physiological_status" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95012']/following-sibling::ns:value/ns:low/@value": [ + "animal.age.min" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95012']/following-sibling::ns:value/ns:high/@value": [ + "animal.age.max" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95012']/following-sibling::ns:value/ns:low/@unit": [ + "animal.age.unit" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95012']/following-sibling::ns:value/following-sibling::ns:methodCode[@codeSystem='2.16.840.1.113883.13.200']/@displayName": [ + "animal.age.qualifier" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95011']/following-sibling::ns:value/ns:low/@value": [ + "animal.weight.min" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95011']/following-sibling::ns:value/ns:high/@value": [ + "animal.weight.max" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95011']/following-sibling::ns:value/ns:low/@unit": [ + "animal.weight.unit" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95011']/following-sibling::ns:value/following-sibling::ns:methodCode[@codeSystem='2.16.840.1.113883.13.200']/@displayName": [ + "animal.weight.qualifier" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95007']/following-sibling::ns:value/@value": [ + "animal.breed.is_crossbred" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95008']/following-sibling::ns:value[@codeSystem='2.16.840.1.113883.4.342']/@displayName": [ + "animal.breed.breed_component" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='C53279' or @code='C82467' or @code='C49495' or @code='C28554' or @code='C21115' or @code='C17998']/following-sibling::ns:value[@value>0]/..": self.outcome, + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95006']/following-sibling::ns:value/@displayName": [ + "health_assessment_prior_to_exposure.condition" + ], + "//ns:adverseEventAssessment/ns:subject1[@typeCode='SBJ']/ns:primaryRole[@classCode='INVSBJ']/ns:subjectOf2[@typeCode='SBJ']//ns:code[@code='T95006']/following-sibling::ns:author//ns:code/@displayName": [ + "health_assessment_prior_to_exposure.assessed_by" + ], + "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:substanceAdministration": self.drug, + "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:observation/ns:code[@code='T95020']/following-sibling::ns:effectiveTime/ns:low/@value": [ + "onset_date" + ], + "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:observation/ns:code[@code='T95020']/following-sibling::ns:effectiveTime/ns:width/@value": [ + "duration.value" + ], + "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:observation/ns:code[@code='T95020']/following-sibling::ns:effectiveTime/ns:width/@unit": [ + "duration.unit" + ], + "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:observation/ns:code[@code='T95020']/following-sibling::ns:value[@codeSystem='2.16.840.1.113883.4.358' or @codeSystem='2.16.840.1.113883.13.226' or @codeSystem='2.16.840.1.113883.13.227']/..": self.reaction, + "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:observation/ns:code[@code='T95021']/following-sibling::ns:value/@displayName": [ + "time_between_exposure_and_onset" + ], + "//ns:adverseEventAssessment//ns:subjectOf2[@typeCode='SBJ']/ns:observation/ns:code[@code='T95023']/following-sibling::ns:value/@value": [ + "treated_for_ae" + ], + } + + ae = {} + try: + if os.path.getsize(map_input.filename) > 0: + tree = etree.parse(open(map_input.filename)) + self.process_xpath_map(XPATH_MAP, tree, ae) + map_output.add(ae["unique_aer_id_number"], ae) + else: + logging.warning("Zero length input file: " + map_input.filename) + except lxml.etree.XMLSyntaxError as err: + logging.warning( + "Malformed input file: " + + map_input.filename + + ". Error was " + + str(err) + ) + except Exception: + traceback.print_exc() + logging.error(sys.exc_info()[0]) + raise + + def process_xpath_map(self, xpath_map, root_node, json): + for xpath, json_fields in iter(xpath_map.items()): + nodeset = root_node.xpath(xpath, namespaces=self.NS) + if type(nodeset) == list and len(nodeset) > 0: + self.set_value_in_json(nodeset, json_fields, json) + + def set_value_in_json(self, nodeset, json_fields, json): + + if callable(json_fields): + json_fields(nodeset, json) + return + + val = [] + for el in nodeset: + if type(el) == etree._Element: + val.append(el.text) + else: + val.append(str(el)) + + val = val if len(val) > 1 else val[0] + + for field in json_fields: + if type(field) == str: + self.set_field_dot_notation(val, field, json) + + def set_field_dot_notation(self, val, field, json): + # Decode value if needed + if field in self.VALUE_DECODE_MAP: + if val in self.VALUE_DECODE_MAP[field]: + val = self.VALUE_DECODE_MAP[field][val] + + parts = field.split(".") + if len(parts) > 1: + first = parts[0] + if json.get(first) == None: + json[first] = {} + self.set_field_dot_notation(val, ".".join(parts[1:]), json[first]) + elif val is not None: + if not (field in self.DATE_FIELDS and not val): + json[field] = val + + def isdigit(self, s): + try: + float(s) + return True + except ValueError: + return False + + def drug(self, nodeset, json): + + XPATH_MAP = { + "./ns:effectiveTime/ns:comp/ns:low/@value": ["first_exposure_date"], + "./ns:effectiveTime/ns:comp/ns:high/@value": ["last_exposure_date"], + "./ns:effectiveTime/ns:comp/ns:period/@value": [ + "frequency_of_administration.value" + ], + "./ns:effectiveTime/ns:comp/ns:period/@unit": [ + "frequency_of_administration.unit" + ], + "./ns:performer/ns:assignedEntity/ns:code/@displayName": [ + "administered_by" + ], + "./ns:routeCode/@displayName": ["route"], + "./ns:doseCheckQuantity/ns:numerator/@value": ["dose.numerator"], + "./ns:doseCheckQuantity/ns:numerator/ns:translation/@displayName": [ + "dose.numerator_unit" + ], + "./ns:doseCheckQuantity/ns:denominator/@value": ["dose.denominator"], + "./ns:doseCheckQuantity/ns:denominator/ns:translation/@displayName": [ + "dose.denominator_unit" + ], + ".//ns:code[@code='T95015']/following-sibling::ns:value/@value": [ + "used_according_to_label" + ], + ".//ns:code[@code='T95016']/following-sibling::ns:outboundRelationship2//ns:value[@value='true']/preceding-sibling::ns:code/@displayName": [ + "off_label_use" + ], + ".//ns:code[@code='T95024']/following-sibling::ns:value/@value": [ + "previous_exposure_to_drug" + ], + ".//ns:code[@code='T95025']/following-sibling::ns:value/@value": [ + "previous_ae_to_drug" + ], + ".//ns:code[@code='T95026']/following-sibling::ns:value/@value": [ + "ae_abated_after_stopping_drug" + ], + ".//ns:code[@code='T95027']/following-sibling::ns:value/@value": [ + "ae_reappeared_after_resuming_drug" + ], + "./ns:consumable/ns:instanceOfKind/ns:productInstanceInstance/ns:existenceTime/ns:low/@value": [ + "manufacturing_date" + ], + "./ns:consumable/ns:instanceOfKind/ns:productInstanceInstance/ns:lotNumberText": [ + "lot_number" + ], + "./ns:consumable/ns:instanceOfKind/ns:productInstanceInstance/ns:lotNumberText/following-sibling::ns:expirationTime/@value": [ + "lot_expiration" + ], + "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:code/@code": [ + "product_ndc" + ], + "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:code/following-sibling::ns:name": [ + "brand_name" + ], + "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:code/following-sibling::ns:formCode/@displayName": [ + "dosage_form" + ], + "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:asManufacturedProduct/ns:manufacturerOrganization/ns:name": [ + "manufacturer.name" + ], + "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:asManufacturedProduct/ns:subjectOf/ns:approval/ns:id/@extension": [ + "manufacturer.registration_number" + ], + "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:instanceOfKind//ns:code[@code='T95017']/following-sibling::ns:value/@value": [ + "number_of_defective_items" + ], + "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:instanceOfKind//ns:code[@code='T95018']/following-sibling::ns:value/@value": [ + "number_of_items_returned" + ], + "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:asSpecializedKind//ns:code[@code='T95013']/following-sibling::ns:name": [ + "atc_vet_code" + ], + "./ns:consumable/ns:instanceOfKind/ns:kindOfProduct/ns:ingredient": self.ingredient, + } + + json["drug"] = [] + for node in nodeset: + drug = {} + self.process_xpath_map(XPATH_MAP, node, drug) + # Convert yyyymm to yyyy-mm that ES can understand. + if drug.get("lot_expiration") is not None: + drug["lot_expiration"] = re.sub( + r"^(\d{4})(\d{2})$", r"\1-\2", drug["lot_expiration"] + ) + + # A corner-case scenario (US-FDACVM-2014-US-065247.xml) + if ( + drug.get("dose") is not None + and drug["dose"].get("denominator") is not None + and not self.isdigit(drug["dose"]["denominator"]) + ): + logging.warning( + "Non-numeric denominator: " + drug["dose"]["denominator"] + ) + drug["dose"]["denominator"] = "0" + + json["drug"].append(drug) + + def ingredient(self, nodeset, json): + + XPATH_MAP = { + "./ns:ingredientSubstance/ns:name": ["name"], + "./ns:quantity/ns:numerator/@value": ["dose.numerator"], + "./ns:quantity/ns:numerator/ns:translation/@displayName": [ + "dose.numerator_unit" + ], + "./ns:quantity/ns:denominator/@value": ["dose.denominator"], + "./ns:quantity/ns:denominator/ns:translation/@displayName": [ + "dose.denominator_unit" + ], + } + + json["active_ingredients"] = [] + for node in nodeset: + ingredient = {} + self.process_xpath_map(XPATH_MAP, node, ingredient) + if "name" in list(ingredient.keys()): + ingredient["name"] = ingredient["name"].title() + json["active_ingredients"].append(ingredient) + + def outcome(self, nodeset, json): + + XPATH_MAP = { + "./ns:code/@displayName": ["medical_status"], + "./ns:value/@value": ["number_of_animals_affected"], + } + + json["outcome"] = [] + for node in nodeset: + outcome = {} + self.process_xpath_map(XPATH_MAP, node, outcome) + json["outcome"].append(outcome) + + def reaction(self, nodeset, json): + + XPATH_MAP = { + "./ns:value[@codeSystem='2.16.840.1.113883.4.358' or @codeSystem='2.16.840.1.113883.13.226' or @codeSystem='2.16.840.1.113883.13.227']/@codeSystemVersion": [ + "veddra_version" + ], + "./ns:value[@codeSystem='2.16.840.1.113883.4.358' or @codeSystem='2.16.840.1.113883.13.226' or @codeSystem='2.16.840.1.113883.13.227']/@code": [ + "veddra_term_code" + ], + "./ns:value[@codeSystem='2.16.840.1.113883.4.358' or @codeSystem='2.16.840.1.113883.13.226' or @codeSystem='2.16.840.1.113883.13.227']/@displayName": [ + "veddra_term_name" + ], + "./ns:referenceRange/ns:observationRange/ns:value/@value": [ + "number_of_animals_affected" + ], + "./ns:referenceRange/ns:observationRange/ns:interpretationCode/@displayName": [ + "accuracy" + ], + } + + json["reaction"] = [] + for node in nodeset: + reaction = {} + self.process_xpath_map(XPATH_MAP, node, reaction) + json["reaction"].append(reaction) class XML2JSON(luigi.Task): - def requires(self): - return ExtractXML() - - def output(self): - return luigi.LocalTarget(os.path.join(config.data_dir('adae/json.db'), BATCH)) - - def run(self): - input_shards = [] - input_dir = self.input().path - for subdir, dirs, files in os.walk(input_dir): - for file in files: - if file.endswith('.xml'): - if not file in NULLIFIED: - input_shards.append(os.path.join(subdir, file)) - else: - logging.info("Skipping a nullified case: " + file) - - parallel.mapreduce( - parallel.Collection.from_list(input_shards), - mapper=XML2JSONMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + def requires(self): + return ExtractXML() + + def output(self): + return luigi.LocalTarget(os.path.join(config.data_dir("adae/json.db"), BATCH)) + + def run(self): + input_shards = [] + input_dir = self.input().path + for subdir, dirs, files in os.walk(input_dir): + for file in files: + if file.endswith(".xml"): + if not file in NULLIFIED: + input_shards.append(os.path.join(subdir, file)) + else: + logging.info("Skipping a nullified case: " + file) + + parallel.mapreduce( + parallel.Collection.from_list(input_shards), + mapper=XML2JSONMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) # Annotation code left in for posterity. This class is not currently used at the request of CVM. class AnnotateEvent(luigi.Task): - def requires(self): - return [CombineHarmonization(), XML2JSON()] - - def output(self): - return luigi.LocalTarget(os.path.join(config.data_dir('adae/annotate.db'), BATCH)) - - def run(self): - harmonized_file = self.input()[0].path - parallel.mapreduce( - parallel.Collection.from_sharded(self.input()[1].path), - mapper=annotate.AnnotateMapper(harmonized_dict=annotate.read_harmonized_file(open(harmonized_file))), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + def requires(self): + return [CombineHarmonization(), XML2JSON()] + + def output(self): + return luigi.LocalTarget( + os.path.join(config.data_dir("adae/annotate.db"), BATCH) + ) + + def run(self): + harmonized_file = self.input()[0].path + parallel.mapreduce( + parallel.Collection.from_sharded(self.input()[1].path), + mapper=annotate.AnnotateMapper( + harmonized_dict=annotate.read_harmonized_file(open(harmonized_file)) + ), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) class LoadJSON(index_util.LoadJSONBase): - index_name = 'animalandveterinarydrugevent' - mapping_file = './schemas/animalandveterinarydrugevent_mapping.json' - data_source = XML2JSON() - use_checksum = False - optimize_index = True - last_update_date = lambda _: newest_file_timestamp(ADAE_LOCAL_DIR) + index_name = "animalandveterinarydrugevent" + mapping_file = "./schemas/animalandveterinarydrugevent_mapping.json" + data_source = XML2JSON() + use_checksum = False + optimize_index = True + last_update_date = lambda _: newest_file_timestamp(ADAE_LOCAL_DIR) -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/adae/tests/pipeline_test.py b/openfda/adae/tests/pipeline_test.py index 67c44fb..770426e 100644 --- a/openfda/adae/tests/pipeline_test.py +++ b/openfda/adae/tests/pipeline_test.py @@ -14,204 +14,219 @@ class ADAEPipelineTests(unittest.TestCase): - def setUp(self): - self.test_dir = tempfile.mkdtemp() - - def tearDown(self): - shutil.rmtree(self.test_dir) - - def test_extract_xml(self): - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "file.zip"), - os.path.join(self.test_dir, "file.zip")) - extract = ExtractXML() - extract.local_dir = self.test_dir - extract.output = MagicMock( - return_value=luigi.LocalTarget(self.test_dir)) - extract.map(os.path.join(self.test_dir, "file.zip"),'', []) - ok_(os.path.isfile(os.path.join(self.test_dir, "file1.xml"))) - - - def test_normalize_product_ndc(self): - eq_("57319-485", normalize_product_ndc("NDC57319-485-05")) - - def test_is_digit(self): - mapper = XML2JSONMapper() - ok_(mapper.isdigit('1')) - ok_(mapper.isdigit('0')) - ok_(mapper.isdigit('1.5')) - ok_(mapper.isdigit('-10')) - ok_(mapper.isdigit('-20.5')) - ok_(mapper.isdigit('+1245.33434')) - ok_(mapper.isdigit('-1245.33434')) - ok_(mapper.isdigit('0.00')) - ok_(not mapper.isdigit('I')) - ok_(not mapper.isdigit('I0')) - ok_(not mapper.isdigit('.')) - - - def test_xml_to_json(self): - mapper = XML2JSONMapper() - map_input = MagicMock() - map_input.filename = os.path.join(dirname(os.path.abspath(__file__)), "test.xml") - - map_dict = {} - - def add(id, json): - map_dict[id] = json - - map_output = MagicMock() - map_output.add = add - mapper.map_shard(map_input, map_output) - - uid = "USA-USFDACVM-2016-US-017128" - - ae = map_dict[uid] - eq_(uid, ae["unique_aer_id_number"]) - eq_(uid, ae["@id"]) - eq_("N141203", ae["report_id"]) - eq_("20160502", ae["original_receive_date"]) - eq_("Food and Drug Administration Center for Veterinary Medicine", ae["receiver"]["organization"]) - eq_("7500 Standish Place (HFV-210) Room N403", ae["receiver"]["street_address"]) - eq_("Rockville", ae["receiver"]["city"]) - eq_("MD", ae["receiver"]["state"]) - eq_("20855", ae["receiver"]["postal_code"]) - eq_("USA", ae["receiver"]["country"]) - eq_("Other", ae["primary_reporter"]) - eq_("Animal Owner", ae["secondary_reporter"]) - eq_("Safety Issue", ae["type_of_information"]) - eq_("true", ae["serious_ae"]) - eq_("2", ae["number_of_animals_treated"]) - eq_("3", ae["number_of_animals_affected"]) - eq_("Dog", ae["animal"]["species"]) - eq_("Male", ae["animal"]["gender"]) - eq_("Neutered", ae["animal"]["reproductive_status"]) - eq_("NOT APPLICABLE", ae["animal"]["female_animal_physiological_status"]) - - eq_("7.00", ae["animal"]["age"]["min"]) - eq_("17.5", ae["animal"]["age"]["max"]) - eq_("Year", ae["animal"]["age"]["unit"]) - eq_("Measured", ae["animal"]["age"]["qualifier"]) - - eq_("6.123", ae["animal"]["weight"]["min"]) - eq_("16.123", ae["animal"]["weight"]["max"]) - eq_("Kilogram", ae["animal"]["weight"]["unit"]) - eq_("Measured", ae["animal"]["weight"]["qualifier"]) - - eq_("false", ae["animal"]["breed"]["is_crossbred"]) - eq_("Terrier - Yorkshire", ae["animal"]["breed"]["breed_component"]) - - eq_("Ongoing", ae["outcome"][0]["medical_status"]) - eq_("1", ae["outcome"][0]["number_of_animals_affected"]) - - eq_("Good", ae["health_assessment_prior_to_exposure"]["condition"]) - eq_("Veterinarian", ae["health_assessment_prior_to_exposure"]["assessed_by"]) - - eq_("20150601", ae["onset_date"]) - eq_("14", ae["duration"]["value"]) - eq_("Month", ae["duration"]["unit"]) - - eq_("11", ae["reaction"][0]["veddra_version"]) - eq_("2227", ae["reaction"][0]["veddra_term_code"]) - eq_("Dental disease", ae["reaction"][0]["veddra_term_name"]) - eq_("1", ae["reaction"][0]["number_of_animals_affected"]) - eq_("Actual", ae["reaction"][0]["accuracy"]) - - eq_("11", ae["reaction"][1]["veddra_version"]) - eq_("1026", ae["reaction"][1]["veddra_term_code"]) - eq_("Localised pain NOS (see other 'SOCs' for specific pain)", ae["reaction"][1]["veddra_term_name"]) - eq_("1", ae["reaction"][1]["number_of_animals_affected"]) - eq_("Actual", ae["reaction"][1]["accuracy"]) - - eq_("2", ae["reaction"][13]["veddra_version"]) - eq_("99115", ae["reaction"][13]["veddra_term_code"]) - eq_("INEFFECTIVE, HEARTWORM LARVAE", ae["reaction"][13]["veddra_term_name"]) - eq_("1", ae["reaction"][13]["number_of_animals_affected"]) - eq_("Actual", ae["reaction"][13]["accuracy"]) - - eq_("10 days", ae["time_between_exposure_and_onset"]) - eq_("true", ae["treated_for_ae"]) - - eq_(8, len(ae["drug"])) - - eq_("20150601", ae["drug"][0]["first_exposure_date"]) - eq_("20151201", ae["drug"][0]["last_exposure_date"]) - eq_("1", ae["drug"][0]["frequency_of_administration"]["value"]) - eq_("Day", ae["drug"][0]["frequency_of_administration"]["unit"]) - eq_("Animal Owner", ae["drug"][0]["administered_by"]) - eq_("Oral", ae["drug"][0]["route"]) - eq_("0.50", ae["drug"][0]["dose"]["numerator"]) - eq_("tablet", ae["drug"][0]["dose"]["numerator_unit"]) - eq_("1", ae["drug"][0]["dose"]["denominator"]) - eq_("Unknown", ae["drug"][0]["dose"]["denominator_unit"]) - eq_("false", ae["drug"][0]["used_according_to_label"]) - eq_([u'Route Off-Label', u'Underdosed'], ae["drug"][0]["off_label_use"]) - eq_("false", ae["drug"][0]["previous_exposure_to_drug"]) - eq_("false", ae["drug"][0]["previous_ae_to_drug"]) - eq_("false", ae["drug"][0]["ae_abated_after_stopping_drug"]) - eq_("true", ae["drug"][0]["ae_reappeared_after_resuming_drug"]) - eq_("20160101", ae["drug"][0]["manufacturing_date"]) - eq_("71423", ae["drug"][0]["lot_number"]) - eq_("20180228", ae["drug"][0]["lot_expiration"]) - eq_("1111-2222", ae["drug"][0]["product_ndc"]) - eq_("Deramaxx Chewable Tablets", ae["drug"][0]["brand_name"]) - eq_("Tablet", ae["drug"][0]["dosage_form"]) - eq_("Elanco US Inc", ae["drug"][0]["manufacturer"]["name"]) - eq_("USA-USFDACVM-N141203", ae["drug"][0]["manufacturer"]["registration_number"]) - eq_("1", ae["drug"][0]["number_of_defective_items"]) - eq_("11", ae["drug"][0]["number_of_items_returned"]) - eq_("QM01AH94", ae["drug"][0]["atc_vet_code"]) - eq_("Deracoxib", ae["drug"][0]["active_ingredients"][0]["name"]) - eq_("25", ae["drug"][0]["active_ingredients"][0]["dose"]["numerator"]) - eq_("Milligram", ae["drug"][0]["active_ingredients"][0]["dose"]["numerator_unit"]) - eq_("1", ae["drug"][0]["active_ingredients"][0]["dose"]["denominator"]) - eq_("dose", ae["drug"][0]["active_ingredients"][0]["dose"]["denominator_unit"]) - - def test_empty_date_handling(self): - mapper = XML2JSONMapper() - map_input = MagicMock() - map_input.filename = os.path.join(dirname(os.path.abspath(__file__)), "test_blank_orig_date.xml") - - map_dict = {} - - def add(id, json): - map_dict[id] = json - - map_output = MagicMock() - map_output.add = add - mapper.map_shard(map_input, map_output) - - uid = "USA-USFDACVM-2016-US-017128" - - ae = map_dict[uid] - eq_(None, ae.get("original_receive_date")) - - - def test_non_numeric_denominator(self): - mapper = XML2JSONMapper() - map_input = MagicMock() - map_input.filename = os.path.join(dirname(os.path.abspath(__file__)), "test-non-numeric-denominator.xml") - - map_dict = {} - - def add(id, json): - map_dict[id] = json - - map_output = MagicMock() - map_output.add = add - mapper.map_shard(map_input, map_output) - - uid = "USA-USFDACVM-2016-US-017128" - - ae = map_dict[uid] - eq_(uid, ae["unique_aer_id_number"]) - eq_(uid, ae["@id"]) - eq_("0", ae["drug"][0]["dose"]["denominator"]) - eq_("Unknown", ae["drug"][0]["dose"]["denominator_unit"]) + def setUp(self): + self.test_dir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_extract_xml(self): + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "file.zip"), + os.path.join(self.test_dir, "file.zip"), + ) + extract = ExtractXML() + extract.local_dir = self.test_dir + extract.output = MagicMock(return_value=luigi.LocalTarget(self.test_dir)) + extract.map(os.path.join(self.test_dir, "file.zip"), "", []) + ok_(os.path.isfile(os.path.join(self.test_dir, "file1.xml"))) + + def test_normalize_product_ndc(self): + eq_("57319-485", normalize_product_ndc("NDC57319-485-05")) + + def test_is_digit(self): + mapper = XML2JSONMapper() + ok_(mapper.isdigit("1")) + ok_(mapper.isdigit("0")) + ok_(mapper.isdigit("1.5")) + ok_(mapper.isdigit("-10")) + ok_(mapper.isdigit("-20.5")) + ok_(mapper.isdigit("+1245.33434")) + ok_(mapper.isdigit("-1245.33434")) + ok_(mapper.isdigit("0.00")) + ok_(not mapper.isdigit("I")) + ok_(not mapper.isdigit("I0")) + ok_(not mapper.isdigit(".")) + + def test_xml_to_json(self): + mapper = XML2JSONMapper() + map_input = MagicMock() + map_input.filename = os.path.join( + dirname(os.path.abspath(__file__)), "test.xml" + ) + + map_dict = {} + + def add(id, json): + map_dict[id] = json + + map_output = MagicMock() + map_output.add = add + mapper.map_shard(map_input, map_output) + + uid = "USA-USFDACVM-2016-US-017128" + + ae = map_dict[uid] + eq_(uid, ae["unique_aer_id_number"]) + eq_(uid, ae["@id"]) + eq_("N141203", ae["report_id"]) + eq_("20160502", ae["original_receive_date"]) + eq_( + "Food and Drug Administration Center for Veterinary Medicine", + ae["receiver"]["organization"], + ) + eq_("7500 Standish Place (HFV-210) Room N403", ae["receiver"]["street_address"]) + eq_("Rockville", ae["receiver"]["city"]) + eq_("MD", ae["receiver"]["state"]) + eq_("20855", ae["receiver"]["postal_code"]) + eq_("USA", ae["receiver"]["country"]) + eq_("Other", ae["primary_reporter"]) + eq_("Animal Owner", ae["secondary_reporter"]) + eq_("Safety Issue", ae["type_of_information"]) + eq_("true", ae["serious_ae"]) + eq_("2", ae["number_of_animals_treated"]) + eq_("3", ae["number_of_animals_affected"]) + eq_("Dog", ae["animal"]["species"]) + eq_("Male", ae["animal"]["gender"]) + eq_("Neutered", ae["animal"]["reproductive_status"]) + eq_("NOT APPLICABLE", ae["animal"]["female_animal_physiological_status"]) + + eq_("7.00", ae["animal"]["age"]["min"]) + eq_("17.5", ae["animal"]["age"]["max"]) + eq_("Year", ae["animal"]["age"]["unit"]) + eq_("Measured", ae["animal"]["age"]["qualifier"]) + + eq_("6.123", ae["animal"]["weight"]["min"]) + eq_("16.123", ae["animal"]["weight"]["max"]) + eq_("Kilogram", ae["animal"]["weight"]["unit"]) + eq_("Measured", ae["animal"]["weight"]["qualifier"]) + + eq_("false", ae["animal"]["breed"]["is_crossbred"]) + eq_("Terrier - Yorkshire", ae["animal"]["breed"]["breed_component"]) + + eq_("Ongoing", ae["outcome"][0]["medical_status"]) + eq_("1", ae["outcome"][0]["number_of_animals_affected"]) + + eq_("Good", ae["health_assessment_prior_to_exposure"]["condition"]) + eq_("Veterinarian", ae["health_assessment_prior_to_exposure"]["assessed_by"]) + + eq_("20150601", ae["onset_date"]) + eq_("14", ae["duration"]["value"]) + eq_("Month", ae["duration"]["unit"]) + + eq_("11", ae["reaction"][0]["veddra_version"]) + eq_("2227", ae["reaction"][0]["veddra_term_code"]) + eq_("Dental disease", ae["reaction"][0]["veddra_term_name"]) + eq_("1", ae["reaction"][0]["number_of_animals_affected"]) + eq_("Actual", ae["reaction"][0]["accuracy"]) + + eq_("11", ae["reaction"][1]["veddra_version"]) + eq_("1026", ae["reaction"][1]["veddra_term_code"]) + eq_( + "Localised pain NOS (see other 'SOCs' for specific pain)", + ae["reaction"][1]["veddra_term_name"], + ) + eq_("1", ae["reaction"][1]["number_of_animals_affected"]) + eq_("Actual", ae["reaction"][1]["accuracy"]) + + eq_("2", ae["reaction"][13]["veddra_version"]) + eq_("99115", ae["reaction"][13]["veddra_term_code"]) + eq_("INEFFECTIVE, HEARTWORM LARVAE", ae["reaction"][13]["veddra_term_name"]) + eq_("1", ae["reaction"][13]["number_of_animals_affected"]) + eq_("Actual", ae["reaction"][13]["accuracy"]) + + eq_("10 days", ae["time_between_exposure_and_onset"]) + eq_("true", ae["treated_for_ae"]) + + eq_(8, len(ae["drug"])) + + eq_("20150601", ae["drug"][0]["first_exposure_date"]) + eq_("20151201", ae["drug"][0]["last_exposure_date"]) + eq_("1", ae["drug"][0]["frequency_of_administration"]["value"]) + eq_("Day", ae["drug"][0]["frequency_of_administration"]["unit"]) + eq_("Animal Owner", ae["drug"][0]["administered_by"]) + eq_("Oral", ae["drug"][0]["route"]) + eq_("0.50", ae["drug"][0]["dose"]["numerator"]) + eq_("tablet", ae["drug"][0]["dose"]["numerator_unit"]) + eq_("1", ae["drug"][0]["dose"]["denominator"]) + eq_("Unknown", ae["drug"][0]["dose"]["denominator_unit"]) + eq_("false", ae["drug"][0]["used_according_to_label"]) + eq_(["Route Off-Label", "Underdosed"], ae["drug"][0]["off_label_use"]) + eq_("false", ae["drug"][0]["previous_exposure_to_drug"]) + eq_("false", ae["drug"][0]["previous_ae_to_drug"]) + eq_("false", ae["drug"][0]["ae_abated_after_stopping_drug"]) + eq_("true", ae["drug"][0]["ae_reappeared_after_resuming_drug"]) + eq_("20160101", ae["drug"][0]["manufacturing_date"]) + eq_("71423", ae["drug"][0]["lot_number"]) + eq_("20180228", ae["drug"][0]["lot_expiration"]) + eq_("1111-2222", ae["drug"][0]["product_ndc"]) + eq_("Deramaxx Chewable Tablets", ae["drug"][0]["brand_name"]) + eq_("Tablet", ae["drug"][0]["dosage_form"]) + eq_("Elanco US Inc", ae["drug"][0]["manufacturer"]["name"]) + eq_( + "USA-USFDACVM-N141203", ae["drug"][0]["manufacturer"]["registration_number"] + ) + eq_("1", ae["drug"][0]["number_of_defective_items"]) + eq_("11", ae["drug"][0]["number_of_items_returned"]) + eq_("QM01AH94", ae["drug"][0]["atc_vet_code"]) + eq_("Deracoxib", ae["drug"][0]["active_ingredients"][0]["name"]) + eq_("25", ae["drug"][0]["active_ingredients"][0]["dose"]["numerator"]) + eq_( + "Milligram", + ae["drug"][0]["active_ingredients"][0]["dose"]["numerator_unit"], + ) + eq_("1", ae["drug"][0]["active_ingredients"][0]["dose"]["denominator"]) + eq_("dose", ae["drug"][0]["active_ingredients"][0]["dose"]["denominator_unit"]) + + def test_empty_date_handling(self): + mapper = XML2JSONMapper() + map_input = MagicMock() + map_input.filename = os.path.join( + dirname(os.path.abspath(__file__)), "test_blank_orig_date.xml" + ) + + map_dict = {} + + def add(id, json): + map_dict[id] = json + + map_output = MagicMock() + map_output.add = add + mapper.map_shard(map_input, map_output) + + uid = "USA-USFDACVM-2016-US-017128" + + ae = map_dict[uid] + eq_(None, ae.get("original_receive_date")) + + def test_non_numeric_denominator(self): + mapper = XML2JSONMapper() + map_input = MagicMock() + map_input.filename = os.path.join( + dirname(os.path.abspath(__file__)), "test-non-numeric-denominator.xml" + ) + + map_dict = {} + + def add(id, json): + map_dict[id] = json + + map_output = MagicMock() + map_output.add = add + mapper.map_shard(map_input, map_output) + + uid = "USA-USFDACVM-2016-US-017128" + + ae = map_dict[uid] + eq_(uid, ae["unique_aer_id_number"]) + eq_(uid, ae["@id"]) + eq_("0", ae["drug"][0]["dose"]["denominator"]) + eq_("Unknown", ae["drug"][0]["dose"]["denominator_unit"]) def main(argv): - unittest.main(argv=argv) + unittest.main(argv=argv) -if __name__ == '__main__': - unittest.main() +if __name__ == "__main__": + unittest.main() diff --git a/openfda/annotation_table/combine_harmonization.py b/openfda/annotation_table/combine_harmonization.py index 9ac037c..edf9662 100644 --- a/openfda/annotation_table/combine_harmonization.py +++ b/openfda/annotation_table/combine_harmonization.py @@ -28,109 +28,108 @@ def read_json_file(json_file): - for line in json_file: - yield json.loads(line) + for line in json_file: + yield json.loads(line) + def _joinable_dict(record_list, join_key_list): - joinable_dict = {} - key_seperator = '' + joinable_dict = {} + key_seperator = "" + + if len(join_key_list) > 1: + key_seperator = ":" - if len(join_key_list) > 1: - key_seperator = ':' + for record in record_list: + join_key = key_seperator.join([record[s] for s in join_key_list]) + if join_key in joinable_dict: + joinable_dict[join_key].append(record) + else: + joinable_dict[join_key] = [record] + return joinable_dict - for record in record_list: - join_key = key_seperator.join([record[s] for s in join_key_list]) - if join_key in joinable_dict: - joinable_dict[join_key].append(record) - else: - joinable_dict[join_key] = [record] - return joinable_dict def _combine_dicts(record_dict, new_data_dict, new_data_key): - record_list = [] - for join_key, record_value in iter(record_dict.items()): - for pivot in record_value: - joined_dict = {} - if join_key in new_data_dict: - for new_data_value in new_data_dict[join_key]: - joined_dict = dict(list(pivot.items()) + list(new_data_value.items())) - record_list.append(joined_dict) - else: - if new_data_key == None: - continue - joined_dict = dict(list(pivot.items()) + list({new_data_key: []}.items())) - record_list.append(joined_dict) - return record_list - - -def combine(product_file, - spl_extract, - rx_extract, - unii_extract, - upc_extract, - json_out): - json_output_file = open(json_out, 'w') - - product = csv.DictReader(open(product_file), delimiter='\t') - label = read_json_file(open(spl_extract)) - rxs = read_json_file(open(rx_extract)) - unii = read_json_file(open(unii_extract)) - upcs = read_json_file(open(upc_extract)) - - all_products = [] - for row in product: - clean_product = {} - #the productid has a date on the front of it, so need to split - clean_product['id'] = row['PRODUCTID'].split('_')[1] - clean_product['application_number'] = row['APPLICATIONNUMBER'] - clean_product['product_type'] = row['PRODUCTTYPENAME'] - clean_product['generic_name'] = row['NONPROPRIETARYNAME'] - clean_product['manufacturer_name'] = row['LABELERNAME'] - clean_product['brand_name'] = row['PROPRIETARYNAME'] - clean_product['brand_name_suffix'] = row['PROPRIETARYNAMESUFFIX'] - clean_product['product_ndc'] = row['PRODUCTNDC'] - clean_product['dosage_form'] = row['DOSAGEFORMNAME'] - clean_product['route'] = row['ROUTENAME'] - clean_product['substance_name'] = row['SUBSTANCENAME'] - - all_products.append(clean_product) - - joinable_labels = {} - - for spl_data in label: - clean_label = {} - clean_label['spl_set_id'] = spl_data['spl_set_id'] - clean_label['id'] = spl_data['id'] - clean_label['spl_version'] = spl_data['spl_version'] - clean_label['is_original_packager'] = spl_data['is_original_packager'] - clean_label['spl_product_ndc'] = spl_data['ProductNDCs'] - clean_label['original_packager_product_ndc'] = \ - spl_data['OriginalPackagerProductNDSs'] - clean_label['package_ndc'] = spl_data['PackageNDCs'] - - joinable_labels[clean_label['id']] = [clean_label] - - # Adding labels - joinable_products = _joinable_dict(all_products, ['id']) - record_list = _combine_dicts(joinable_labels, joinable_products, None) - - # Adding UNII - joinable_record_dict = _joinable_dict(record_list, ['id']) - joinable_unii = _joinable_dict(unii, ['spl_id']) - record_list = _combine_dicts(joinable_record_dict, - joinable_unii, - 'unii_indexing') - # Adding RXNorm - joinable_record_dict = _joinable_dict(record_list, - ['spl_set_id', 'spl_version']) - joinable_rxnorm = _joinable_dict(rxs, ['spl_set_id', 'spl_version']) - record_list = _combine_dicts(joinable_record_dict, joinable_rxnorm, 'rxnorm') - - # Adding UPC - joinable_record_dict = _joinable_dict(record_list, ['spl_set_id']) - joinable_upc = _joinable_dict(upcs, ['set_id']) - record_list = _combine_dicts(joinable_record_dict, joinable_upc, 'upc') - - for row in record_list: - json.dump(row, json_output_file, encoding='utf-8-sig') - json_output_file.write('\n') + record_list = [] + for join_key, record_value in iter(record_dict.items()): + for pivot in record_value: + joined_dict = {} + if join_key in new_data_dict: + for new_data_value in new_data_dict[join_key]: + joined_dict = dict( + list(pivot.items()) + list(new_data_value.items()) + ) + record_list.append(joined_dict) + else: + if new_data_key == None: + continue + joined_dict = dict( + list(pivot.items()) + list({new_data_key: []}.items()) + ) + record_list.append(joined_dict) + return record_list + + +def combine(product_file, spl_extract, rx_extract, unii_extract, upc_extract, json_out): + json_output_file = open(json_out, "w") + + product = csv.DictReader(open(product_file), delimiter="\t") + label = read_json_file(open(spl_extract)) + rxs = read_json_file(open(rx_extract)) + unii = read_json_file(open(unii_extract)) + upcs = read_json_file(open(upc_extract)) + + all_products = [] + for row in product: + clean_product = {} + # the productid has a date on the front of it, so need to split + clean_product["id"] = row["PRODUCTID"].split("_")[1] + clean_product["application_number"] = row["APPLICATIONNUMBER"] + clean_product["product_type"] = row["PRODUCTTYPENAME"] + clean_product["generic_name"] = row["NONPROPRIETARYNAME"] + clean_product["manufacturer_name"] = row["LABELERNAME"] + clean_product["brand_name"] = row["PROPRIETARYNAME"] + clean_product["brand_name_suffix"] = row["PROPRIETARYNAMESUFFIX"] + clean_product["product_ndc"] = row["PRODUCTNDC"] + clean_product["dosage_form"] = row["DOSAGEFORMNAME"] + clean_product["route"] = row["ROUTENAME"] + clean_product["substance_name"] = row["SUBSTANCENAME"] + + all_products.append(clean_product) + + joinable_labels = {} + + for spl_data in label: + clean_label = {} + clean_label["spl_set_id"] = spl_data["spl_set_id"] + clean_label["id"] = spl_data["id"] + clean_label["spl_version"] = spl_data["spl_version"] + clean_label["is_original_packager"] = spl_data["is_original_packager"] + clean_label["spl_product_ndc"] = spl_data["ProductNDCs"] + clean_label["original_packager_product_ndc"] = spl_data[ + "OriginalPackagerProductNDSs" + ] + clean_label["package_ndc"] = spl_data["PackageNDCs"] + + joinable_labels[clean_label["id"]] = [clean_label] + + # Adding labels + joinable_products = _joinable_dict(all_products, ["id"]) + record_list = _combine_dicts(joinable_labels, joinable_products, None) + + # Adding UNII + joinable_record_dict = _joinable_dict(record_list, ["id"]) + joinable_unii = _joinable_dict(unii, ["spl_id"]) + record_list = _combine_dicts(joinable_record_dict, joinable_unii, "unii_indexing") + # Adding RXNorm + joinable_record_dict = _joinable_dict(record_list, ["spl_set_id", "spl_version"]) + joinable_rxnorm = _joinable_dict(rxs, ["spl_set_id", "spl_version"]) + record_list = _combine_dicts(joinable_record_dict, joinable_rxnorm, "rxnorm") + + # Adding UPC + joinable_record_dict = _joinable_dict(record_list, ["spl_set_id"]) + joinable_upc = _joinable_dict(upcs, ["set_id"]) + record_list = _combine_dicts(joinable_record_dict, joinable_upc, "upc") + + for row in record_list: + json.dump(row, json_output_file, encoding="utf-8-sig") + json_output_file.write("\n") diff --git a/openfda/annotation_table/extract_unii.py b/openfda/annotation_table/extract_unii.py index 9cfba6e..38938c5 100644 --- a/openfda/annotation_table/extract_unii.py +++ b/openfda/annotation_table/extract_unii.py @@ -24,72 +24,96 @@ from lxml import etree # http://www.fda.gov/ForIndustry/DataStandards/StructuredProductLabeling/ucm162061.htm -UNII_OID = '2.16.840.1.113883.4.9' -UNII_OTHER_OID1 = '2.16.840.1.113883.3.26.1.5' -UNII_OTHER_OID2 = '2.16.840.1.113883.6.177' -UNII_OTHER_OID3 = '2.16.840.1.113883.6.345' +UNII_OID = "2.16.840.1.113883.4.9" +UNII_OTHER_OID1 = "2.16.840.1.113883.3.26.1.5" +UNII_OTHER_OID2 = "2.16.840.1.113883.6.177" +UNII_OTHER_OID3 = "2.16.840.1.113883.6.345" + def remove_namespace(xml_file): - """SPL have namespaces that don't play nice with python's libxml. For now - we just remove namespaces to work around this.""" - # TODO(mattmo): Figure out a better solution to this. - # http://stackoverflow.com/questions/5572247/how-to-find-xml-elements- - # via-xpath-in-python-in-a-namespace-agnostic-way - lines = xml_file.read() - lines = re.sub(r']+>', '', lines) - lines = re.sub(r'\w+:type="[^"]+"', '', lines) - return BytesIO(lines.encode()) + """SPL have namespaces that don't play nice with python's libxml. For now + we just remove namespaces to work around this.""" + # TODO(mattmo): Figure out a better solution to this. + # http://stackoverflow.com/questions/5572247/how-to-find-xml-elements- + # via-xpath-in-python-in-a-namespace-agnostic-way + lines = xml_file.read() + lines = re.sub(r"]+>", "", lines) + lines = re.sub(r'\w+:type="[^"]+"', "", lines) + return BytesIO(lines.encode()) + def parse_xml(xml_file): - p = etree.XMLParser(huge_tree=True) - tree = etree.parse(remove_namespace(open(xml_file)), parser=p) - return tree + p = etree.XMLParser(huge_tree=True) + tree = etree.parse(remove_namespace(open(xml_file)), parser=p) + return tree + def first_match_or_empty_string(matches): - if len(matches) > 0: - return matches[0] - else: - return '' + if len(matches) > 0: + return matches[0] + else: + return "" + def extract_set_id(tree): - return first_match_or_empty_string( - tree.getroot().xpath('/document/setId/@root')) + return first_match_or_empty_string(tree.getroot().xpath("/document/setId/@root")) def load_unii_from_csv(file): - unii_rows = {} - df = pd.read_csv(open(file, 'r', encoding='utf-8', errors='ignore'), header=0, names=['Name', 'Type', 'UNII', 'PT'], - sep='\t', index_col=False, encoding='utf-8', - dtype=str, low_memory=False, na_values=[], keep_default_na=False) - for row in df.itertuples(): - unii_rows[row.Name.lower()] = {'va': [], 'name': row.Name, 'set_id': '', - 'unii': row.UNII} - unii_rows[row.PT.lower()] = {'va': [], 'name': row.PT, 'set_id': '', - 'unii': row.UNII} - return unii_rows + unii_rows = {} + df = pd.read_csv( + open(file, "r", encoding="utf-8", errors="ignore"), + header=0, + names=["Name", "Type", "UNII", "PT"], + sep="\t", + index_col=False, + encoding="utf-8", + dtype=str, + low_memory=False, + na_values=[], + keep_default_na=False, + ) + for row in df.itertuples(): + unii_rows[row.Name.lower()] = { + "va": [], + "name": row.Name, + "set_id": "", + "unii": row.UNII, + } + unii_rows[row.PT.lower()] = { + "va": [], + "name": row.PT, + "set_id": "", + "unii": row.UNII, + } + return unii_rows + def extract_unii(tree): - """Extracts the UNII from Pharmalogical Indexing XML""" - unii_xpath = '//id[@root="%s"]/@extension' % UNII_OID - return first_match_or_empty_string(tree.getroot().xpath(unii_xpath)) + """Extracts the UNII from Pharmalogical Indexing XML""" + unii_xpath = '//id[@root="%s"]/@extension' % UNII_OID + return first_match_or_empty_string(tree.getroot().xpath(unii_xpath)) + def extract_unii_name(tree): - """Extracts the name of the UNII from the Pharmalogical Indexing XML""" - unii_name_xpath = '//identifiedSubstance/name/descendant::text()' - return first_match_or_empty_string(tree.getroot().xpath(unii_name_xpath)) + """Extracts the name of the UNII from the Pharmalogical Indexing XML""" + unii_name_xpath = "//identifiedSubstance/name/descendant::text()" + return first_match_or_empty_string(tree.getroot().xpath(unii_name_xpath)) def extract_unii_other_code(tree): - """Extract the codes for other ingredients""" - unii_other_xpath = \ - '//generalizedMaterialKind/code[@codeSystem="%s" or @codeSystem="%s" or @codeSystem="%s"]/@code' \ - % (UNII_OTHER_OID1, UNII_OTHER_OID2, UNII_OTHER_OID3) - return tree.getroot().xpath(unii_other_xpath) + """Extract the codes for other ingredients""" + unii_other_xpath = ( + '//generalizedMaterialKind/code[@codeSystem="%s" or @codeSystem="%s" or @codeSystem="%s"]/@code' + % (UNII_OTHER_OID1, UNII_OTHER_OID2, UNII_OTHER_OID3) + ) + return tree.getroot().xpath(unii_other_xpath) def extract_unii_other_name(tree): - """Extract the display names for other ingredients""" - unii_other_xpath = \ - '//generalizedMaterialKind/code[@codeSystem="%s" or @codeSystem="%s" or @codeSystem="%s"]/@displayName' \ - % (UNII_OTHER_OID1, UNII_OTHER_OID2, UNII_OTHER_OID3) - return tree.getroot().xpath(unii_other_xpath) + """Extract the display names for other ingredients""" + unii_other_xpath = ( + '//generalizedMaterialKind/code[@codeSystem="%s" or @codeSystem="%s" or @codeSystem="%s"]/@displayName' + % (UNII_OTHER_OID1, UNII_OTHER_OID2, UNII_OTHER_OID3) + ) + return tree.getroot().xpath(unii_other_xpath) diff --git a/openfda/annotation_table/pipeline.py b/openfda/annotation_table/pipeline.py index da44c42..df17da8 100755 --- a/openfda/annotation_table/pipeline.py +++ b/openfda/annotation_table/pipeline.py @@ -29,582 +29,642 @@ from openfda.spl import process_barcodes, extract from openfda.tasks import DependencyTriggeredTask -data_dir = config.data_dir('harmonization') -BATCH_DATE = arrow.utcnow().ceil('week').format('YYYYMMDD') -BASE_DIR = config.data_dir('harmonization/batches/%s' % BATCH_DATE) -SPL_S3_DIR = config.data_dir('spl/s3_sync') +data_dir = config.data_dir("harmonization") +BATCH_DATE = arrow.utcnow().ceil("week").format("YYYYMMDD") +BASE_DIR = config.data_dir("harmonization/batches/%s" % BATCH_DATE) +SPL_S3_DIR = config.data_dir("spl/s3_sync") TMP_DIR = config.tmp_dir() -common.shell_cmd_quiet('mkdir -p %s', data_dir) -common.shell_cmd_quiet('mkdir -p %s', BASE_DIR) -common.shell_cmd_quiet('mkdir -p %s', TMP_DIR) +common.shell_cmd_quiet("mkdir -p %s", data_dir) +common.shell_cmd_quiet("mkdir -p %s", BASE_DIR) +common.shell_cmd_quiet("mkdir -p %s", TMP_DIR) -SPL_SET_ID_INDEX = join(BASE_DIR, 'spl_index.db') -DAILYMED_PREFIX = 'ftp://public.nlm.nih.gov/nlmdata/.dailymed/' +SPL_SET_ID_INDEX = join(BASE_DIR, "spl_index.db") +DAILYMED_PREFIX = "ftp://public.nlm.nih.gov/nlmdata/.dailymed/" -PHARM_CLASS_DOWNLOAD = \ - DAILYMED_PREFIX + 'pharmacologic_class_indexing_spl_files.zip' +PHARM_CLASS_DOWNLOAD = DAILYMED_PREFIX + "pharmacologic_class_indexing_spl_files.zip" -RXNORM_DOWNLOAD = \ - DAILYMED_PREFIX + 'rxnorm_mappings.zip' +RXNORM_DOWNLOAD = DAILYMED_PREFIX + "rxnorm_mappings.zip" -UNII_WEBSITE = 'https://precision.fda.gov' -UNII_DOWNLOAD_PAGE = UNII_WEBSITE + '/uniisearch/archive' +UNII_WEBSITE = "https://precision.fda.gov" +UNII_DOWNLOAD_PAGE = UNII_WEBSITE + "/uniisearch/archive" -NDC_DOWNLOAD_PAGE = \ - 'https://www.fda.gov/drugs/drug-approvals-and-databases/national-drug-code-directory' +NDC_DOWNLOAD_PAGE = "https://www.fda.gov/drugs/drug-approvals-and-databases/national-drug-code-directory" # The database names play a central role in terms of output and the joiner. # For instance: SPL_EXTRACT_DB is used as both an output of a download/json/etc # pipeline AND it is used as input into a join mapreduce() AND it is used # as a source identifier on a mapper output record for the actural JoinReducer. -SPL_EXTRACT_DB = 'spl_extract.db' -NDC_EXTRACT_DB = 'ndc.db' -UNII_EXTRACT_DB = 'unii.db' -RXNORM_EXTRACT_DB = 'rxnorm.db' -UPC_EXTRACT_DB = 'upc.db' +SPL_EXTRACT_DB = "spl_extract.db" +NDC_EXTRACT_DB = "ndc.db" +UNII_EXTRACT_DB = "unii.db" +RXNORM_EXTRACT_DB = "rxnorm.db" +UPC_EXTRACT_DB = "upc.db" class DownloadNDC(luigi.Task): - def requires(self): - return [] + def requires(self): + return [] - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'ndc/raw/ndc_database.zip')) + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "ndc/raw/ndc_database.zip")) - def run(self): - zip_url = None - soup = BeautifulSoup(urlopen(NDC_DOWNLOAD_PAGE).read(), 'lxml') - for a in soup.find_all(href=re.compile('.*.zip')): - if 'ndc database file - text' in a.text.lower(): - zip_url = urljoin('https://www.fda.gov', a['href']) - break + def run(self): + zip_url = None + soup = BeautifulSoup(urlopen(NDC_DOWNLOAD_PAGE).read(), "lxml") + for a in soup.find_all(href=re.compile(".*.zip")): + if "ndc database file - text" in a.text.lower(): + zip_url = urljoin("https://www.fda.gov", a["href"]) + break - if not zip_url: - logging.fatal('NDC database file not found!') - raise + if not zip_url: + logging.fatal("NDC database file not found!") + raise - common.download(zip_url, self.output().path) + common.download(zip_url, self.output().path) class DownloadPharmaClass(luigi.Task): - def requires(self): - return [] + def requires(self): + return [] - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'pharma_class/raw/pharmacologic_class.zip')) + def output(self): + return luigi.LocalTarget( + join(BASE_DIR, "pharma_class/raw/pharmacologic_class.zip") + ) + + def run(self): + common.download(PHARM_CLASS_DOWNLOAD, self.output().path) - def run(self): - common.download(PHARM_CLASS_DOWNLOAD, self.output().path) class DownloadUNII(luigi.Task): - def requires(self): - return [] - - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'unii/raw/unii.zip')) - - def run(self): - ''' - UNII downloading has changed from being a simple URL pull to a SPA application backed by a Lambda and AWS S3 pre-signed - URLs. So we brute force into the HTML/JS here, construct the AWS Lambda URL, obtain a pre-signed URL and finally - pull the file. This works, but is fragile and will break if FDA makes significant changes to the SPA structure. We'd - need something like Cypress to make this more flexible. - ''' - # SPA Main HTML - soup = BeautifulSoup(urlopen(UNII_DOWNLOAD_PAGE).read(), 'lxml') - # This script tag has the file name we need - script_tag = soup.find(id="__NEXT_DATA__") - script_content = json.loads(script_tag.contents[0]) - unii_file_name = script_content['props']['pageProps']['namesRecords'][0]['fileName'] - - # Now go through the page scripts and find the AWS Lambda endpoint URL. - for tag in soup.find_all('script'): - script_src = tag.get('src') - if script_src and '/static/chunks/pages/archive-' in script_src: - full_script_url = UNII_WEBSITE + script_src - script_content = requests.get(full_script_url).text - lambda_endpoint = re.compile('https://\\w+.execute-api.us-east-1.amazonaws.com/production/v\\d').search( - script_content).group(0) - get_file_url = lambda_endpoint + '/get-file/' + unii_file_name - # The lambda endpoint will return a pre-signed S3 URL to download from - aws_file_url = requests.get(get_file_url).json()['url'] - common.download_requests(aws_file_url, self.output().path) + def requires(self): + return [] + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "unii/raw/unii.zip")) + + def run(self): + """ + UNII downloading has changed from being a simple URL pull to a SPA application backed by a Lambda and AWS S3 pre-signed + URLs. So we brute force into the HTML/JS here, construct the AWS Lambda URL, obtain a pre-signed URL and finally + pull the file. This works, but is fragile and will break if FDA makes significant changes to the SPA structure. We'd + need something like Cypress to make this more flexible. + """ + # SPA Main HTML + soup = BeautifulSoup(urlopen(UNII_DOWNLOAD_PAGE).read(), "lxml") + # This script tag has the file name we need + script_tag = soup.find(id="__NEXT_DATA__") + script_content = json.loads(script_tag.contents[0]) + unii_file_name = script_content["props"]["pageProps"]["namesRecords"][0][ + "fileName" + ] + + # Now go through the page scripts and find the AWS Lambda endpoint URL. + for tag in soup.find_all("script"): + script_src = tag.get("src") + if script_src and "/static/chunks/pages/archive-" in script_src: + full_script_url = UNII_WEBSITE + script_src + script_content = requests.get(full_script_url).text + lambda_endpoint = ( + re.compile( + "https://\\w+.execute-api.us-east-1.amazonaws.com/production/v\\d" + ) + .search(script_content) + .group(0) + ) + get_file_url = lambda_endpoint + "/get-file/" + unii_file_name + # The lambda endpoint will return a pre-signed S3 URL to download from + aws_file_url = requests.get(get_file_url).json()["url"] + common.download_requests(aws_file_url, self.output().path) class DownloadRXNorm(luigi.Task): - def requires(self): - return [] + def requires(self): + return [] - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'rxnorm/raw/rxnorm_mappings.zip')) + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "rxnorm/raw/rxnorm_mappings.zip")) - def run(self): - common.download(RXNORM_DOWNLOAD, self.output().path) + def run(self): + common.download(RXNORM_DOWNLOAD, self.output().path) def list_zip_files_in_zip(zip_filename): - return subprocess.check_output("unzip -l %s | \ + return ( + subprocess.check_output( + "unzip -l %s | \ grep zip | \ - awk '{print $4}'" % zip_filename, - shell=True, encoding='utf-8').strip().split('\n') + awk '{print $4}'" + % zip_filename, + shell=True, + encoding="utf-8", + ) + .strip() + .split("\n") + ) def ExtractXMLFromNestedZip(zip_filename, output_dir, exclude_images=True): - for child_zip_filename in list_zip_files_in_zip(zip_filename): - base_zip = basename(child_zip_filename) - target_dir = base_zip.split('.')[0] - cmd = 'unzip -j -d %(output_dir)s/%(target_dir)s \ + for child_zip_filename in list_zip_files_in_zip(zip_filename): + base_zip = basename(child_zip_filename) + target_dir = base_zip.split(".")[0] + cmd = ( + "unzip -j -d %(output_dir)s/%(target_dir)s \ %(zip_filename)s \ - %(child_zip_filename)s' % locals() - common.shell_cmd_quiet(cmd) + %(child_zip_filename)s" + % locals() + ) + common.shell_cmd_quiet(cmd) - cmd = 'unzip %(output_dir)s/%(target_dir)s/%(base_zip)s -d \ - %(output_dir)s/%(target_dir)s' % locals() - if exclude_images: - cmd += ' -x *.jpg' + cmd = ( + "unzip %(output_dir)s/%(target_dir)s/%(base_zip)s -d \ + %(output_dir)s/%(target_dir)s" + % locals() + ) + if exclude_images: + cmd += " -x *.jpg" - common.shell_cmd_quiet(cmd) - common.shell_cmd_quiet('rm %(output_dir)s/%(target_dir)s/%(base_zip)s' % locals()) + common.shell_cmd_quiet(cmd) + common.shell_cmd_quiet( + "rm %(output_dir)s/%(target_dir)s/%(base_zip)s" % locals() + ) class ExtractNDC(luigi.Task): - def requires(self): - return DownloadNDC() - - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'ndc/extracted/product.txt')) - - def run(self): - zip_filename = self.input().path - output_filename = self.output().path - common.shell_cmd_quiet('mkdir -p %s' % dirname(self.output().path)) - cmd = 'unzip -p %(zip_filename)s product.txt > \ - %(output_filename)s' % locals() - common.shell_cmd_quiet(cmd) + def requires(self): + return DownloadNDC() + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "ndc/extracted/product.txt")) + + def run(self): + zip_filename = self.input().path + output_filename = self.output().path + common.shell_cmd_quiet("mkdir -p %s" % dirname(self.output().path)) + cmd = ( + "unzip -p %(zip_filename)s product.txt > \ + %(output_filename)s" + % locals() + ) + common.shell_cmd_quiet(cmd) class ExtractRXNorm(luigi.Task): - def requires(self): - return DownloadRXNorm() - - def output(self): - return luigi.LocalTarget(join(BASE_DIR, - 'rxnorm/extracted/rxnorm_mappings.txt')) - - def run(self): - zip_filename = self.input().path - output_filename = self.output().path - common.shell_cmd_quiet('mkdir -p %s' % dirname(self.output().path)) - cmd = 'unzip -p %(zip_filename)s rxnorm_mappings.txt > \ - %(output_filename)s' % locals() - common.shell_cmd_quiet(cmd) + def requires(self): + return DownloadRXNorm() + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "rxnorm/extracted/rxnorm_mappings.txt")) + + def run(self): + zip_filename = self.input().path + output_filename = self.output().path + common.shell_cmd_quiet("mkdir -p %s" % dirname(self.output().path)) + cmd = ( + "unzip -p %(zip_filename)s rxnorm_mappings.txt > \ + %(output_filename)s" + % locals() + ) + common.shell_cmd_quiet(cmd) class ExtractPharmaClass(luigi.Task): - def requires(self): - return DownloadPharmaClass() + def requires(self): + return DownloadPharmaClass() + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "pharma_class/extracted")) - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'pharma_class/extracted')) + def run(self): + zip_filename = self.input().path + output_dir = self.output().path + common.shell_cmd_quiet("mkdir -p %s" % output_dir) + ExtractXMLFromNestedZip(zip_filename, output_dir) - def run(self): - zip_filename = self.input().path - output_dir = self.output().path - common.shell_cmd_quiet('mkdir -p %s' % output_dir) - ExtractXMLFromNestedZip(zip_filename, output_dir) -''' +""" We need to make sure we take the latest version for each Pharmacologic Class Indexing SPL Set Id. The pharma class file we download from DailyMed may contain multiple versions for the same SPL Set ID, which causes problems down the line. This task will go over all the files, extract the current versions only, and discard the rest (also normalizing the directory structure in the process). -''' -class NormalizePharmaClass(parallel.MRTask): - NS = {'ns': 'urn:hl7-org:v3'} - def requires(self): - return ExtractPharmaClass() +""" - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'pharma_class/normalized')) - def mapreduce_inputs(self): - return parallel.Collection.from_glob(join(self.input().path, '*/*.xml')) +class NormalizePharmaClass(parallel.MRTask): + NS = {"ns": "urn:hl7-org:v3"} - def map(self, xml_file, value, output): - p = etree.XMLParser() - tree = etree.parse(open(xml_file), parser=p) - spl_id = tree.xpath('//ns:document/ns:id/@root', namespaces=self.NS)[0].lower() - spl_set_id = tree.xpath('//ns:document/ns:setId/@root', namespaces=self.NS)[0].lower() - version = tree.xpath('//ns:document/ns:versionNumber/@value', namespaces=self.NS)[0] - output.add(spl_set_id, {'spl_id': spl_id, 'version': version, 'file': xml_file}) + def requires(self): + return ExtractPharmaClass() - def reduce(self, key, values, output): - values.sort(key=lambda spl: int(spl['version'])) - output.put(key, values[-1]) - os.makedirs(name=self.output().path, exist_ok=True) - shutil.copy2(values[-1]['file'], self.output().path) + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "pharma_class/normalized")) - def output_format(self): - return NullOutput() + def mapreduce_inputs(self): + return parallel.Collection.from_glob(join(self.input().path, "*/*.xml")) -class ExtractUNII(luigi.Task): - def requires(self): - return DownloadUNII() + def map(self, xml_file, value, output): + p = etree.XMLParser() + tree = etree.parse(open(xml_file), parser=p) + spl_id = tree.xpath("//ns:document/ns:id/@root", namespaces=self.NS)[0].lower() + spl_set_id = tree.xpath("//ns:document/ns:setId/@root", namespaces=self.NS)[ + 0 + ].lower() + version = tree.xpath( + "//ns:document/ns:versionNumber/@value", namespaces=self.NS + )[0] + output.add(spl_set_id, {"spl_id": spl_id, "version": version, "file": xml_file}) - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'unii/extracted/unii.csv')) + def reduce(self, key, values, output): + values.sort(key=lambda spl: int(spl["version"])) + output.put(key, values[-1]) + os.makedirs(name=self.output().path, exist_ok=True) + shutil.copy2(values[-1]["file"], self.output().path) - def run(self): - zip_filename = self.input().path - output_filename = self.output().path - output_dir = dirname(output_filename) - common.shell_cmd('mkdir -p %s' % output_dir) - cmd = 'unzip -o %(zip_filename)s \ - -d %(output_dir)s' % locals() - common.shell_cmd(cmd) + def output_format(self): + return NullOutput() - # UNII filename varies; find and rename to a standardized name. - # It is now a tab-delimited CSV instead of an XML as before. - for file in glob.glob(join(output_dir, 'UNII*Names*.txt')): - logging.info('Renaming %s', file) - os.rename(file, output_filename) + +class ExtractUNII(luigi.Task): + def requires(self): + return DownloadUNII() + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "unii/extracted/unii.csv")) + + def run(self): + zip_filename = self.input().path + output_filename = self.output().path + output_dir = dirname(output_filename) + common.shell_cmd("mkdir -p %s" % output_dir) + cmd = ( + "unzip -o %(zip_filename)s \ + -d %(output_dir)s" + % locals() + ) + common.shell_cmd(cmd) + + # UNII filename varies; find and rename to a standardized name. + # It is now a tab-delimited CSV instead of an XML as before. + for file in glob.glob(join(output_dir, "UNII*Names*.txt")): + logging.info("Renaming %s", file) + os.rename(file, output_filename) class RXNorm2JSONMapper(parallel.Mapper): - def map(self, key, value, output): - keepers = ['rxcui', 'rxstring', 'rxtty','setid', 'spl_version'] - rename_map = { - 'setid': 'spl_set_id' - } - def _cleaner(k, v): - k = k.lower() - if k in keepers: - if k in rename_map: - return (rename_map[k], v) - return (k, v) - new_value = common.transform_dict(value, _cleaner) - new_key = new_value['spl_set_id'] + ':'+ new_value['spl_version'] - output.add(new_key, new_value) + def map(self, key, value, output): + keepers = ["rxcui", "rxstring", "rxtty", "setid", "spl_version"] + rename_map = {"setid": "spl_set_id"} + + def _cleaner(k, v): + k = k.lower() + if k in keepers: + if k in rename_map: + return (rename_map[k], v) + return (k, v) + + new_value = common.transform_dict(value, _cleaner) + new_key = new_value["spl_set_id"] + ":" + new_value["spl_version"] + output.add(new_key, new_value) class RXNormReducer(parallel.Reducer): - def reduce(self, key, values, output): - out = {} - out['spl_set_id'] = values[0]['spl_set_id'] - out['spl_version'] = values[0]['spl_version'] - out['rxnorm'] = [] - for v in values: - out['rxnorm'].append({ 'rxcui' : v['rxcui'], - 'rxstring' : v['rxstring'], - 'rxtty' : v['rxtty'] }) - output.put(key, out) + def reduce(self, key, values, output): + out = {} + out["spl_set_id"] = values[0]["spl_set_id"] + out["spl_version"] = values[0]["spl_version"] + out["rxnorm"] = [] + for v in values: + out["rxnorm"].append( + {"rxcui": v["rxcui"], "rxstring": v["rxstring"], "rxtty": v["rxtty"]} + ) + output.put(key, out) class RXNorm2JSON(luigi.Task): - ''' The Rxnorm data needs to be rolled up to the SPL_SET_ID level, so first - we clean the data and group it with a ListReducer. - ''' - def requires(self): - return ExtractRXNorm() - - def output(self): - return luigi.LocalTarget(join(BASE_DIR, RXNORM_EXTRACT_DB)) - - def run(self): - parallel.mapreduce( - parallel.Collection.from_glob( - self.input().path, parallel.CSVDictLineInput(delimiter='|')), - mapper=RXNorm2JSONMapper(), - reducer=RXNormReducer(), - output_prefix=self.output().path, - num_shards=10) + """The Rxnorm data needs to be rolled up to the SPL_SET_ID level, so first + we clean the data and group it with a ListReducer. + """ + + def requires(self): + return ExtractRXNorm() + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, RXNORM_EXTRACT_DB)) + + def run(self): + parallel.mapreduce( + parallel.Collection.from_glob( + self.input().path, parallel.CSVDictLineInput(delimiter="|") + ), + mapper=RXNorm2JSONMapper(), + reducer=RXNormReducer(), + output_prefix=self.output().path, + num_shards=10, + ) + # TODO(hansnelsen): refactor this task into a join reduction. Leaving it in # place for now since the logic is a bit harry. Ideally, we # go directly to leveldb, avoiding the JSON step. class UNIIHarmonizationJSON(luigi.Task): - def requires(self): - return [ExtractNDC(), NormalizePharmaClass(), ExtractUNII()] + def requires(self): + return [ExtractNDC(), NormalizePharmaClass(), ExtractUNII()] - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'unii_extract.json')) + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "unii_extract.json")) - def run(self): - ndc_file = self.input()[0].path - pharma_class_dir = self.input()[1].path - unii_file = self.input()[2].path - output_file = self.output().path - common.shell_cmd_quiet('mkdir -p %s' % dirname(self.output().path)) - unii_harmonization.harmonize_unii(output_file, ndc_file, unii_file, pharma_class_dir) + def run(self): + ndc_file = self.input()[0].path + pharma_class_dir = self.input()[1].path + unii_file = self.input()[2].path + output_file = self.output().path + common.shell_cmd_quiet("mkdir -p %s" % dirname(self.output().path)) + unii_harmonization.harmonize_unii( + output_file, ndc_file, unii_file, pharma_class_dir + ) class UNII2JSON(luigi.Task): - def requires(self): - return UNIIHarmonizationJSON() + def requires(self): + return UNIIHarmonizationJSON() - def output(self): - return luigi.LocalTarget(join(BASE_DIR, UNII_EXTRACT_DB)) + def output(self): + return luigi.LocalTarget(join(BASE_DIR, UNII_EXTRACT_DB)) - def run(self): - parallel.mapreduce( - parallel.Collection.from_glob( - self.input().path, parallel.JSONLineInput()), - mapper=parallel.IdentityMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=1) + def run(self): + parallel.mapreduce( + parallel.Collection.from_glob(self.input().path, parallel.JSONLineInput()), + mapper=parallel.IdentityMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=1, + ) class NDC2JSONMapper(parallel.Mapper): - rename_map = { - 'PRODUCTID': 'id', - 'APPLICATIONNUMBER': 'application_number', - 'PRODUCTTYPENAME': 'product_type', - 'NONPROPRIETARYNAME': 'generic_name', - 'LABELERNAME': 'manufacturer_name', - 'PROPRIETARYNAME': 'brand_name', - 'PROPRIETARYNAMESUFFIX': 'brand_name_suffix', - 'PRODUCTNDC': 'product_ndc', - 'DOSAGEFORMNAME': 'dosage_form', - 'ROUTENAME': 'route', - 'SUBSTANCENAME': 'substance_name', - } - - def map(self, key, value, output): - def _cleaner(k, v): - ''' Helper function to rename keys and purge any keys that are not in - the map. - ''' - if k == 'PRODUCTID': - v = v.split('_')[1] - if k in self.rename_map: - return (self.rename_map[k], v) - - new_value = common.transform_dict(value, _cleaner) - output.add(key, new_value) + rename_map = { + "PRODUCTID": "id", + "APPLICATIONNUMBER": "application_number", + "PRODUCTTYPENAME": "product_type", + "NONPROPRIETARYNAME": "generic_name", + "LABELERNAME": "manufacturer_name", + "PROPRIETARYNAME": "brand_name", + "PROPRIETARYNAMESUFFIX": "brand_name_suffix", + "PRODUCTNDC": "product_ndc", + "DOSAGEFORMNAME": "dosage_form", + "ROUTENAME": "route", + "SUBSTANCENAME": "substance_name", + } + + def map(self, key, value, output): + def _cleaner(k, v): + """Helper function to rename keys and purge any keys that are not in + the map. + """ + if k == "PRODUCTID": + v = v.split("_")[1] + if k in self.rename_map: + return (self.rename_map[k], v) + + new_value = common.transform_dict(value, _cleaner) + output.add(key, new_value) # TODO(hansnelsen): Refactor the UNII steps to not need NDC File. Once done, # Everything can use this step. As it stands now, it is only # used by the JoinAll() task. class NDC2JSON(luigi.Task): - def requires(self): - return ExtractNDC() - - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'ndc', NDC_EXTRACT_DB)) - - def run(self): - import sys - import csv - maxInt = sys.maxsize - decrement = True - - while decrement: - # decrease the maxInt value by factor 10 - # as long as the OverflowError occurs. - - decrement = False - try: - csv.field_size_limit(maxInt) - except OverflowError: - maxInt = int(maxInt / 10) + def requires(self): + return ExtractNDC() + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "ndc", NDC_EXTRACT_DB)) + + def run(self): + import sys + import csv + + maxInt = sys.maxsize decrement = True - parallel.mapreduce( - parallel.Collection.from_glob( - self.input().path, parallel.CSVDictLineInput(delimiter='\t')), - mapper=NDC2JSONMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=1) + + while decrement: + # decrease the maxInt value by factor 10 + # as long as the OverflowError occurs. + + decrement = False + try: + csv.field_size_limit(maxInt) + except OverflowError: + maxInt = int(maxInt / 10) + decrement = True + parallel.mapreduce( + parallel.Collection.from_glob( + self.input().path, parallel.CSVDictLineInput(delimiter="\t") + ), + mapper=NDC2JSONMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=1, + ) class SPLSetIDMapper(parallel.Mapper): - ''' Creates an index that is used for both getting the latest version of the - SPL AND for mapping the id to set_id, so that there can be a single - reduction key (spl_set_id). The same SPL raw file is indexed twice, with - a different prefix and a different key, because different downstream - pipelines use different keys. We want to give all pipelines a way to get - an SPL_SET_ID. - ''' - def __init__(self, index_db): - parallel.Mapper.__init__(self) - self.index_db = index_db - - def map(self, key, value, output): - set_id, version, _id = value.get('set_id'), value.get('version'), value.get('id') - - if set_id is None: - logging.warning('SPLSetIDMapper encountered a blank SPL Set ID!') - else: - _index = { - '_version': version, - '_id': _id, - '_set_id': set_id, - '_key': key - } - - # Limiting the output to the known universe of SPL IDs that exist in the - # main join file (NDC). - if _id in self.index_db: - output.add('_set_id:' + set_id, (version, _index)) - output.add('_id:' + _id, (version, _index)) + """Creates an index that is used for both getting the latest version of the + SPL AND for mapping the id to set_id, so that there can be a single + reduction key (spl_set_id). The same SPL raw file is indexed twice, with + a different prefix and a different key, because different downstream + pipelines use different keys. We want to give all pipelines a way to get + an SPL_SET_ID. + """ + + def __init__(self, index_db): + parallel.Mapper.__init__(self) + self.index_db = index_db + + def map(self, key, value, output): + set_id, version, _id = ( + value.get("set_id"), + value.get("version"), + value.get("id"), + ) + + if set_id is None: + logging.warning("SPLSetIDMapper encountered a blank SPL Set ID!") + else: + _index = {"_version": version, "_id": _id, "_set_id": set_id, "_key": key} + + # Limiting the output to the known universe of SPL IDs that exist in the + # main join file (NDC). + if _id in self.index_db: + output.add("_set_id:" + set_id, (version, _index)) + output.add("_id:" + _id, (version, _index)) class SPLSetIDIndex(luigi.Task): - ''' Creates an index used for SPL version resolution and ID/SET_ID mapping. - This task has a cross-dependency upon the SPL pipeline. It returns a list - of every json.db every made by the SPL pipeline. - ''' - def requires(self): - from openfda.spl.pipeline import SPL2JSON - return [SPL2JSON(), NDC2JSON()] - - def output(self): - return luigi.LocalTarget(SPL_SET_ID_INDEX) - - def run(self): - ndc_spl_id_index = {} - ndc_db = self.input()[1].path - logging.info('Joining data from NDC DB: %s', ndc_db) - db = parallel.ShardedDB.open(ndc_db) - db_iter = db.range_iter(None, None) - - # We want each SPL ID that is in the NDC file so that we always use the - # same SPL file for both ID and SET_ID based joins. - for (key, val) in db_iter: - ndc_spl_id_index[val['id']] = True - - - parallel.mapreduce( - parallel.Collection.from_sharded(self.input()[0].path), - mapper=SPLSetIDMapper(index_db=ndc_spl_id_index), - reducer=parallel.ListReducer(), - output_prefix=self.output().path, - num_shards=16) + """Creates an index used for SPL version resolution and ID/SET_ID mapping. + This task has a cross-dependency upon the SPL pipeline. It returns a list + of every json.db every made by the SPL pipeline. + """ + + def requires(self): + from openfda.spl.pipeline import SPL2JSON + + return [SPL2JSON(), NDC2JSON()] + + def output(self): + return luigi.LocalTarget(SPL_SET_ID_INDEX) + + def run(self): + ndc_spl_id_index = {} + ndc_db = self.input()[1].path + logging.info("Joining data from NDC DB: %s", ndc_db) + db = parallel.ShardedDB.open(ndc_db) + db_iter = db.range_iter(None, None) + + # We want each SPL ID that is in the NDC file so that we always use the + # same SPL file for both ID and SET_ID based joins. + for key, val in db_iter: + ndc_spl_id_index[val["id"]] = True + + parallel.mapreduce( + parallel.Collection.from_sharded(self.input()[0].path), + mapper=SPLSetIDMapper(index_db=ndc_spl_id_index), + reducer=parallel.ListReducer(), + output_prefix=self.output().path, + num_shards=16, + ) class CurrentSPLMapper(parallel.Mapper): - def _extract(self, filename): - ''' Moving this code from the `spl_harmonization` file, since it is the + def _extract(self, filename): + """Moving this code from the `spl_harmonization` file, since it is the only part of that file that is needed now that we have converted to a map reduction.` - ''' - try: - tree = extract.parse_xml(filename) - harmonized = {} - harmonized['spl_set_id'] = spl.extract.extract_set_id(tree) - harmonized['id'] = spl.extract.extract_id(tree) - harmonized['spl_version'] = spl.extract.extract_version_number(tree) - harmonized['is_original_packager'] = \ - spl.extract.is_original_packager(tree) - harmonized['spl_product_ndc'] = spl.extract.extract_product_ndcs(tree) - harmonized['original_packager_product_ndc'] = \ - spl.extract.extract_original_packager_product_ndcs(tree) - harmonized['package_ndc'] = spl.extract.extract_package_ndcs(tree) - return harmonized - except: - logging.warning('ERROR processing SPL data: %s', filename) - traceback.print_exc() - return None - - def map(self, key, value, output): - if '_set_id:' in key: - version, _index = sorted(value)[-1] - xml_file = _index['_key'] - new_value = self._extract(xml_file) - if new_value is not None: - new_key = key.replace('_set_id:', '') - output.add(new_key, new_value) + """ + try: + tree = extract.parse_xml(filename) + harmonized = {} + harmonized["spl_set_id"] = spl.extract.extract_set_id(tree) + harmonized["id"] = spl.extract.extract_id(tree) + harmonized["spl_version"] = spl.extract.extract_version_number(tree) + harmonized["is_original_packager"] = spl.extract.is_original_packager(tree) + harmonized["spl_product_ndc"] = spl.extract.extract_product_ndcs(tree) + harmonized["original_packager_product_ndc"] = ( + spl.extract.extract_original_packager_product_ndcs(tree) + ) + harmonized["package_ndc"] = spl.extract.extract_package_ndcs(tree) + return harmonized + except: + logging.warning("ERROR processing SPL data: %s", filename) + traceback.print_exc() + return None + + def map(self, key, value, output): + if "_set_id:" in key: + version, _index = sorted(value)[-1] + xml_file = _index["_key"] + new_value = self._extract(xml_file) + if new_value is not None: + new_key = key.replace("_set_id:", "") + output.add(new_key, new_value) class GenerateCurrentSPLJSON(luigi.Task): - ''' All SPL files on S3 (IDs), we only want the most recent versions. This - task generates a spl_extract.db that is only the current version. - ''' - def requires(self): - return SPLSetIDIndex() + """All SPL files on S3 (IDs), we only want the most recent versions. This + task generates a spl_extract.db that is only the current version. + """ + + def requires(self): + return SPLSetIDIndex() - def output(self): - return luigi.LocalTarget(join(BASE_DIR, SPL_EXTRACT_DB)) + def output(self): + return luigi.LocalTarget(join(BASE_DIR, SPL_EXTRACT_DB)) - def run(self): - parallel.mapreduce( - parallel.Collection.from_sharded(self.input().path), - mapper=CurrentSPLMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + def run(self): + parallel.mapreduce( + parallel.Collection.from_sharded(self.input().path), + mapper=CurrentSPLMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) class ExtractUPCFromSPL(luigi.Task): - ''' A task that will generate otc-bars.xml for ANY SPL IDs that does not have - one yet. There is a simple output file to represent that this process has - run for this batch cycle. - ''' - def requires(self): - # Just need a task that forces an S3 sync, which GenerateCurrentSPLJSON does - return GenerateCurrentSPLJSON() - - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'barcode_update.done')) - - def run(self): - for filename in glob.glob(SPL_S3_DIR + '/*/*.xml'): - src_dir = dirname(filename) - barcode_target = join(src_dir, 'barcodes') - xml_out = join(barcode_target, 'otc-bars.xml') - json_out = xml_out.replace('.xml', '.json') - - if not os.path.exists(xml_out): - common.shell_cmd_quiet('mkdir -p %s', barcode_target) - # logging.info('Zbarimg on directory %s', src_dir) - cmd = 'find %(src_dir)s -name "*.jpg" -size +0\ + """A task that will generate otc-bars.xml for ANY SPL IDs that does not have + one yet. There is a simple output file to represent that this process has + run for this batch cycle. + """ + + def requires(self): + # Just need a task that forces an S3 sync, which GenerateCurrentSPLJSON does + return GenerateCurrentSPLJSON() + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "barcode_update.done")) + + def run(self): + for filename in glob.glob(SPL_S3_DIR + "/*/*.xml"): + src_dir = dirname(filename) + barcode_target = join(src_dir, "barcodes") + xml_out = join(barcode_target, "otc-bars.xml") + json_out = xml_out.replace(".xml", ".json") + + if not os.path.exists(xml_out): + common.shell_cmd_quiet("mkdir -p %s", barcode_target) + # logging.info('Zbarimg on directory %s', src_dir) + cmd = ( + 'find %(src_dir)s -name "*.jpg" -size +0\ -exec zbarimg -q --xml {} \; > \ - %(xml_out)s' % locals() - common.shell_cmd_quiet(cmd) + %(xml_out)s' + % locals() + ) + common.shell_cmd_quiet(cmd) - if common.is_older(json_out, xml_out): - # logging.info('%s does not exist, producing...', json_out) - process_barcodes.XML2JSON(xml_out) + if common.is_older(json_out, xml_out): + # logging.info('%s does not exist, producing...', json_out) + process_barcodes.XML2JSON(xml_out) - common.shell_cmd_quiet('touch %s', self.output().path) + common.shell_cmd_quiet("touch %s", self.output().path) class UpcMapper(parallel.Mapper): - def __init__(self, spl_s3_dir): - parallel.Mapper.__init__(self) - self.spl_s3_dir = spl_s3_dir - - def map(self, key, value, output): - if value is None: return - _id = value['id'] - # otc-bars.json contains new-line delimited JSON strings. we output each - # line along with it's id. - upc_json = join(self.spl_s3_dir, _id, 'barcodes/otc-bars.json') - if os.path.exists(upc_json): - upcs = open(upc_json, 'r') - upcList = [] - for upc in upcs: - upcList.append(json.loads(upc)) - output.add(_id, upcList) + def __init__(self, spl_s3_dir): + parallel.Mapper.__init__(self) + self.spl_s3_dir = spl_s3_dir + + def map(self, key, value, output): + if value is None: + return + _id = value["id"] + # otc-bars.json contains new-line delimited JSON strings. we output each + # line along with it's id. + upc_json = join(self.spl_s3_dir, _id, "barcodes/otc-bars.json") + if os.path.exists(upc_json): + upcs = open(upc_json, "r") + upcList = [] + for upc in upcs: + upcList.append(json.loads(upc)) + output.add(_id, upcList) class UpcXml2JSON(luigi.Task): - def requires(self): - return [ExtractUPCFromSPL(), GenerateCurrentSPLJSON()] + def requires(self): + return [ExtractUPCFromSPL(), GenerateCurrentSPLJSON()] + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, UPC_EXTRACT_DB)) - def output(self): - return luigi.LocalTarget(join(BASE_DIR, UPC_EXTRACT_DB)) + def run(self): + parallel.mapreduce( + parallel.Collection.from_sharded(self.input()[1].path), + mapper=UpcMapper(spl_s3_dir=SPL_S3_DIR), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) - def run(self): - parallel.mapreduce( - parallel.Collection.from_sharded(self.input()[1].path), - mapper=UpcMapper(spl_s3_dir=SPL_S3_DIR), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) # The index key has a prefix to avoid collision, we need to data structure # that tells us which prefix maps to which database and which value to pull @@ -615,145 +675,149 @@ def run(self): # We then concatenate these to find the index entry. # Which then results in a spl_set_id, which is our new output key. ID_PREFIX_DB_MAP = { - SPL_EXTRACT_DB: ('_set_id', 'spl_set_id'), - NDC_EXTRACT_DB: ('_id', 'id'), - UNII_EXTRACT_DB: ('_id', 'spl_id'), - RXNORM_EXTRACT_DB: ('_set_id', 'spl_set_id'), - UPC_EXTRACT_DB: ('_id', 'id') + SPL_EXTRACT_DB: ("_set_id", "spl_set_id"), + NDC_EXTRACT_DB: ("_id", "id"), + UNII_EXTRACT_DB: ("_id", "spl_id"), + RXNORM_EXTRACT_DB: ("_set_id", "spl_set_id"), + UPC_EXTRACT_DB: ("_id", "id"), } class JoinAllMapper(parallel.Mapper): - def __init__(self, index_db): - parallel.Mapper.__init__(self) - self.index_db = index_db - - def map_shard(self, map_input, map_output): - self.filename = map_input.filename - return parallel.Mapper.map_shard(self, map_input, map_output) - - def get_set_id(self, lookup_value): - ''' Helper function to pull the set_id from the first entry in the index. - ''' - if lookup_value in self.index_db: - index = self.index_db[lookup_value] - set_id = index['_set_id'] - return set_id - else: - return None - - def map(self, key, value, output): - db_name = basename(dirname(self.filename)) - prefix, lookup_key = ID_PREFIX_DB_MAP[db_name] - if not value: - #logging.warning('Bad value for map input: %s, %s', db_name, key) - return - if not isinstance(value, list): value = [value] - for val in value: - lookup_value = prefix + ':' + val[lookup_key] - set_id = self.get_set_id(lookup_value) - if set_id: - output.add(set_id, (db_name, val)) - #else: - # logging.warning('Missing set id for %s', lookup_value) + def __init__(self, index_db): + parallel.Mapper.__init__(self) + self.index_db = index_db + + def map_shard(self, map_input, map_output): + self.filename = map_input.filename + return parallel.Mapper.map_shard(self, map_input, map_output) + + def get_set_id(self, lookup_value): + """Helper function to pull the set_id from the first entry in the index.""" + if lookup_value in self.index_db: + index = self.index_db[lookup_value] + set_id = index["_set_id"] + return set_id + else: + return None + + def map(self, key, value, output): + db_name = basename(dirname(self.filename)) + prefix, lookup_key = ID_PREFIX_DB_MAP[db_name] + if not value: + # logging.warning('Bad value for map input: %s, %s', db_name, key) + return + if not isinstance(value, list): + value = [value] + for val in value: + lookup_value = prefix + ":" + val[lookup_key] + set_id = self.get_set_id(lookup_value) + if set_id: + output.add(set_id, (db_name, val)) + # else: + # logging.warning('Missing set id for %s', lookup_value) class JoinAllReducer(parallel.Reducer): - ''' A custom joiner for combining all of the data sources into a single - unique record. - ''' - def _join(self, values): - # The mapper output values: (database_file, dictionary value) - # Each database_file lines up with an instance variable so that we can - # grab the appropriate data for the appropriate join. - intermediate = collections.defaultdict(list) - for row in values: - db_name, val = row - intermediate[db_name].append(val) - - result = [] - for ndc_row in intermediate[NDC_EXTRACT_DB]: - for spl_row in intermediate[SPL_EXTRACT_DB]: - result.append(dict(list(ndc_row.items()) + list(spl_row.items()))) - - final_result = [] - - for row in result: - final = dict(row) - - # If there is unii_indexing, there is only one record, so we make sure - # something is there and grab the first one. - final['unii_indexing'] = {} - unii_data = intermediate.get(UNII_EXTRACT_DB, None) - if unii_data: - final['unii_indexing'] = unii_data[0]['unii_indexing'] - - final['rxnorm'] = [] - for row in intermediate.get(RXNORM_EXTRACT_DB, []): - for data in row['rxnorm']: - final['rxnorm'].append(data) - - # TODO(hansnelsen): clean this up, it saves either a string or an empty - # list. It is bad, but it will require a refactor of - # annotation steps in each pipeline, so leaving it for - # now. - # If upc data exists, there is only one, so we make sure it is there and - # then grab the first one. - final['upc'] = [] - upc_data = intermediate.get(UPC_EXTRACT_DB, None) - if upc_data: - final['upc'] = [upc['upc'] for upc in upc_data] - - final_result.append(final) - - return final_result - - def reduce(self, key, values, output): - # we're writing to a JSONLine output, which ignores - # out key field and simply writes one value per line. - for row in self._join(values): - output.put(key, row) - #else: - # logging.warning('No data for key: %s', key) + """A custom joiner for combining all of the data sources into a single + unique record. + """ + + def _join(self, values): + # The mapper output values: (database_file, dictionary value) + # Each database_file lines up with an instance variable so that we can + # grab the appropriate data for the appropriate join. + intermediate = collections.defaultdict(list) + for row in values: + db_name, val = row + intermediate[db_name].append(val) + + result = [] + for ndc_row in intermediate[NDC_EXTRACT_DB]: + for spl_row in intermediate[SPL_EXTRACT_DB]: + result.append(dict(list(ndc_row.items()) + list(spl_row.items()))) + + final_result = [] + + for row in result: + final = dict(row) + + # If there is unii_indexing, there is only one record, so we make sure + # something is there and grab the first one. + final["unii_indexing"] = {} + unii_data = intermediate.get(UNII_EXTRACT_DB, None) + if unii_data: + final["unii_indexing"] = unii_data[0]["unii_indexing"] + + final["rxnorm"] = [] + for row in intermediate.get(RXNORM_EXTRACT_DB, []): + for data in row["rxnorm"]: + final["rxnorm"].append(data) + + # TODO(hansnelsen): clean this up, it saves either a string or an empty + # list. It is bad, but it will require a refactor of + # annotation steps in each pipeline, so leaving it for + # now. + # If upc data exists, there is only one, so we make sure it is there and + # then grab the first one. + final["upc"] = [] + upc_data = intermediate.get(UPC_EXTRACT_DB, None) + if upc_data: + final["upc"] = [upc["upc"] for upc in upc_data] + + final_result.append(final) + + return final_result + + def reduce(self, key, values, output): + # we're writing to a JSONLine output, which ignores + # out key field and simply writes one value per line. + for row in self._join(values): + output.put(key, row) + # else: + # logging.warning('No data for key: %s', key) class CombineHarmonization(DependencyTriggeredTask): - def requires(self): - return {'ndc': NDC2JSON(), - 'spl': GenerateCurrentSPLJSON(), - 'unii': UNII2JSON(), - 'rxnorm': RXNorm2JSON(), - 'upc': UpcXml2JSON(), - 'setid': SPLSetIDIndex() - } - - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'harmonized.json')) - - def run(self): - index_db = {} - setid_db = self.input()['setid'].path - logging.info('Joining data from setid DB: %s', setid_db) - db = parallel.ShardedDB.open(setid_db) - db_iter = db.range_iter(None, None) - - for (key, val) in db_iter: - # Only need one entry from the index, pruning here to simplify mapper - # The index is also used to find the latest version of SPL, so it is a bit - # clunky to use in this particular case. - index_db[key] = val[0][1] - - db_list = [ - self.input()[_db].path for _db in ['ndc', 'spl', 'unii', 'rxnorm', 'upc'] - ] - - logging.info('DB %s', db_list) - parallel.mapreduce( - parallel.Collection.from_sharded_list(db_list), - mapper=JoinAllMapper(index_db=index_db), - reducer=JoinAllReducer(), - output_prefix=self.output().path, - output_format=parallel.JSONLineOutput()) - -if __name__ == '__main__': - luigi.run() + def requires(self): + return { + "ndc": NDC2JSON(), + "spl": GenerateCurrentSPLJSON(), + "unii": UNII2JSON(), + "rxnorm": RXNorm2JSON(), + "upc": UpcXml2JSON(), + "setid": SPLSetIDIndex(), + } + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "harmonized.json")) + + def run(self): + index_db = {} + setid_db = self.input()["setid"].path + logging.info("Joining data from setid DB: %s", setid_db) + db = parallel.ShardedDB.open(setid_db) + db_iter = db.range_iter(None, None) + + for key, val in db_iter: + # Only need one entry from the index, pruning here to simplify mapper + # The index is also used to find the latest version of SPL, so it is a bit + # clunky to use in this particular case. + index_db[key] = val[0][1] + + db_list = [ + self.input()[_db].path for _db in ["ndc", "spl", "unii", "rxnorm", "upc"] + ] + + logging.info("DB %s", db_list) + parallel.mapreduce( + parallel.Collection.from_sharded_list(db_list), + mapper=JoinAllMapper(index_db=index_db), + reducer=JoinAllReducer(), + output_prefix=self.output().path, + output_format=parallel.JSONLineOutput(), + ) + + +if __name__ == "__main__": + luigi.run() diff --git a/openfda/annotation_table/tests/extract_unii_test.py b/openfda/annotation_table/tests/extract_unii_test.py index 4438bee..03c464c 100644 --- a/openfda/annotation_table/tests/extract_unii_test.py +++ b/openfda/annotation_table/tests/extract_unii_test.py @@ -6,52 +6,60 @@ from openfda.annotation_table import extract_unii as extract from openfda.spl import extract as extract_spl + class UNIIExtractMethoxsalenUnitTest(unittest.TestCase): - 'UNII Extract Unit Test' - - def setUp(self): - self.tree = extract_spl.parse_xml(os.path.dirname(__file__) + - '/data/METHOXSALEN.xml') - - def test_extract_setid(self): - expected_setid = 'ea1a6225-0eb9-4743-9601-23d5795936a3' - extracted_setid = extract.extract_set_id(self.tree) - self.assertEqual(expected_setid, extracted_setid, extracted_setid) - - def test_extract_unii(self): - expected_unii = 'U4VJ29L7BQ' - extracted_unii = extract.extract_unii(self.tree) - self.assertEqual(expected_unii, extracted_unii, extracted_unii) - - def test_extract_unii_name(self): - expected_unii_name = 'METHOXSALEN' - extracted_unii_name = extract.extract_unii_name(self.tree) - self.assertEqual(expected_unii_name, - extracted_unii_name, - extracted_unii_name) - - def test_extract_unii_other_code(self): - expected_unii_other_code = ['N0000010217', - 'N0000175984', - 'N0000009801', - 'N0000175879', - 'N0000007909'] - extracted_unii_other_code = extract.extract_unii_other_code(self.tree) - self.assertListEqual(expected_unii_other_code, - extracted_unii_other_code, - extracted_unii_other_code) - - def test_extract_unii_other_name(self): - expected_unii_other_name = ['Photoabsorption [MoA]', - 'Photoactivated Radical Generator [EPC]', - 'Photosensitizing Activity [PE]', - 'Psoralen [EPC]', - 'Psoralens [Chemical/Ingredient]'] - extracted_unii_other_name = extract.extract_unii_other_name(self.tree) - self.assertListEqual(expected_unii_other_name, - extracted_unii_other_name, - extracted_unii_other_name) - -if __name__ == '__main__': - unittest.main() + "UNII Extract Unit Test" + + def setUp(self): + self.tree = extract_spl.parse_xml( + os.path.dirname(__file__) + "/data/METHOXSALEN.xml" + ) + + def test_extract_setid(self): + expected_setid = "ea1a6225-0eb9-4743-9601-23d5795936a3" + extracted_setid = extract.extract_set_id(self.tree) + self.assertEqual(expected_setid, extracted_setid, extracted_setid) + + def test_extract_unii(self): + expected_unii = "U4VJ29L7BQ" + extracted_unii = extract.extract_unii(self.tree) + self.assertEqual(expected_unii, extracted_unii, extracted_unii) + + def test_extract_unii_name(self): + expected_unii_name = "METHOXSALEN" + extracted_unii_name = extract.extract_unii_name(self.tree) + self.assertEqual(expected_unii_name, extracted_unii_name, extracted_unii_name) + + def test_extract_unii_other_code(self): + expected_unii_other_code = [ + "N0000010217", + "N0000175984", + "N0000009801", + "N0000175879", + "N0000007909", + ] + extracted_unii_other_code = extract.extract_unii_other_code(self.tree) + self.assertListEqual( + expected_unii_other_code, + extracted_unii_other_code, + extracted_unii_other_code, + ) + + def test_extract_unii_other_name(self): + expected_unii_other_name = [ + "Photoabsorption [MoA]", + "Photoactivated Radical Generator [EPC]", + "Photosensitizing Activity [PE]", + "Psoralen [EPC]", + "Psoralens [Chemical/Ingredient]", + ] + extracted_unii_other_name = extract.extract_unii_other_name(self.tree) + self.assertListEqual( + expected_unii_other_name, + extracted_unii_other_name, + extracted_unii_other_name, + ) + +if __name__ == "__main__": + unittest.main() diff --git a/openfda/annotation_table/tests/pipeline_test.py b/openfda/annotation_table/tests/pipeline_test.py index 49fa6b4..b3514cd 100644 --- a/openfda/annotation_table/tests/pipeline_test.py +++ b/openfda/annotation_table/tests/pipeline_test.py @@ -15,92 +15,109 @@ class AnnotationPipelineTests(unittest.TestCase): - def setUp(self): - self.test_dir = tempfile.mkdtemp() - openfda.annotation_table.pipeline.data_dir = os.path.join(self.test_dir, openfda.annotation_table.pipeline.data_dir) - openfda.annotation_table.pipeline.BASE_DIR = os.path.join(self.test_dir, openfda.annotation_table.pipeline.BASE_DIR) - openfda.annotation_table.pipeline.SPL_S3_DIR = os.path.join(self.test_dir, - openfda.annotation_table.pipeline.SPL_S3_DIR) - openfda.annotation_table.pipeline.TMP_DIR = os.path.join(self.test_dir, - openfda.annotation_table.pipeline.TMP_DIR) - openfda.annotation_table.pipeline.SPL_SET_ID_INDEX = join(openfda.annotation_table.pipeline.BASE_DIR, - 'spl_index.db') - - openfda.spl.pipeline.SPL_S3_LOCAL_DIR = os.path.join(self.test_dir, - openfda.spl.pipeline.SPL_S3_LOCAL_DIR) - openfda.spl.pipeline.SPL_INDEX_DIR = os.path.join(self.test_dir, - openfda.spl.pipeline.SPL_INDEX_DIR) - - common.shell_cmd('mkdir -p %s', openfda.annotation_table.pipeline.data_dir) - common.shell_cmd('mkdir -p %s', openfda.annotation_table.pipeline.BASE_DIR) - common.shell_cmd('mkdir -p %s', openfda.annotation_table.pipeline.TMP_DIR) - - common.shell_cmd('mkdir -p %s', openfda.spl.pipeline.SPL_S3_LOCAL_DIR) - common.shell_cmd('mkdir -p %s', openfda.spl.pipeline.SPL_INDEX_DIR) - - - def tearDown(self): - shutil.rmtree(self.test_dir) - - def test_multiple_barcodes_handling(self): - common.shell_cmd('cp -rf %s/* %s', os.path.join(RUN_DIR, - 'data/multi_upc_test/annotation'), - openfda.annotation_table.pipeline.BASE_DIR) - common.shell_cmd('cp -rf %s/* %s', os.path.join(RUN_DIR, - 'data/multi_upc_test/spl'), - (dirname(openfda.spl.pipeline.SPL_S3_LOCAL_DIR))) - - RXNorm2JSON().run() - UNIIHarmonizationJSON().run() - UNII2JSON().run() - NDC2JSON().run() - - splToIndex = DetermineSPLToIndex() - splToIndex.run() - - spl2json = SPL2JSON() - spl2json.spl_path = openfda.spl.pipeline.SPL_S3_LOCAL_DIR - spl2json.run() - - SPLSetIDIndex().run() - GenerateCurrentSPLJSON().run() - UpcXml2JSON().run() - CombineHarmonization().run() - annotate_json = AnnotateJSON() - annotate_json.run() - - # Make sure there are now 3, not 1, UPCs for this SPL ID stored in LevelDB. - db = parallel.ShardedDB.open( - join(openfda.annotation_table.pipeline.BASE_DIR, openfda.annotation_table.pipeline.UPC_EXTRACT_DB)) - db_iter = db.range_iter(None, None) - (k, v) = next(db_iter) - ok_(isinstance(v, list)) - eq_(k, '64f8040f-938d-4236-8e22-c838c9b5f8da') - eq_(len(v), 3) - sorted(v, key=lambda upc: upc['upc']) - eq_(v[0]['upc'], '0300694200305') - eq_(v[1]['upc'], '0300694210304') - eq_(v[2]['upc'], '0300694220303') - - # Make sure harmonized.json also contains 3 UPCs. - harmonized = open(join(openfda.annotation_table.pipeline.BASE_DIR, 'harmonized.json'), 'r') - for jsonLine in harmonized: - obj = simplejson.loads(jsonLine) - eq_(obj['id'], '64f8040f-938d-4236-8e22-c838c9b5f8da') - eq_(obj['upc'], ['0300694200305', '0300694210304', '0300694220303']) - - # Make sure the drug label JSON got openFDA section showing all 3 UPCs. - db = parallel.ShardedDB.open(annotate_json.output().path) - db_iter = db.range_iter(None, None) - (k, v) = next(db_iter) - v['openfda']['upc'].sort() - eq_(v['openfda']['upc'], ['0300694200305', '0300694210304', '0300694220303']) - + def setUp(self): + self.test_dir = tempfile.mkdtemp() + openfda.annotation_table.pipeline.data_dir = os.path.join( + self.test_dir, openfda.annotation_table.pipeline.data_dir + ) + openfda.annotation_table.pipeline.BASE_DIR = os.path.join( + self.test_dir, openfda.annotation_table.pipeline.BASE_DIR + ) + openfda.annotation_table.pipeline.SPL_S3_DIR = os.path.join( + self.test_dir, openfda.annotation_table.pipeline.SPL_S3_DIR + ) + openfda.annotation_table.pipeline.TMP_DIR = os.path.join( + self.test_dir, openfda.annotation_table.pipeline.TMP_DIR + ) + openfda.annotation_table.pipeline.SPL_SET_ID_INDEX = join( + openfda.annotation_table.pipeline.BASE_DIR, "spl_index.db" + ) + + openfda.spl.pipeline.SPL_S3_LOCAL_DIR = os.path.join( + self.test_dir, openfda.spl.pipeline.SPL_S3_LOCAL_DIR + ) + openfda.spl.pipeline.SPL_INDEX_DIR = os.path.join( + self.test_dir, openfda.spl.pipeline.SPL_INDEX_DIR + ) + + common.shell_cmd("mkdir -p %s", openfda.annotation_table.pipeline.data_dir) + common.shell_cmd("mkdir -p %s", openfda.annotation_table.pipeline.BASE_DIR) + common.shell_cmd("mkdir -p %s", openfda.annotation_table.pipeline.TMP_DIR) + + common.shell_cmd("mkdir -p %s", openfda.spl.pipeline.SPL_S3_LOCAL_DIR) + common.shell_cmd("mkdir -p %s", openfda.spl.pipeline.SPL_INDEX_DIR) + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_multiple_barcodes_handling(self): + common.shell_cmd( + "cp -rf %s/* %s", + os.path.join(RUN_DIR, "data/multi_upc_test/annotation"), + openfda.annotation_table.pipeline.BASE_DIR, + ) + common.shell_cmd( + "cp -rf %s/* %s", + os.path.join(RUN_DIR, "data/multi_upc_test/spl"), + (dirname(openfda.spl.pipeline.SPL_S3_LOCAL_DIR)), + ) + + RXNorm2JSON().run() + UNIIHarmonizationJSON().run() + UNII2JSON().run() + NDC2JSON().run() + + splToIndex = DetermineSPLToIndex() + splToIndex.run() + + spl2json = SPL2JSON() + spl2json.spl_path = openfda.spl.pipeline.SPL_S3_LOCAL_DIR + spl2json.run() + + SPLSetIDIndex().run() + GenerateCurrentSPLJSON().run() + UpcXml2JSON().run() + CombineHarmonization().run() + annotate_json = AnnotateJSON() + annotate_json.run() + + # Make sure there are now 3, not 1, UPCs for this SPL ID stored in LevelDB. + db = parallel.ShardedDB.open( + join( + openfda.annotation_table.pipeline.BASE_DIR, + openfda.annotation_table.pipeline.UPC_EXTRACT_DB, + ) + ) + db_iter = db.range_iter(None, None) + (k, v) = next(db_iter) + ok_(isinstance(v, list)) + eq_(k, "64f8040f-938d-4236-8e22-c838c9b5f8da") + eq_(len(v), 3) + sorted(v, key=lambda upc: upc["upc"]) + eq_(v[0]["upc"], "0300694200305") + eq_(v[1]["upc"], "0300694210304") + eq_(v[2]["upc"], "0300694220303") + + # Make sure harmonized.json also contains 3 UPCs. + harmonized = open( + join(openfda.annotation_table.pipeline.BASE_DIR, "harmonized.json"), "r" + ) + for jsonLine in harmonized: + obj = simplejson.loads(jsonLine) + eq_(obj["id"], "64f8040f-938d-4236-8e22-c838c9b5f8da") + eq_(obj["upc"], ["0300694200305", "0300694210304", "0300694220303"]) + + # Make sure the drug label JSON got openFDA section showing all 3 UPCs. + db = parallel.ShardedDB.open(annotate_json.output().path) + db_iter = db.range_iter(None, None) + (k, v) = next(db_iter) + v["openfda"]["upc"].sort() + eq_(v["openfda"]["upc"], ["0300694200305", "0300694210304", "0300694220303"]) def main(argv): - unittest.main(argv=argv) + unittest.main(argv=argv) -if __name__ == '__main__': - unittest.main() +if __name__ == "__main__": + unittest.main() diff --git a/openfda/annotation_table/unii_harmonization.py b/openfda/annotation_table/unii_harmonization.py index f534be8..b0a0308 100644 --- a/openfda/annotation_table/unii_harmonization.py +++ b/openfda/annotation_table/unii_harmonization.py @@ -21,106 +21,110 @@ def parallel_extract(files, worker): - manager = multiprocessing.Manager() - name_queue = manager.Queue() - pool = multiprocessing.Pool() - pool.map_async(worker, [(f, name_queue) for f in files]) - pool.close() - pool.join() + manager = multiprocessing.Manager() + name_queue = manager.Queue() + pool = multiprocessing.Pool() + pool.map_async(worker, [(f, name_queue) for f in files]) + pool.close() + pool.join() - rows = {} - while not name_queue.empty(): - harm_row = name_queue.get() - rows[harm_row["name"].lower()] = harm_row - return rows + rows = {} + while not name_queue.empty(): + harm_row = name_queue.get() + rows[harm_row["name"].lower()] = harm_row + return rows def harmonization_extract_worker(args): - filename = args[0] - name_queue = args[1] - try: - tree = extract_unii.parse_xml(filename) - harmonized = {} - - harmonized['unii'] = extract_unii.extract_unii(tree) - harmonized['set_id'] = extract_unii.extract_set_id(tree) - harmonized['name'] = extract_unii.extract_unii_name(tree) - #zipping together two arrays, since they came from the same xpath locations - #these are the NUI codes and their respective names - #we might be able to get the names from somewhere else and avoid the zip - intermediate = list(zip(extract_unii.extract_unii_other_code(tree), - extract_unii.extract_unii_other_name(tree))) - # print intermediate - header = ['number', 'name'] - harmonized['va'] = [dict(list(zip(header, s))) for s in intermediate] - name_queue.put(harmonized) - except Exception as inst: - print(filename + 'has a problem') - print(inst) + filename = args[0] + name_queue = args[1] + try: + tree = extract_unii.parse_xml(filename) + harmonized = {} + + harmonized["unii"] = extract_unii.extract_unii(tree) + harmonized["set_id"] = extract_unii.extract_set_id(tree) + harmonized["name"] = extract_unii.extract_unii_name(tree) + # zipping together two arrays, since they came from the same xpath locations + # these are the NUI codes and their respective names + # we might be able to get the names from somewhere else and avoid the zip + intermediate = list( + zip( + extract_unii.extract_unii_other_code(tree), + extract_unii.extract_unii_other_name(tree), + ) + ) + # print intermediate + header = ["number", "name"] + harmonized["va"] = [dict(list(zip(header, s))) for s in intermediate] + name_queue.put(harmonized) + except Exception as inst: + print(filename + "has a problem") + print(inst) def harmonize_unii(out_file, product_file, unii_file, class_index_dir): - out = open(out_file, 'w') - meta_file = csv.DictReader(open(product_file, 'rt', encoding='utf-8', errors='replace'), delimiter='\t') - - ndc_dict = {} - for row in meta_file: - - # Building the ndc_dict which is the out_file data structure of the final loop - # A little weird because there are duplicate set_id entries in the Product - # file. Setting the key of the ndc_dict as the substance name and then - # checking to see if the set_id is already in value list of that key. - this_spl_id = row['PRODUCTID'].split('_')[1] - this_substance = row['SUBSTANCENAME'] - - if this_substance.strip() != '': - if this_spl_id in ndc_dict: - tmp_substance = [s.lstrip() for s in this_substance.split(';')] - ndc_dict[this_spl_id] = set(tmp_substance + ndc_dict[this_spl_id]) - ndc_dict[this_spl_id] = list(ndc_dict[this_spl_id]) - else: - ndc_dict[this_spl_id] = [s.lstrip() for s in this_substance.split(';')] - - - - pharma_xmls = [] - # Grab all of the xml files - for root, _, filenames in os.walk(class_index_dir): - for filename in fnmatch.filter(filenames, '*.xml'): - pharma_xmls.append(os.path.join(root, filename)) - - # call async worker - pharma_rows = parallel_extract(pharma_xmls, harmonization_extract_worker) - - unii_rows = extract_unii.load_unii_from_csv(unii_file) - - combo = [] - - # Loop over ndc_dict, split its key, look for each token as a separate - # UNII element, if it is one, then add it to the unii_info dict for this - # loop cycle, once done with all of the tokenized keys, then loop over each - # set_id in the ndc_dict value list and push a combine record onto the - # list that will be the output. - # Loop handles the many-to-many relationship of ingredients to products. - unii_pivot = {} - for key, value in iter(ndc_dict.items()): - for substance_name in value: - if substance_name.lower() in pharma_rows: - if key in unii_pivot: - unii_pivot[key].append(pharma_rows[substance_name.lower()]) - else: - unii_pivot[key] = [pharma_rows[substance_name.lower()]] - elif substance_name.lower() in unii_rows: - if key in unii_pivot: - unii_pivot[key].append(unii_rows[substance_name.lower()]) - else: - unii_pivot[key] = [unii_rows[substance_name.lower()]] - - for key, value in iter(unii_pivot.items()): - output_dict = {} - output_dict['spl_id'] = key - output_dict['unii_indexing'] = value - combo.append(output_dict) - - for row in combo: - out.write(json.dumps(row) + '\n') + out = open(out_file, "w") + meta_file = csv.DictReader( + open(product_file, "rt", encoding="utf-8", errors="replace"), delimiter="\t" + ) + + ndc_dict = {} + for row in meta_file: + + # Building the ndc_dict which is the out_file data structure of the final loop + # A little weird because there are duplicate set_id entries in the Product + # file. Setting the key of the ndc_dict as the substance name and then + # checking to see if the set_id is already in value list of that key. + this_spl_id = row["PRODUCTID"].split("_")[1] + this_substance = row["SUBSTANCENAME"] + + if this_substance.strip() != "": + if this_spl_id in ndc_dict: + tmp_substance = [s.lstrip() for s in this_substance.split(";")] + ndc_dict[this_spl_id] = set(tmp_substance + ndc_dict[this_spl_id]) + ndc_dict[this_spl_id] = list(ndc_dict[this_spl_id]) + else: + ndc_dict[this_spl_id] = [s.lstrip() for s in this_substance.split(";")] + + pharma_xmls = [] + # Grab all of the xml files + for root, _, filenames in os.walk(class_index_dir): + for filename in fnmatch.filter(filenames, "*.xml"): + pharma_xmls.append(os.path.join(root, filename)) + + # call async worker + pharma_rows = parallel_extract(pharma_xmls, harmonization_extract_worker) + + unii_rows = extract_unii.load_unii_from_csv(unii_file) + + combo = [] + + # Loop over ndc_dict, split its key, look for each token as a separate + # UNII element, if it is one, then add it to the unii_info dict for this + # loop cycle, once done with all of the tokenized keys, then loop over each + # set_id in the ndc_dict value list and push a combine record onto the + # list that will be the output. + # Loop handles the many-to-many relationship of ingredients to products. + unii_pivot = {} + for key, value in iter(ndc_dict.items()): + for substance_name in value: + if substance_name.lower() in pharma_rows: + if key in unii_pivot: + unii_pivot[key].append(pharma_rows[substance_name.lower()]) + else: + unii_pivot[key] = [pharma_rows[substance_name.lower()]] + elif substance_name.lower() in unii_rows: + if key in unii_pivot: + unii_pivot[key].append(unii_rows[substance_name.lower()]) + else: + unii_pivot[key] = [unii_rows[substance_name.lower()]] + + for key, value in iter(unii_pivot.items()): + output_dict = {} + output_dict["spl_id"] = key + output_dict["unii_indexing"] = value + combo.append(output_dict) + + for row in combo: + out.write(json.dumps(row) + "\n") diff --git a/openfda/app.py b/openfda/app.py index 390d378..b4e85b4 100644 --- a/openfda/app.py +++ b/openfda/app.py @@ -9,23 +9,29 @@ import gflags as flags import logging -flags.DEFINE_enum('log_level', 'INFO', - ['DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL'], - 'Default logging level.') +flags.DEFINE_enum( + "log_level", + "INFO", + ["DEBUG", "INFO", "WARN", "ERROR", "FATAL"], + "Default logging level.", +) + def run(): - argv = sys.argv - try: - # parse flags - other_argv = flags.FLAGS(argv) + argv = sys.argv + try: + # parse flags + other_argv = flags.FLAGS(argv) - logging.basicConfig(level=getattr(logging, flags.FLAGS.log_level), - format='%(levelname).1s %(created)f %(filename)s:%(lineno)s [%(funcName)s] %(message)s') + logging.basicConfig( + level=getattr(logging, flags.FLAGS.log_level), + format="%(levelname).1s %(created)f %(filename)s:%(lineno)s [%(funcName)s] %(message)s", + ) - # get main function from previous call frame - user_main = inspect.currentframe().f_back.f_locals['main'] - user_main(other_argv) + # get main function from previous call frame + user_main = inspect.currentframe().f_back.f_locals["main"] + user_main(other_argv) - except flags.FlagsError as e: - print('%s\n\nUsage: %s ARGS\n%s' % (e, sys.argv[0], flags.FLAGS)) - sys.exit(1) + except flags.FlagsError as e: + print("%s\n\nUsage: %s ARGS\n%s" % (e, sys.argv[0], flags.FLAGS)) + sys.exit(1) diff --git a/openfda/caers/parse.py b/openfda/caers/parse.py index acc646a..22f3ad6 100644 --- a/openfda/caers/parse.py +++ b/openfda/caers/parse.py @@ -5,16 +5,27 @@ import simplejson from six.moves import range -reader = csv.reader(open('caers.csv')) +reader = csv.reader(open("caers.csv")) header = next(reader) -PRETTY_FIELDS = [ 'report_number', 'create_date', 'event_start_date', 'primary_product_name', 'meddra_soc', - 'event_outcome', 'age', 'age_unit', 'gender', 'mandatory_flag', 'voluntary_flag'] +PRETTY_FIELDS = [ + "report_number", + "create_date", + "event_start_date", + "primary_product_name", + "meddra_soc", + "event_outcome", + "age", + "age_unit", + "gender", + "mandatory_flag", + "voluntary_flag", +] for row in reader: - json_row = {} - for i in range(len(row)): - json_row[PRETTY_FIELDS[i]] = row[i] + json_row = {} + for i in range(len(row)): + json_row[PRETTY_FIELDS[i]] = row[i] - print('{ "index" : { "_index" : "caers", "_type" : "event" }}') - print(simplejson.dumps(json_row)) + print('{ "index" : { "_index" : "caers", "_type" : "event" }}') + print(simplejson.dumps(json_row)) diff --git a/openfda/caers/pipeline.py b/openfda/caers/pipeline.py index 76e30f9..7156e75 100644 --- a/openfda/caers/pipeline.py +++ b/openfda/caers/pipeline.py @@ -15,175 +15,181 @@ from openfda.common import convert_unicode, newest_file_timestamp RUN_DIR = dirname(dirname(os.path.abspath(__file__))) -BASE_DIR = config.data_dir('caers') -DOWNLOAD_DIR = config.data_dir('caers/raw') -common.shell_cmd('mkdir -p %s', BASE_DIR) +BASE_DIR = config.data_dir("caers") +DOWNLOAD_DIR = config.data_dir("caers/raw") +common.shell_cmd("mkdir -p %s", BASE_DIR) -CAERS_DOWNLOAD_PAGE_URL = 'https://www.fda.gov/food/compliance-enforcement-food/cfsan-adverse-event-reporting-system-caers' +CAERS_DOWNLOAD_PAGE_URL = "https://www.fda.gov/food/compliance-enforcement-food/cfsan-adverse-event-reporting-system-caers" RENAME_MAP = { - 'report id': 'report_number', - 'caers created date': 'date_created', - 'date of event': 'date_started', - 'product type': 'role', - 'product': 'name_brand', - 'product code': 'industry_code', - 'description': 'industry_name', - 'patient age': 'age', - 'age units': 'age_unit', - 'sex': 'gender', - 'outcomes': 'outcomes', - 'medra preferred terms': 'reactions', - 'meddra preferred terms': 'reactions' + "report id": "report_number", + "caers created date": "date_created", + "date of event": "date_started", + "product type": "role", + "product": "name_brand", + "product code": "industry_code", + "description": "industry_name", + "patient age": "age", + "age units": "age_unit", + "sex": "gender", + "outcomes": "outcomes", + "medra preferred terms": "reactions", + "meddra preferred terms": "reactions", } # Lists of keys used by the cleaner function in the CSV2JSONMapper() and the # _transform() in CSV2JSONReducer() -CONSUMER = ['age', 'age_unit', 'gender'] -PRODUCTS = ['role', 'name_brand', 'industry_code', 'industry_name'] -DATES = ['date_created', 'date_started'] +CONSUMER = ["age", "age_unit", "gender"] +PRODUCTS = ["role", "name_brand", "industry_code", "industry_name"] +DATES = ["date_created", "date_started"] class DownloadCAERS(luigi.Task): - local_dir = DOWNLOAD_DIR + local_dir = DOWNLOAD_DIR - def requires(self): - return [] - - def output(self): - return luigi.LocalTarget(self.local_dir) - - def run(self): - common.shell_cmd('mkdir -p %s', self.local_dir) - soup = BeautifulSoup(urlopen(CAERS_DOWNLOAD_PAGE_URL).read(), 'lxml') - for a in soup.find_all(title=re.compile('CAERS ASCII.*')): - if 'Download CAERS ASCII' in re.sub(r'\s', ' ', a.text): - fileURL = urljoin('https://www.fda.gov', a['href']) - common.download(fileURL, join(self.output().path, a.attrs['title']+'.csv')) + def requires(self): + return [] + def output(self): + return luigi.LocalTarget(self.local_dir) + def run(self): + common.shell_cmd("mkdir -p %s", self.local_dir) + soup = BeautifulSoup(urlopen(CAERS_DOWNLOAD_PAGE_URL).read(), "lxml") + for a in soup.find_all(title=re.compile("CAERS ASCII.*")): + if "Download CAERS ASCII" in re.sub(r"\s", " ", a.text): + fileURL = urljoin("https://www.fda.gov", a["href"]) + common.download( + fileURL, join(self.output().path, a.attrs["title"] + ".csv") + ) class CSV2JSONMapper(parallel.Mapper): - @staticmethod - def cleaner(k, v): - ''' Callback function passed into transform_dict. Takes a key/value tuple + @staticmethod + def cleaner(k, v): + """Callback function passed into transform_dict. Takes a key/value tuple and either passes them through, does a transformation either or drops both (by returning None). In this case: renaming all fields, returning None on empty keys to avoid blowing up downstream transforms, formatting dates. - ''' - if k.lower() in RENAME_MAP: - k = RENAME_MAP[k.lower()] - - if v is None: - return (k, None) + """ + if k.lower() in RENAME_MAP: + k = RENAME_MAP[k.lower()] - if isinstance(v, str): - v = convert_unicode(v).strip() + if v is None: + return (k, None) - if k in DATES: - if v: - try: - v = datetime.datetime.strptime(v, "%m/%d/%Y").strftime("%Y%m%d") - except ValueError: - logging.warning('Unparseable date: ' + v) - else: - return None + if isinstance(v, str): + v = convert_unicode(v).strip() + if k in DATES: + if v: + try: + v = datetime.datetime.strptime(v, "%m/%d/%Y").strftime("%Y%m%d") + except ValueError: + logging.warning("Unparseable date: " + v) + else: + return None - return (k, v) + return (k, v) - def map(self, key, value, output): - new_value = common.transform_dict(value, self.cleaner) - new_key = new_value['report_number'] + def map(self, key, value, output): + new_value = common.transform_dict(value, self.cleaner) + new_key = new_value["report_number"] - output.add(new_key, new_value) + output.add(new_key, new_value) class CSV2JSONReducer(parallel.Reducer): - def merge_two_dicts(self, x, y): - z = x.copy() # start with x's keys and values - z.update(y) # modifies z with y's keys and values & returns None - return z + def merge_two_dicts(self, x, y): + z = x.copy() # start with x's keys and values + z.update(y) # modifies z with y's keys and values & returns None + return z - def _transform(self, value): - ''' Takes several rows for the same report_number and merges them into + def _transform(self, value): + """Takes several rows for the same report_number and merges them into a single report object, which is the final JSON representation, barring any annotation steps that may follow. - ''' - result = { - 'report_number': None, - 'date_created': None, - 'date_started': None, - 'consumer': {}, - 'products': [], - 'reactions': {}, # reactions and outcomes get pivoted to arrays of keys - 'outcomes': {} - } - - track_consumer = [] - for row in value: - product = {k:v for k, v in row.items() if k in PRODUCTS} - consumer = {k:v for k, v in row.items() if k in CONSUMER and v} - reactions = row.get('reactions', '').split(',') - outcomes = row.get('outcomes', '').split(',') - - # Setting the results - result['report_number'] = row['report_number'] - result['date_created'] = row['date_created'] - if 'date_started' in row: - result['date_started'] = row['date_started'] - - result['products'].append(product) - result['consumer'] = self.merge_two_dicts(result.get('consumer', {}), consumer) - - # Eliminating duplicate reactions - for reaction in reactions: - reaction = reaction.strip() - if reaction: - result['reactions'][reaction] = True - - # Eliminating duplicate outcomes - for outcome in outcomes: - outcome = outcome.strip() - if outcome: - result['outcomes'][outcome] = True - - # Now that each list is unique, revert to list of strings - result['reactions'] = list(result['reactions'].keys()) - result['outcomes'] = list(result['outcomes'].keys()) - return result - - def reduce(self, key, values, output): - output.put(key, self._transform(values)) + """ + result = { + "report_number": None, + "date_created": None, + "date_started": None, + "consumer": {}, + "products": [], + "reactions": {}, # reactions and outcomes get pivoted to arrays of keys + "outcomes": {}, + } + + track_consumer = [] + for row in value: + product = {k: v for k, v in row.items() if k in PRODUCTS} + consumer = {k: v for k, v in row.items() if k in CONSUMER and v} + reactions = row.get("reactions", "").split(",") + outcomes = row.get("outcomes", "").split(",") + + # Setting the results + result["report_number"] = row["report_number"] + result["date_created"] = row["date_created"] + if "date_started" in row: + result["date_started"] = row["date_started"] + + result["products"].append(product) + result["consumer"] = self.merge_two_dicts( + result.get("consumer", {}), consumer + ) + + # Eliminating duplicate reactions + for reaction in reactions: + reaction = reaction.strip() + if reaction: + result["reactions"][reaction] = True + + # Eliminating duplicate outcomes + for outcome in outcomes: + outcome = outcome.strip() + if outcome: + result["outcomes"][outcome] = True + + # Now that each list is unique, revert to list of strings + result["reactions"] = list(result["reactions"].keys()) + result["outcomes"] = list(result["outcomes"].keys()) + return result + + def reduce(self, key, values, output): + output.put(key, self._transform(values)) class CSV2JSON(luigi.Task): - def requires(self): - return DownloadCAERS() - - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'json.db')) - - def run(self): - parallel.mapreduce( - parallel.Collection.from_glob(glob.glob(join(self.input().path, '*.csv')), - parallel.CSVDictLineInput(delimiter=',', quoting=csv.QUOTE_MINIMAL, strip_str='\ufeff')), - mapper=CSV2JSONMapper(), - reducer=CSV2JSONReducer(), - output_prefix=self.output().path) + def requires(self): + return DownloadCAERS() + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "json.db")) + + def run(self): + parallel.mapreduce( + parallel.Collection.from_glob( + glob.glob(join(self.input().path, "*.csv")), + parallel.CSVDictLineInput( + delimiter=",", quoting=csv.QUOTE_MINIMAL, strip_str="\ufeff" + ), + ), + mapper=CSV2JSONMapper(), + reducer=CSV2JSONReducer(), + output_prefix=self.output().path, + ) class LoadJSON(index_util.LoadJSONBase): - index_name = 'foodevent' - mapping_file = './schemas/foodevent_mapping.json' - data_source = CSV2JSON() - use_checksum = False - optimize_index = True - last_update_date = lambda _: newest_file_timestamp(DOWNLOAD_DIR) + index_name = "foodevent" + mapping_file = "./schemas/foodevent_mapping.json" + data_source = CSV2JSON() + use_checksum = False + optimize_index = True + last_update_date = lambda _: newest_file_timestamp(DOWNLOAD_DIR) -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/classification/pipeline.py b/openfda/classification/pipeline.py index aa69bba..586eac2 100755 --- a/openfda/classification/pipeline.py +++ b/openfda/classification/pipeline.py @@ -1,8 +1,8 @@ #!/usr/bin/python -''' Device classification pipeline for downloading, converting to JSON and +""" Device classification pipeline for downloading, converting to JSON and loading into elasticsearch. -''' +""" import csv import glob @@ -14,127 +14,137 @@ from openfda import common, config, index_util, parallel from openfda import device_common, download_util from openfda.common import first_file_timestamp -from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, - DeviceAnnotateMapper) +from openfda.device_harmonization.pipeline import ( + Harmonized2OpenFDA, + DeviceAnnotateMapper, +) RUN_DIR = dirname(dirname(os.path.abspath(__file__))) # A directory for holding files that track Task state -RAW_DIR = config.data_dir('classification/raw') -META_DIR = config.data_dir('classification/meta') -common.shell_cmd('mkdir -p %s', META_DIR) +RAW_DIR = config.data_dir("classification/raw") +META_DIR = config.data_dir("classification/meta") +common.shell_cmd("mkdir -p %s", META_DIR) -DEVICE_CLASS_ZIP = ('https://www.accessdata.fda.gov/premarket/' - 'ftparea/foiclass.zip') +DEVICE_CLASS_ZIP = "https://www.accessdata.fda.gov/premarket/" "ftparea/foiclass.zip" class DownloadFoiClass(luigi.Task): - def requires(self): - return [] + def requires(self): + return [] - def output(self): - return luigi.LocalTarget(RAW_DIR) + def output(self): + return luigi.LocalTarget(RAW_DIR) + + def run(self): + output_filename = join(self.output().path, DEVICE_CLASS_ZIP.split("/")[-1]) + common.download(DEVICE_CLASS_ZIP, output_filename) - def run(self): - output_filename = join(self.output().path, DEVICE_CLASS_ZIP.split('/')[-1]) - common.download(DEVICE_CLASS_ZIP, output_filename) class ExtractAndCleanDownloadsClassification(luigi.Task): - ''' Unzip each of the download files and remove all the non-UTF8 characters. - Unzip -p streams the data directly to iconv which then writes to disk. - ''' - def requires(self): - return DownloadFoiClass() + """Unzip each of the download files and remove all the non-UTF8 characters. + Unzip -p streams the data directly to iconv which then writes to disk. + """ + + def requires(self): + return DownloadFoiClass() + + def output(self): + return luigi.LocalTarget(config.data_dir("classification/extracted")) - def output(self): - return luigi.LocalTarget(config.data_dir('classification/extracted')) + def run(self): + output_dir = self.output().path + common.shell_cmd("mkdir -p %s", output_dir) + input_dir = self.input().path + download_util.extract_and_clean(input_dir, "ISO-8859-1", "UTF-8", "txt") - def run(self): - output_dir = self.output().path - common.shell_cmd('mkdir -p %s', output_dir) - input_dir = self.input().path - download_util.extract_and_clean(input_dir, 'ISO-8859-1', 'UTF-8', 'txt') class ClassificationMapper(parallel.Mapper): - def map(self, key, value, output): - IGNORE = ['physicalstate', 'technicalmethod', 'targetarea'] - - # Changing names to match the openFDA naming standard - # key = source name, value = replacement name - RENAME_MAP = { - 'productcode': 'product_code', - 'reviewcode': 'review_code', - 'regulationnumber': 'regulation_number', - 'devicename': 'device_name', - 'medicalspecialty': 'medical_specialty', - 'thirdpartyflag': 'third_party_flag', - 'gmpexemptflag': 'gmp_exempt_flag', - 'deviceclass': 'device_class', - 'summarymalfunctionreporting': 'summary_malfunction_reporting' - } - - MEDICAL_SPECIALTY = device_common.MED_SPECIALTY_ADVISORY_COMMITTEE - - def _cleaner(k, v): - ''' A helper function used for removing and renaming dictionary keys. - ''' - k = k.lower() - if k in IGNORE: return None - if k in RENAME_MAP: - k = RENAME_MAP[k] - if k == 'medical_specialty': - tk, tv = 'medical_specialty_description', MEDICAL_SPECIALTY[v] - return [(k, v), (tk, tv)] - return (k, v) - - new_value = common.transform_dict(value, _cleaner) - output.add(key, new_value) + def map(self, key, value, output): + IGNORE = ["physicalstate", "technicalmethod", "targetarea"] + + # Changing names to match the openFDA naming standard + # key = source name, value = replacement name + RENAME_MAP = { + "productcode": "product_code", + "reviewcode": "review_code", + "regulationnumber": "regulation_number", + "devicename": "device_name", + "medicalspecialty": "medical_specialty", + "thirdpartyflag": "third_party_flag", + "gmpexemptflag": "gmp_exempt_flag", + "deviceclass": "device_class", + "summarymalfunctionreporting": "summary_malfunction_reporting", + } + + MEDICAL_SPECIALTY = device_common.MED_SPECIALTY_ADVISORY_COMMITTEE + + def _cleaner(k, v): + """A helper function used for removing and renaming dictionary keys.""" + k = k.lower() + if k in IGNORE: + return None + if k in RENAME_MAP: + k = RENAME_MAP[k] + if k == "medical_specialty": + tk, tv = "medical_specialty_description", MEDICAL_SPECIALTY[v] + return [(k, v), (tk, tv)] + return (k, v) + + new_value = common.transform_dict(value, _cleaner) + output.add(key, new_value) + class Classification2JSON(luigi.Task): - def requires(self): - return ExtractAndCleanDownloadsClassification() - - def output(self): - return luigi.LocalTarget(config.data_dir('classification/json.db')) - - def run(self): - common.shell_cmd('mkdir -p %s', dirname(self.output().path)) - input_files = glob.glob(self.input().path + '/*.txt') - parallel.mapreduce( - parallel.Collection.from_glob( - input_files, parallel.CSVDictLineInput(delimiter='|', - quoting=csv.QUOTE_NONE, - escapechar='\\')), - mapper=ClassificationMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + def requires(self): + return ExtractAndCleanDownloadsClassification() + + def output(self): + return luigi.LocalTarget(config.data_dir("classification/json.db")) + + def run(self): + common.shell_cmd("mkdir -p %s", dirname(self.output().path)) + input_files = glob.glob(self.input().path + "/*.txt") + parallel.mapreduce( + parallel.Collection.from_glob( + input_files, + parallel.CSVDictLineInput( + delimiter="|", quoting=csv.QUOTE_NONE, escapechar="\\" + ), + ), + mapper=ClassificationMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) + class AnnotateDevice(luigi.Task): - def requires(self): - return [Harmonized2OpenFDA(), Classification2JSON()] + def requires(self): + return [Harmonized2OpenFDA(), Classification2JSON()] - def output(self): - return luigi.LocalTarget(config.data_dir('classification/annotate.db')) + def output(self): + return luigi.LocalTarget(config.data_dir("classification/annotate.db")) - def run(self): - harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() + def run(self): + harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() - parallel.mapreduce( - parallel.Collection.from_sharded(self.input()[1].path), - mapper=DeviceAnnotateMapper(harmonized_db=harmonized_db), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=10) + parallel.mapreduce( + parallel.Collection.from_sharded(self.input()[1].path), + mapper=DeviceAnnotateMapper(harmonized_db=harmonized_db), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=10, + ) class LoadJSON(index_util.LoadJSONBase): - index_name = 'deviceclass' - mapping_file = './schemas/classification_mapping.json' - data_source = AnnotateDevice() - use_checksum = False - optimize_index = True - last_update_date = lambda _: first_file_timestamp(RAW_DIR) + index_name = "deviceclass" + mapping_file = "./schemas/classification_mapping.json" + data_source = AnnotateDevice() + use_checksum = False + optimize_index = True + last_update_date = lambda _: first_file_timestamp(RAW_DIR) -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/common.py b/openfda/common.py index a65aef1..caa945e 100644 --- a/openfda/common.py +++ b/openfda/common.py @@ -19,318 +19,359 @@ class BatchHelper(object): - ''' - Convenience class for batching operations. - - When more than `batch_size` operations have been added to the batch, `fn` - will be invoked with `args` and `kw`. The actual data must be expected - via the `batch` argument. - ''' - def __init__(self, fn, *args, **kw): - self._fn = fn - self._args = args - self._kw = kw - self._batch = [] - self._count = 0 - self._start_time = time.time() - - def flush(self): - self._fn(*self._args, batch=self._batch, **self._kw) - del self._batch[:] - - def add(self, obj): - self._batch.append(obj) - self._count += 1 - if len(self._batch) > DEFAULT_BATCH_SIZE: - self.flush() - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.flush() - elapsed = time.time() - self._start_time - logging.info('BatchHelper: %d operations in %.2f seconds, %.2f docs/s', - self._count, elapsed, self._count / elapsed) + """ + Convenience class for batching operations. + + When more than `batch_size` operations have been added to the batch, `fn` + will be invoked with `args` and `kw`. The actual data must be expected + via the `batch` argument. + """ + + def __init__(self, fn, *args, **kw): + self._fn = fn + self._args = args + self._kw = kw + self._batch = [] + self._count = 0 + self._start_time = time.time() + + def flush(self): + self._fn(*self._args, batch=self._batch, **self._kw) + del self._batch[:] + + def add(self, obj): + self._batch.append(obj) + self._count += 1 + if len(self._batch) > DEFAULT_BATCH_SIZE: + self.flush() + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.flush() + elapsed = time.time() - self._start_time + logging.info( + "BatchHelper: %d operations in %.2f seconds, %.2f docs/s", + self._count, + elapsed, + self._count / elapsed, + ) # Because a.b.c is easier than a['b']['c'] # This extends dict to allow for easy json.dump action. class ObjectDict(dict): - def __init__(self, _dict): - for k, v in iter(_dict.items()): - v = dict_to_obj(v) - self[k] = v - setattr(self, k, v) - - def get_nested(self, key_string, default=None): - result = self - nested = key_string.split(".") - try: - for i in nested: - result = getattr(result, i) - except AttributeError: - if default: - return default - else: - raise - return result + def __init__(self, _dict): + for k, v in iter(_dict.items()): + v = dict_to_obj(v) + self[k] = v + setattr(self, k, v) + + def get_nested(self, key_string, default=None): + result = self + nested = key_string.split(".") + try: + for i in nested: + result = getattr(result, i) + except AttributeError: + if default: + return default + else: + raise + return result def dict_to_obj(d): - """Create a readonly view of a dictionary as ObjectDicts""" - if isinstance(d, dict): - return ObjectDict(d) - if isinstance(d, list): - return [dict_to_obj(v) for v in d] - else: - return d + """Create a readonly view of a dictionary as ObjectDicts""" + if isinstance(d, dict): + return ObjectDict(d) + if isinstance(d, list): + return [dict_to_obj(v) for v in d] + else: + return d class ProcessException(Exception): - def __init__(self, code, stdout, stderr): - self.stdout = stdout - self.stderr = stderr - self.code = code + def __init__(self, code, stdout, stderr): + self.stdout = stdout + self.stderr = stderr + self.code = code - def __repr__(self): - return 'ProcessException(\ncode=%s\nstderr=\n%s\nstdout=\n%s\n)' % ( - self.code, self.stderr, self.stdout) + def __repr__(self): + return "ProcessException(\ncode=%s\nstderr=\n%s\nstdout=\n%s\n)" % ( + self.code, + self.stderr, + self.stdout, + ) class TeeStream(Thread): - def __init__(self, quiet, input_stream, output_stream=sys.stdout, prefix=''): - Thread.__init__(self) - self._output = BytesIO() - self.output_stream = output_stream - self.input_stream = input_stream - self.quiet = quiet - self.prefix = prefix - - def run(self): - while 1: - line = self.input_stream.read() - if not line: - return - if not self.quiet: - self.output_stream.write(self.prefix + line.decode('utf-8')) - self._output.write(line) - - def output(self): - return self._output.getvalue() + def __init__(self, quiet, input_stream, output_stream=sys.stdout, prefix=""): + Thread.__init__(self) + self._output = BytesIO() + self.output_stream = output_stream + self.input_stream = input_stream + self.quiet = quiet + self.prefix = prefix + + def run(self): + while 1: + line = self.input_stream.read() + if not line: + return + if not self.quiet: + self.output_stream.write(self.prefix + line.decode("utf-8")) + self._output.write(line) + + def output(self): + return self._output.getvalue() def _checked_subprocess(quiet=False, *args, **kw): - kw['stderr'] = subprocess.PIPE - kw['stdout'] = subprocess.PIPE + kw["stderr"] = subprocess.PIPE + kw["stdout"] = subprocess.PIPE - proc = subprocess.Popen(*args, **kw) - stdout = TeeStream(quiet, proc.stdout, prefix='OUT: ') - stderr = TeeStream(quiet, proc.stderr, prefix='ERR: ') - stdout.start() - stderr.start() - status_code = proc.wait() - stdout.join() - stderr.join() - if status_code != 0: - logging.error(stdout.output()) - logging.error(stderr.output()) - raise ProcessException(status_code, stdout.output(), stderr.output()) + proc = subprocess.Popen(*args, **kw) + stdout = TeeStream(quiet, proc.stdout, prefix="OUT: ") + stderr = TeeStream(quiet, proc.stderr, prefix="ERR: ") + stdout.start() + stderr.start() + status_code = proc.wait() + stdout.join() + stderr.join() + if status_code != 0: + logging.error(stdout.output()) + logging.error(stderr.output()) + raise ProcessException(status_code, stdout.output(), stderr.output()) - return stdout.output() + return stdout.output() def cmd(args): - # print 'Running: ', ' '.join(args) - return _checked_subprocess(False, args) + # print 'Running: ', ' '.join(args) + return _checked_subprocess(False, args) + def quiet_cmd(args): - # print 'Running: ', ' '.join(args) - return _checked_subprocess(True, args) + # print 'Running: ', ' '.join(args) + return _checked_subprocess(True, args) def shell_cmd(fmt, *args): - cmd = _format_cmd(args, fmt) - return _checked_subprocess(False, cmd, shell=True) + cmd = _format_cmd(args, fmt) + return _checked_subprocess(False, cmd, shell=True) + def shell_cmd_quiet(fmt, *args): - cmd = _format_cmd(args, fmt) - return _checked_subprocess(True, cmd, shell=True) + cmd = _format_cmd(args, fmt) + return _checked_subprocess(True, cmd, shell=True) def _format_cmd(args, fmt): - # print 'Running: %s: %s' % (fmt, args) - if len(args) > 0: - cmd = fmt % args - else: - cmd = fmt - return cmd + # print 'Running: %s: %s' % (fmt, args) + if len(args) > 0: + cmd = fmt % args + else: + cmd = fmt + return cmd def transform_dict(coll, transform_fn): - '''Recursively call `tranform_fn` for each key and value in `coll` + """Recursively call `tranform_fn` for each key and value in `coll` or any child dictionaries. transform_fn should return (new_key, new_value). - ''' - if isinstance(coll, list): - return [transform_dict(v, transform_fn) for v in coll] - if not isinstance(coll, dict): - return coll - res = {} - for k, v in iter(coll.items()): - v = transform_dict(v, transform_fn) - transformed = transform_fn(k, v) - if transformed is None: - continue - if isinstance(transformed, tuple): - res[transformed[0]] = transformed[1] - else: - for fk, fv in transformed: res[fk] = fv - return res + """ + if isinstance(coll, list): + return [transform_dict(v, transform_fn) for v in coll] + if not isinstance(coll, dict): + return coll + res = {} + for k, v in iter(coll.items()): + v = transform_dict(v, transform_fn) + transformed = transform_fn(k, v) + if transformed is None: + continue + if isinstance(transformed, tuple): + res[transformed[0]] = transformed[1] + else: + for fk, fv in transformed: + res[fk] = fv + return res def is_older(a, b): - 'Returns true if file `a` is older than `b`, or `a` does not exist.' - if not os.path.exists(a): - return True + "Returns true if file `a` is older than `b`, or `a` does not exist." + if not os.path.exists(a): + return True - if not os.path.exists(b): - logging.warning('Trying to compare against non-existent test file %s', b) - return True + if not os.path.exists(b): + logging.warning("Trying to compare against non-existent test file %s", b) + return True - return (os.path.getmtime(a) < os.path.getmtime(b)) + return os.path.getmtime(a) < os.path.getmtime(b) def is_newer(a, b): - 'Returns true if file `a` is newer than `b`, or `b` does not exist.' - if not os.path.exists(a): - return False + "Returns true if file `a` is newer than `b`, or `b` does not exist." + if not os.path.exists(a): + return False - if not os.path.exists(b): - logging.warning('Trying to compare against non-existent test file %s', b) - return True + if not os.path.exists(b): + logging.warning("Trying to compare against non-existent test file %s", b) + return True - return (os.path.getmtime(a) >= os.path.getmtime(b)) + return os.path.getmtime(a) >= os.path.getmtime(b) def get_k_number(data): - if re.match(r'^(K|BK|DEN)', data): - return data + if re.match(r"^(K|BK|DEN)", data): + return data def get_p_number(data): - if re.match(r'^(P|N|D[0-9]|BP|H)', data): - return data + if re.match(r"^(P|N|D[0-9]|BP|H)", data): + return data def download(url, output_filename): - shell_cmd('mkdir -p %s', dirname(output_filename)) - shell_cmd("curl -fLR -o '%s.tmp' '%s'", output_filename, url) - os.rename(output_filename + '.tmp', output_filename) + shell_cmd("mkdir -p %s", dirname(output_filename)) + shell_cmd("curl -fLR -o '%s.tmp' '%s'", output_filename, url) + os.rename(output_filename + ".tmp", output_filename) + def download_requests(url, output_filename): - shell_cmd('mkdir -p %s', dirname(output_filename)) - r = requests.get(url) - with open(output_filename, 'wb') as output_file: - output_file.write(r.content) + shell_cmd("mkdir -p %s", dirname(output_filename)) + r = requests.get(url) + with open(output_filename, "wb") as output_file: + output_file.write(r.content) def download_to_file_with_retry(url, output_file): - for i in range(25): - try: - download(url, output_file) - return - except: - logging.info('Timeout trying %s, retrying...', url) - time.sleep(5) - continue + for i in range(25): + try: + download(url, output_file) + return + except: + logging.info("Timeout trying %s, retrying...", url) + time.sleep(5) + continue - raise Exception('Fetch of %s failed.' % url) + raise Exception("Fetch of %s failed." % url) def soup_with_retry(url): - for i in range(25): - try: - return BeautifulSoup(urlopen(url).read(), "lxml") - except: - logging.info('An error trying to cook soup from %s, retrying...', url) - time.sleep(5) - continue + for i in range(25): + try: + return BeautifulSoup(urlopen(url).read(), "lxml") + except: + logging.info("An error trying to cook soup from %s, retrying...", url) + time.sleep(5) + continue - raise Exception('Fetch of %s failed.' % url) + raise Exception("Fetch of %s failed." % url) -def strip_unicode(raw_str, remove_crlf = False): - # http://stackoverflow.com/questions/10993612/python-removing-xa0-from-string - a_str = raw_str - if remove_crlf: - a_str = a_str.replace(u'\x13', '')\ - .replace(u'\x0A', '') +def strip_unicode(raw_str, remove_crlf=False): + # http://stackoverflow.com/questions/10993612/python-removing-xa0-from-string + a_str = raw_str + if remove_crlf: + a_str = a_str.replace("\x13", "").replace("\x0A", "") - return a_str.replace(u'\xa0', ' ')\ - .replace(u'\u2013', '-')\ - .replace(u'\xae', ' ')\ - .replace(u'\u201c', '')\ - .replace(u'\u201d', '') + return ( + a_str.replace("\xa0", " ") + .replace("\u2013", "-") + .replace("\xae", " ") + .replace("\u201c", "") + .replace("\u201d", "") + ) def extract_date(date_str): - ''' - Will transform a timestamp string to valid date string in YYYY-MM-DD format. - :param date_str: A string having timestamp in YYYYMMDDhhmmss - :return: a date string - ''' - if not date_str: - return '1900-01-01' + """ + Will transform a timestamp string to valid date string in YYYY-MM-DD format. + :param date_str: A string having timestamp in YYYYMMDDhhmmss + :return: a date string + """ + if not date_str: + return "1900-01-01" - year = date_str[0:4] - month = date_str[4:6] - day = date_str[6:8] + year = date_str[0:4] + month = date_str[4:6] + day = date_str[6:8] + if 1900 >= int(year) or 2050 < int(year): + year = "1900" + if 0 >= int(month) or 12 < int(month): + month = "01" + if 0 >= int(day) or 12 < int(day): + day = "01" - if 1900 >= int(year) or 2050 < int(year): - year = "1900" - if 0 >= int(month) or 12 < int(month): - month = "01" - if 0 >= int(day) or 12 < int(day): - day = "01" + return year + "-" + month + "-" + day - return year + '-' + month + '-' + day def convert_unicode(data): - # add recursive decoder to prevent unicode errors on writes. - #if isinstance(data, str): - # return data # six.text_type(data, 'utf8', 'ignore').encode(encoding='utf8') - #elif isinstance(data, collections.Mapping): - # return dict(list(map(convert_unicode, list(data.items())))) - #elif isinstance(data, collections.Iterable): - # return type(data)(list(map(convert_unicode, data))) - #else: - return data + # add recursive decoder to prevent unicode errors on writes. + # if isinstance(data, str): + # return data # six.text_type(data, 'utf8', 'ignore').encode(encoding='utf8') + # elif isinstance(data, collections.Mapping): + # return dict(list(map(convert_unicode, list(data.items())))) + # elif isinstance(data, collections.Iterable): + # return type(data)(list(map(convert_unicode, data))) + # else: + return data def first_file_timestamp(path): - ''' - :param path: path to an existing directory - :return: timestamp of the first file found in the given directory as YYYY-MM-DD - ''' - return arrow.get(os.path.getmtime(os.path.join(path, os.listdir(path)[0]))).format('YYYY-MM-DD') if len( - os.listdir(path)) > 0 else None + """ + :param path: path to an existing directory + :return: timestamp of the first file found in the given directory as YYYY-MM-DD + """ + return ( + arrow.get(os.path.getmtime(os.path.join(path, os.listdir(path)[0]))).format( + "YYYY-MM-DD" + ) + if len(os.listdir(path)) > 0 + else None + ) def newest_file_timestamp(path): - ''' - :param path: path to an existing directory - :return: timestamp of the newest file found in the given directory as YYYY-MM-DD - ''' - return arrow.get(sorted(list(map(lambda file: os.path.getmtime(os.path.join(path, file)), os.listdir(path))))[-1]).format( - 'YYYY-MM-DD') if len( - os.listdir(path)) > 0 else None + """ + :param path: path to an existing directory + :return: timestamp of the newest file found in the given directory as YYYY-MM-DD + """ + return ( + arrow.get( + sorted( + list( + map( + lambda file: os.path.getmtime(os.path.join(path, file)), + os.listdir(path), + ) + ) + )[-1] + ).format("YYYY-MM-DD") + if len(os.listdir(path)) > 0 + else None + ) + def oldest_file_timestamp(path): - ''' - :param path: path to an existing directory - :return: timestamp of the oldest file found in the given directory as YYYY-MM-DD - ''' - return arrow.get(sorted(list(map(lambda file: os.path.getmtime(os.path.join(path, file)), os.listdir(path))))[0]).format( - 'YYYY-MM-DD') if len( - os.listdir(path)) > 0 else None + """ + :param path: path to an existing directory + :return: timestamp of the oldest file found in the given directory as YYYY-MM-DD + """ + return ( + arrow.get( + sorted( + list( + map( + lambda file: os.path.getmtime(os.path.join(path, file)), + os.listdir(path), + ) + ) + )[0] + ).format("YYYY-MM-DD") + if len(os.listdir(path)) > 0 + else None + ) diff --git a/openfda/config.py b/openfda/config.py index a77cae7..55e8cbb 100644 --- a/openfda/config.py +++ b/openfda/config.py @@ -16,43 +16,65 @@ import os.path + class FDAConfig(luigi.WrapperTask): - data_dir = Parameter(default='./data') - tmp_dir = Parameter(default='./data/openfda-tmp') - es_host = Parameter(default='localhost:9200') - aws_profile = luigi.Parameter(default='openfda') - disable_downloads = luigi.BoolParameter(default=False) + data_dir = Parameter(default="./data") + tmp_dir = Parameter(default="./data/openfda-tmp") + es_host = Parameter(default="localhost:9200") + aws_profile = luigi.Parameter(default="openfda") + disable_downloads = luigi.BoolParameter(default=False) - snapshot_path = luigi.Parameter(default='elasticsearch-snapshots/opensearch') - snapshot_bucket = luigi.Parameter(default='openfda-prod') - role_arn = luigi.Parameter(default="arn:aws:iam::806972138196:role/OpenSearchS3Access") + snapshot_path = luigi.Parameter(default="elasticsearch-snapshots/opensearch") + snapshot_bucket = luigi.Parameter(default="openfda-prod") + role_arn = luigi.Parameter( + default="arn:aws:iam::806972138196:role/OpenSearchS3Access" + ) class Context(object): - def __init__(self, path=''): - self._path = path + def __init__(self, path=""): + self._path = path + + def path(self, path=None): + return os.path.join(FDAConfig().data_dir, self._path, path) + + +def snapshot_path(): + return FDAConfig().snapshot_path + + +def snapshot_bucket(): + return FDAConfig().snapshot_bucket + + +def es_host(): + return FDAConfig().es_host - def path(self, path=None): - return os.path.join(FDAConfig().data_dir, self._path, path) -def snapshot_path(): return FDAConfig().snapshot_path -def snapshot_bucket(): return FDAConfig().snapshot_bucket -def es_host(): return FDAConfig().es_host -def role_arn(): return FDAConfig().role_arn -def disable_downloads(): return FDAConfig().disable_downloads +def role_arn(): + return FDAConfig().role_arn + + +def disable_downloads(): + return FDAConfig().disable_downloads + def es_client(host=None, port=9200): - import elasticsearch - if host is None: - host = es_host() + import elasticsearch + + if host is None: + host = es_host() + + return elasticsearch.Elasticsearch(host, timeout=180, port=port) - return elasticsearch.Elasticsearch(host, timeout=180, port=port) def data_dir(*subdirs): - return os.path.join(FDAConfig().data_dir, *subdirs) + return os.path.join(FDAConfig().data_dir, *subdirs) + def tmp_dir(*subdirs): - return os.path.join(FDAConfig().tmp_dir, *subdirs) + return os.path.join(FDAConfig().tmp_dir, *subdirs) + def aws_profile(): - return FDAConfig().aws_profile + return FDAConfig().aws_profile diff --git a/openfda/covid19serology/pipeline.py b/openfda/covid19serology/pipeline.py index 2d5f188..9593efc 100644 --- a/openfda/covid19serology/pipeline.py +++ b/openfda/covid19serology/pipeline.py @@ -1,8 +1,8 @@ #!/usr/bin/python -''' Device pipeline for downloading, transforming to JSON and loading COVID-19 Serological Testing Validation Project +""" Device pipeline for downloading, transforming to JSON and loading COVID-19 Serological Testing Validation Project data into Elasticsearch. -''' +""" import glob from os.path import dirname @@ -14,79 +14,81 @@ from openfda.tasks import AlwaysRunTask from openfda.common import first_file_timestamp -SEROLOGY_TEST_BUCKET = 's3://openfda-covid19serology/' -SEROLOGY_TEST_SYNC_DIR = config.data_dir('covid19serology/s3_sync') -SEROLOGY_TEST_JSON_DB_DIR = config.data_dir('covid19serology/json.db') +SEROLOGY_TEST_BUCKET = "s3://openfda-covid19serology/" +SEROLOGY_TEST_SYNC_DIR = config.data_dir("covid19serology/s3_sync") +SEROLOGY_TEST_JSON_DB_DIR = config.data_dir("covid19serology/json.db") class SyncS3SerologyTest(AlwaysRunTask): - def _run(self): - common.cmd(['mkdir', '-p', SEROLOGY_TEST_SYNC_DIR]) - common.cmd(['aws', - '--profile=' + config.aws_profile(), - 's3', - 'sync', + def _run(self): + common.cmd(["mkdir", "-p", SEROLOGY_TEST_SYNC_DIR]) + common.cmd( + [ + "aws", + "--profile=" + config.aws_profile(), + "s3", + "sync", SEROLOGY_TEST_BUCKET, - SEROLOGY_TEST_SYNC_DIR]) + SEROLOGY_TEST_SYNC_DIR, + ] + ) - def output(self): - return luigi.LocalTarget(SEROLOGY_TEST_SYNC_DIR) + def output(self): + return luigi.LocalTarget(SEROLOGY_TEST_SYNC_DIR) class SerologyCSV2JSONMapper(parallel.Mapper): - def map(self, key, value, output): + def map(self, key, value, output): - # Date fields. - DATE_KEYS = ['date_performed'] + # Date fields. + DATE_KEYS = ["date_performed"] - def _cleaner(k, v): - ''' A helper function that is used formatting dates. - ''' - if v != None: - if k in DATE_KEYS: - # Elasticsearch cannot handle null dates, emit None so the wrapper - # function `transform_dict` will omit it from its transformation - v = arrow.get(v).format('M/D/YYYY') - return (k, v) + def _cleaner(k, v): + """A helper function that is used formatting dates.""" + if v != None: + if k in DATE_KEYS: + # Elasticsearch cannot handle null dates, emit None so the wrapper + # function `transform_dict` will omit it from its transformation + v = arrow.get(v).format("M/D/YYYY") + return (k, v) - new_value = common.transform_dict(value, _cleaner) - output.add(key, new_value) + new_value = common.transform_dict(value, _cleaner) + output.add(key, new_value) class SerologyCSV2JSON(luigi.Task): - def requires(self): - return SyncS3SerologyTest() + def requires(self): + return SyncS3SerologyTest() - def output(self): - return luigi.LocalTarget(SEROLOGY_TEST_JSON_DB_DIR) + def output(self): + return luigi.LocalTarget(SEROLOGY_TEST_JSON_DB_DIR) - def run(self): - input_files = glob.glob(self.requires().output().path + '/*.csv') - parallel.mapreduce( - parallel.Collection.from_glob(input_files, - parallel.CSVDictLineInput()), - mapper=SerologyCSV2JSONMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=1) + def run(self): + input_files = glob.glob(self.requires().output().path + "/*.csv") + parallel.mapreduce( + parallel.Collection.from_glob(input_files, parallel.CSVDictLineInput()), + mapper=SerologyCSV2JSONMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=1, + ) - def mapreduce_inputs(self): - input_files = glob.glob(dirname(self.requires().output().path) + '/*.csv') - return parallel.Collection.from_glob(input_files, - parallel.CSVDictLineInput()) + def mapreduce_inputs(self): + input_files = glob.glob(dirname(self.requires().output().path) + "/*.csv") + return parallel.Collection.from_glob(input_files, parallel.CSVDictLineInput()) class LoadJSON(index_util.LoadJSONBase): - index_name = 'covid19serology' - mapping_file = './schemas/covid19serology_mapping.json' - data_source = SerologyCSV2JSON() - use_checksum = False - optimize_index = True - last_update_date = lambda _: first_file_timestamp(SEROLOGY_TEST_SYNC_DIR) + index_name = "covid19serology" + mapping_file = "./schemas/covid19serology_mapping.json" + data_source = SerologyCSV2JSON() + use_checksum = False + optimize_index = True + last_update_date = lambda _: first_file_timestamp(SEROLOGY_TEST_SYNC_DIR) -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/covid19serology/tests/pipeline_test.py b/openfda/covid19serology/tests/pipeline_test.py index 2020e38..7b6e0a3 100644 --- a/openfda/covid19serology/tests/pipeline_test.py +++ b/openfda/covid19serology/tests/pipeline_test.py @@ -13,67 +13,79 @@ class SerologyPipelineTests(unittest.TestCase): - def setUp(self): - self.test_dir = tempfile.mkdtemp() - openfda.covid19serology.pipeline.SEROLOGY_TEST_SYNC_DIR = join(self.test_dir, 'covid19serology/s3_sync') - openfda.covid19serology.pipeline.SEROLOGY_TEST_JSON_DB_DIR = join(self.test_dir, 'covid19serology/json.db') + def setUp(self): + self.test_dir = tempfile.mkdtemp() + openfda.covid19serology.pipeline.SEROLOGY_TEST_SYNC_DIR = join( + self.test_dir, "covid19serology/s3_sync" + ) + openfda.covid19serology.pipeline.SEROLOGY_TEST_JSON_DB_DIR = join( + self.test_dir, "covid19serology/json.db" + ) - os.makedirs(openfda.covid19serology.pipeline.SEROLOGY_TEST_SYNC_DIR) - shutil.copyfile( - os.path.join(dirname(os.path.abspath(__file__)), "20200421_Euroimmun_SARS-COV-2_ELISA_(IgG)_results.csv"), - os.path.join(openfda.covid19serology.pipeline.SEROLOGY_TEST_SYNC_DIR, "input.csv")) + os.makedirs(openfda.covid19serology.pipeline.SEROLOGY_TEST_SYNC_DIR) + shutil.copyfile( + os.path.join( + dirname(os.path.abspath(__file__)), + "20200421_Euroimmun_SARS-COV-2_ELISA_(IgG)_results.csv", + ), + os.path.join( + openfda.covid19serology.pipeline.SEROLOGY_TEST_SYNC_DIR, "input.csv" + ), + ) - def tearDown(self): - shutil.rmtree(self.test_dir) + def tearDown(self): + shutil.rmtree(self.test_dir) - def test_csv_to_json(self): - csv2json = SerologyCSV2JSON() - csv2json.run() + def test_csv_to_json(self): + csv2json = SerologyCSV2JSON() + csv2json.run() - ldb = leveldb.LevelDB(os.path.join(csv2json.output().path, 'shard-00000-of-00001.db')) - data = list(ldb.RangeIter()) - eq_(110, len(data)) + ldb = leveldb.LevelDB( + os.path.join(csv2json.output().path, "shard-00000-of-00001.db") + ) + data = list(ldb.RangeIter()) + eq_(110, len(data)) - # Verify base logic. - row = loads(data[0][1]) - eq_('Euroimmun', row.get('manufacturer')) - eq_('SARS-COV-2 ELISA (IgG)', row.get('device')) - eq_('4/21/2020', row.get('date_performed')) - eq_('E200330DT', row.get('lot_number')) - eq_('Panel 1', row.get('panel')) - eq_('1', row.get('sample_no')) - eq_('C0001', row.get('sample_id')) - eq_('Serum', row.get('type')) - eq_('Negatives', row.get('group')) - eq_('NA', row.get('days_from_symptom')) - eq_('NA', row.get('igm_result')) - eq_('Negative', row.get('igg_result')) - eq_('NA', row.get('iga_result')) - eq_('NA', row.get('pan_result')) - eq_('NA', row.get('igm_igg_result')) - eq_('Pass', row.get('control')) - eq_('0', row.get('igm_titer')) - eq_('0', row.get('igg_titer')) - eq_('Negative', row.get('igm_truth')) - eq_('Negative', row.get('igg_truth')) - eq_('Negative', row.get('antibody_truth')) - eq_('NA', row.get('igm_agree')) - eq_('TN', row.get('igg_agree')) - eq_('NA', row.get('iga_agree')) - eq_('NA', row.get('pan_agree')) - eq_('NA', row.get('igm_igg_agree')) - eq_('TN', row.get('antibody_agree')) + # Verify base logic. + row = loads(data[0][1]) + eq_("Euroimmun", row.get("manufacturer")) + eq_("SARS-COV-2 ELISA (IgG)", row.get("device")) + eq_("4/21/2020", row.get("date_performed")) + eq_("E200330DT", row.get("lot_number")) + eq_("Panel 1", row.get("panel")) + eq_("1", row.get("sample_no")) + eq_("C0001", row.get("sample_id")) + eq_("Serum", row.get("type")) + eq_("Negatives", row.get("group")) + eq_("NA", row.get("days_from_symptom")) + eq_("NA", row.get("igm_result")) + eq_("Negative", row.get("igg_result")) + eq_("NA", row.get("iga_result")) + eq_("NA", row.get("pan_result")) + eq_("NA", row.get("igm_igg_result")) + eq_("Pass", row.get("control")) + eq_("0", row.get("igm_titer")) + eq_("0", row.get("igg_titer")) + eq_("Negative", row.get("igm_truth")) + eq_("Negative", row.get("igg_truth")) + eq_("Negative", row.get("antibody_truth")) + eq_("NA", row.get("igm_agree")) + eq_("TN", row.get("igg_agree")) + eq_("NA", row.get("iga_agree")) + eq_("NA", row.get("pan_agree")) + eq_("NA", row.get("igm_igg_agree")) + eq_("TN", row.get("antibody_agree")) - # Verify some variations. - row = loads(data[55][1]) - eq_('29', row.get('days_from_symptom')) - eq_('400', row.get('igm_titer')) - eq_('1600', row.get('igg_titer')) + # Verify some variations. + row = loads(data[55][1]) + eq_("29", row.get("days_from_symptom")) + eq_("400", row.get("igm_titer")) + eq_("1600", row.get("igg_titer")) def main(argv): - unittest.main(argv=argv) + unittest.main(argv=argv) -if __name__ == '__main__': - unittest.main() +if __name__ == "__main__": + unittest.main() diff --git a/openfda/deploy/tests/adae/test_endpoint.py b/openfda/deploy/tests/adae/test_endpoint.py index 2d70a8b..e0bb326 100644 --- a/openfda/deploy/tests/adae/test_endpoint.py +++ b/openfda/deploy/tests/adae/test_endpoint.py @@ -6,119 +6,133 @@ def test_nullified_records(): - NULLIFIED = ['USA-FDACVM-2018-US-045311', 'USA-FDACVM-2018-US-048571', 'USA-FDACVM-2018-US-046672', - 'USA-FDACVM-2017-US-070108', 'USA-FDACVM-2017-US-002864', 'USA-FDACVM-2017-US-002866', - 'USA-FDACVM-2017-US-052458', 'USA-FDACVM-2017-US-055193', 'USA-FDACVM-2017-US-043931', - 'USA-FDACVM-2018-US-002321', 'USA-FDACVM-2017-US-042492', 'USA-FDACVM-2018-US-044065' - ] - - for case_num in NULLIFIED: - meta, results = fetch( - '/animalandveterinary/event.json?search=unique_aer_id_number:' + case_num) - eq_(results, None) + NULLIFIED = [ + "USA-FDACVM-2018-US-045311", + "USA-FDACVM-2018-US-048571", + "USA-FDACVM-2018-US-046672", + "USA-FDACVM-2017-US-070108", + "USA-FDACVM-2017-US-002864", + "USA-FDACVM-2017-US-002866", + "USA-FDACVM-2017-US-052458", + "USA-FDACVM-2017-US-055193", + "USA-FDACVM-2017-US-043931", + "USA-FDACVM-2018-US-002321", + "USA-FDACVM-2017-US-042492", + "USA-FDACVM-2018-US-044065", + ] + + for case_num in NULLIFIED: + meta, results = fetch( + "/animalandveterinary/event.json?search=unique_aer_id_number:" + case_num + ) + eq_(results, None) def test_single_ae_record(): - meta, results = fetch( - '/animalandveterinary/event.json?search=unique_aer_id_number:USA-USFDACVM-2015-US-094810') - eq_(len(results), 1) - - ae = results[0] - - eq_("USA-USFDACVM-2015-US-094810", ae["unique_aer_id_number"]) - eq_(None, ae.get("@id")) - eq_("N141251", ae["report_id"]) - eq_("20150126", ae["original_receive_date"]) - eq_("Food and Drug Administration Center for Veterinary Medicine", ae["receiver"]["organization"]) - eq_("7500 Standish Place (HFV-210) Room N403", ae["receiver"]["street_address"]) - eq_("Rockville", ae["receiver"]["city"]) - eq_("MD", ae["receiver"]["state"]) - eq_("20855", ae["receiver"]["postal_code"]) - eq_("USA", ae["receiver"]["country"]) - eq_("Other", ae["primary_reporter"]) - eq_("Safety Issue", ae["type_of_information"]) - eq_("true", ae["serious_ae"]) - eq_("1", ae["number_of_animals_treated"]) - eq_("1", ae["number_of_animals_affected"]) - eq_("Dog", ae["animal"]["species"]) - eq_("Male", ae["animal"]["gender"]) - eq_("Neutered", ae["animal"]["reproductive_status"]) - eq_("NOT APPLICABLE", ae["animal"]["female_animal_physiological_status"]) - - eq_("1.00", ae["animal"]["age"]["min"]) - eq_(None, ae["animal"]["age"].get("max")) - eq_("Year", ae["animal"]["age"]["unit"]) - eq_("Measured", ae["animal"]["age"]["qualifier"]) - - eq_("38.419", ae["animal"]["weight"]["min"]) - eq_(None, ae["animal"]["weight"].get("max")) - eq_("Kilogram", ae["animal"]["weight"]["unit"]) - eq_("Measured", ae["animal"]["weight"]["qualifier"]) - - eq_("false", ae["animal"]["breed"]["is_crossbred"]) - eq_("Retriever - Labrador", ae["animal"]["breed"]["breed_component"]) - - eq_("Recovered/Normal", ae["outcome"][0]["medical_status"]) - eq_("1", ae["outcome"][0]["number_of_animals_affected"]) - - eq_("Good", ae["health_assessment_prior_to_exposure"]["condition"]) - eq_("Veterinarian", ae["health_assessment_prior_to_exposure"]["assessed_by"]) - - eq_("20141222", ae["onset_date"]) - eq_({'value': '4', 'unit': 'Week'}, ae.get("duration")) - - eq_("11", ae["reaction"][0]["veddra_version"]) - eq_("129", ae["reaction"][0]["veddra_term_code"]) - eq_("Vocalisation", ae["reaction"][0]["veddra_term_name"]) - eq_("1", ae["reaction"][0]["number_of_animals_affected"]) - eq_("Actual", ae["reaction"][0]["accuracy"]) - - eq_("11", ae["reaction"][1]["veddra_version"]) - eq_("960", ae["reaction"][1]["veddra_term_code"]) - eq_("Pruritus", ae["reaction"][1]["veddra_term_name"]) - eq_("1", ae["reaction"][1]["number_of_animals_affected"]) - eq_("Actual", ae["reaction"][1]["accuracy"]) - - eq_(None, ae.get("time_between_exposure_and_onset")) - eq_("false", ae["treated_for_ae"]) - - eq_(1, len(ae["drug"])) - - eq_("20141222", ae["drug"][0]["first_exposure_date"]) - eq_("20141222", ae["drug"][0]["last_exposure_date"]) - - eq_("Animal Owner", ae["drug"][0]["administered_by"]) - eq_("Topical", ae["drug"][0]["route"]) - eq_("1", ae["drug"][0]["dose"]["numerator"]) - eq_("tube", ae["drug"][0]["dose"]["numerator_unit"]) - eq_("1", ae["drug"][0]["dose"]["denominator"]) - eq_("dose", ae["drug"][0]["dose"]["denominator_unit"]) - eq_('false', ae["drug"][0].get("used_according_to_label")) - eq_('Overdosed', ae["drug"][0].get("off_label_use")) - eq_("false", ae["drug"][0]["previous_exposure_to_drug"]) - eq_(None, ae["drug"][0].get("previous_ae_to_drug")) - eq_(None, ae["drug"][0].get("ae_abated_after_stopping_drug")) - eq_(None, ae["drug"][0].get("ae_reappeared_after_resuming_drug")) - eq_(None, ae["drug"][0].get("manufacturing_date")) - eq_('KP09ECX KP09C4D', ae["drug"][0].get("lot_number")) - eq_('2017-01', ae["drug"][0].get("lot_expiration")) - eq_('000859-2339', ae["drug"][0].get("product_ndc")) - eq_("MSK", ae["drug"][0]["brand_name"]) - eq_('Solution', ae["drug"][0]["dosage_form"]) - eq_("MSK", ae["drug"][0]["manufacturer"]["name"]) - eq_("USA-USFDACVM-N141251", ae["drug"][0]["manufacturer"]["registration_number"]) - eq_(None, ae["drug"][0].get("number_of_defective_items")) - eq_(None, ae["drug"][0].get("number_of_items_returned")) - eq_("QP54AB52", ae["drug"][0]["atc_vet_code"]) - eq_("Imidacloprid", ae["drug"][0]["active_ingredients"][0]["name"]) - eq_("500", ae["drug"][0]["active_ingredients"][0]["dose"]["numerator"]) - eq_("Milligram", ae["drug"][0]["active_ingredients"][0]["dose"]["numerator_unit"]) - eq_("5", ae["drug"][0]["active_ingredients"][0]["dose"]["denominator"]) - eq_("mL", ae["drug"][0]["active_ingredients"][0]["dose"]["denominator_unit"]) - - -if __name__ == '__main__': - all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) - for key, func in all_functions: - if key.find("test_") > -1: - func() + meta, results = fetch( + "/animalandveterinary/event.json?search=unique_aer_id_number:USA-USFDACVM-2015-US-094810" + ) + eq_(len(results), 1) + + ae = results[0] + + eq_("USA-USFDACVM-2015-US-094810", ae["unique_aer_id_number"]) + eq_(None, ae.get("@id")) + eq_("N141251", ae["report_id"]) + eq_("20150126", ae["original_receive_date"]) + eq_( + "Food and Drug Administration Center for Veterinary Medicine", + ae["receiver"]["organization"], + ) + eq_("7500 Standish Place (HFV-210) Room N403", ae["receiver"]["street_address"]) + eq_("Rockville", ae["receiver"]["city"]) + eq_("MD", ae["receiver"]["state"]) + eq_("20855", ae["receiver"]["postal_code"]) + eq_("USA", ae["receiver"]["country"]) + eq_("Other", ae["primary_reporter"]) + eq_("Safety Issue", ae["type_of_information"]) + eq_("true", ae["serious_ae"]) + eq_("1", ae["number_of_animals_treated"]) + eq_("1", ae["number_of_animals_affected"]) + eq_("Dog", ae["animal"]["species"]) + eq_("Male", ae["animal"]["gender"]) + eq_("Neutered", ae["animal"]["reproductive_status"]) + eq_("NOT APPLICABLE", ae["animal"]["female_animal_physiological_status"]) + + eq_("1.00", ae["animal"]["age"]["min"]) + eq_(None, ae["animal"]["age"].get("max")) + eq_("Year", ae["animal"]["age"]["unit"]) + eq_("Measured", ae["animal"]["age"]["qualifier"]) + + eq_("38.419", ae["animal"]["weight"]["min"]) + eq_(None, ae["animal"]["weight"].get("max")) + eq_("Kilogram", ae["animal"]["weight"]["unit"]) + eq_("Measured", ae["animal"]["weight"]["qualifier"]) + + eq_("false", ae["animal"]["breed"]["is_crossbred"]) + eq_("Retriever - Labrador", ae["animal"]["breed"]["breed_component"]) + + eq_("Recovered/Normal", ae["outcome"][0]["medical_status"]) + eq_("1", ae["outcome"][0]["number_of_animals_affected"]) + + eq_("Good", ae["health_assessment_prior_to_exposure"]["condition"]) + eq_("Veterinarian", ae["health_assessment_prior_to_exposure"]["assessed_by"]) + + eq_("20141222", ae["onset_date"]) + eq_({"value": "4", "unit": "Week"}, ae.get("duration")) + + eq_("11", ae["reaction"][0]["veddra_version"]) + eq_("129", ae["reaction"][0]["veddra_term_code"]) + eq_("Vocalisation", ae["reaction"][0]["veddra_term_name"]) + eq_("1", ae["reaction"][0]["number_of_animals_affected"]) + eq_("Actual", ae["reaction"][0]["accuracy"]) + + eq_("11", ae["reaction"][1]["veddra_version"]) + eq_("960", ae["reaction"][1]["veddra_term_code"]) + eq_("Pruritus", ae["reaction"][1]["veddra_term_name"]) + eq_("1", ae["reaction"][1]["number_of_animals_affected"]) + eq_("Actual", ae["reaction"][1]["accuracy"]) + + eq_(None, ae.get("time_between_exposure_and_onset")) + eq_("false", ae["treated_for_ae"]) + + eq_(1, len(ae["drug"])) + + eq_("20141222", ae["drug"][0]["first_exposure_date"]) + eq_("20141222", ae["drug"][0]["last_exposure_date"]) + + eq_("Animal Owner", ae["drug"][0]["administered_by"]) + eq_("Topical", ae["drug"][0]["route"]) + eq_("1", ae["drug"][0]["dose"]["numerator"]) + eq_("tube", ae["drug"][0]["dose"]["numerator_unit"]) + eq_("1", ae["drug"][0]["dose"]["denominator"]) + eq_("dose", ae["drug"][0]["dose"]["denominator_unit"]) + eq_("false", ae["drug"][0].get("used_according_to_label")) + eq_("Overdosed", ae["drug"][0].get("off_label_use")) + eq_("false", ae["drug"][0]["previous_exposure_to_drug"]) + eq_(None, ae["drug"][0].get("previous_ae_to_drug")) + eq_(None, ae["drug"][0].get("ae_abated_after_stopping_drug")) + eq_(None, ae["drug"][0].get("ae_reappeared_after_resuming_drug")) + eq_(None, ae["drug"][0].get("manufacturing_date")) + eq_("KP09ECX KP09C4D", ae["drug"][0].get("lot_number")) + eq_("2017-01", ae["drug"][0].get("lot_expiration")) + eq_("000859-2339", ae["drug"][0].get("product_ndc")) + eq_("MSK", ae["drug"][0]["brand_name"]) + eq_("Solution", ae["drug"][0]["dosage_form"]) + eq_("MSK", ae["drug"][0]["manufacturer"]["name"]) + eq_("USA-USFDACVM-N141251", ae["drug"][0]["manufacturer"]["registration_number"]) + eq_(None, ae["drug"][0].get("number_of_defective_items")) + eq_(None, ae["drug"][0].get("number_of_items_returned")) + eq_("QP54AB52", ae["drug"][0]["atc_vet_code"]) + eq_("Imidacloprid", ae["drug"][0]["active_ingredients"][0]["name"]) + eq_("500", ae["drug"][0]["active_ingredients"][0]["dose"]["numerator"]) + eq_("Milligram", ae["drug"][0]["active_ingredients"][0]["dose"]["numerator_unit"]) + eq_("5", ae["drug"][0]["active_ingredients"][0]["dose"]["denominator"]) + eq_("mL", ae["drug"][0]["active_ingredients"][0]["dose"]["denominator_unit"]) + + +if __name__ == "__main__": + all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) + for key, func in all_functions: + if key.find("test_") > -1: + func() diff --git a/openfda/deploy/tests/device510k/test_endpoint.py b/openfda/deploy/tests/device510k/test_endpoint.py index 6c2516b..bb9d601 100644 --- a/openfda/deploy/tests/device510k/test_endpoint.py +++ b/openfda/deploy/tests/device510k/test_endpoint.py @@ -3,29 +3,47 @@ def test_openfda_section(): - meta, results = fetch('/device/510k.json?search=openfda.registration_number:3006630150') - assert results[0]['openfda'], results + meta, results = fetch( + "/device/510k.json?search=openfda.registration_number:3006630150" + ) + assert results[0]["openfda"], results + def test_not_analyzed(): - assert_total('/device/510k.json?search=openfda.regulation_number:874.3450', 70) + assert_total("/device/510k.json?search=openfda.regulation_number:874.3450", 70) + def test_date(): - assert_total('/device/510k.json?search=date_received:20000410', 30) + assert_total("/device/510k.json?search=date_received:20000410", 30) + def test_date_range(): - assert_total('/device/510k.json?search=date_received:([20000410+TO+20001231])', 2384) + assert_total( + "/device/510k.json?search=date_received:([20000410+TO+20001231])", 2384 + ) + def test_exact_count(): - assert_count_top('/device/510k.json?count=advisory_committee_description.exact', [ - 'Cardiovascular', 'General Hospital', 'Clinical Chemistry', 'General, Plastic Surgery', - ]) + assert_count_top( + "/device/510k.json?count=advisory_committee_description.exact", + [ + "Cardiovascular", + "General Hospital", + "Clinical Chemistry", + "General, Plastic Surgery", + ], + ) + def test_openfda_count(): - assert_count_top('/device/510k.json?count=openfda.device_name.exact', [ - 'Powered Laser Surgical Instrument', - 'Latex Patient Examination Glove', - 'Electrosurgical, Cutting & Coagulation & Accessories', - 'System, Image Processing, Radiological', - 'Hearing Aid, Air Conduction', - 'Laparoscope, General & Plastic Surgery', - ]) + assert_count_top( + "/device/510k.json?count=openfda.device_name.exact", + [ + "Powered Laser Surgical Instrument", + "Latex Patient Examination Glove", + "Electrosurgical, Cutting & Coagulation & Accessories", + "System, Image Processing, Radiological", + "Hearing Aid, Air Conduction", + "Laparoscope, General & Plastic Surgery", + ], + ) diff --git a/openfda/deploy/tests/deviceclass/test_endpoint.py b/openfda/deploy/tests/deviceclass/test_endpoint.py index 20be25d..0c5b3d8 100644 --- a/openfda/deploy/tests/deviceclass/test_endpoint.py +++ b/openfda/deploy/tests/deviceclass/test_endpoint.py @@ -1,33 +1,43 @@ - import requests from openfda.tests.api_test_helpers import * def test_exact_count(): - assert_count_contains('/device/classification.json?count=device_name.exact', - [ 'Endoscopic Video Imaging System/Component, Gastroenterology-Urology', - 'Catheter Introducer Kit' - ]) + assert_count_contains( + "/device/classification.json?count=device_name.exact", + [ + "Endoscopic Video Imaging System/Component, Gastroenterology-Urology", + "Catheter Introducer Kit", + ], + ) + def test_summary_malfunction_reporting_counts(): - assert_count_contains('/device/classification.json?count=summary_malfunction_reporting', - [ 'Eligible', - 'Ineligible' - ]) + assert_count_contains( + "/device/classification.json?count=summary_malfunction_reporting", + ["Eligible", "Ineligible"], + ) def test_exact_count_desc(): - assert_count_top('/device/classification.json?count=medical_specialty_description.exact', - [ 'Unknown', - 'Clinical Chemistry', - 'Gastroenterology, Urology', - 'General, Plastic Surgery', - 'Microbiology' - ]) + assert_count_top( + "/device/classification.json?count=medical_specialty_description.exact", + [ + "Unknown", + "Clinical Chemistry", + "Gastroenterology, Urology", + "General, Plastic Surgery", + "Microbiology", + ], + ) + def test_openfda(): - assert_total('/device/classification.json?search=openfda.registration_number:3005809810', 1) + assert_total( + "/device/classification.json?search=openfda.registration_number:3005809810", 1 + ) + def test_not_analyzed(): - assert_total('/device/classification.json?search=regulation_number:878.4200', 1) + assert_total("/device/classification.json?search=regulation_number:878.4200", 1) diff --git a/openfda/deploy/tests/deviceevent/test_endpoint.py b/openfda/deploy/tests/deviceevent/test_endpoint.py index d2b7335..912e4f1 100644 --- a/openfda/deploy/tests/deviceevent/test_endpoint.py +++ b/openfda/deploy/tests/deviceevent/test_endpoint.py @@ -1,1480 +1,1530 @@ from openfda.tests.api_test_helpers import * - def assert_enough_results(query, count): - meta, results = fetch(query) - assert meta['results']['total'] > count, \ - 'Query %s returned fewer results than expected: %d vs %d' % ( - query, count, meta['results']['total']) + meta, results = fetch(query) + assert ( + meta["results"]["total"] > count + ), "Query %s returned fewer results than expected: %d vs %d" % ( + query, + count, + meta["results"]["total"], + ) def test_counts_with_patient_problems(): - assert_enough_results('/device/event.json?search=_exists_:patient.patient_problems', 10361000) + assert_enough_results( + "/device/event.json?search=_exists_:patient.patient_problems", 10361000 + ) def test_counts_with_product_problems(): - assert_enough_results('/device/event.json?search=_exists_:product_problems', 10375000) + assert_enough_results( + "/device/event.json?search=_exists_:product_problems", 10375000 + ) def test_exact_count(): - assert_count_top('/device/event.json?count=type_of_report.exact', ['Initial submission', 'Followup']) + assert_count_top( + "/device/event.json?count=type_of_report.exact", + ["Initial submission", "Followup"], + ) def test_exact_product_problems_count(): - assert_count_top(query='/device/event.json?count=product_problems.exact&limit=1000', - N=1000, expected_terms= - ["Adverse Event Without Identified Device or Use Problem", - "Device Displays Incorrect Message", - "Break", - "Device Operates Differently Than Expected", - "Insufficient Information", - "Failure to Osseointegrate", - "Wireless Communication Problem", - "Appropriate Term/Code Not Available", - "Loss of Osseointegration", - "No Device Output", - "Patient Device Interaction Problem", - "Incorrect, Inadequate or Imprecise Resultor Readings", - "Fluid Leak", - "Device Dislodged or Dislocated", - "Over-Sensing", - "No Display/Image", - "Mechanical Problem", - "Imprecision", - "Leak/Splash", - "High impedance", - "Fracture", - "Crack", - "Battery Problem", - "Device Inoperable", - "Failure to Power Up", - "Osseointegration Problem", - "Device Alarm System", - "Use of Device Problem", - "Material Rupture", - "Pumping Stopped", - "Detachment Of Device Component", - "No Apparent Adverse Event", - "Premature Discharge of Battery", - "Power Problem", - "Detachment of Device or Device Component", - "Improper or Incorrect Procedure or Method", - "Communication or Transmission Problem", - "Incorrect Or Inadequate Test Results", - "Migration or Expulsion of Device", - "Connection Problem", - "Defective Device", - "Charging Problem", - "Obstruction of Flow", - "High Test Results", - "Malposition of Device", - "High Capture Threshold", - "Inappropriate/Inadequate Shock/Stimulation", - "Nonstandard Device", - "Failure to Capture", - "Device Sensing Problem", - "Material Integrity Problem", - "Bent", - "Difficult to Remove", - "Device Slipped", - "Incorrect Measurement", - "Device Difficult to Program or Calibrate", - "Signal Artifact/Noise", - "Display or Visual Feedback Problem", - "Patient-Device Incompatibility", - "Device Contamination with Chemical or Other Material", - "Material Deformation", - "Electrical /Electronic Property Problem", - "Overheating of Device", - "Material Separation", - "Loss of Power", - "Naturally Worn", - "Occlusion Within Device", - "Under-Sensing", - "Material Twisted/Bent", - "Sticking", - "Computer Software Problem", - "Component Missing", - "Loose or Intermittent Connection", - "Unintended Movement", - "Material Erosion", - "Failure to Deliver", - "Failure to Fire", - "Invalid Sensing", - "Failure to Sense", - "Noise, Audible", - "Low impedance", - "Poor Quality Image", - "Loss of or Failure to Bond", - "Low Test Results", - "Corroded", - "Impedance Problem", - "Positioning Problem", - "Structural Problem", - "Air Leak", - "Failure to Advance", - "Premature End-of-Life Indicator", - "Overfill", - "Moisture or Humidity Problem", - "No Audible Alarm", - "Pacing Problem", - "Migration", - "Physical Resistance/Sticking", - "Activation, Positioning or SeparationProblem", - "Inappropriate or Unexpected Reset", - "Retraction Problem", - "Mechanical Jam", - "Filling Problem", - "Positioning Failure", - "Insufficient Flow or Under Infusion", - "Device Stops Intermittently", - "Output Problem", - "Circuit Failure", - "Failure to Form Staple", - "Difficult to Insert", - "Failure to Deliver Energy", - "Temperature Problem", - "Capturing Problem", - "Image Display Error/Artifact", - "Moisture Damage", - "Disconnection", - "Failure to Charge", - "Difficult to Open or Close", - "Unable to Obtain Readings", - "Excess Flow or Over-Infusion", - "Therapeutic or Diagnostic Output Failure", - "Failure to Prime", - "Failure of Device to Self-Test", - "False Alarm", - "Contamination", - "Self-Activation or Keying", - "Computer Operating System Problem", - "Infusion or Flow Problem", - "Unstable", - "Component Falling", - "Defective Component", - "Defective Alarm", - "Inadequacy of Device Shape and/or Size", - "Failure To Adhere Or Bond", - "Metal Shedding Debris", - "Failure to Interrogate", - "Failure to Read Input Signal", - "Low Battery", - "Entrapment of Device", - "Failure to Cycle", - "Difficult or Delayed Activation", - "False Positive Result", - "Kinked", - "Application Interface Becomes Non-Functional Or Program Exits Abnormally", - "Data Problem", - "Inaccurate Delivery", - "Visual Prompts will not Clear", - "Product Quality Problem", - "Failure to Cut", - "Material Protrusion/Extrusion", - "Inflation Problem", - "Other (for use when an appropriate device code cannot be identified)", - "Material Fragmentation", - "Scratched Material", - "Device-Device Incompatibility", - "Suction Problem", - "Human-Device Interface Problem", - "Unexpected Therapeutic Results", - "Improper Flow or Infusion", - "Ambient Noise Problem", - "Deflation Problem", - "Protective Measures Problem", - "Device Markings/Labelling Problem", - "Fail-Safe Problem", - "Partial Blockage", - "Degraded", - "Unexpected Shutdown", - "Unintended System Motion", - "Misfire", - "Burst Container or Vessel", - "Mechanics Altered", - "Unintended Collision", - "Packaging Problem", - "Fitting Problem", - "Display Difficult to Read", - "Device Operational Issue", - "Failure to Align", - "Torn Material", - "Electromagnetic Interference", - "No Flow", - "Thermal Decomposition of Device", - "Calibration Problem", - "Failure to Calibrate", - "Volume Accuracy Problem", - "Difficult To Position", - "Dull, Blunt", - "Peeled/Delaminated", - "Material Split, Cut or Torn", - "Hole In Material", - "False Negative Result", - "Deformation Due to Compressive Stress", - "False Reading From Device Non-Compliance", - "Physical Resistance", - "High Readings", - "Device Damaged Prior to Use", - "Therapy Delivered to Incorrect Body Area", - "Difficult to Advance", - "Energy Output Problem", - "Reset Problem", - "Pacemaker Found in Back-Up Mode", - "Material Puncture/Hole", - "Material Discolored", - "Premature Activation", - "Microbial Contamination of Device", - "Premature Elective Replacement Indicator", - "Pumping Problem", - "Intermittent Continuity", - "Decreased Sensitivity", - "Date/Time-Related Software Problem", - "Separation Failure", - "Biocompatibility", - "Gas Leak", - "Application Program Freezes, Becomes Nonfunctional", - "Material Frayed", - "Smoking", - "Device Damaged by Another Device", - "Device Emits Odor", - "Defibrillation/Stimulation Problem", - "Off-Label Use", - "Difficult or Delayed Positioning", - "Loss of Data", - "Misassembled", - "Pressure Problem", - "Perivalvular Leak", - "Priming Problem", - "Device Issue", - "Aspiration Issue", - "Device Appears to Trigger Rejection", - "Unintended Application Program Shut Down", - "Output below Specifications", - "Use of Incorrect Control Settings", - "Melted", - "Disassembly", - "Loosening of Implant Not Related to Bone-Ingrowth", - "Non Reproducible Results", - "Erratic or Intermittent Display", - "Activation Failure", - "Intermittent Capture", - "Electromagnetic Compatibility Problem", - "Vibration", - "Delivered as Unsterile Product", - "Contamination /Decontamination Problem", - "Low Readings", - "Incomplete Coaptation", - "Delayed Charge Time", - "Stretched", - "Material Disintegration", - "Device Or Device Fragments Location Unknown", - "Device Handling Problem", - "Complete Blockage", - "Device Reprocessing Problem", - "Inadequate User Interface", - "Unstable Capture Threshold", - "Fire", - "Calcified", - "Tear, Rip or Hole in Device Packaging", - "Unsealed Device Packaging", - "Sparking", - "Device Packaging Compromised", - "Pocket Stimulation", - "Extrusion", - "Collapse", - "Gradient Increase", - "Short Fill", - "Manufacturing, Packaging or Shipping Problem", - "Cut In Material", - "Electrical Shorting", - "Improper Device Output", - "Programming Issue", - "Expulsion", - "Unraveled Material", - "Out-Of-Box Failure", - "Device Contaminated During Manufacture or Shipping", - "Device Remains Activated", - "Inaccurate Flow Rate", - "Misconnection", - "Failure to Shut Off", - "High Sensing Threshold", - "Split", - "Complete Loss of Power", - "Decrease in Pressure", - "Application Program Problem", - "Device Fell", - "Decrease in Suction", - "Restricted Flow rate", - "Failure to Infuse", - "Device Contamination with Body Fluid", - "Failure To Run On AC/DC", - "Device Tipped Over", - "Patient Data Problem", - "Failure to Recalibrate", - "Grounding Malfunction", - "Reflux within Device", - "Excessive Heating", - "Firing Problem", - "Failure to Deliver Shock/Stimulation", - "Failure To Service", - "No Audible Prompt/Feedback", - "Flaked", - "Failure to Discharge", - "Device Ingredient or Reagent Problem", - "Unintended Head Motion", - "Device Difficult to Setup or Prepare", - "Optical Problem", - "Unknown (for use when the device problem is not known)", - "Low Sensing Threshold", - "Failure to Convert Rhythm", - "Energy Output To Patient Tissue Incorrect", - "Unintended Ejection", - "Increase in Pressure", - "Failure to Obtain Sample", - "Calibration Error", - "Operating System Becomes Nonfunctional", - "Material Perforation", - "Failure to Disconnect", - "Device Expiration Issue", - "Intermittent Loss of Power", - "Environmental Compatibility Problem", - "Missing Test Results", - "Activation Problem", - "Premature Separation", - "Free or Unrestricted Flow", - "Electronic Property Issue", - "Insufficient Heating", - "Particulates", - "Intermittent Infusion", - "Suction Failure", - "No Pacing", - "Compatibility Problem", - "Arcing", - "Inappropriate Audible Prompt/Feedback", - "Insufficient Cooling", - "Coagulation in Device or Device Ingredient", - "Fail-Safe Did Not Operate", - "Telemetry Discrepancy", - "Device Maintenance Issue", - "Physical Property Issue", - "Unintended Power Up", - "Delamination", - "Component Misassembled", - "Gas Output Problem", - "Material Too Rigid or Stiff", - "Decreased Pump Speed", - "Backflow", - "Device Contamination With Biological Material", - "Low Audible Alarm", - "Audible Prompt/Feedback Problem", - "Separation Problem", - "Incorrect Interpretation of Signal", - "Needle, separation", - "Electrical Power Problem", - "Sensing Intermittently", - "Component or Accessory Incompatibility", - "Shelf Life Exceeded", - "Failure to Auto Stop", - "Failure to Pump", - "Contamination of Device Ingredient or Reagent", - "Failure to Analyze Signal", - "Material Opacification", - "Shipping Damage or Problem", - "Failure to Clean Adequately", - "No Pressure", - "Labelling, Instructions for Use or Training Problem", - "Device Misassembled During Manufacturing /Shipping", - "Explanted", - "Chemical Spillage", - "Intermittent Communication Failure", - "Chemical Problem", - "Operating System Version or Upgrade Problem", - "Tidal Volume Fluctuations", - "Gel Leak", - "Misfocusing", - "Power Conditioning Problem", - "Folded", - "Charred", - "Application Program Version or Upgrade Problem", - "Alarm Not Visible", - "Device Disinfection Or Sterilization Issue", - "Failure to Run on Battery", - "Difficult to Interrogate", - "Missing Value Reason", - "Failure to Back-Up", - "Installation-Related Problem", - "Environmental Particulates", - "Inability to Irrigate", - "Sharp Edges", - "Tip breakage", - "Battery Problem: High Impedance", - "Malfunction", - "Difficult to Fold, Unfold or Collapse", - "Failure to Select Signal", - "Reaction", - "Incorrect Software Programming Calculations", - "Incorrect Device Or Component Shipped", - "Shielding Failure", - "Material Distortion", - "Inaudible or Unclear Audible Prompt/Feedback", - "Dent in Material", - "Output above Specifications", - "Intermittent Energy Output", - "Loss of Threshold", - "Component Incompatible", - "Image Resolution Poor", - "Delivery System Failure", - "Accessory Incompatible", - "Incomplete or Missing Packaging", - "Implant, removal of", - "Improper Chemical Reaction", - "Inadequate or Insufficient Training", - "Illegible Information", - "Incomplete or Inadequate Connection", - "Difficult to Open or Remove Packaging Material", - "Application Program Problem: Dose Calculation Error", - "Missing Information", - "Failure to Reset", - "Maintenance Does Not Comply To Manufacturers Recommendations", - "Improper Alarm", - "Interrogation Problem", - "Inadequate Instructions for Healthcare Professional", - "Difficult to Flush", - "Device Abrasion From Instrument Or Another Object", - "Inability to Auto-Fill", - "Expiration Date Error", - "Inadequate Instructions for Non-Healthcare Professional", - "Failure to Unfold or Unwrap", - "Looping", - "Problem with Software Installation", - "Blocked Connection", - "Residue After Decontamination", - "Aborted Charge", - "Difficult or Delayed Separation", - "Failure to Zero", - "Flare or Flash", - "Filtration Problem", - "Pacing Inadequately", - "Inadequate Ultra Filtration", - "Component(s), broken", - "Misassembly by Users", - "Device remains implanted", - "Failure to Transmit Record", - "Issue With Displayed Error Message", - "Loose", - "Failure to Eject", - "Human Factors Issue", - "Data Back-Up Problem", - "Flushing Problem", - "Inaccurate Synchronization", - "Unintended Electrical Shock", - "Unintended Arm Motion", - "Electro-Static Discharge", - "Failure to Conduct", - "Pacing Intermittently", - "Inaccurate Information", - "Knotted", - "Application Program Problem: Medication Error", - "Failure to Fold", - "Device, or device fragments remain in patient", - "Fail-Safe Design Failure", - "Solder Joint Fracture", - "Ejection Problem", - "Ventilation Problem in Device Environment", - "Air/Gas in Device", - "Image Orientation Incorrect", - "Inadequate Lighting", - "Tears, rips, holes in device, device material", - "Problem with Sterilization", - "Radiation Overexposure", - "Optical Distortion", - "Application Program Problem: Parameter Calculation Error", - "Material Invagination", - "Pitted", - "Coiled", - "Inadequate Filtration Process", - "Lens, vaulting", - "Increase in Suction", - "Key or Button Unresponsive/not Working", - "Alarm, audible", - "Radiofrequency Interference (RFI)", - "Arcing of Electrodes", - "Sediment, Precipitate Or Deposit In Device Or Device Ingredient", - "Material Too Soft/Flexible", - "Inaccurate Dispensing", - "Increased Sensitivity", - "Battery Impedance Issue", - "False Device Output", - "Precipitate in Device or Device Ingredient", - "Explosion", - "Material Torqued", - "Pacing Asynchronously", - "Excessive Cooling", - "Mushroomed", - "Uncoiled", - "Overcorrection", - "Decoupling", - "Lens (IOL), torn, split, cracked", - "Device Difficult to Maintain", - "Increased Pump Speed", - "Fumes or Vapors", - "Inadequate Storage", - "Fungus in Device Environment", - "Measurement System Incompatibility", - "Implant extrusion", - "Medical Gas Supply Problem", - "Battery Problem: Low Impedance", - "Inappropriate Waveform", - "Balloon leak(s)", - "Optical Discoloration", - "Intermittent Shock/Stimulation", - "Wrinkled", - "Continuous Firing", - "Fogging", - "Ambient Temperature Problem", - "Balloon rupture", - "No Fail-Safe Mechanism", - "Image Reversal", - "Normal", - "Disposable", - "Delayed Alarm", - "Lack of Maintenance Documentation or Guidelines", - "Radio Signal Problem", - "Implant Mobility NOS (Not otherwise specified)", - "Program, failure to", - "Optical Obstruction", - "Device Unsafe to Use in Environment", - "Biofilm coating in Device", - "Wire(s), breakage of", - "Inadequate Service", - "Unclear Information", - "Misassembled During Installation", - "Program or Algorithm Execution Problem", - "Failure to Deflate", - "Failure to Disinfect", - "Application Network Problem", - "Dome collapse", - "Optical Decentration", - "Unauthorized Access to Computer System", - "Absorption", - "Clumping in Device or Device Ingredient", - "Electrical Overstress", - "Radiation Output Failure", - "Computer System Security Problem", - "No Visual Prompts/Feedback", - "Valve(s), failure of", - "Misassembly During Maintenance/Repair", - "Dislocated", - "Light Interference", - "No Tactile Prompts/Feedback", - "Balloon burst", - "Changes In Ambient Temperature In Device Environment", - "Plunge", - "Program or Algorithm Execution Failure", - "Device Rinsing Issue", - "Hot Oil Leak", - "Radiation Leak", - "Unintended Deflation", - "Motor failure", - "Tactile Prompts/Feedback", - "Battery charger, defective", - "Application Program Problem: Power Calculation Error", - "Biological Environmental Factor", - "Component(s), worn", - "Foreign material", - "Radiation Output Problem", - "Energy Spectrum Incorrect", - "Deflation, cause unknown", - "Unexpected/Unintended Radiation Output", - "Measurements, inaccurate", - "Premature Indicator Activation", - "Radiation Underexposure", - "Failure to Convert to Back-Up", - "Locking mechanism failure", - "Failure To Unwrap", - "Haptic(s), broken", - "High pH", - "Hose line occlusion", - "Reservoir", - "Cross Reactivity", - "Disengaged", - "Arcing at Paddles", - "Erratic Results", - "Unable to confirm conditions of use", - "Clips, scissored", - "Alarm, intermittent", - "Dissection", - "Device, removal of (non-implant)", - "Inappropriate Tactile Prompt/Feedback", - "Facilities Issue", - "Incorrect Error Code", - "Lens Damaged in Delivery System", - "Seal, defective", - "Application Security Problem", - "Dislodged", - "Safety interlock(s) inadequate", - "Mislabeled", - "Noise", - "System fails to activate", - "Tube(s), splitting of", - "Overdelivery", - "Inadequate Lubrication", - "Unintended Magnet Quench", - "Problem with Removal of Enzymatic Cleaner", - "Sharp/jagged/rough/etched/scratched", - "Needle, unsheathed", - "Pressure, insufficient", - "Compatibility", - "Device Contaminated at the User Facility", - "Performance", - "Connection error", - "Emergency Power Failure", - "Implant breakage or physical damage", - "Membrane leak(s)", - "Pressure sensor failure", - "Bolus mechanism failure", - "Interference", - "Mineralization", - "Rupture, cause unknown", - "Escape", - "Fuse, blown", - "Pre Or Post-Pumping Problem", - "Source, detachment from", - "Thickening of Material", - "Capacitative Coupling", - "Unexpected Color", - "Cardiac enzyme evaluation, erroneous", - "Delayed Program or Algorithm Execution", - "Design/structure problem", - "Haptic(s), bent", - "Tube(s), buckling of", - "Incomplete or Inadequate Priming", - "Internal fixation, revision of", - "Lens Stuck in Delivery System", - "Undercorrection", - "Replace", - "Sterility", - "Blank screen", - "Close, difficult to", - "Component(s), overheating of", - "Deterioration of prosthesis", - "Displacement", - "Electrode(s), fracture of", - "Filter break(s)", - "Lead(s), fracture of", - "Misplacement", - "Poor gas exchange", - "Alarm, no lead", - "Spring loading mechanism problem", - "Unintended Compatibility", - "Alarm, failure of warning", - "Cautery", - "Electro-magnetic interference (EMI), compatibility/incompatibility", - "Filter", - "Filter leak(s)", - "Haptic Damaged in Delivery System", - "Intended use for traditional use", - "Intraprocedure, fire or flash during", - "Lens, disc", - "Lens, repositioning of", - "Occlusion, incorrect" - ]) + assert_count_top( + query="/device/event.json?count=product_problems.exact&limit=1000", + N=1000, + expected_terms=[ + "Adverse Event Without Identified Device or Use Problem", + "Device Displays Incorrect Message", + "Break", + "Device Operates Differently Than Expected", + "Insufficient Information", + "Failure to Osseointegrate", + "Wireless Communication Problem", + "Appropriate Term/Code Not Available", + "Loss of Osseointegration", + "No Device Output", + "Patient Device Interaction Problem", + "Incorrect, Inadequate or Imprecise Resultor Readings", + "Fluid Leak", + "Device Dislodged or Dislocated", + "Over-Sensing", + "No Display/Image", + "Mechanical Problem", + "Imprecision", + "Leak/Splash", + "High impedance", + "Fracture", + "Crack", + "Battery Problem", + "Device Inoperable", + "Failure to Power Up", + "Osseointegration Problem", + "Device Alarm System", + "Use of Device Problem", + "Material Rupture", + "Pumping Stopped", + "Detachment Of Device Component", + "No Apparent Adverse Event", + "Premature Discharge of Battery", + "Power Problem", + "Detachment of Device or Device Component", + "Improper or Incorrect Procedure or Method", + "Communication or Transmission Problem", + "Incorrect Or Inadequate Test Results", + "Migration or Expulsion of Device", + "Connection Problem", + "Defective Device", + "Charging Problem", + "Obstruction of Flow", + "High Test Results", + "Malposition of Device", + "High Capture Threshold", + "Inappropriate/Inadequate Shock/Stimulation", + "Nonstandard Device", + "Failure to Capture", + "Device Sensing Problem", + "Material Integrity Problem", + "Bent", + "Difficult to Remove", + "Device Slipped", + "Incorrect Measurement", + "Device Difficult to Program or Calibrate", + "Signal Artifact/Noise", + "Display or Visual Feedback Problem", + "Patient-Device Incompatibility", + "Device Contamination with Chemical or Other Material", + "Material Deformation", + "Electrical /Electronic Property Problem", + "Overheating of Device", + "Material Separation", + "Loss of Power", + "Naturally Worn", + "Occlusion Within Device", + "Under-Sensing", + "Material Twisted/Bent", + "Sticking", + "Computer Software Problem", + "Component Missing", + "Loose or Intermittent Connection", + "Unintended Movement", + "Material Erosion", + "Failure to Deliver", + "Failure to Fire", + "Invalid Sensing", + "Failure to Sense", + "Noise, Audible", + "Low impedance", + "Poor Quality Image", + "Loss of or Failure to Bond", + "Low Test Results", + "Corroded", + "Impedance Problem", + "Positioning Problem", + "Structural Problem", + "Air Leak", + "Failure to Advance", + "Premature End-of-Life Indicator", + "Overfill", + "Moisture or Humidity Problem", + "No Audible Alarm", + "Pacing Problem", + "Migration", + "Physical Resistance/Sticking", + "Activation, Positioning or SeparationProblem", + "Inappropriate or Unexpected Reset", + "Retraction Problem", + "Mechanical Jam", + "Filling Problem", + "Positioning Failure", + "Insufficient Flow or Under Infusion", + "Device Stops Intermittently", + "Output Problem", + "Circuit Failure", + "Failure to Form Staple", + "Difficult to Insert", + "Failure to Deliver Energy", + "Temperature Problem", + "Capturing Problem", + "Image Display Error/Artifact", + "Moisture Damage", + "Disconnection", + "Failure to Charge", + "Difficult to Open or Close", + "Unable to Obtain Readings", + "Excess Flow or Over-Infusion", + "Therapeutic or Diagnostic Output Failure", + "Failure to Prime", + "Failure of Device to Self-Test", + "False Alarm", + "Contamination", + "Self-Activation or Keying", + "Computer Operating System Problem", + "Infusion or Flow Problem", + "Unstable", + "Component Falling", + "Defective Component", + "Defective Alarm", + "Inadequacy of Device Shape and/or Size", + "Failure To Adhere Or Bond", + "Metal Shedding Debris", + "Failure to Interrogate", + "Failure to Read Input Signal", + "Low Battery", + "Entrapment of Device", + "Failure to Cycle", + "Difficult or Delayed Activation", + "False Positive Result", + "Kinked", + "Application Interface Becomes Non-Functional Or Program Exits Abnormally", + "Data Problem", + "Inaccurate Delivery", + "Visual Prompts will not Clear", + "Product Quality Problem", + "Failure to Cut", + "Material Protrusion/Extrusion", + "Inflation Problem", + "Other (for use when an appropriate device code cannot be identified)", + "Material Fragmentation", + "Scratched Material", + "Device-Device Incompatibility", + "Suction Problem", + "Human-Device Interface Problem", + "Unexpected Therapeutic Results", + "Improper Flow or Infusion", + "Ambient Noise Problem", + "Deflation Problem", + "Protective Measures Problem", + "Device Markings/Labelling Problem", + "Fail-Safe Problem", + "Partial Blockage", + "Degraded", + "Unexpected Shutdown", + "Unintended System Motion", + "Misfire", + "Burst Container or Vessel", + "Mechanics Altered", + "Unintended Collision", + "Packaging Problem", + "Fitting Problem", + "Display Difficult to Read", + "Device Operational Issue", + "Failure to Align", + "Torn Material", + "Electromagnetic Interference", + "No Flow", + "Thermal Decomposition of Device", + "Calibration Problem", + "Failure to Calibrate", + "Volume Accuracy Problem", + "Difficult To Position", + "Dull, Blunt", + "Peeled/Delaminated", + "Material Split, Cut or Torn", + "Hole In Material", + "False Negative Result", + "Deformation Due to Compressive Stress", + "False Reading From Device Non-Compliance", + "Physical Resistance", + "High Readings", + "Device Damaged Prior to Use", + "Therapy Delivered to Incorrect Body Area", + "Difficult to Advance", + "Energy Output Problem", + "Reset Problem", + "Pacemaker Found in Back-Up Mode", + "Material Puncture/Hole", + "Material Discolored", + "Premature Activation", + "Microbial Contamination of Device", + "Premature Elective Replacement Indicator", + "Pumping Problem", + "Intermittent Continuity", + "Decreased Sensitivity", + "Date/Time-Related Software Problem", + "Separation Failure", + "Biocompatibility", + "Gas Leak", + "Application Program Freezes, Becomes Nonfunctional", + "Material Frayed", + "Smoking", + "Device Damaged by Another Device", + "Device Emits Odor", + "Defibrillation/Stimulation Problem", + "Off-Label Use", + "Difficult or Delayed Positioning", + "Loss of Data", + "Misassembled", + "Pressure Problem", + "Perivalvular Leak", + "Priming Problem", + "Device Issue", + "Aspiration Issue", + "Device Appears to Trigger Rejection", + "Unintended Application Program Shut Down", + "Output below Specifications", + "Use of Incorrect Control Settings", + "Melted", + "Disassembly", + "Loosening of Implant Not Related to Bone-Ingrowth", + "Non Reproducible Results", + "Erratic or Intermittent Display", + "Activation Failure", + "Intermittent Capture", + "Electromagnetic Compatibility Problem", + "Vibration", + "Delivered as Unsterile Product", + "Contamination /Decontamination Problem", + "Low Readings", + "Incomplete Coaptation", + "Delayed Charge Time", + "Stretched", + "Material Disintegration", + "Device Or Device Fragments Location Unknown", + "Device Handling Problem", + "Complete Blockage", + "Device Reprocessing Problem", + "Inadequate User Interface", + "Unstable Capture Threshold", + "Fire", + "Calcified", + "Tear, Rip or Hole in Device Packaging", + "Unsealed Device Packaging", + "Sparking", + "Device Packaging Compromised", + "Pocket Stimulation", + "Extrusion", + "Collapse", + "Gradient Increase", + "Short Fill", + "Manufacturing, Packaging or Shipping Problem", + "Cut In Material", + "Electrical Shorting", + "Improper Device Output", + "Programming Issue", + "Expulsion", + "Unraveled Material", + "Out-Of-Box Failure", + "Device Contaminated During Manufacture or Shipping", + "Device Remains Activated", + "Inaccurate Flow Rate", + "Misconnection", + "Failure to Shut Off", + "High Sensing Threshold", + "Split", + "Complete Loss of Power", + "Decrease in Pressure", + "Application Program Problem", + "Device Fell", + "Decrease in Suction", + "Restricted Flow rate", + "Failure to Infuse", + "Device Contamination with Body Fluid", + "Failure To Run On AC/DC", + "Device Tipped Over", + "Patient Data Problem", + "Failure to Recalibrate", + "Grounding Malfunction", + "Reflux within Device", + "Excessive Heating", + "Firing Problem", + "Failure to Deliver Shock/Stimulation", + "Failure To Service", + "No Audible Prompt/Feedback", + "Flaked", + "Failure to Discharge", + "Device Ingredient or Reagent Problem", + "Unintended Head Motion", + "Device Difficult to Setup or Prepare", + "Optical Problem", + "Unknown (for use when the device problem is not known)", + "Low Sensing Threshold", + "Failure to Convert Rhythm", + "Energy Output To Patient Tissue Incorrect", + "Unintended Ejection", + "Increase in Pressure", + "Failure to Obtain Sample", + "Calibration Error", + "Operating System Becomes Nonfunctional", + "Material Perforation", + "Failure to Disconnect", + "Device Expiration Issue", + "Intermittent Loss of Power", + "Environmental Compatibility Problem", + "Missing Test Results", + "Activation Problem", + "Premature Separation", + "Free or Unrestricted Flow", + "Electronic Property Issue", + "Insufficient Heating", + "Particulates", + "Intermittent Infusion", + "Suction Failure", + "No Pacing", + "Compatibility Problem", + "Arcing", + "Inappropriate Audible Prompt/Feedback", + "Insufficient Cooling", + "Coagulation in Device or Device Ingredient", + "Fail-Safe Did Not Operate", + "Telemetry Discrepancy", + "Device Maintenance Issue", + "Physical Property Issue", + "Unintended Power Up", + "Delamination", + "Component Misassembled", + "Gas Output Problem", + "Material Too Rigid or Stiff", + "Decreased Pump Speed", + "Backflow", + "Device Contamination With Biological Material", + "Low Audible Alarm", + "Audible Prompt/Feedback Problem", + "Separation Problem", + "Incorrect Interpretation of Signal", + "Needle, separation", + "Electrical Power Problem", + "Sensing Intermittently", + "Component or Accessory Incompatibility", + "Shelf Life Exceeded", + "Failure to Auto Stop", + "Failure to Pump", + "Contamination of Device Ingredient or Reagent", + "Failure to Analyze Signal", + "Material Opacification", + "Shipping Damage or Problem", + "Failure to Clean Adequately", + "No Pressure", + "Labelling, Instructions for Use or Training Problem", + "Device Misassembled During Manufacturing /Shipping", + "Explanted", + "Chemical Spillage", + "Intermittent Communication Failure", + "Chemical Problem", + "Operating System Version or Upgrade Problem", + "Tidal Volume Fluctuations", + "Gel Leak", + "Misfocusing", + "Power Conditioning Problem", + "Folded", + "Charred", + "Application Program Version or Upgrade Problem", + "Alarm Not Visible", + "Device Disinfection Or Sterilization Issue", + "Failure to Run on Battery", + "Difficult to Interrogate", + "Missing Value Reason", + "Failure to Back-Up", + "Installation-Related Problem", + "Environmental Particulates", + "Inability to Irrigate", + "Sharp Edges", + "Tip breakage", + "Battery Problem: High Impedance", + "Malfunction", + "Difficult to Fold, Unfold or Collapse", + "Failure to Select Signal", + "Reaction", + "Incorrect Software Programming Calculations", + "Incorrect Device Or Component Shipped", + "Shielding Failure", + "Material Distortion", + "Inaudible or Unclear Audible Prompt/Feedback", + "Dent in Material", + "Output above Specifications", + "Intermittent Energy Output", + "Loss of Threshold", + "Component Incompatible", + "Image Resolution Poor", + "Delivery System Failure", + "Accessory Incompatible", + "Incomplete or Missing Packaging", + "Implant, removal of", + "Improper Chemical Reaction", + "Inadequate or Insufficient Training", + "Illegible Information", + "Incomplete or Inadequate Connection", + "Difficult to Open or Remove Packaging Material", + "Application Program Problem: Dose Calculation Error", + "Missing Information", + "Failure to Reset", + "Maintenance Does Not Comply To Manufacturers Recommendations", + "Improper Alarm", + "Interrogation Problem", + "Inadequate Instructions for Healthcare Professional", + "Difficult to Flush", + "Device Abrasion From Instrument Or Another Object", + "Inability to Auto-Fill", + "Expiration Date Error", + "Inadequate Instructions for Non-Healthcare Professional", + "Failure to Unfold or Unwrap", + "Looping", + "Problem with Software Installation", + "Blocked Connection", + "Residue After Decontamination", + "Aborted Charge", + "Difficult or Delayed Separation", + "Failure to Zero", + "Flare or Flash", + "Filtration Problem", + "Pacing Inadequately", + "Inadequate Ultra Filtration", + "Component(s), broken", + "Misassembly by Users", + "Device remains implanted", + "Failure to Transmit Record", + "Issue With Displayed Error Message", + "Loose", + "Failure to Eject", + "Human Factors Issue", + "Data Back-Up Problem", + "Flushing Problem", + "Inaccurate Synchronization", + "Unintended Electrical Shock", + "Unintended Arm Motion", + "Electro-Static Discharge", + "Failure to Conduct", + "Pacing Intermittently", + "Inaccurate Information", + "Knotted", + "Application Program Problem: Medication Error", + "Failure to Fold", + "Device, or device fragments remain in patient", + "Fail-Safe Design Failure", + "Solder Joint Fracture", + "Ejection Problem", + "Ventilation Problem in Device Environment", + "Air/Gas in Device", + "Image Orientation Incorrect", + "Inadequate Lighting", + "Tears, rips, holes in device, device material", + "Problem with Sterilization", + "Radiation Overexposure", + "Optical Distortion", + "Application Program Problem: Parameter Calculation Error", + "Material Invagination", + "Pitted", + "Coiled", + "Inadequate Filtration Process", + "Lens, vaulting", + "Increase in Suction", + "Key or Button Unresponsive/not Working", + "Alarm, audible", + "Radiofrequency Interference (RFI)", + "Arcing of Electrodes", + "Sediment, Precipitate Or Deposit In Device Or Device Ingredient", + "Material Too Soft/Flexible", + "Inaccurate Dispensing", + "Increased Sensitivity", + "Battery Impedance Issue", + "False Device Output", + "Precipitate in Device or Device Ingredient", + "Explosion", + "Material Torqued", + "Pacing Asynchronously", + "Excessive Cooling", + "Mushroomed", + "Uncoiled", + "Overcorrection", + "Decoupling", + "Lens (IOL), torn, split, cracked", + "Device Difficult to Maintain", + "Increased Pump Speed", + "Fumes or Vapors", + "Inadequate Storage", + "Fungus in Device Environment", + "Measurement System Incompatibility", + "Implant extrusion", + "Medical Gas Supply Problem", + "Battery Problem: Low Impedance", + "Inappropriate Waveform", + "Balloon leak(s)", + "Optical Discoloration", + "Intermittent Shock/Stimulation", + "Wrinkled", + "Continuous Firing", + "Fogging", + "Ambient Temperature Problem", + "Balloon rupture", + "No Fail-Safe Mechanism", + "Image Reversal", + "Normal", + "Disposable", + "Delayed Alarm", + "Lack of Maintenance Documentation or Guidelines", + "Radio Signal Problem", + "Implant Mobility NOS (Not otherwise specified)", + "Program, failure to", + "Optical Obstruction", + "Device Unsafe to Use in Environment", + "Biofilm coating in Device", + "Wire(s), breakage of", + "Inadequate Service", + "Unclear Information", + "Misassembled During Installation", + "Program or Algorithm Execution Problem", + "Failure to Deflate", + "Failure to Disinfect", + "Application Network Problem", + "Dome collapse", + "Optical Decentration", + "Unauthorized Access to Computer System", + "Absorption", + "Clumping in Device or Device Ingredient", + "Electrical Overstress", + "Radiation Output Failure", + "Computer System Security Problem", + "No Visual Prompts/Feedback", + "Valve(s), failure of", + "Misassembly During Maintenance/Repair", + "Dislocated", + "Light Interference", + "No Tactile Prompts/Feedback", + "Balloon burst", + "Changes In Ambient Temperature In Device Environment", + "Plunge", + "Program or Algorithm Execution Failure", + "Device Rinsing Issue", + "Hot Oil Leak", + "Radiation Leak", + "Unintended Deflation", + "Motor failure", + "Tactile Prompts/Feedback", + "Battery charger, defective", + "Application Program Problem: Power Calculation Error", + "Biological Environmental Factor", + "Component(s), worn", + "Foreign material", + "Radiation Output Problem", + "Energy Spectrum Incorrect", + "Deflation, cause unknown", + "Unexpected/Unintended Radiation Output", + "Measurements, inaccurate", + "Premature Indicator Activation", + "Radiation Underexposure", + "Failure to Convert to Back-Up", + "Locking mechanism failure", + "Failure To Unwrap", + "Haptic(s), broken", + "High pH", + "Hose line occlusion", + "Reservoir", + "Cross Reactivity", + "Disengaged", + "Arcing at Paddles", + "Erratic Results", + "Unable to confirm conditions of use", + "Clips, scissored", + "Alarm, intermittent", + "Dissection", + "Device, removal of (non-implant)", + "Inappropriate Tactile Prompt/Feedback", + "Facilities Issue", + "Incorrect Error Code", + "Lens Damaged in Delivery System", + "Seal, defective", + "Application Security Problem", + "Dislodged", + "Safety interlock(s) inadequate", + "Mislabeled", + "Noise", + "System fails to activate", + "Tube(s), splitting of", + "Overdelivery", + "Inadequate Lubrication", + "Unintended Magnet Quench", + "Problem with Removal of Enzymatic Cleaner", + "Sharp/jagged/rough/etched/scratched", + "Needle, unsheathed", + "Pressure, insufficient", + "Compatibility", + "Device Contaminated at the User Facility", + "Performance", + "Connection error", + "Emergency Power Failure", + "Implant breakage or physical damage", + "Membrane leak(s)", + "Pressure sensor failure", + "Bolus mechanism failure", + "Interference", + "Mineralization", + "Rupture, cause unknown", + "Escape", + "Fuse, blown", + "Pre Or Post-Pumping Problem", + "Source, detachment from", + "Thickening of Material", + "Capacitative Coupling", + "Unexpected Color", + "Cardiac enzyme evaluation, erroneous", + "Delayed Program or Algorithm Execution", + "Design/structure problem", + "Haptic(s), bent", + "Tube(s), buckling of", + "Incomplete or Inadequate Priming", + "Internal fixation, revision of", + "Lens Stuck in Delivery System", + "Undercorrection", + "Replace", + "Sterility", + "Blank screen", + "Close, difficult to", + "Component(s), overheating of", + "Deterioration of prosthesis", + "Displacement", + "Electrode(s), fracture of", + "Filter break(s)", + "Lead(s), fracture of", + "Misplacement", + "Poor gas exchange", + "Alarm, no lead", + "Spring loading mechanism problem", + "Unintended Compatibility", + "Alarm, failure of warning", + "Cautery", + "Electro-magnetic interference (EMI), compatibility/incompatibility", + "Filter", + "Filter leak(s)", + "Haptic Damaged in Delivery System", + "Intended use for traditional use", + "Intraprocedure, fire or flash during", + "Lens, disc", + "Lens, repositioning of", + "Occlusion, incorrect", + ], + ) def test_exact_patient_problems_count(): - assert_count_top(query='/device/event.json?count=patient.patient_problems.exact&limit=1000', - N=1000, expected_terms= - ["No Known Impact Or Consequence To Patient", - "No Consequences Or Impact To Patient", - "No Clinical Signs, Symptoms or Conditions", - "No Patient Involvement", - "Pain", - "Failure of Implant", - "Hyperglycemia", - "No Code Available", - "Unspecified Infection", - "No Information", - "Tissue Damage", - "Injury", - "Hypoglycemia", - "Death", - "Insufficient Information", - "Inadequate Pain Relief", - "Discomfort", - "Hemorrhage/Bleeding", - "Inflammation", - "Erosion", - "Peritonitis", - "Fibrosis", - "Therapeutic Effects, Unexpected", - "Therapeutic Response, Decreased", - "Swelling", - "Complaint, Ill-Defined", - "Diabetic Ketoacidosis", - "Foreign Body In Patient", - "Capsular Contracture", - "Surgical procedure", - "Blood Loss", - "Device Embedded In Tissue or Plaque", - "Abdominal Pain", - "Disability", - "Foreign Body Reaction", - "Shock from Patient Lead(s)", - "Fall", - "Not Applicable", - "Nausea", - "Inadequate Osseointegration", - "Vomiting", - "Bone Fracture(s)", - "Reaction", - "Numbness", - "Headache", - "Osteolysis", - "Burning Sensation", - "Dizziness", - "Bacterial Infection", - "Joint Dislocation", - "Incontinence", - "Ambulation Difficulties", - "Adhesion(s)", - "Fatigue", - "Hernia", - "Other (for use when an appropriate patient code cannot be identified)", - "Host-Tissue Reaction", - "Hypersensitivity/Allergic reaction", - "Skin Irritation", - "Hematoma", - "Appropriate Clinical Signs, Symptoms, Conditions Term / Code Not Available", - "Cardiac Arrest", - "Electric Shock", - "Sepsis", - "Myocardial Infarction", - "Thrombosis", - "Dyspnea", - "Loss of consciousness", - "Abscess", - "Test Result", - "Fistula", - "Undesired Nerve Stimulation", - "Low Blood Pressure/ Hypotension", - "Perforation", - "Loss of Range of Motion", - "Post Operative Wound Infection", - "Thrombus", - "Scar Tissue", - "Blurred Vision", - "Chest Pain", - "Syncope", - "Erythema", - "Limited Mobility Of The Implanted Joint", - "Stroke/CVA", - "Occlusion", - "Rash", - "Toxicity", - "Perforation of Vessels", - "Seizures", - "Weakness", - "Shaking/Tremors", - "Wound Dehiscence", - "Cardiac Perforation", - "Anxiety", - "Fever", - "Non-union Bone Fracture", - "Impaired Healing", - "Seroma", - "Urinary Tract Infection", - "Swelling/ Edema", - "Patient Problem/Medical Problem", - "Itching Sensation", - "Pericardial Effusion", - "Heavier Menses", - "Stenosis", - "Edema", - "Sweating", - "Weight Changes", - "Device Overstimulation of Tissue", - "Neurological Deficit/Dysfunction", - "Cardiac Tamponade", - "Muscle Stimulation", - "Visual Impairment", - "Burn(s)", - "Urinary Frequency", - "Arrhythmia", - "Pocket Erosion", - "Deformity/ Disfigurement", - "Aortic Valve Stenosis", - "Necrosis", - "Emotional Changes", - "Obstruction/Occlusion", - "Overdose", - "Bradycardia", - "Urinary Retention", - "Muscular Rigidity", - "Uterine Perforation", - "Irritation", - "Intimal Dissection", - "Vascular Dissection", - "Confusion/ Disorientation", - "Depression", - "Aortic Regurgitation", - "Internal Organ Perforation", - "Aneurysm", - "Bleeding", - "Fluid Discharge", - "Nerve Damage", - "Unspecified Tissue Injury", - "Purulent Discharge", - "Endocarditis", - "Rupture", - "Reocclusion", - "Scarring", - "Osteopenia/ Osteoporosis", - "Malaise", - "Atrial Fibrillation", - "Menstrual Irregularities", - "Muscle Spasm(s)", - "Hemolysis", - "Laceration(s)", - "Visual Disturbances", - "Hair Loss", - "Polydipsia", - "Mitral Regurgitation", - "Increased Sensitivity", - "Hearing Impairment", - "Tingling", - "Sleep Dysfunction", - "Ventricular Tachycardia", - "Cyst(s)", - "Complete Heart Block", - "Bruise/Contusion", - "Staphylococcus Aureus", - "Arthralgia", - "Cognitive Changes", - "Underdose", - "Angina", - "Twiddlers Syndrome", - "Cramp(s)", - "Ventricular Fibrillation", - "Cerebrospinal Fluid Leakage", - "Keratitis", - "Abdominal Distention", - "Heart Failure", - "High Blood Pressure/ Hypertension", - "Pulmonary Embolism", - "Diarrhea", - "Tachycardia", - "Renal Failure", - "Failure to Anastomose", - "Burn, Thermal", - "Joint Swelling", - "Ischemia", - "Discharge", - "Non specific EKG/ECG Changes", - "Joint Disorder", - "Needle Stick/Puncture", - "Vessel Or Plaque, Device Embedded In", - "Granuloma", - "Synovitis", - "Dehydration", - "Capsular Bag Tear", - "Muscle Weakness", - "Pneumothorax", - "Paralysis", - "Skin Inflammation/ Irritation", - "Micturition Urgency", - "Low Oxygen Saturation", - "Infarction, Cerebral", - "Palpitations", - "Ossification", - "Calcium Deposits/Calcification", - "Halo", - "Respiratory Distress", - "Partial thickness (Second Degree) Burn", - "Respiratory Failure", - "Extubate", - "Pneumonia", - "Memory Loss/Impairment", - "Embolism", - "Anemia", - "Mitral Valve Stenosis", - "Lethargy", - "Distress", - "Surgical procedure, additional", - "Abdominal Cramps", - "Aortic Insufficiency", - "Intraocular Pressure Increased", - "Skin Discoloration", - "Thrombosis/Thrombus", - "Shock", - "Dysphagia/ Odynophagia", - "Abnormal Vaginal Discharge", - "Hemorrhage, Cerebral", - "Deafness", - "Neuropathy", - "Hematuria", - "Constipation", - "Chills", - "Fainting", - "Cellulitis", - "Local Reaction", - "Coma", - "Joint Laxity", - "Vascular System (Circulation), Impaired", - "Chemical Exposure", - "Neck Pain", - "Dry Eye(s)", - "Cardiopulmonary Arrest", - "Alteration In Body Temperature", - "Dysphasia", - "Head Injury", - "Vitrectomy", - "Abrasion", - "Tissue Breakdown", - "Radiation Exposure, Unintended", - "Fungal Infection", - "Red Eye(s)", - "Pseudoaneurysm", - "Prolapse", - "Sinus Perforation", - "Eye Injury", - "Autoimmune Reaction", - "Intracranial Hemorrhage", - "Foreign body, removal of", - "Skin Inflammation", - "Loss of Vision", - "Congestive Heart Failure", - "Skin Erosion", - "Surgical procedure, repeated", - "Pelvic Inflammatory Disease", - "Cancer", - "Cardiac Enzyme Elevation", - "Pregnancy", - "Arthritis", - "Pleural Effusion", - "Hip Fracture", - "Transient Ischemic Attack", - "Irritability", - "Corneal Ulcer", - "Postoperative refraction, unexpected", - "Sedation", - "Regurgitation", - "Pressure Sores", - "Dysuria", - "Heart Failure/Congestive Heart Failure", - "Bowel Perforation", - "Ulcer", - "Extrusion", - "Cardiogenic Shock", - "Multiple Organ Failure", - "Therapy/non-surgical treatment, additional", - "Implant Pain", - "Unknown (for use when the patient''s condition is not known)", - "Air Embolism", - "Unspecified Respiratory Problem", - "Iatrogenic Source", - "Treatment with medication(s)", - "Thromboembolism", - "Autoimmune Disorder", - "Corneal Edema", - "Hearing Loss", - "Exposure to Body Fluids", - "Hypoxia", - "Extravasation", - "Swollen Lymph Nodes", - "Skin Infection", - "Twitching", - "Paresis", - "Syncope/Fainting", - "Diaphoresis", - "Foreign Body Sensation in Eye", - "Urticaria", - "Metal Related Pathology", - "Septic Shock", - "Skin Tears", - "Hemorrhage, Subarachnoid", - "Tinnitus", - "Vertigo", - "Full thickness (Third Degree) Burn", - "Great Vessel Perforation", - "ST Segment Elevation", - "Endophthalmitis", - "Headache, Lumbar Puncture", - "Lymphoma", - "Hot Flashes/Flushes", - "Aortic Dissection", - "Exit Block", - "Claudication", - "Vasoconstriction", - "Coagulation Disorder", - "Hydrocephalus", - "Sudden Cardiac Death", - "Eye Burn", - "Spinal Column Injury", - "Superficial (First Degree) Burn", - "Excessive Tear Production", - "Aspiration/Inhalation", - "Chest Tightness/Pressure", - "Pulmonary Valve Stenosis", - "Apnea", - "Wrinkling", - "Embolus", - "Corneal Pannus", - "Brain Injury", - "Intermenstrual Bleeding", - "Meningitis", - "Breast Mass", - "Heart Block", - "Corneal Abrasion", - "Pulmonary Edema", - "Hypervolemia", - "Atrial Perforation", - "Thyroid Problems", - "Convulsion/Seizure", - "Collapse", - "Tooth Fracture", - "Ascites", - "Tricuspid Regurgitation", - "Flatus", - "Myalgia", - "No Patient involvement", - "Anaplastic Large Cell Lymphoma", - "Genital Bleeding", - "Aortic Valve Insufficiency/ Regurgitation", - "Urinary Incontinence", - "Ectopic Pregnancy", - "Right Ventricular Failure", - "Corneal Clouding/Hazing", - "Sensitivity of Teeth", - "Asthma", - "Choking", - "Hypoesthesia", - "Atrial Flutter", - "Pyrosis/Heartburn", - "Insufficiency, Valvular", - "Infiltration into Tissue", - "Loss Of Pulse", - "Gastrointestinal Hemorrhage", - "Physical Entrapment", - "Intraocular Pressure, Delayed, Uncontrolled", - "Cardiomyopathy", - "Ulceration", - "Hospitalization required", - "Respiratory Tract Infection", - "Conjunctivitis", - "Liver Damage/Dysfunction", - "Asystole", - "Misdiagnosis", - "Corneal Infiltrates", - "Pulmonary Regurgitation", - "Menorrhagia", - "Dyskinesia", - "Encephalopathy", - "Organ Dehiscence", - "Cataract", - "Caustic/Chemical Burns", - "Mitral Valve Insufficiency/ Regurgitation", - "Sore Throat", - "Miscarriage", - "Damage to Ligament(s)", - "Hyperplasia", - "Nonresorbable materials, unretrieved in body", - "Physical Asymmetry", - "Electrolyte Imbalance", - "Neck Stiffness", - "Hemoptysis", - "Unintended Extubation", - "Awareness during Anaesthesia", - "Hemothorax", - "Hemostasis", - "Ecchymosis", - "Spinal Cord Injury", - "Atrial Tachycardia", - "Regurgitation, Valvular", - "Reaction, Injection Site", - "Anaphylactic Shock", - "Airway Obstruction", - "Peeling", - "Missed Dose", - "Fracture, Arm", - "Contusion", - "Paraplegia", - "Uveitis", - "Fallopian Tube Perforation", - "Missing Value Reason", - "Convulsion, Clonic", - "Viral Infection", - "Cramp(s) /Muscle Spasm(s)", - "Low Cardiac Output", - "Pallor", - "Hyphema", - "Iritis", - "Exsanguination", - "Limb Fracture", - "Valvular Insufficiency/ Regurgitation", - "Facial Nerve Paralysis", - "Cataract, Induced", - "Ptosis", - "Vitreous Floaters", - "Blister", - "Swollen Lymph Nodes/Glands", - "Cyanosis", - "Rheumatoid Arthritis", - "Venipuncture", - "Hemorrhage, Subdural", - "Surgery, prolonged", - "Mitral Insufficiency", - "Macular Edema", - "Therapeutic Response, Increased", - "Colostomy", - "Tricuspid Valve Stenosis", - "Embolism/Embolus", - "Paresthesia", - "Chronic Obstructive Pulmonary Disease (COPD)", - "Hypovolemia", - "Diminished Pulse Pressure", - "Corneal Scar", - "Retinal Detachment", - "Oversedation", - "Perforation of Esophagus", - "Necrosis Of Flap Tissue", - "Bronchitis", - "Phlebitis", - "Muscle/Tendon Damage", - "Decreased Respiratory Rate", - "Unintended Radiation Exposure", - "Bowel Burn", - "Hypopyon", - "Ventilator Dependent", - "Ischemia Stroke", - "Transfusion of blood products", - "Hyperemia", - "Cusp Tear", - "Concussion", - "Difficulty Chewing", - "Glaucoma", - "Extreme Exhaustion", - "Gastritis", - "Virus", - "Dementia", - "Cough", - "Debris, Bone Shedding", - "Malunion of Bone", - "Restenosis", - "Hypothermia", - "Unspecified Nervous System Problem", - "Lupus", - "Feeding Problem", - "Decreased Appetite", - "Hemorrhagic Stroke", - "Premature Labor", - "Hypoventilation", - "Vitreous Loss", - "Hypovolemic Shock", - "Breast Cancer", - "Seizures, Grand-Mal", - "Laceration(s) of Esophagus", - "Atherosclerosis", - "Vaso-Vagal Response", - "Radiation Overdose", - "Allergic reaction", - "Right Ventricular Dysfunction", - "Crushing Injury", - "Nervous System Injury", - "Sprain", - "Jaundice", - "Overcorrection", - "Pulmonary Insufficiency", - "Abortion", - "Nerve Proximity Nos (Not Otherwise Specified)", - "Nicks, cuts or tears of dura or other tissues by device", - "Toxic Shock Syndrome", - "Intraocular Infection", - "Disc Impingement", - "Pupillary Block", - "Intraoperative Pain", - "Toxic Anterior Segment Syndrome (TASS)", - "Dyspareunia", - "Hepatitis", - "Electro-Mechanical Dissociation", - "Congenital Defect/Deformity", - "Lactate Dehydrogenase Increased", - "Wheal(s)", - "Renal Disease, End Stage", - "Eye Pain", - "Post Traumatic Wound Infection", - "Increased Respiratory Rate", - "Hiccups", - "Ischemic Heart Disease", - "Valvular Stenosis", - "Peroneal Nerve Palsy", - "Diplopia", - "Subcutaneous Nodule", - "Laparotomy", - "Death, Intrauterine Fetal", - "Gangrene", - "Drainage", - "Raynauds Phenomenon", - "Retinal Tear", - "Vaginal Mucosa Damage", - "Anoxia", - "Irregular Pulse", - "Abnormal Blood Gases", - "Foreign Body Embolism", - "Peripheral Vascular Disease", - "Pulmonary Emphysema", - "Zonular Dehiscence", - "Anaphylactoid", - "Respiratory Acidosis", - "Shock, Insulin", - "Herpes", - "Acanthameba Keratitis", - "Surgical procedure, delayed", - "Impotence", - "Pulmonary Dysfunction", - "Adult Respiratory Distress Syndrome", - "Decreased Sensitivity", - "Migration", - "Radiation Burn", - "Corneal Decompensation", - "Hemorrhage, Intraventricular", - "Phototoxicity", - "Presyncope", - "Retinal Injury", - "Fungus", - "Left Ventricular Failure", - "Skin Disorders", - "Left Ventricular Dysfunction", - "Hemolytic Anemia", - "Hyperthermia", - "Hyperventilation", - "Disseminated Intravascular Coagulation (DIC)", - "Flashers", - "Pregnancy with a Contraceptive Device", - "Sneezing", - "Suture Abrasion", - "Renal Impairment", - "Alteration in Body Temperature", - "Corneal Perforation", - "Tricuspid Valve Insufficiency/ Regurgitation", - "Electrocution", - "Vasodilatation", - "Torsades-de-Pointes", - "Skull Fracture", - "Arachnoiditis, Spinal", - "Unequal Limb Length", - "Immunodeficiency", - "Epistaxis", - "Melena", - "Skin Burning Sensation", - "Unspecified Vascular Problem", - "Unspecified Heart Problem", - "Hyperbilirubinemia", - "Localized Skin Lesion", - "Breast Implant Associated Anaplastic Large Cell Lymphoma (BIA ALCL)", - "Deposits", - "Sjogren''s Syndrome", - "Infection, Pyrogenic", - "Strangulation", - "Uremia", - "Lead(s), Burn(s) From", - "Drug Resistant Bacterial Infection", - "Bronchospasm", - "Vitreous Detachment", - "Painful stimulation", - "Respiratory Arrest", - "Bronchopneumonia", - "Connective Tissue Disease", - "Blister(s)", - "Convulsion, Tonic", - "Hypoesthesia, Foot/Leg", - "Cardiovascular Insufficiency", - "Unspecified Kidney or Urinary Problem", - "Fasciitis", - "Intervertebral Disc Compression or Protrusion", - "Vitritis", - "Breast Discomfort/Pain", - "Nodule", - "Unspecified Eye / Vision Problem", - "Intraocular Pressure Decreased", - "Tricuspid Insufficiency", - "Neovascularization", - "Increased Intra-Peritoneal Volume (IIPV)", - "Pulmonary Valve Insufficiency/ Regurgitation", - "Mitral Valve Prolapse", - "Seizures, Absence", - "Unspecified Blood or Lymphatic problem", - "Antibiotics, Reaction To", - "Asphyxia", - "Joint Contracture", - "Reaction to Medicinal Component of Device", - "Swollen Glands", - "Cerebral Ventriculomeglia", - "Stacking Breaths", - "Ruptured Aneurysm", - "Vitreous Hemorrhage", - "HIV, Human Immunodeficiency Virus", - "Scar Excision", - "Speech Disorder", - "Unspecified Mental, Emotional or Behavioural Problem", - "ST Segment Depression", - "Clouding, Central Corneal", - "Liver Laceration(s)", - "Myocarditis", - "Teratogenic Effects", - "Fibromyositis", - "Nasal Obstruction", - "Nipple Sensation Changes", - "Seizures, Focal", - "Idioventricular Rhythm", - "Radiation Underdose", - "Infection, Indirect", - "Protrusion", - "Contact Dermatitis", - "Respiratory Insufficiency", - "Fetal Distress", - "Quadriplegia", - "Suffocation", - "Neuralgia", - "Muscular Tics", - "Leak(s), perivalvular", - "Left Ventricular Hypertrophy", - "Subluxation", - "Unspecified Gastrointestinal Problem", - "Surgical procedure aborted/stopped", - "Balance Problems", - "Bruxism", - "Eye Infections", - "Positive antinuclear antibodies (ANA)", - "Ventricular Flutter", - "Encephalitis", - "Hypoesthesia, Arm/Hand", - "Liver Failure", - "Osteomyelitis", - "Retroperitoneal Hemorrhage", - "Surgical incision, enlargement of", - "Vertebral Fracture", - "Seizures, Focal Motor", - "Pancreatitis", - "Scleroderma", - "Solid Tumour", - "Cancer, Other"]) + assert_count_top( + query="/device/event.json?count=patient.patient_problems.exact&limit=1000", + N=1000, + expected_terms=[ + "No Known Impact Or Consequence To Patient", + "No Consequences Or Impact To Patient", + "No Clinical Signs, Symptoms or Conditions", + "No Patient Involvement", + "Pain", + "Failure of Implant", + "Hyperglycemia", + "No Code Available", + "Unspecified Infection", + "No Information", + "Tissue Damage", + "Injury", + "Hypoglycemia", + "Death", + "Insufficient Information", + "Inadequate Pain Relief", + "Discomfort", + "Hemorrhage/Bleeding", + "Inflammation", + "Erosion", + "Peritonitis", + "Fibrosis", + "Therapeutic Effects, Unexpected", + "Therapeutic Response, Decreased", + "Swelling", + "Complaint, Ill-Defined", + "Diabetic Ketoacidosis", + "Foreign Body In Patient", + "Capsular Contracture", + "Surgical procedure", + "Blood Loss", + "Device Embedded In Tissue or Plaque", + "Abdominal Pain", + "Disability", + "Foreign Body Reaction", + "Shock from Patient Lead(s)", + "Fall", + "Not Applicable", + "Nausea", + "Inadequate Osseointegration", + "Vomiting", + "Bone Fracture(s)", + "Reaction", + "Numbness", + "Headache", + "Osteolysis", + "Burning Sensation", + "Dizziness", + "Bacterial Infection", + "Joint Dislocation", + "Incontinence", + "Ambulation Difficulties", + "Adhesion(s)", + "Fatigue", + "Hernia", + "Other (for use when an appropriate patient code cannot be identified)", + "Host-Tissue Reaction", + "Hypersensitivity/Allergic reaction", + "Skin Irritation", + "Hematoma", + "Appropriate Clinical Signs, Symptoms, Conditions Term / Code Not Available", + "Cardiac Arrest", + "Electric Shock", + "Sepsis", + "Myocardial Infarction", + "Thrombosis", + "Dyspnea", + "Loss of consciousness", + "Abscess", + "Test Result", + "Fistula", + "Undesired Nerve Stimulation", + "Low Blood Pressure/ Hypotension", + "Perforation", + "Loss of Range of Motion", + "Post Operative Wound Infection", + "Thrombus", + "Scar Tissue", + "Blurred Vision", + "Chest Pain", + "Syncope", + "Erythema", + "Limited Mobility Of The Implanted Joint", + "Stroke/CVA", + "Occlusion", + "Rash", + "Toxicity", + "Perforation of Vessels", + "Seizures", + "Weakness", + "Shaking/Tremors", + "Wound Dehiscence", + "Cardiac Perforation", + "Anxiety", + "Fever", + "Non-union Bone Fracture", + "Impaired Healing", + "Seroma", + "Urinary Tract Infection", + "Swelling/ Edema", + "Patient Problem/Medical Problem", + "Itching Sensation", + "Pericardial Effusion", + "Heavier Menses", + "Stenosis", + "Edema", + "Sweating", + "Weight Changes", + "Device Overstimulation of Tissue", + "Neurological Deficit/Dysfunction", + "Cardiac Tamponade", + "Muscle Stimulation", + "Visual Impairment", + "Burn(s)", + "Urinary Frequency", + "Arrhythmia", + "Pocket Erosion", + "Deformity/ Disfigurement", + "Aortic Valve Stenosis", + "Necrosis", + "Emotional Changes", + "Obstruction/Occlusion", + "Overdose", + "Bradycardia", + "Urinary Retention", + "Muscular Rigidity", + "Uterine Perforation", + "Irritation", + "Intimal Dissection", + "Vascular Dissection", + "Confusion/ Disorientation", + "Depression", + "Aortic Regurgitation", + "Internal Organ Perforation", + "Aneurysm", + "Bleeding", + "Fluid Discharge", + "Nerve Damage", + "Unspecified Tissue Injury", + "Purulent Discharge", + "Endocarditis", + "Rupture", + "Reocclusion", + "Scarring", + "Osteopenia/ Osteoporosis", + "Malaise", + "Atrial Fibrillation", + "Menstrual Irregularities", + "Muscle Spasm(s)", + "Hemolysis", + "Laceration(s)", + "Visual Disturbances", + "Hair Loss", + "Polydipsia", + "Mitral Regurgitation", + "Increased Sensitivity", + "Hearing Impairment", + "Tingling", + "Sleep Dysfunction", + "Ventricular Tachycardia", + "Cyst(s)", + "Complete Heart Block", + "Bruise/Contusion", + "Staphylococcus Aureus", + "Arthralgia", + "Cognitive Changes", + "Underdose", + "Angina", + "Twiddlers Syndrome", + "Cramp(s)", + "Ventricular Fibrillation", + "Cerebrospinal Fluid Leakage", + "Keratitis", + "Abdominal Distention", + "Heart Failure", + "High Blood Pressure/ Hypertension", + "Pulmonary Embolism", + "Diarrhea", + "Tachycardia", + "Renal Failure", + "Failure to Anastomose", + "Burn, Thermal", + "Joint Swelling", + "Ischemia", + "Discharge", + "Non specific EKG/ECG Changes", + "Joint Disorder", + "Needle Stick/Puncture", + "Vessel Or Plaque, Device Embedded In", + "Granuloma", + "Synovitis", + "Dehydration", + "Capsular Bag Tear", + "Muscle Weakness", + "Pneumothorax", + "Paralysis", + "Skin Inflammation/ Irritation", + "Micturition Urgency", + "Low Oxygen Saturation", + "Infarction, Cerebral", + "Palpitations", + "Ossification", + "Calcium Deposits/Calcification", + "Halo", + "Respiratory Distress", + "Partial thickness (Second Degree) Burn", + "Respiratory Failure", + "Extubate", + "Pneumonia", + "Memory Loss/Impairment", + "Embolism", + "Anemia", + "Mitral Valve Stenosis", + "Lethargy", + "Distress", + "Surgical procedure, additional", + "Abdominal Cramps", + "Aortic Insufficiency", + "Intraocular Pressure Increased", + "Skin Discoloration", + "Thrombosis/Thrombus", + "Shock", + "Dysphagia/ Odynophagia", + "Abnormal Vaginal Discharge", + "Hemorrhage, Cerebral", + "Deafness", + "Neuropathy", + "Hematuria", + "Constipation", + "Chills", + "Fainting", + "Cellulitis", + "Local Reaction", + "Coma", + "Joint Laxity", + "Vascular System (Circulation), Impaired", + "Chemical Exposure", + "Neck Pain", + "Dry Eye(s)", + "Cardiopulmonary Arrest", + "Alteration In Body Temperature", + "Dysphasia", + "Head Injury", + "Vitrectomy", + "Abrasion", + "Tissue Breakdown", + "Radiation Exposure, Unintended", + "Fungal Infection", + "Red Eye(s)", + "Pseudoaneurysm", + "Prolapse", + "Sinus Perforation", + "Eye Injury", + "Autoimmune Reaction", + "Intracranial Hemorrhage", + "Foreign body, removal of", + "Skin Inflammation", + "Loss of Vision", + "Congestive Heart Failure", + "Skin Erosion", + "Surgical procedure, repeated", + "Pelvic Inflammatory Disease", + "Cancer", + "Cardiac Enzyme Elevation", + "Pregnancy", + "Arthritis", + "Pleural Effusion", + "Hip Fracture", + "Transient Ischemic Attack", + "Irritability", + "Corneal Ulcer", + "Postoperative refraction, unexpected", + "Sedation", + "Regurgitation", + "Pressure Sores", + "Dysuria", + "Heart Failure/Congestive Heart Failure", + "Bowel Perforation", + "Ulcer", + "Extrusion", + "Cardiogenic Shock", + "Multiple Organ Failure", + "Therapy/non-surgical treatment, additional", + "Implant Pain", + "Unknown (for use when the patient''s condition is not known)", + "Air Embolism", + "Unspecified Respiratory Problem", + "Iatrogenic Source", + "Treatment with medication(s)", + "Thromboembolism", + "Autoimmune Disorder", + "Corneal Edema", + "Hearing Loss", + "Exposure to Body Fluids", + "Hypoxia", + "Extravasation", + "Swollen Lymph Nodes", + "Skin Infection", + "Twitching", + "Paresis", + "Syncope/Fainting", + "Diaphoresis", + "Foreign Body Sensation in Eye", + "Urticaria", + "Metal Related Pathology", + "Septic Shock", + "Skin Tears", + "Hemorrhage, Subarachnoid", + "Tinnitus", + "Vertigo", + "Full thickness (Third Degree) Burn", + "Great Vessel Perforation", + "ST Segment Elevation", + "Endophthalmitis", + "Headache, Lumbar Puncture", + "Lymphoma", + "Hot Flashes/Flushes", + "Aortic Dissection", + "Exit Block", + "Claudication", + "Vasoconstriction", + "Coagulation Disorder", + "Hydrocephalus", + "Sudden Cardiac Death", + "Eye Burn", + "Spinal Column Injury", + "Superficial (First Degree) Burn", + "Excessive Tear Production", + "Aspiration/Inhalation", + "Chest Tightness/Pressure", + "Pulmonary Valve Stenosis", + "Apnea", + "Wrinkling", + "Embolus", + "Corneal Pannus", + "Brain Injury", + "Intermenstrual Bleeding", + "Meningitis", + "Breast Mass", + "Heart Block", + "Corneal Abrasion", + "Pulmonary Edema", + "Hypervolemia", + "Atrial Perforation", + "Thyroid Problems", + "Convulsion/Seizure", + "Collapse", + "Tooth Fracture", + "Ascites", + "Tricuspid Regurgitation", + "Flatus", + "Myalgia", + "No Patient involvement", + "Anaplastic Large Cell Lymphoma", + "Genital Bleeding", + "Aortic Valve Insufficiency/ Regurgitation", + "Urinary Incontinence", + "Ectopic Pregnancy", + "Right Ventricular Failure", + "Corneal Clouding/Hazing", + "Sensitivity of Teeth", + "Asthma", + "Choking", + "Hypoesthesia", + "Atrial Flutter", + "Pyrosis/Heartburn", + "Insufficiency, Valvular", + "Infiltration into Tissue", + "Loss Of Pulse", + "Gastrointestinal Hemorrhage", + "Physical Entrapment", + "Intraocular Pressure, Delayed, Uncontrolled", + "Cardiomyopathy", + "Ulceration", + "Hospitalization required", + "Respiratory Tract Infection", + "Conjunctivitis", + "Liver Damage/Dysfunction", + "Asystole", + "Misdiagnosis", + "Corneal Infiltrates", + "Pulmonary Regurgitation", + "Menorrhagia", + "Dyskinesia", + "Encephalopathy", + "Organ Dehiscence", + "Cataract", + "Caustic/Chemical Burns", + "Mitral Valve Insufficiency/ Regurgitation", + "Sore Throat", + "Miscarriage", + "Damage to Ligament(s)", + "Hyperplasia", + "Nonresorbable materials, unretrieved in body", + "Physical Asymmetry", + "Electrolyte Imbalance", + "Neck Stiffness", + "Hemoptysis", + "Unintended Extubation", + "Awareness during Anaesthesia", + "Hemothorax", + "Hemostasis", + "Ecchymosis", + "Spinal Cord Injury", + "Atrial Tachycardia", + "Regurgitation, Valvular", + "Reaction, Injection Site", + "Anaphylactic Shock", + "Airway Obstruction", + "Peeling", + "Missed Dose", + "Fracture, Arm", + "Contusion", + "Paraplegia", + "Uveitis", + "Fallopian Tube Perforation", + "Missing Value Reason", + "Convulsion, Clonic", + "Viral Infection", + "Cramp(s) /Muscle Spasm(s)", + "Low Cardiac Output", + "Pallor", + "Hyphema", + "Iritis", + "Exsanguination", + "Limb Fracture", + "Valvular Insufficiency/ Regurgitation", + "Facial Nerve Paralysis", + "Cataract, Induced", + "Ptosis", + "Vitreous Floaters", + "Blister", + "Swollen Lymph Nodes/Glands", + "Cyanosis", + "Rheumatoid Arthritis", + "Venipuncture", + "Hemorrhage, Subdural", + "Surgery, prolonged", + "Mitral Insufficiency", + "Macular Edema", + "Therapeutic Response, Increased", + "Colostomy", + "Tricuspid Valve Stenosis", + "Embolism/Embolus", + "Paresthesia", + "Chronic Obstructive Pulmonary Disease (COPD)", + "Hypovolemia", + "Diminished Pulse Pressure", + "Corneal Scar", + "Retinal Detachment", + "Oversedation", + "Perforation of Esophagus", + "Necrosis Of Flap Tissue", + "Bronchitis", + "Phlebitis", + "Muscle/Tendon Damage", + "Decreased Respiratory Rate", + "Unintended Radiation Exposure", + "Bowel Burn", + "Hypopyon", + "Ventilator Dependent", + "Ischemia Stroke", + "Transfusion of blood products", + "Hyperemia", + "Cusp Tear", + "Concussion", + "Difficulty Chewing", + "Glaucoma", + "Extreme Exhaustion", + "Gastritis", + "Virus", + "Dementia", + "Cough", + "Debris, Bone Shedding", + "Malunion of Bone", + "Restenosis", + "Hypothermia", + "Unspecified Nervous System Problem", + "Lupus", + "Feeding Problem", + "Decreased Appetite", + "Hemorrhagic Stroke", + "Premature Labor", + "Hypoventilation", + "Vitreous Loss", + "Hypovolemic Shock", + "Breast Cancer", + "Seizures, Grand-Mal", + "Laceration(s) of Esophagus", + "Atherosclerosis", + "Vaso-Vagal Response", + "Radiation Overdose", + "Allergic reaction", + "Right Ventricular Dysfunction", + "Crushing Injury", + "Nervous System Injury", + "Sprain", + "Jaundice", + "Overcorrection", + "Pulmonary Insufficiency", + "Abortion", + "Nerve Proximity Nos (Not Otherwise Specified)", + "Nicks, cuts or tears of dura or other tissues by device", + "Toxic Shock Syndrome", + "Intraocular Infection", + "Disc Impingement", + "Pupillary Block", + "Intraoperative Pain", + "Toxic Anterior Segment Syndrome (TASS)", + "Dyspareunia", + "Hepatitis", + "Electro-Mechanical Dissociation", + "Congenital Defect/Deformity", + "Lactate Dehydrogenase Increased", + "Wheal(s)", + "Renal Disease, End Stage", + "Eye Pain", + "Post Traumatic Wound Infection", + "Increased Respiratory Rate", + "Hiccups", + "Ischemic Heart Disease", + "Valvular Stenosis", + "Peroneal Nerve Palsy", + "Diplopia", + "Subcutaneous Nodule", + "Laparotomy", + "Death, Intrauterine Fetal", + "Gangrene", + "Drainage", + "Raynauds Phenomenon", + "Retinal Tear", + "Vaginal Mucosa Damage", + "Anoxia", + "Irregular Pulse", + "Abnormal Blood Gases", + "Foreign Body Embolism", + "Peripheral Vascular Disease", + "Pulmonary Emphysema", + "Zonular Dehiscence", + "Anaphylactoid", + "Respiratory Acidosis", + "Shock, Insulin", + "Herpes", + "Acanthameba Keratitis", + "Surgical procedure, delayed", + "Impotence", + "Pulmonary Dysfunction", + "Adult Respiratory Distress Syndrome", + "Decreased Sensitivity", + "Migration", + "Radiation Burn", + "Corneal Decompensation", + "Hemorrhage, Intraventricular", + "Phototoxicity", + "Presyncope", + "Retinal Injury", + "Fungus", + "Left Ventricular Failure", + "Skin Disorders", + "Left Ventricular Dysfunction", + "Hemolytic Anemia", + "Hyperthermia", + "Hyperventilation", + "Disseminated Intravascular Coagulation (DIC)", + "Flashers", + "Pregnancy with a Contraceptive Device", + "Sneezing", + "Suture Abrasion", + "Renal Impairment", + "Alteration in Body Temperature", + "Corneal Perforation", + "Tricuspid Valve Insufficiency/ Regurgitation", + "Electrocution", + "Vasodilatation", + "Torsades-de-Pointes", + "Skull Fracture", + "Arachnoiditis, Spinal", + "Unequal Limb Length", + "Immunodeficiency", + "Epistaxis", + "Melena", + "Skin Burning Sensation", + "Unspecified Vascular Problem", + "Unspecified Heart Problem", + "Hyperbilirubinemia", + "Localized Skin Lesion", + "Breast Implant Associated Anaplastic Large Cell Lymphoma (BIA ALCL)", + "Deposits", + "Sjogren''s Syndrome", + "Infection, Pyrogenic", + "Strangulation", + "Uremia", + "Lead(s), Burn(s) From", + "Drug Resistant Bacterial Infection", + "Bronchospasm", + "Vitreous Detachment", + "Painful stimulation", + "Respiratory Arrest", + "Bronchopneumonia", + "Connective Tissue Disease", + "Blister(s)", + "Convulsion, Tonic", + "Hypoesthesia, Foot/Leg", + "Cardiovascular Insufficiency", + "Unspecified Kidney or Urinary Problem", + "Fasciitis", + "Intervertebral Disc Compression or Protrusion", + "Vitritis", + "Breast Discomfort/Pain", + "Nodule", + "Unspecified Eye / Vision Problem", + "Intraocular Pressure Decreased", + "Tricuspid Insufficiency", + "Neovascularization", + "Increased Intra-Peritoneal Volume (IIPV)", + "Pulmonary Valve Insufficiency/ Regurgitation", + "Mitral Valve Prolapse", + "Seizures, Absence", + "Unspecified Blood or Lymphatic problem", + "Antibiotics, Reaction To", + "Asphyxia", + "Joint Contracture", + "Reaction to Medicinal Component of Device", + "Swollen Glands", + "Cerebral Ventriculomeglia", + "Stacking Breaths", + "Ruptured Aneurysm", + "Vitreous Hemorrhage", + "HIV, Human Immunodeficiency Virus", + "Scar Excision", + "Speech Disorder", + "Unspecified Mental, Emotional or Behavioural Problem", + "ST Segment Depression", + "Clouding, Central Corneal", + "Liver Laceration(s)", + "Myocarditis", + "Teratogenic Effects", + "Fibromyositis", + "Nasal Obstruction", + "Nipple Sensation Changes", + "Seizures, Focal", + "Idioventricular Rhythm", + "Radiation Underdose", + "Infection, Indirect", + "Protrusion", + "Contact Dermatitis", + "Respiratory Insufficiency", + "Fetal Distress", + "Quadriplegia", + "Suffocation", + "Neuralgia", + "Muscular Tics", + "Leak(s), perivalvular", + "Left Ventricular Hypertrophy", + "Subluxation", + "Unspecified Gastrointestinal Problem", + "Surgical procedure aborted/stopped", + "Balance Problems", + "Bruxism", + "Eye Infections", + "Positive antinuclear antibodies (ANA)", + "Ventricular Flutter", + "Encephalitis", + "Hypoesthesia, Arm/Hand", + "Liver Failure", + "Osteomyelitis", + "Retroperitoneal Hemorrhage", + "Surgical incision, enlargement of", + "Vertebral Fracture", + "Seizures, Focal Motor", + "Pancreatitis", + "Scleroderma", + "Solid Tumour", + "Cancer, Other", + ], + ) def test_date(): - assert_total('/device/event.json?search=date_received:20090217', 1) + assert_total("/device/event.json?search=date_received:20090217", 1) def test_date_range(): - assert_total('/device/event.json?search=date_received:([20090217+TO+20091231])', 1) + assert_total("/device/event.json?search=date_received:([20090217+TO+20091231])", 1) def test_openfda(): - assert_total('/device/event.json?search=device.openfda.regulation_number:892.1650', 1) + assert_total( + "/device/event.json?search=device.openfda.regulation_number:892.1650", 1 + ) def test_single_product_problem(): - meta, results = fetch( - '/device/event.json?search=mdr_report_key:7014707') - eq_(len(results), 1) - event = results[0] - eq_(event["product_problems"], ["Device Displays Incorrect Message"]) + meta, results = fetch("/device/event.json?search=mdr_report_key:7014707") + eq_(len(results), 1) + event = results[0] + eq_(event["product_problems"], ["Device Displays Incorrect Message"]) - meta, results = fetch( - '/device/event.json?search=report_number:"0001313525-2021-00110"') - eq_(len(results), 1) - event = results[0] - problems = sorted(event["product_problems"], key=lambda k: k) - eq_(problems, ['Adverse Event Without Identified Device or Use Problem', 'Insufficient Information']) + meta, results = fetch( + '/device/event.json?search=report_number:"0001313525-2021-00110"' + ) + eq_(len(results), 1) + event = results[0] + problems = sorted(event["product_problems"], key=lambda k: k) + eq_( + problems, + [ + "Adverse Event Without Identified Device or Use Problem", + "Insufficient Information", + ], + ) - meta, results = fetch( - '/device/event.json?search=report_number:"1057985-2021-00146"') - eq_(len(results), 1) - event = results[0] - eq_(event["product_problems"], ["Adverse Event Without Identified Device or Use Problem"]) + meta, results = fetch( + '/device/event.json?search=report_number:"1057985-2021-00146"' + ) + eq_(len(results), 1) + event = results[0] + eq_( + event["product_problems"], + ["Adverse Event Without Identified Device or Use Problem"], + ) def test_multiple_product_problems(): - meta, results = fetch( - '/device/event.json?search=mdr_report_key:7014702') - eq_(len(results), 1) + meta, results = fetch("/device/event.json?search=mdr_report_key:7014702") + eq_(len(results), 1) - event = results[0] - problems = sorted(event["product_problems"], key=lambda k: k) - eq_(problems, ["Ambient Noise Problem", "Fracture", "High impedance", "Over-Sensing", "Pacing Problem"]) + event = results[0] + problems = sorted(event["product_problems"], key=lambda k: k) + eq_( + problems, + [ + "Ambient Noise Problem", + "Fracture", + "High impedance", + "Over-Sensing", + "Pacing Problem", + ], + ) def test_multiple_patient_problems(): - meta, results = fetch( - '/device/event.json?search=mdr_report_key:4102973') - eq_(len(results), 1) - event = results[0] - eq_(len(event["patient"]), 1) - problems = sorted(event["patient"][0]["patient_problems"], key=lambda k: k) - eq_(problems, ["Cardiogenic Shock", "Myocardial Infarction", "Thrombosis"]) + meta, results = fetch("/device/event.json?search=mdr_report_key:4102973") + eq_(len(results), 1) + event = results[0] + eq_(len(event["patient"]), 1) + problems = sorted(event["patient"][0]["patient_problems"], key=lambda k: k) + eq_(problems, ["Cardiogenic Shock", "Myocardial Infarction", "Thrombosis"]) - meta, results = fetch( - '/device/event.json?search=report_number:"0001313525-2021-00110"') - eq_(len(results), 1) - event = results[0] - eq_(len(event["patient"]), 1) - problems = sorted(event["patient"][0]["patient_problems"], key=lambda k: k) - eq_(problems, ['Corneal Infiltrates', 'Corneal Ulcer', 'Keratitis']) + meta, results = fetch( + '/device/event.json?search=report_number:"0001313525-2021-00110"' + ) + eq_(len(results), 1) + event = results[0] + eq_(len(event["patient"]), 1) + problems = sorted(event["patient"][0]["patient_problems"], key=lambda k: k) + eq_(problems, ["Corneal Infiltrates", "Corneal Ulcer", "Keratitis"]) - meta, results = fetch( - '/device/event.json?search=report_number:"1057985-2021-00146"') - eq_(len(results), 1) - event = results[0] - eq_(len(event["patient"]), 1) - problems = sorted(event["patient"][0]["patient_problems"], key=lambda k: k) - eq_(problems, - ["Blurred Vision", "Excessive Tear Production", "Eye Pain", "Foreign Body Sensation in Eye", "Itching Sensation", - "Red Eye(s)"]) + meta, results = fetch( + '/device/event.json?search=report_number:"1057985-2021-00146"' + ) + eq_(len(results), 1) + event = results[0] + eq_(len(event["patient"]), 1) + problems = sorted(event["patient"][0]["patient_problems"], key=lambda k: k) + eq_( + problems, + [ + "Blurred Vision", + "Excessive Tear Production", + "Eye Pain", + "Foreign Body Sensation in Eye", + "Itching Sensation", + "Red Eye(s)", + ], + ) def test_multiple_patients_with_same_problem(): - meta, results = fetch( - '/device/event.json?search=mdr_report_key:4718251') - eq_(len(results), 1) + meta, results = fetch("/device/event.json?search=mdr_report_key:4718251") + eq_(len(results), 1) - event = results[0] - eq_(len(event["patient"]), 4) + event = results[0] + eq_(len(event["patient"]), 4) - for idx, patient in enumerate(sorted(event["patient"], key=lambda k: k["patient_sequence_number"])): - eq_(patient['patient_sequence_number'], str(idx + 1)) - eq_(sorted(patient['patient_problems'], key=lambda k: k), ["Injury", "Tissue Damage"]) + for idx, patient in enumerate( + sorted(event["patient"], key=lambda k: k["patient_sequence_number"]) + ): + eq_(patient["patient_sequence_number"], str(idx + 1)) + eq_( + sorted(patient["patient_problems"], key=lambda k: k), + ["Injury", "Tissue Damage"], + ) def test_multiple_patient_problems_without_patient_record_issue_179(): - meta, results = fetch( - '/device/event.json?search=mdr_report_key:11339476') - eq_(len(results), 1) - event = results[0] - eq_(len(event["patient"]), 1) - problems = sorted(event["patient"][0]["patient_problems"], key=lambda k: k) - eq_(problems, ["Blurred Vision", "Visual Disturbances"]) - - meta, results = fetch( - '/device/event.json?search=mdr_report_key:11131671') - eq_(len(results), 1) - event = results[0] - eq_(len(event["patient"]), 1) - problems = sorted(event["patient"][0]["patient_problems"], key=lambda k: k) - eq_(problems, ["Visual Disturbances"]) + meta, results = fetch("/device/event.json?search=mdr_report_key:11339476") + eq_(len(results), 1) + event = results[0] + eq_(len(event["patient"]), 1) + problems = sorted(event["patient"][0]["patient_problems"], key=lambda k: k) + eq_(problems, ["Blurred Vision", "Visual Disturbances"]) + meta, results = fetch("/device/event.json?search=mdr_report_key:11131671") + eq_(len(results), 1) + event = results[0] + eq_(len(event["patient"]), 1) + problems = sorted(event["patient"][0]["patient_problems"], key=lambda k: k) + eq_(problems, ["Visual Disturbances"]) def test_foitext_present(): - meta, results = fetch( - '/device/event.json?search=mdr_report_key:10131013') - eq_(len(results), 1) + meta, results = fetch("/device/event.json?search=mdr_report_key:10131013") + eq_(len(results), 1) - event = results[0] - mdrtext = sorted(event["mdr_text"], key=lambda k: k["mdr_text_key"]) - eq_(len(mdrtext), 2) - eq_(mdrtext[0]["text"], - u"INVESTIGATION COMPLETED ON A SMITHS MEDICAL FLUID WARMING/LEVEL 1 HOTLINE LOW FLOW SYSTEMS - HL-90 COMPLAINT OF TEMPERATURE FLUCTUATIONS WAS VERIFIED DURING TESTING. THIS WAS ISOLATED TO A FAULTY PCB (PRIMARY CIRCUIT BOARD). THE DEVICE WAS FOUND TO HAVE WEAR AND TEAR UPON ARRIVAL FOR INVESTIGATION. FRONT COVER, TANK AND LINE CORD. OUTDATED PCB AND POWER SWITCH. NO ACTION WAS TAKEN TO REPAIR DEVICE AS DEEMED BEYOND REPAIR AND WILL BE SCRAPPED. NO CAUSE OF EVENT WAS ESTABLISHED.") - eq_(mdrtext[1]["text"], - u"INFORMATION RECEIVED A FLUID WARMING/LEVEL 1 HOTLINE LOW FLOW SYSTEMS - HL-90 TEMPERATURE WAS BOUNCING AROUND AND SHOWING AT 45 DEGREES CELSIUS BUT WAS AT 20 DEGREES CELSIUS. NO PATIENT ADVERSE EVENTS REPORTED.") + event = results[0] + mdrtext = sorted(event["mdr_text"], key=lambda k: k["mdr_text_key"]) + eq_(len(mdrtext), 2) + eq_( + mdrtext[0]["text"], + "INVESTIGATION COMPLETED ON A SMITHS MEDICAL FLUID WARMING/LEVEL 1 HOTLINE LOW FLOW SYSTEMS - HL-90 COMPLAINT OF TEMPERATURE FLUCTUATIONS WAS VERIFIED DURING TESTING. THIS WAS ISOLATED TO A FAULTY PCB (PRIMARY CIRCUIT BOARD). THE DEVICE WAS FOUND TO HAVE WEAR AND TEAR UPON ARRIVAL FOR INVESTIGATION. FRONT COVER, TANK AND LINE CORD. OUTDATED PCB AND POWER SWITCH. NO ACTION WAS TAKEN TO REPAIR DEVICE AS DEEMED BEYOND REPAIR AND WILL BE SCRAPPED. NO CAUSE OF EVENT WAS ESTABLISHED.", + ) + eq_( + mdrtext[1]["text"], + "INFORMATION RECEIVED A FLUID WARMING/LEVEL 1 HOTLINE LOW FLOW SYSTEMS - HL-90 TEMPERATURE WAS BOUNCING AROUND AND SHOWING AT 45 DEGREES CELSIUS BUT WAS AT 20 DEGREES CELSIUS. NO PATIENT ADVERSE EVENTS REPORTED.", + ) diff --git a/openfda/deploy/tests/devicepma/test_endpoint.py b/openfda/deploy/tests/devicepma/test_endpoint.py index aa785bc..c4c9542 100644 --- a/openfda/deploy/tests/devicepma/test_endpoint.py +++ b/openfda/deploy/tests/devicepma/test_endpoint.py @@ -1,25 +1,32 @@ - import requests from openfda.tests.api_test_helpers import * - def test_not_analyzed_0(): - assert_total('/device/pma.json?search=openfda.regulation_number:876.5270', 290) + assert_total("/device/pma.json?search=openfda.regulation_number:876.5270", 290) def test_exact_count_0(): - assert_count_top('/device/pma.json?count=supplement_type.exact', ['30-Day Notice', 'Normal 180 Day Track', 'Real-Time Process', '', '135 Review Track For 30-Day Notice']) + assert_count_top( + "/device/pma.json?count=supplement_type.exact", + [ + "30-Day Notice", + "Normal 180 Day Track", + "Real-Time Process", + "", + "135 Review Track For 30-Day Notice", + ], + ) def test_date_1(): - assert_total('/device/pma.json?search=decision_date:20131122', 11) + assert_total("/device/pma.json?search=decision_date:20131122", 11) def test_date_range_2(): - assert_total('/device/pma.json?search=decision_date:([20131122+TO+20131231])', 305) + assert_total("/device/pma.json?search=decision_date:([20131122+TO+20131231])", 305) def test_openfda_3(): - assert_total('/device/pma.json?search=openfda.registration_number:3006630150', 836) + assert_total("/device/pma.json?search=openfda.registration_number:3006630150", 836) diff --git a/openfda/deploy/tests/devicerecall/test_endpoint.py b/openfda/deploy/tests/devicerecall/test_endpoint.py index 9454025..854aad0 100644 --- a/openfda/deploy/tests/devicerecall/test_endpoint.py +++ b/openfda/deploy/tests/devicerecall/test_endpoint.py @@ -1,132 +1,179 @@ - import requests from openfda.tests.api_test_helpers import * - def test_pma_parsed_out(): - meta, results = fetch('/device/recall.json?search=product_res_number.exact:Z-0567-2013') - eq_(sorted(results[0]['pma_numbers']), ['P960009S007', 'P960009S051', 'P960009S134']) + meta, results = fetch( + "/device/recall.json?search=product_res_number.exact:Z-0567-2013" + ) + eq_( + sorted(results[0]["pma_numbers"]), ["P960009S007", "P960009S051", "P960009S134"] + ) def test_510k_parsed_out(): - meta, results = fetch('/device/recall.json?search=product_res_number.exact:Z-1535-2009') - eq_(sorted(results[0]['k_numbers']), ['K003205', 'K984357', 'K990848', 'K991723']) + meta, results = fetch( + "/device/recall.json?search=product_res_number.exact:Z-1535-2009" + ) + eq_(sorted(results[0]["k_numbers"]), ["K003205", "K984357", "K990848", "K991723"]) def test_product_code_ooo(): - meta, results = fetch('/device/recall.json?search=product_res_number.exact:Z-2153-2014') - eq_(results[0]['product_code'], 'N/A') + meta, results = fetch( + "/device/recall.json?search=product_res_number.exact:Z-2153-2014" + ) + eq_(results[0]["product_code"], "N/A") def test_product_code_na(): - meta, results = fetch('/device/recall.json?search=product_res_number.exact:Z-0094-2021') - eq_(results[0]['product_code'], 'N/A') + meta, results = fetch( + "/device/recall.json?search=product_res_number.exact:Z-0094-2021" + ) + eq_(results[0]["product_code"], "N/A") def test_old_product_code(): - meta, results = fetch('/device/recall.json?search=product_res_number.exact:Z-0351-03+AND+res_event_number:25150') - eq_(results[0]['product_code'], '74') + meta, results = fetch( + "/device/recall.json?search=product_res_number.exact:Z-0351-03+AND+res_event_number:25150" + ) + eq_(results[0]["product_code"], "74") def test_other_submission_description(): - meta, results = fetch('/device/recall.json?search=product_res_number.exact:Z-2153-2014') - eq_(results[0]['other_submission_description'], 'Exempt') + meta, results = fetch( + "/device/recall.json?search=product_res_number.exact:Z-2153-2014" + ) + eq_(results[0]["other_submission_description"], "Exempt") - meta, results = fetch('/device/recall.json?search=product_res_number.exact:Z-1650-2010') - eq_(results[0]['other_submission_description'], 'Illegally Marketed') + meta, results = fetch( + "/device/recall.json?search=product_res_number.exact:Z-1650-2010" + ) + eq_(results[0]["other_submission_description"], "Illegally Marketed") def test_unparseable_address_flattened(): - meta, results = fetch('/device/recall.json?search=product_res_number.exact:Z-0432-2021') - eq_(results[0]['recalling_firm'], 'Inpeco S.A. Via San Gottardo 10 Lugano Switzerland') - eq_(results[0].get('address_1'), None) - eq_(results[0].get('address_2'), None) - eq_(results[0].get('city'), None) - eq_(results[0].get('state'), None) - eq_(results[0].get('postal_code'), None) - eq_(results[0].get('country'), None) + meta, results = fetch( + "/device/recall.json?search=product_res_number.exact:Z-0432-2021" + ) + eq_( + results[0]["recalling_firm"], + "Inpeco S.A. Via San Gottardo 10 Lugano Switzerland", + ) + eq_(results[0].get("address_1"), None) + eq_(results[0].get("address_2"), None) + eq_(results[0].get("city"), None) + eq_(results[0].get("state"), None) + eq_(results[0].get("postal_code"), None) + eq_(results[0].get("country"), None) def test_one_complete_record(): - meta, results = fetch('/device/recall.json?search=product_res_number.exact:Z-0164-2014') - eq_(results[0]['cfres_id'], '122136') - eq_(results[0]['res_event_number'], '66351') - eq_(results[0]['product_res_number'], 'Z-0164-2014') - eq_(results[0]['firm_fei_number'], '1217052') - eq_(results[0]['event_date_terminated'], '2016-08-22') - eq_(results[0]['event_date_initiated'], '2013-09-23') - eq_(results[0]['event_date_posted'], '2013-11-05') - eq_(results[0]['recall_status'], 'Terminated') - eq_(results[0]['product_description'], - 'Portex¿ Saddleblock Tray with Drugs 25g Quincke; 4795-20\n\nThe Regional Anesthesia family of products is comprised of sterile (unless otherwise indicated in the product scope), single-use devices designed to perform epidural, spinal, combined epidural/spinal, nerve block, lumbar puncture and regional anesthesia procedures. The spinal products are comprised of spinal needles, introducer needles and accessories required to perform a spinal procedure.') - eq_(results[0]['code_info'], 'Lot: 2455104') - eq_(results[0]['action'], - "Smiths Medical sent an Urgent Recall Notice dated September 23, 2013, to all affected customers. The notice identified the product, the problem, and the action to be taken by the customer. Customers were instructed to inspect their inventory for the affected product and remove from use. Complete the Confirmation Form and return by fax to 603-358-1017 or by email to spinal@smiths-medical.com. Upon receipt of the completed form, a customer service representative would contact them to arrange for exchange of their unused affected devices for credit or replacement. Customers were also instructed to forward the notice to all personnel who need to be aware within their organization. Customers with questions were instructed to contact Smiths Medical's Customer Service Department at 800-258-5361.\r\nFor questions regarding this recall call 800-258-5361.") - eq_(results[0]['product_quantity'], '222') - eq_(results[0]['distribution_pattern'], - 'Nationwide Distribution including AK, AR, AZ, CA, CT, FL, GA, HI, IA, ID, IL, IN, KS, KY, LA, MA, MD, ME, MI, MN, MO, MS, NC, NE, NJ, NY, OH, OK, OR, PA, RI, SC, TX, UT, VA, VT, WA, WI, WY.') - eq_(results[0]['recalling_firm'], 'Smiths Medical ASD, Inc.') - eq_(results[0]['address_1'], '10 Bowman Dr') - eq_(results[0].get('address_2'), None) - eq_(results[0]['city'], 'Keene') - eq_(results[0]['state'], 'NH') - eq_(results[0]['postal_code'], '03431-5043') - eq_(results[0]['additional_info_contact'], '800-258-5361') - eq_(results[0]['root_cause_description'], 'Material/Component Contamination') - eq_(results[0]['reason_for_recall'], - 'Visual particulate in the glass ampules of 5% Lidocaine HCL and 7.5% Dextrose Injection, USP, 2 mL Ampoule, NDC # 0409-4712-01, Hospira Lot Number 23-227-DK. These ampules are included in certain Portex Spinal Anaesthesia Trays.') - eq_(results[0]['product_code'], 'CAZ') - eq_(results[0].get('k_numbers'), None) - eq_(results[0]['pma_numbers'], ['D028296']) - eq_(results[0].get('other_submission_description'), None) - - eq_(results[0]['openfda']['device_name'], 'Anesthesia Conduction Kit') - eq_(results[0]['openfda']['medical_specialty_description'], 'Anesthesiology') - eq_(results[0]['openfda']['device_class'], '2') - eq_(results[0]['openfda']['regulation_number'], '868.5140') + meta, results = fetch( + "/device/recall.json?search=product_res_number.exact:Z-0164-2014" + ) + eq_(results[0]["cfres_id"], "122136") + eq_(results[0]["res_event_number"], "66351") + eq_(results[0]["product_res_number"], "Z-0164-2014") + eq_(results[0]["firm_fei_number"], "1217052") + eq_(results[0]["event_date_terminated"], "2016-08-22") + eq_(results[0]["event_date_initiated"], "2013-09-23") + eq_(results[0]["event_date_posted"], "2013-11-05") + eq_(results[0]["recall_status"], "Terminated") + eq_( + results[0]["product_description"], + "Portex¿ Saddleblock Tray with Drugs 25g Quincke; 4795-20\n\nThe Regional Anesthesia family of products is comprised of sterile (unless otherwise indicated in the product scope), single-use devices designed to perform epidural, spinal, combined epidural/spinal, nerve block, lumbar puncture and regional anesthesia procedures. The spinal products are comprised of spinal needles, introducer needles and accessories required to perform a spinal procedure.", + ) + eq_(results[0]["code_info"], "Lot: 2455104") + eq_( + results[0]["action"], + "Smiths Medical sent an Urgent Recall Notice dated September 23, 2013, to all affected customers. The notice identified the product, the problem, and the action to be taken by the customer. Customers were instructed to inspect their inventory for the affected product and remove from use. Complete the Confirmation Form and return by fax to 603-358-1017 or by email to spinal@smiths-medical.com. Upon receipt of the completed form, a customer service representative would contact them to arrange for exchange of their unused affected devices for credit or replacement. Customers were also instructed to forward the notice to all personnel who need to be aware within their organization. Customers with questions were instructed to contact Smiths Medical's Customer Service Department at 800-258-5361.\r\nFor questions regarding this recall call 800-258-5361.", + ) + eq_(results[0]["product_quantity"], "222") + eq_( + results[0]["distribution_pattern"], + "Nationwide Distribution including AK, AR, AZ, CA, CT, FL, GA, HI, IA, ID, IL, IN, KS, KY, LA, MA, MD, ME, MI, MN, MO, MS, NC, NE, NJ, NY, OH, OK, OR, PA, RI, SC, TX, UT, VA, VT, WA, WI, WY.", + ) + eq_(results[0]["recalling_firm"], "Smiths Medical ASD, Inc.") + eq_(results[0]["address_1"], "10 Bowman Dr") + eq_(results[0].get("address_2"), None) + eq_(results[0]["city"], "Keene") + eq_(results[0]["state"], "NH") + eq_(results[0]["postal_code"], "03431-5043") + eq_(results[0]["additional_info_contact"], "800-258-5361") + eq_(results[0]["root_cause_description"], "Material/Component Contamination") + eq_( + results[0]["reason_for_recall"], + "Visual particulate in the glass ampules of 5% Lidocaine HCL and 7.5% Dextrose Injection, USP, 2 mL Ampoule, NDC # 0409-4712-01, Hospira Lot Number 23-227-DK. These ampules are included in certain Portex Spinal Anaesthesia Trays.", + ) + eq_(results[0]["product_code"], "CAZ") + eq_(results[0].get("k_numbers"), None) + eq_(results[0]["pma_numbers"], ["D028296"]) + eq_(results[0].get("other_submission_description"), None) + + eq_(results[0]["openfda"]["device_name"], "Anesthesia Conduction Kit") + eq_(results[0]["openfda"]["medical_specialty_description"], "Anesthesiology") + eq_(results[0]["openfda"]["device_class"], "2") + eq_(results[0]["openfda"]["regulation_number"], "868.5140") def test_fei_number_in_place(): - assert_total('/device/recall.json?search=firm_fei_number:3001236905', 12) + assert_total("/device/recall.json?search=firm_fei_number:3001236905", 12) def test_grouping_by_event_number(): - assert_total('/device/recall.json?search=res_event_number:81166', 20) + assert_total("/device/recall.json?search=res_event_number:81166", 20) def test_exact_count_0(): - assert_count_top('/device/recall.json?count=root_cause_description.exact', - ['Device Design', 'Nonconforming Material/Component', 'Process control', 'Software design', - 'Other']) + assert_count_top( + "/device/recall.json?count=root_cause_description.exact", + [ + "Device Design", + "Nonconforming Material/Component", + "Process control", + "Software design", + "Other", + ], + ) def test_date_1(): - assert_total('/device/recall.json?search=event_date_terminated:20050922', 7) + assert_total("/device/recall.json?search=event_date_terminated:20050922", 7) def test_date_range_2(): - assert_total('/device/recall.json?search=event_date_terminated:([20050922+TO+20051231])', 600) + assert_total( + "/device/recall.json?search=event_date_terminated:([20050922+TO+20051231])", 600 + ) def test_openfda_3(): - assert_total('/device/recall.json?search=openfda.regulation_number:878.4370', 700) + assert_total("/device/recall.json?search=openfda.regulation_number:878.4370", 700) def test_dupe_removal_fda_392(): - assert_total_exact('/device/recall.json?search=product_res_number.exact:Z-1844-2017', 1) - assert_total_exact('/device/recall.json?search=product_res_number.exact:Z-0592-2011', 1) - meta, results = fetch('/device/recall.json?search=product_res_number.exact:Z-0592-2011') - eq_(results[0]['res_event_number'], '56864') - eq_(results[0]['product_res_number'], 'Z-0592-2011') - eq_(results[0]['firm_fei_number'], '3000206154') - eq_(results[0]['event_date_terminated'], '2010-12-14') - eq_(results[0]['root_cause_description'], 'Error in labeling') - eq_(results[0]['product_code'], 'KKX') - eq_(results[0].get('pma_numbers'), None) - eq_(results[0]['k_numbers'], ['K050322']) - eq_(results[0]['openfda']['device_name'], 'Drape, Surgical') - eq_(results[0]['openfda']['medical_specialty_description'], 'General, Plastic Surgery') - eq_(results[0]['openfda']['device_class'], '2') - eq_(results[0]['openfda']['regulation_number'], '878.4370') + assert_total_exact( + "/device/recall.json?search=product_res_number.exact:Z-1844-2017", 1 + ) + assert_total_exact( + "/device/recall.json?search=product_res_number.exact:Z-0592-2011", 1 + ) + meta, results = fetch( + "/device/recall.json?search=product_res_number.exact:Z-0592-2011" + ) + eq_(results[0]["res_event_number"], "56864") + eq_(results[0]["product_res_number"], "Z-0592-2011") + eq_(results[0]["firm_fei_number"], "3000206154") + eq_(results[0]["event_date_terminated"], "2010-12-14") + eq_(results[0]["root_cause_description"], "Error in labeling") + eq_(results[0]["product_code"], "KKX") + eq_(results[0].get("pma_numbers"), None) + eq_(results[0]["k_numbers"], ["K050322"]) + eq_(results[0]["openfda"]["device_name"], "Drape, Surgical") + eq_( + results[0]["openfda"]["medical_specialty_description"], + "General, Plastic Surgery", + ) + eq_(results[0]["openfda"]["device_class"], "2") + eq_(results[0]["openfda"]["regulation_number"], "878.4370") diff --git a/openfda/deploy/tests/devicereglist/test_endpoint.py b/openfda/deploy/tests/devicereglist/test_endpoint.py index a1ed31b..eb469a6 100644 --- a/openfda/deploy/tests/devicereglist/test_endpoint.py +++ b/openfda/deploy/tests/devicereglist/test_endpoint.py @@ -1,29 +1,50 @@ - import requests from openfda.tests.api_test_helpers import * - def test_not_analyzed_0(): - assert_total('/device/registrationlisting.json?search=products.openfda.device_class:2', 111749) + assert_total( + "/device/registrationlisting.json?search=products.openfda.device_class:2", + 111749, + ) def test_exact_count_0(): - assert_count_top('/device/registrationlisting.json?count=registration.name.exact', ['Sterigenics U.S., LLC', 'Isomedix Operations, Inc.', 'Sterigenics US LLC', 'ISOMEDIX OPERATIONS INC.', 'C.R. BARD, INC.']) + assert_count_top( + "/device/registrationlisting.json?count=registration.name.exact", + [ + "Sterigenics U.S., LLC", + "Isomedix Operations, Inc.", + "Sterigenics US LLC", + "ISOMEDIX OPERATIONS INC.", + "C.R. BARD, INC.", + ], + ) def test_exact_count_1(): - assert_count_top('/device/registrationlisting.json?count=proprietary_name.exact', ['Integra', 'Miltex', 'PILLING', 'Aesculap', 'Jarit']) + assert_count_top( + "/device/registrationlisting.json?count=proprietary_name.exact", + ["Integra", "Miltex", "PILLING", "Aesculap", "Jarit"], + ) def test_date_1(): - assert_total('/device/registrationlisting.json?search=products.created_date:20080117', 150) + assert_total( + "/device/registrationlisting.json?search=products.created_date:20080117", 150 + ) def test_date_range_2(): - assert_total('/device/registrationlisting.json?search=products.created_date:([20080117+TO+20081231])', 14000) + assert_total( + "/device/registrationlisting.json?search=products.created_date:([20080117+TO+20081231])", + 14000, + ) def test_openfda_3(): - assert_total('/device/registrationlisting.json?search=products.openfda.regulation_number:876.1500', 3912) + assert_total( + "/device/registrationlisting.json?search=products.openfda.regulation_number:876.1500", + 3912, + ) diff --git a/openfda/deploy/tests/deviceudi/test_endpoint.py b/openfda/deploy/tests/deviceudi/test_endpoint.py index 621d236..33c58b0 100644 --- a/openfda/deploy/tests/deviceudi/test_endpoint.py +++ b/openfda/deploy/tests/deviceudi/test_endpoint.py @@ -5,310 +5,354 @@ from openfda.tests.api_test_helpers import * - def test_package_type(): - meta, results = fetch( - '/device/udi.json?search=identifiers.id:M20609070') - eq_(len(results), 1) + meta, results = fetch("/device/udi.json?search=identifiers.id:M20609070") + eq_(len(results), 1) + + udi = results[0] + eq_(udi["identifiers"][1]["package_type"], "Box") + eq_(udi["identifiers"][1]["type"], "Package") - udi = results[0] - eq_(udi["identifiers"][1]["package_type"], "Box") - eq_(udi["identifiers"][1]["type"], "Package") def test_size_value_as_string_fda82(): - meta, results = fetch( - '/device/udi.json?search=identifiers.id:10801741065450') - eq_(len(results), 1) + meta, results = fetch("/device/udi.json?search=identifiers.id:10801741065450") + eq_(len(results), 1) - udi = results[0] - eq_(udi["device_sizes"][0]["unit"], "Inch") - eq_(udi["device_sizes"][0]["value"], "3/16") - eq_(udi["device_sizes"][0]["type"], "Outer Diameter") + udi = results[0] + eq_(udi["device_sizes"][0]["unit"], "Inch") + eq_(udi["device_sizes"][0]["value"], "3/16") + eq_(udi["device_sizes"][0]["type"], "Outer Diameter") def test_booleans(): - # identifiers.package_status.exact - meta, results = fetch( - '/device/udi.json?search=is_direct_marking_exempt:true') - ok_(meta["results"]["total"] >= 51 + 10777 + 13555 + 2447 + 17088 + 3955 + 3383 + 6036 + 5168 + 2265 + 913) - + # identifiers.package_status.exact + meta, results = fetch("/device/udi.json?search=is_direct_marking_exempt:true") + ok_( + meta["results"]["total"] + >= 51 + 10777 + 13555 + 2447 + 17088 + 3955 + 3383 + 6036 + 5168 + 2265 + 913 + ) def test_exact_fields(): - # identifiers.package_status.exact - meta, results = fetch( - '/device/udi.json?search=identifiers.package_status.exact:"In Commercial Distribution"') - ok_(meta["results"]["total"] >= 36642) + # identifiers.package_status.exact + meta, results = fetch( + '/device/udi.json?search=identifiers.package_status.exact:"In Commercial Distribution"' + ) + ok_(meta["results"]["total"] >= 36642) - meta, results = fetch( - '/device/udi.json?search=identifiers.package_status.exact:"Not in Commercial Distribution"') - ok_(meta["results"]["total"] >= 79) + meta, results = fetch( + '/device/udi.json?search=identifiers.package_status.exact:"Not in Commercial Distribution"' + ) + ok_(meta["results"]["total"] >= 79) - meta, results = fetch( - '/device/udi.json?search=identifiers.package_status.exact:Commercial DDistribution') - eq_(meta, None) - eq_(results, None) + meta, results = fetch( + "/device/udi.json?search=identifiers.package_status.exact:Commercial DDistribution" + ) + eq_(meta, None) + eq_(results, None) def test_date_fields(): - # package_discontinue_date - meta, results = fetch( - '/device/udi.json?search=identifiers.package_discontinue_date:20170430&limit=1') - eq_(meta["results"]["total"], 6) - meta, results = fetch( - '/device/udi.json?search=identifiers.package_discontinue_date:"2017-04-30"&limit=1') - eq_(meta["results"]["total"], 6) - - udi = results[0] - ok_(udi["identifiers"][0]["id"] in ["50699753498278", - "00699753498273", - "00850490007061", - "00850490007054", - "00850490007085", - "00850490007078", - "00863101000306", - "10863101000303", - "20863101000300", - "50699753473626", - "00751774077704", - "00850490007078", - "00492450303419", - "00492450302887", - "00699753473621"]) - - # publish_date - meta, results = fetch( - '/device/udi.json?search=publish_date:[2014-12-31+TO+2015-01-03]&limit=100') - eq_(meta["results"]["total"], 4) - meta, results = fetch( - '/device/udi.json?search=publish_date:[20141231+TO+20150103]&limit=100') - eq_(meta["results"]["total"], 4) - - - meta, results = fetch( - '/device/udi.json?search=publish_date:[2014-05-01+TO+2014-06-30]&limit=100') - eq_(meta["results"]["total"], 3) - meta, results = fetch( - '/device/udi.json?search=publish_date:[20140501+TO+20140630]&limit=100') - eq_(meta["results"]["total"], 3) - meta, results = fetch( - '/device/udi.json?search=publish_date:[20140501%20TO%2020140630]&limit=100') - eq_(meta["results"]["total"], 3) - meta, results = fetch( - '/device/udi.json?search=publish_date:[20140501 TO 20140630]&limit=100') - eq_(meta["results"]["total"], 3) - meta, results = fetch( - '/device/udi.json?search=publish_date:([19900501 TO 20180630])+AND+commercial_distribution_end_date:["19900101" TO "20191231"]&limit=100') - ok_(meta["results"]["total"] > 13000) - - - - # commercial_distribution_end_date - meta, results = fetch( - '/device/udi.json?search=commercial_distribution_end_date:[2014-01-01+TO+2014-12-31]&limit=100') - eq_(meta["results"]["total"], 145) + # package_discontinue_date + meta, results = fetch( + "/device/udi.json?search=identifiers.package_discontinue_date:20170430&limit=1" + ) + eq_(meta["results"]["total"], 6) + meta, results = fetch( + '/device/udi.json?search=identifiers.package_discontinue_date:"2017-04-30"&limit=1' + ) + eq_(meta["results"]["total"], 6) + + udi = results[0] + ok_( + udi["identifiers"][0]["id"] + in [ + "50699753498278", + "00699753498273", + "00850490007061", + "00850490007054", + "00850490007085", + "00850490007078", + "00863101000306", + "10863101000303", + "20863101000300", + "50699753473626", + "00751774077704", + "00850490007078", + "00492450303419", + "00492450302887", + "00699753473621", + ] + ) + + # publish_date + meta, results = fetch( + "/device/udi.json?search=publish_date:[2014-12-31+TO+2015-01-03]&limit=100" + ) + eq_(meta["results"]["total"], 4) + meta, results = fetch( + "/device/udi.json?search=publish_date:[20141231+TO+20150103]&limit=100" + ) + eq_(meta["results"]["total"], 4) + + meta, results = fetch( + "/device/udi.json?search=publish_date:[2014-05-01+TO+2014-06-30]&limit=100" + ) + eq_(meta["results"]["total"], 3) + meta, results = fetch( + "/device/udi.json?search=publish_date:[20140501+TO+20140630]&limit=100" + ) + eq_(meta["results"]["total"], 3) + meta, results = fetch( + "/device/udi.json?search=publish_date:[20140501%20TO%2020140630]&limit=100" + ) + eq_(meta["results"]["total"], 3) + meta, results = fetch( + "/device/udi.json?search=publish_date:[20140501 TO 20140630]&limit=100" + ) + eq_(meta["results"]["total"], 3) + meta, results = fetch( + '/device/udi.json?search=publish_date:([19900501 TO 20180630])+AND+commercial_distribution_end_date:["19900101" TO "20191231"]&limit=100' + ) + ok_(meta["results"]["total"] > 13000) + + # commercial_distribution_end_date + meta, results = fetch( + "/device/udi.json?search=commercial_distribution_end_date:[2014-01-01+TO+2014-12-31]&limit=100" + ) + eq_(meta["results"]["total"], 145) def test_identifiers(): - meta, results = fetch( - '/device/udi.json?search=brand_name:"Cardiovascular Procedure Kit"+AND+version_or_model_number.exact:"66483"') - eq_(len(results), 1) - - udi = results[0] - eq_(udi["identifiers"][0]["id"], "50699753498278") - eq_(udi["identifiers"][0]["type"], "Package") - eq_(udi["identifiers"][0]["issuing_agency"], "GS1") - eq_(udi["identifiers"][0]["unit_of_use_id"], "00699753498273") - eq_(udi["identifiers"][0]["quantity_per_package"], "4") - eq_(udi["identifiers"][0]["package_discontinue_date"], "2017-05-01") - eq_(udi["identifiers"][0]["package_status"], "Not in Commercial Distribution") - - eq_(udi["identifiers"][1]["id"], "00699753498273") - eq_(udi["identifiers"][1]["type"], "Primary") - eq_(udi["identifiers"][1]["issuing_agency"], "GS1") - eq_(udi["identifiers"][1].get("unit_of_use_id"), None) - eq_(udi["identifiers"][1].get("quantity_per_package"), None) - eq_(udi["identifiers"][1].get("package_discontinue_date"), None) - eq_(udi["identifiers"][1].get("package_status"), None) + meta, results = fetch( + '/device/udi.json?search=brand_name:"Cardiovascular Procedure Kit"+AND+version_or_model_number.exact:"66483"' + ) + eq_(len(results), 1) + + udi = results[0] + eq_(udi["identifiers"][0]["id"], "50699753498278") + eq_(udi["identifiers"][0]["type"], "Package") + eq_(udi["identifiers"][0]["issuing_agency"], "GS1") + eq_(udi["identifiers"][0]["unit_of_use_id"], "00699753498273") + eq_(udi["identifiers"][0]["quantity_per_package"], "4") + eq_(udi["identifiers"][0]["package_discontinue_date"], "2017-05-01") + eq_(udi["identifiers"][0]["package_status"], "Not in Commercial Distribution") + + eq_(udi["identifiers"][1]["id"], "00699753498273") + eq_(udi["identifiers"][1]["type"], "Primary") + eq_(udi["identifiers"][1]["issuing_agency"], "GS1") + eq_(udi["identifiers"][1].get("unit_of_use_id"), None) + eq_(udi["identifiers"][1].get("quantity_per_package"), None) + eq_(udi["identifiers"][1].get("package_discontinue_date"), None) + eq_(udi["identifiers"][1].get("package_status"), None) def test_openfda(): - meta, results = fetch( - '/device/udi.json?search=identifiers.id:00085412096094') - eq_(len(results), 1) - - udi = results[0] - - # Fei and premarket data is supressed for initial release. - eq_(udi.get("fei_number"), None) - eq_(udi.get("pma_submissions"), None) - eq_(udi.get("pma_submissions"), None) - eq_(udi.get("pma_submissions"), None) - - eq_(udi["product_codes"][0]["code"], "LMF") - eq_(udi["product_codes"][0]["name"], "Agent, Absorbable Hemostatic, Collagen Based") - eq_(udi["product_codes"][0]["openfda"]["device_class"], "3") - eq_(udi["product_codes"][0]["openfda"]["device_name"].lower(), "agent, absorbable hemostatic, collagen based") - eq_(udi["product_codes"][0]["openfda"]["medical_specialty_description"], "General, Plastic Surgery") - eq_(udi["product_codes"][0]["openfda"]["regulation_number"], "878.4490") - eq_(udi["product_codes"][0]["openfda"].get("registration_number"), None) - eq_(udi["product_codes"][0]["openfda"].get("fei_number"), None) - eq_(udi["product_codes"][0]["openfda"].get("pma_number"), None) + meta, results = fetch("/device/udi.json?search=identifiers.id:00085412096094") + eq_(len(results), 1) + + udi = results[0] + + # Fei and premarket data is supressed for initial release. + eq_(udi.get("fei_number"), None) + eq_(udi.get("pma_submissions"), None) + eq_(udi.get("pma_submissions"), None) + eq_(udi.get("pma_submissions"), None) + + eq_(udi["product_codes"][0]["code"], "LMF") + eq_(udi["product_codes"][0]["name"], "Agent, Absorbable Hemostatic, Collagen Based") + eq_(udi["product_codes"][0]["openfda"]["device_class"], "3") + eq_( + udi["product_codes"][0]["openfda"]["device_name"].lower(), + "agent, absorbable hemostatic, collagen based", + ) + eq_( + udi["product_codes"][0]["openfda"]["medical_specialty_description"], + "General, Plastic Surgery", + ) + eq_(udi["product_codes"][0]["openfda"]["regulation_number"], "878.4490") + eq_(udi["product_codes"][0]["openfda"].get("registration_number"), None) + eq_(udi["product_codes"][0]["openfda"].get("fei_number"), None) + eq_(udi["product_codes"][0]["openfda"].get("pma_number"), None) def test_storage(): - meta, results = fetch( - '/device/udi.json?search=identifiers.id:00887517614384') - eq_(len(results), 1) + meta, results = fetch("/device/udi.json?search=identifiers.id:00887517614384") + eq_(len(results), 1) - udi = results[0] - eq_(udi["storage"][0]["type"], "Storage Environment Temperature") - eq_(udi["storage"][0]["high"]["unit"], "Degrees Celsius") - eq_(udi["storage"][0]["high"]["value"], "30") - eq_(udi["storage"][0]["low"]["unit"], "Degrees Celsius") - eq_(udi["storage"][0]["low"]["value"], "15") + udi = results[0] + eq_(udi["storage"][0]["type"], "Storage Environment Temperature") + eq_(udi["storage"][0]["high"]["unit"], "Degrees Celsius") + eq_(udi["storage"][0]["high"]["value"], "30") + eq_(udi["storage"][0]["low"]["unit"], "Degrees Celsius") + eq_(udi["storage"][0]["low"]["value"], "15") - # KEEP DRY - meta, results = fetch( - '/device/udi.json?search=identifiers.id:00886874103951') - eq_(len(results), 1) + # KEEP DRY + meta, results = fetch("/device/udi.json?search=identifiers.id:00886874103951") + eq_(len(results), 1) - udi = results[0] - eq_(udi["storage"][0]["special_conditions"], "KEEP DRY") + udi = results[0] + eq_(udi["storage"][0]["special_conditions"], "KEEP DRY") def test_commercial_distribution_end_date(): - meta, results = fetch( - '/device/udi.json?search=commercial_distribution_status:"Not+in+Commercial+Distribution"+AND+brand_name:DURACUFF+AND+identifiers.id:04051948012811') - eq_(len(results), 1) + meta, results = fetch( + '/device/udi.json?search=commercial_distribution_status:"Not+in+Commercial+Distribution"+AND+brand_name:DURACUFF+AND+identifiers.id:04051948012811' + ) + eq_(len(results), 1) - udi = results[0] - eq_(udi.get("commercial_distribution_end_date"), "2021-10-12") - eq_(udi["commercial_distribution_status"], "Not in Commercial Distribution") + udi = results[0] + eq_(udi.get("commercial_distribution_end_date"), "2021-10-12") + eq_(udi["commercial_distribution_status"], "Not in Commercial Distribution") def test_catalog_number(): - meta, results = fetch( - '/device/udi.json?search=catalog_number:521554BE2') - eq_(len(results), 1) + meta, results = fetch("/device/udi.json?search=catalog_number:521554BE2") + eq_(len(results), 1) - udi = results[0] - eq_(udi["catalog_number"], "14-521554BE2") + udi = results[0] + eq_(udi["catalog_number"], "14-521554BE2") - # Make sure exact search also works. - meta, results = fetch( - '/device/udi.json?search=catalog_number.exact:14-521554BE2') - eq_(len(results), 1) + # Make sure exact search also works. + meta, results = fetch("/device/udi.json?search=catalog_number.exact:14-521554BE2") + eq_(len(results), 1) - udi = results[0] - eq_(udi["catalog_number"], "14-521554BE2") + udi = results[0] + eq_(udi["catalog_number"], "14-521554BE2") def test_device_sizes(): - meta, results = fetch( - '/device/udi.json?search=identifiers.id:00886874104361') - eq_(len(results), 1) + meta, results = fetch("/device/udi.json?search=identifiers.id:00886874104361") + eq_(len(results), 1) - udi = results[0] - eq_(udi["device_sizes"][0]["unit"], "Millimeter") - eq_(udi["device_sizes"][0]["value"], "4.3") - eq_(udi["device_sizes"][0]["type"], "Depth") + udi = results[0] + eq_(udi["device_sizes"][0]["unit"], "Millimeter") + eq_(udi["device_sizes"][0]["value"], "4.3") + eq_(udi["device_sizes"][0]["type"], "Depth") - meta, results = fetch( - '/device/udi.json?search=identifiers.id:00888480590095') - eq_(len(results), 1) + meta, results = fetch("/device/udi.json?search=identifiers.id:00888480590095") + eq_(len(results), 1) - udi = results[0] - eq_(udi["device_sizes"][0].get("unit"), None) - eq_(udi["device_sizes"][0].get("value"), None) - eq_(udi["device_sizes"][0]["type"], "Device Size Text, specify") - eq_(udi["device_sizes"][0]["text"], "4.5 X 24MM BLUNT FXD SCREW 2PK") + udi = results[0] + eq_(udi["device_sizes"][0].get("unit"), None) + eq_(udi["device_sizes"][0].get("value"), None) + eq_(udi["device_sizes"][0]["type"], "Device Size Text, specify") + eq_(udi["device_sizes"][0]["text"], "4.5 X 24MM BLUNT FXD SCREW 2PK") def test_CoRoent(): - meta, results = fetch( - '/device/udi.json?search=identifiers.id:00887517567062+AND+identifiers.type:Primary+AND+identifiers.issuing_agency.exact:GS1') - eq_(len(results), 1) - - udi = results[0] - eq_(udi["record_status"], "Published") - eq_(udi["publish_date"], "2015-10-24") - eq_(udi.get("commercial_distribution_end_date"), None) - eq_(udi["commercial_distribution_status"], "In Commercial Distribution") - eq_(udi["identifiers"][0]["id"], "00887517567062") - eq_(udi["identifiers"][0]["type"], "Primary") - eq_(udi["identifiers"][0]["issuing_agency"], "GS1") - eq_(udi["brand_name"], "CoRoent") - eq_(udi["version_or_model_number"], "5163284") - eq_(udi.get("catalog_number"), None) - eq_(udi["device_count_in_base_package"], "1") - eq_(udi["device_description"], str("CoRoent Ant TLIF PEEK, 16x13x28mm 4°")) - eq_(udi["is_direct_marking_exempt"], "false") - eq_(udi["is_pm_exempt"], "false") - eq_(udi["is_hct_p"], "false") - eq_(udi["is_kit"], "false") - eq_(udi["is_combination_product"], "false") - eq_(udi["is_single_use"], "false") - eq_(udi["has_lot_or_batch_number"], "true") - eq_(udi["has_serial_number"], "false") - eq_(udi["has_manufacturing_date"], "false") - eq_(udi["has_expiration_date"], "false") - eq_(udi["has_donation_id_number"], "false") - eq_(udi["is_labeled_as_nrl"], "false") - eq_(udi["is_labeled_as_no_nrl"], "false") - eq_(udi["mri_safety"], "Labeling does not contain MRI Safety Information") - eq_(udi["is_rx"], "true") - eq_(udi["is_otc"], "false") - eq_(udi["customer_contacts"][0]["phone"], "+1(858)909-1800") - eq_(udi["customer_contacts"][0]["email"], "RA_UDI@nuvasive.com") - eq_(udi["customer_contacts"][0].get("ext"), None) - eq_(udi["gmdn_terms"][0]["name"], "Metallic spinal fusion cage, non-sterile") - eq_(udi["gmdn_terms"][0]["definition"], - "A non-sterile device intended to help fuse segments of the spine to treat anatomical abnormalities of the vertebrae, typically due to degenerative intervertebral disks [i.e., degenerative disc disease (DDD)]. The device is typically designed as a small, hollow and/or porous, threaded or fenestrated cylinder (or other geometric form) made of metal [usually titanium (Ti)] that is implanted between the bones or bone grafts of the spine, to provide mechanical stability and sufficient space for therapeutic spinal bone fusion to occur. Disposable devices associated with implantation may be included. This device must be sterilized prior to use.") - - eq_(udi["product_codes"][0]["code"], "MAX") - eq_(udi["product_codes"][0]["name"], "Intervertebral fusion device with bone graft, lumbar") - eq_(udi["product_codes"][0]["openfda"]["device_class"], "2") - eq_(udi["product_codes"][0]["openfda"]["device_name"], "Intervertebral Fusion Device With Bone Graft, Lumbar") - eq_(udi["product_codes"][0]["openfda"]["medical_specialty_description"], "Orthopedic") - eq_(udi["product_codes"][0]["openfda"]["regulation_number"], "888.3080") - - eq_(udi["product_codes"][1]["code"], "MQP") - eq_(udi["product_codes"][1]["name"], "SPINAL VERTEBRAL BODY REPLACEMENT DEVICE") - eq_(udi["product_codes"][1]["openfda"]["device_class"], "2") - eq_(udi["product_codes"][1]["openfda"]["device_name"], "Spinal Vertebral Body Replacement Device") - eq_(udi["product_codes"][1]["openfda"]["medical_specialty_description"], "Orthopedic") - eq_(udi["product_codes"][1]["openfda"]["regulation_number"], "888.3060") - - eq_(udi.get("device_sizes"), None) - eq_(udi.get("storage"), None) - eq_(udi["sterilization"]["is_sterile"], "false") - eq_(udi["sterilization"]["is_sterilization_prior_use"], "true") - eq_(udi["sterilization"]["sterilization_methods"], "Moist Heat or Steam Sterilization") - - # Fei and premarket data is supressed for initial release. - eq_(udi.get("fei_number"), None) - eq_(udi.get("pma_submissions"), None) - eq_(udi.get("pma_submissions"), None) - eq_(udi.get("pma_submissions"), None) + meta, results = fetch( + "/device/udi.json?search=identifiers.id:00887517567062+AND+identifiers.type:Primary+AND+identifiers.issuing_agency.exact:GS1" + ) + eq_(len(results), 1) + + udi = results[0] + eq_(udi["record_status"], "Published") + eq_(udi["publish_date"], "2015-10-24") + eq_(udi.get("commercial_distribution_end_date"), None) + eq_(udi["commercial_distribution_status"], "In Commercial Distribution") + eq_(udi["identifiers"][0]["id"], "00887517567062") + eq_(udi["identifiers"][0]["type"], "Primary") + eq_(udi["identifiers"][0]["issuing_agency"], "GS1") + eq_(udi["brand_name"], "CoRoent") + eq_(udi["version_or_model_number"], "5163284") + eq_(udi.get("catalog_number"), None) + eq_(udi["device_count_in_base_package"], "1") + eq_(udi["device_description"], str("CoRoent Ant TLIF PEEK, 16x13x28mm 4°")) + eq_(udi["is_direct_marking_exempt"], "false") + eq_(udi["is_pm_exempt"], "false") + eq_(udi["is_hct_p"], "false") + eq_(udi["is_kit"], "false") + eq_(udi["is_combination_product"], "false") + eq_(udi["is_single_use"], "false") + eq_(udi["has_lot_or_batch_number"], "true") + eq_(udi["has_serial_number"], "false") + eq_(udi["has_manufacturing_date"], "false") + eq_(udi["has_expiration_date"], "false") + eq_(udi["has_donation_id_number"], "false") + eq_(udi["is_labeled_as_nrl"], "false") + eq_(udi["is_labeled_as_no_nrl"], "false") + eq_(udi["mri_safety"], "Labeling does not contain MRI Safety Information") + eq_(udi["is_rx"], "true") + eq_(udi["is_otc"], "false") + eq_(udi["customer_contacts"][0]["phone"], "+1(858)909-1800") + eq_(udi["customer_contacts"][0]["email"], "RA_UDI@nuvasive.com") + eq_(udi["customer_contacts"][0].get("ext"), None) + eq_(udi["gmdn_terms"][0]["name"], "Metallic spinal fusion cage, non-sterile") + eq_( + udi["gmdn_terms"][0]["definition"], + "A non-sterile device intended to help fuse segments of the spine to treat anatomical abnormalities of the vertebrae, typically due to degenerative intervertebral disks [i.e., degenerative disc disease (DDD)]. The device is typically designed as a small, hollow and/or porous, threaded or fenestrated cylinder (or other geometric form) made of metal [usually titanium (Ti)] that is implanted between the bones or bone grafts of the spine, to provide mechanical stability and sufficient space for therapeutic spinal bone fusion to occur. Disposable devices associated with implantation may be included. This device must be sterilized prior to use.", + ) + + eq_(udi["product_codes"][0]["code"], "MAX") + eq_( + udi["product_codes"][0]["name"], + "Intervertebral fusion device with bone graft, lumbar", + ) + eq_(udi["product_codes"][0]["openfda"]["device_class"], "2") + eq_( + udi["product_codes"][0]["openfda"]["device_name"], + "Intervertebral Fusion Device With Bone Graft, Lumbar", + ) + eq_( + udi["product_codes"][0]["openfda"]["medical_specialty_description"], + "Orthopedic", + ) + eq_(udi["product_codes"][0]["openfda"]["regulation_number"], "888.3080") + + eq_(udi["product_codes"][1]["code"], "MQP") + eq_(udi["product_codes"][1]["name"], "SPINAL VERTEBRAL BODY REPLACEMENT DEVICE") + eq_(udi["product_codes"][1]["openfda"]["device_class"], "2") + eq_( + udi["product_codes"][1]["openfda"]["device_name"], + "Spinal Vertebral Body Replacement Device", + ) + eq_( + udi["product_codes"][1]["openfda"]["medical_specialty_description"], + "Orthopedic", + ) + eq_(udi["product_codes"][1]["openfda"]["regulation_number"], "888.3060") + + eq_(udi.get("device_sizes"), None) + eq_(udi.get("storage"), None) + eq_(udi["sterilization"]["is_sterile"], "false") + eq_(udi["sterilization"]["is_sterilization_prior_use"], "true") + eq_( + udi["sterilization"]["sterilization_methods"], + "Moist Heat or Steam Sterilization", + ) + + # Fei and premarket data is supressed for initial release. + eq_(udi.get("fei_number"), None) + eq_(udi.get("pma_submissions"), None) + eq_(udi.get("pma_submissions"), None) + eq_(udi.get("pma_submissions"), None) def test_total_count(): - assert_total('/device/udi.json', 533467) + assert_total("/device/udi.json", 533467) def test_product_code_count(): - assert_count_top('/device/udi.json?count=product_codes.code.exact', - ['MNI', 'MNH', 'KWP', 'KWQ']) + assert_count_top( + "/device/udi.json?count=product_codes.code.exact", ["MNI", "MNH", "KWP", "KWQ"] + ) def test_mri_safety_count(): - assert_count_top('/device/udi.json?count=mri_safety.exact', - ['Labeling does not contain MRI Safety Information', 'MR Conditional', 'MR Unsafe', 'MR Safe']) - - -if __name__ == '__main__': - all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) - for key, func in all_functions: - if key.find("test_") > -1: - func() + assert_count_top( + "/device/udi.json?count=mri_safety.exact", + [ + "Labeling does not contain MRI Safety Information", + "MR Conditional", + "MR Unsafe", + "MR Safe", + ], + ) + + +if __name__ == "__main__": + all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) + for key, func in all_functions: + if key.find("test_") > -1: + func() diff --git a/openfda/deploy/tests/downloadstats/test_endpoint.py b/openfda/deploy/tests/downloadstats/test_endpoint.py index 1edc3e6..3385602 100644 --- a/openfda/deploy/tests/downloadstats/test_endpoint.py +++ b/openfda/deploy/tests/downloadstats/test_endpoint.py @@ -2,32 +2,33 @@ import inspect import sys + def test_download_counts(): - status = api_test_helpers.json('/usage.json') - assert status["downloadStats"]["deviceudi"] >= 111000 - assert status["downloadStats"]["deviceenforcement"] >= 288800 - assert status["downloadStats"]["devicereglist"] >= 1999834 - assert status["downloadStats"]["othernsde"] >= 9215 - assert status["downloadStats"]["druglabel"] >= 225705 - assert status["downloadStats"]["covid19serology"] >= 2753 - assert status["downloadStats"]["ndc"] >= 124783 - assert status["downloadStats"]["devicepma"] >= 9783 - assert status["downloadStats"]["deviceclearance"] >= 20863 - assert status["downloadStats"]["tobaccoproblem"] >= 761 - assert status["downloadStats"]["foodenforcement"] >= 80278 - assert status["downloadStats"]["drugsfda"] >= 15972 - assert status["downloadStats"]["foodevent"] >= 16823 - assert status["downloadStats"]["deviceevent"] >= 878275 - assert status["downloadStats"]["animalandveterinarydrugevent"] >= 325266 - assert status["downloadStats"]["othersubstance"] >= 2491 - assert status["downloadStats"]["deviceclass"] >= 91209 - assert status["downloadStats"]["devicerecall"] >= 33674 - assert status["downloadStats"]["drugenforcement"] >= 38296 - assert status["downloadStats"]["drugevent"] >= 3093777 + status = api_test_helpers.json("/usage.json") + assert status["downloadStats"]["deviceudi"] >= 111000 + assert status["downloadStats"]["deviceenforcement"] >= 288800 + assert status["downloadStats"]["devicereglist"] >= 1999834 + assert status["downloadStats"]["othernsde"] >= 9215 + assert status["downloadStats"]["druglabel"] >= 225705 + assert status["downloadStats"]["covid19serology"] >= 2753 + assert status["downloadStats"]["ndc"] >= 124783 + assert status["downloadStats"]["devicepma"] >= 9783 + assert status["downloadStats"]["deviceclearance"] >= 20863 + assert status["downloadStats"]["tobaccoproblem"] >= 761 + assert status["downloadStats"]["foodenforcement"] >= 80278 + assert status["downloadStats"]["drugsfda"] >= 15972 + assert status["downloadStats"]["foodevent"] >= 16823 + assert status["downloadStats"]["deviceevent"] >= 878275 + assert status["downloadStats"]["animalandveterinarydrugevent"] >= 325266 + assert status["downloadStats"]["othersubstance"] >= 2491 + assert status["downloadStats"]["deviceclass"] >= 91209 + assert status["downloadStats"]["devicerecall"] >= 33674 + assert status["downloadStats"]["drugenforcement"] >= 38296 + assert status["downloadStats"]["drugevent"] >= 3093777 -if __name__ == '__main__': - all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) - for key, func in all_functions: - if key.find("test_") > -1: - func() +if __name__ == "__main__": + all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) + for key, func in all_functions: + if key.find("test_") > -1: + func() diff --git a/openfda/deploy/tests/drugevent/test_endpoint.py b/openfda/deploy/tests/drugevent/test_endpoint.py index 2739b9a..e407c2a 100644 --- a/openfda/deploy/tests/drugevent/test_endpoint.py +++ b/openfda/deploy/tests/drugevent/test_endpoint.py @@ -1,112 +1,145 @@ - import requests from openfda.tests.api_test_helpers import * -EXACT_FIELDS = ['openfda.application_number.exact', 'openfda.brand_name.exact', - 'openfda.generic_name.exact', 'openfda.manufacturer_name.exact', - 'openfda.nui.exact', 'openfda.pharm_class_cs.exact', - 'openfda.pharm_class_epc.exact', 'openfda.pharm_class_moa.exact', 'openfda.pharm_class_pe.exact', - 'openfda.product_type.exact', 'openfda.route.exact', 'openfda.rxcui.exact', - 'openfda.substance_name.exact', 'openfda.unii.exact'] +EXACT_FIELDS = [ + "openfda.application_number.exact", + "openfda.brand_name.exact", + "openfda.generic_name.exact", + "openfda.manufacturer_name.exact", + "openfda.nui.exact", + "openfda.pharm_class_cs.exact", + "openfda.pharm_class_epc.exact", + "openfda.pharm_class_moa.exact", + "openfda.pharm_class_pe.exact", + "openfda.product_type.exact", + "openfda.route.exact", + "openfda.rxcui.exact", + "openfda.substance_name.exact", + "openfda.unii.exact", +] + def test_exact_field_queries_after_fda_253(): - for field in EXACT_FIELDS: - print(field) - meta, counts = fetch_counts('/drug/event.json?count=patient.drug.%s&limit=1' % field) - eq_(len(counts), 1) + for field in EXACT_FIELDS: + print(field) + meta, counts = fetch_counts( + "/drug/event.json?count=patient.drug.%s&limit=1" % field + ) + eq_(len(counts), 1) + def test_not_analyzed_0(): - assert_total('/drug/event.json?search=patient.drug.openfda.spl_id:d08e6fbf-9162-47cd-9cf6-74ea24d48214', 65410) + assert_total( + "/drug/event.json?search=patient.drug.openfda.spl_id:d08e6fbf-9162-47cd-9cf6-74ea24d48214", + 65410, + ) def test_date_1(): - assert_total('/drug/event.json?search=receivedate:20040319', 2900) + assert_total("/drug/event.json?search=receivedate:20040319", 2900) def test_date_range_2(): - assert_total('/drug/event.json?search=receivedate:([20040319+TO+20041231])', 166000) + assert_total("/drug/event.json?search=receivedate:([20040319+TO+20041231])", 166000) def test_openfda_3(): - assert_total('/drug/event.json?search=patient.drug.openfda.unii:A4P49JAZ9H', 1463) + assert_total("/drug/event.json?search=patient.drug.openfda.unii:A4P49JAZ9H", 1463) def test_2018q3_present(): - assert_total('/drug/event.json?search=safetyreportid.exact:9995402', 1) + assert_total("/drug/event.json?search=safetyreportid.exact:9995402", 1) + def test_drug_event_unii(): - assert_unii('/drug/event.json?search=safetyreportid:10003301', ['RM1CE97Z4N', 'WK2XYI10QM']) + assert_unii( + "/drug/event.json?search=safetyreportid:10003301", ["RM1CE97Z4N", "WK2XYI10QM"] + ) + def test_drug_generic_name_count(): - assert_total('/drug/event.json?search=patient.drug.medicinalproduct:TOVIAZ', 7700) - assert_total('/drug/event.json?search=patient.drug.openfda.generic_name:%22dimethyl%20fumarate%22', 90000) + assert_total("/drug/event.json?search=patient.drug.medicinalproduct:TOVIAZ", 7700) + assert_total( + "/drug/event.json?search=patient.drug.openfda.generic_name:%22dimethyl%20fumarate%22", + 90000, + ) + def test_histogram(): - meta, counts = fetch_counts('/drug/event.json?search=receivedate:[20040101+TO+20180611]&count=receivedate') - eq_(counts[0].term, '20040101') - eq_(counts[0].count, 1) - eq_(counts[1].term, '20040102') - ok_(counts[1].count > 500) - eq_(counts[2].term, '20040103') - eq_(counts[2].count, 2) - assert_greater_equal(len(counts), 4920) + meta, counts = fetch_counts( + "/drug/event.json?search=receivedate:[20040101+TO+20180611]&count=receivedate" + ) + eq_(counts[0].term, "20040101") + eq_(counts[0].count, 1) + eq_(counts[1].term, "20040102") + ok_(counts[1].count > 500) + eq_(counts[2].term, "20040103") + eq_(counts[2].count, 2) + assert_greater_equal(len(counts), 4920) + # Uncomment once FDA-250 is fixed. + def test_nonsteriodial_drug_reactions(): - assert_count_top('/drug/event.json?search=patient.drug.openfda.pharm_class_epc:' - '"nonsteroidal+anti-inflammatory+drug"&count=patient.reaction.reactionmeddrapt.exact', [ - 'DRUG INEFFECTIVE', - 'NAUSEA', - 'FATIGUE', - 'PAIN' - ]) + assert_count_top( + "/drug/event.json?search=patient.drug.openfda.pharm_class_epc:" + '"nonsteroidal+anti-inflammatory+drug"&count=patient.reaction.reactionmeddrapt.exact', + ["DRUG INEFFECTIVE", "NAUSEA", "FATIGUE", "PAIN"], + ) def test_nonsteriodial_drug_reactions(): - assert_count_top('/drug/event.json?search=patient.drug.openfda.pharm_class_epc:' - '"nonsteroidal+anti-inflammatory+drug"&count=patient.reaction.reactionmeddrapt.exact', [ - 'DIZZINESS', - 'NAUSEA', - 'FATIGUE' - ]) + assert_count_top( + "/drug/event.json?search=patient.drug.openfda.pharm_class_epc:" + '"nonsteroidal+anti-inflammatory+drug"&count=patient.reaction.reactionmeddrapt.exact', + ["DIZZINESS", "NAUSEA", "FATIGUE"], + ) - assert_count_contains('/drug/event.json?search=patient.drug.openfda.pharm_class_epc:' - '"nonsteroidal+anti-inflammatory+drug"&count=patient.reaction.reactionmeddrapt.exact&limit=1000', [ - 'HEADACHE', - 'HEART RATE INCREASED', - 'SEPSIS' - ]) + assert_count_contains( + "/drug/event.json?search=patient.drug.openfda.pharm_class_epc:" + '"nonsteroidal+anti-inflammatory+drug"&count=patient.reaction.reactionmeddrapt.exact&limit=1000', + ["HEADACHE", "HEART RATE INCREASED", "SEPSIS"], + ) def test_date_formats_after_migration(): - assert_total('/drug/event.json?search=safetyreportid:"10141822"+AND+' - 'receiptdate:20181008+AND+receiptdate:2018-10-08+AND+transmissiondate:' - '[20190203+TO+20190205]+AND+transmissiondate:[2019-02-03+TO+2019-02-05]', 1) - + assert_total( + '/drug/event.json?search=safetyreportid:"10141822"+AND+' + "receiptdate:20181008+AND+receiptdate:2018-10-08+AND+transmissiondate:" + "[20190203+TO+20190205]+AND+transmissiondate:[2019-02-03+TO+2019-02-05]", + 1, + ) def test_drugrecurrence_is_always_an_array(): - meta, results = fetch( - '/drug/event.json?search=safetyreportid:14340262&limit=1') - - event = results[0] - eq_(len(event['patient']['drug']), 1) - eq_(type(event['patient']['drug'][0]['drugrecurrence']), type([])) - eq_(len(event['patient']['drug'][0]['drugrecurrence']), 4) - - drugrecurrence = sorted(event['patient']['drug'][0]['drugrecurrence'], key=lambda k: k['drugrecuraction']) - eq_(drugrecurrence[0]['drugrecuraction'], 'Drug intolerance') - eq_(drugrecurrence[1]['drugrecuraction'], 'Dyspnoea') - eq_(drugrecurrence[2]['drugrecuraction'], 'Type III immune complex mediated reaction') - eq_(drugrecurrence[3]['drugrecuraction'], 'Urticaria') - - meta, results = fetch( - '/drug/event.json?search=safetyreportid:17394804&limit=1') - event = results[0] - eq_(len(event['patient']['drug']), 1) - eq_(type(event['patient']['drug'][0]['drugrecurrence']), type([])) - eq_(len(event['patient']['drug'][0]['drugrecurrence']), 1) - eq_(event['patient']['drug'][0]['drugrecurrence'][0]['drugrecuraction'], "Embolism venous") + meta, results = fetch("/drug/event.json?search=safetyreportid:14340262&limit=1") + + event = results[0] + eq_(len(event["patient"]["drug"]), 1) + eq_(type(event["patient"]["drug"][0]["drugrecurrence"]), type([])) + eq_(len(event["patient"]["drug"][0]["drugrecurrence"]), 4) + + drugrecurrence = sorted( + event["patient"]["drug"][0]["drugrecurrence"], + key=lambda k: k["drugrecuraction"], + ) + eq_(drugrecurrence[0]["drugrecuraction"], "Drug intolerance") + eq_(drugrecurrence[1]["drugrecuraction"], "Dyspnoea") + eq_( + drugrecurrence[2]["drugrecuraction"], + "Type III immune complex mediated reaction", + ) + eq_(drugrecurrence[3]["drugrecuraction"], "Urticaria") + + meta, results = fetch("/drug/event.json?search=safetyreportid:17394804&limit=1") + event = results[0] + eq_(len(event["patient"]["drug"]), 1) + eq_(type(event["patient"]["drug"][0]["drugrecurrence"]), type([])) + eq_(len(event["patient"]["drug"][0]["drugrecurrence"]), 1) + eq_( + event["patient"]["drug"][0]["drugrecurrence"][0]["drugrecuraction"], + "Embolism venous", + ) diff --git a/openfda/deploy/tests/druglabel/test_endpoint.py b/openfda/deploy/tests/druglabel/test_endpoint.py index 61d18fa..fa988d5 100644 --- a/openfda/deploy/tests/druglabel/test_endpoint.py +++ b/openfda/deploy/tests/druglabel/test_endpoint.py @@ -1,111 +1,156 @@ - import requests from openfda.tests.api_test_helpers import * -EXACT_FIELDS = ['openfda.application_number.exact', 'openfda.brand_name.exact', - 'openfda.generic_name.exact', 'openfda.is_original_packager.exact', 'openfda.manufacturer_name.exact', - 'openfda.nui.exact', 'openfda.package_ndc.exact', 'openfda.pharm_class_cs.exact', - 'openfda.pharm_class_epc.exact', 'openfda.pharm_class_moa.exact', 'openfda.pharm_class_pe.exact', - 'openfda.product_ndc.exact', 'openfda.product_type.exact', 'openfda.route.exact', 'openfda.rxcui.exact', - 'openfda.spl_id.exact', 'openfda.spl_set_id.exact', - 'openfda.substance_name.exact', 'openfda.unii.exact', 'openfda.upc.exact'] +EXACT_FIELDS = [ + "openfda.application_number.exact", + "openfda.brand_name.exact", + "openfda.generic_name.exact", + "openfda.is_original_packager.exact", + "openfda.manufacturer_name.exact", + "openfda.nui.exact", + "openfda.package_ndc.exact", + "openfda.pharm_class_cs.exact", + "openfda.pharm_class_epc.exact", + "openfda.pharm_class_moa.exact", + "openfda.pharm_class_pe.exact", + "openfda.product_ndc.exact", + "openfda.product_type.exact", + "openfda.route.exact", + "openfda.rxcui.exact", + "openfda.spl_id.exact", + "openfda.spl_set_id.exact", + "openfda.substance_name.exact", + "openfda.unii.exact", + "openfda.upc.exact", +] + def test_exact_field_queries_after_fda_253(): - for field in EXACT_FIELDS: - print(field) - meta, counts = fetch_counts('/drug/label.json?count=%s&limit=1' % field) - eq_(len(counts), 1) + for field in EXACT_FIELDS: + print(field) + meta, counts = fetch_counts("/drug/label.json?count=%s&limit=1" % field) + eq_(len(counts), 1) def test_not_analyzed_0(): - assert_total('/drug/label.json?search=id:d08e6fbf-9162-47cd-9cf6-74ea24d48214', 1) + assert_total("/drug/label.json?search=id:d08e6fbf-9162-47cd-9cf6-74ea24d48214", 1) def test_exact_count_0(): - assert_count_top('/drug/label.json?count=openfda.brand_name.exact', ['Oxygen', 'Ibuprofen', 'Gabapentin', 'Allergy Relief', 'Hand Sanitizer']) + assert_count_top( + "/drug/label.json?count=openfda.brand_name.exact", + ["Oxygen", "Ibuprofen", "Gabapentin", "Allergy Relief", "Hand Sanitizer"], + ) def test_date_1(): - assert_total('/drug/label.json?search=effective_time:20140320', 15) + assert_total("/drug/label.json?search=effective_time:20140320", 15) def test_date_range_2(): - assert_total('/drug/label.json?search=effective_time:([20140320+TO+20141231])', 3900) - assert_total('/drug/label.json?search=effective_time:([2014-03-20+TO+2014-12-31])', 3900) + assert_total( + "/drug/label.json?search=effective_time:([20140320+TO+20141231])", 3900 + ) + assert_total( + "/drug/label.json?search=effective_time:([2014-03-20+TO+2014-12-31])", 3900 + ) def test_openfda_3(): - assert_total('/drug/label.json?search=openfda.is_original_packager:true', 53000) + assert_total("/drug/label.json?search=openfda.is_original_packager:true", 53000) def test_package_ndc_unii(): - assert_unii('/drug/label.json?search=openfda.substance_name:Methoxsalen', ['U4VJ29L7BQ']) + assert_unii( + "/drug/label.json?search=openfda.substance_name:Methoxsalen", ["U4VJ29L7BQ"] + ) def test_drug_label_unii(): - assert_unii('/drug/label.json?search=openfda.substance_name:ofloxacin', ['A4P49JAZ9H']) + assert_unii( + "/drug/label.json?search=openfda.substance_name:ofloxacin", ["A4P49JAZ9H"] + ) def test_unii_multiple_substances(): - assert_unii('/drug/label.json?search=set_id:0002a6ef-6e5e-4a34-95d5-ebaab222496f', - [ - "39M11XPH03", "1G4GK01F67", "S7V92P67HO", "X7BCI5P86H", "6YR2608RSU" - ]) + assert_unii( + "/drug/label.json?search=set_id:0002a6ef-6e5e-4a34-95d5-ebaab222496f", + ["39M11XPH03", "1G4GK01F67", "S7V92P67HO", "X7BCI5P86H", "6YR2608RSU"], + ) + def test_count_after_es2_migration(): - assert_count_top('/drug/label.json?count=openfda.product_type.exact', ['HUMAN OTC DRUG','HUMAN PRESCRIPTION DRUG'], 2) + assert_count_top( + "/drug/label.json?count=openfda.product_type.exact", + ["HUMAN OTC DRUG", "HUMAN PRESCRIPTION DRUG"], + 2, + ) - meta, counts = fetch_counts('/drug/label.json?count=openfda.product_type.exact') - assert_greater(counts[0].count, counts[1].count) + meta, counts = fetch_counts("/drug/label.json?count=openfda.product_type.exact") + assert_greater(counts[0].count, counts[1].count) def test_multiple_upc(): - # Todo: fix the build - # meta, results = fetch( - # '/drug/label.json?search=openfda.spl_set_id.exact:0b0be196-0c62-461c-94f4-9a35339b4501') - meta, results = fetch( - '/drug/label.json?search=openfda.spl_set_id.exact:003161f8-aed4-4d04-b6a3-6afc56e2817a') - eq_(len(results), 1) - upcs = results[0]['openfda']['upc'] - upcs.sort() - eq_(['0860005203642', '0860005203659'], upcs) + # Todo: fix the build + # meta, results = fetch( + # '/drug/label.json?search=openfda.spl_set_id.exact:0b0be196-0c62-461c-94f4-9a35339b4501') + meta, results = fetch( + "/drug/label.json?search=openfda.spl_set_id.exact:003161f8-aed4-4d04-b6a3-6afc56e2817a" + ) + eq_(len(results), 1) + upcs = results[0]["openfda"]["upc"] + upcs.sort() + eq_(["0860005203642", "0860005203659"], upcs) + def test_single_upc(): - meta, results = fetch( - '/drug/label.json?search=openfda.spl_set_id.exact:00006ebc-ec2b-406c-96b7-a3cc422e933f') - eq_(len(results), 1) - upcs = results[0]['openfda']['upc'] - eq_(['0740640391006'], upcs) + meta, results = fetch( + "/drug/label.json?search=openfda.spl_set_id.exact:00006ebc-ec2b-406c-96b7-a3cc422e933f" + ) + eq_(len(results), 1) + upcs = results[0]["openfda"]["upc"] + eq_(["0740640391006"], upcs) + def test_pharma_classes(): - meta, results = fetch( - '/drug/label.json?search=openfda.spl_set_id.exact:15b0b56d-6188-4937-b541-902022e35b24') - eq_(len(results), 1) - openfda = results[0]['openfda'] - eq_(['IBUPROFEN'], openfda['generic_name']) - eq_(['WK2XYI10QM'], openfda['unii']) - eq_(['Cyclooxygenase Inhibitors [MoA]'], openfda['pharm_class_moa']) - eq_(['Anti-Inflammatory Agents, Non-Steroidal [CS]'], openfda['pharm_class_cs']) - eq_(['Nonsteroidal Anti-inflammatory Drug [EPC]'], openfda['pharm_class_epc']) - - - meta, results = fetch( - '/drug/label.json?search=openfda.spl_set_id.exact:143a5ba8-0ee9-43be-a0da-2c455c1a46bd') - eq_(len(results), 1) - openfda = results[0]['openfda'] - eq_(['BENZONATATE'], openfda['generic_name']) - eq_(['5P4DHS6ENR'], openfda['unii']) - eq_(['Decreased Tracheobronchial Stretch Receptor Activity [PE]'], openfda['pharm_class_pe']) - eq_(['Non-narcotic Antitussive [EPC]'], openfda['pharm_class_epc']) - - meta, results = fetch( - '/drug/label.json?search=openfda.unii.exact:10X0709Y6I&limit=1') - eq_(len(results), 1) - openfda = results[0]['openfda'] - eq_(['BISACODYL'], openfda['substance_name']) - eq_(['10X0709Y6I'], openfda['unii']) - eq_(['Increased Large Intestinal Motility [PE]', 'Stimulation Large Intestine Fluid/Electrolyte Secretion [PE]'], - sorted(openfda['pharm_class_pe'])) - eq_(['Stimulant Laxative [EPC]'], openfda['pharm_class_epc']) + meta, results = fetch( + "/drug/label.json?search=openfda.spl_set_id.exact:15b0b56d-6188-4937-b541-902022e35b24" + ) + eq_(len(results), 1) + openfda = results[0]["openfda"] + eq_(["IBUPROFEN"], openfda["generic_name"]) + eq_(["WK2XYI10QM"], openfda["unii"]) + eq_(["Cyclooxygenase Inhibitors [MoA]"], openfda["pharm_class_moa"]) + eq_(["Anti-Inflammatory Agents, Non-Steroidal [CS]"], openfda["pharm_class_cs"]) + eq_(["Nonsteroidal Anti-inflammatory Drug [EPC]"], openfda["pharm_class_epc"]) + + meta, results = fetch( + "/drug/label.json?search=openfda.spl_set_id.exact:143a5ba8-0ee9-43be-a0da-2c455c1a46bd" + ) + eq_(len(results), 1) + openfda = results[0]["openfda"] + eq_(["BENZONATATE"], openfda["generic_name"]) + eq_(["5P4DHS6ENR"], openfda["unii"]) + eq_( + ["Decreased Tracheobronchial Stretch Receptor Activity [PE]"], + openfda["pharm_class_pe"], + ) + eq_(["Non-narcotic Antitussive [EPC]"], openfda["pharm_class_epc"]) + + meta, results = fetch( + "/drug/label.json?search=openfda.unii.exact:10X0709Y6I&limit=1" + ) + eq_(len(results), 1) + openfda = results[0]["openfda"] + eq_(["BISACODYL"], openfda["substance_name"]) + eq_(["10X0709Y6I"], openfda["unii"]) + eq_( + [ + "Increased Large Intestinal Motility [PE]", + "Stimulation Large Intestine Fluid/Electrolyte Secretion [PE]", + ], + sorted(openfda["pharm_class_pe"]), + ) + eq_(["Stimulant Laxative [EPC]"], openfda["pharm_class_epc"]) diff --git a/openfda/deploy/tests/drugsfda/test_endpoint.py b/openfda/deploy/tests/drugsfda/test_endpoint.py index 31a501f..fe73aa5 100644 --- a/openfda/deploy/tests/drugsfda/test_endpoint.py +++ b/openfda/deploy/tests/drugsfda/test_endpoint.py @@ -4,231 +4,246 @@ from openfda.tests.api_test_helpers import * -EXACT_FIELDS = ['openfda.application_number.exact', 'openfda.brand_name.exact', - 'openfda.generic_name.exact', 'openfda.manufacturer_name.exact', - 'openfda.nui.exact', 'openfda.package_ndc.exact', 'openfda.pharm_class_cs.exact', - 'openfda.pharm_class_epc.exact', 'openfda.pharm_class_moa.exact', - 'openfda.product_ndc.exact', 'openfda.product_type.exact', 'openfda.route.exact', 'openfda.rxcui.exact', - 'openfda.spl_id.exact', 'openfda.spl_set_id.exact', - 'openfda.substance_name.exact', 'openfda.unii.exact'] +EXACT_FIELDS = [ + "openfda.application_number.exact", + "openfda.brand_name.exact", + "openfda.generic_name.exact", + "openfda.manufacturer_name.exact", + "openfda.nui.exact", + "openfda.package_ndc.exact", + "openfda.pharm_class_cs.exact", + "openfda.pharm_class_epc.exact", + "openfda.pharm_class_moa.exact", + "openfda.product_ndc.exact", + "openfda.product_type.exact", + "openfda.route.exact", + "openfda.rxcui.exact", + "openfda.spl_id.exact", + "openfda.spl_set_id.exact", + "openfda.substance_name.exact", + "openfda.unii.exact", +] + def test_exact_field_queries_after_fda_253(): - for field in EXACT_FIELDS: - meta, counts = fetch_counts('/drug/drugsfda.json?count=%s&limit=1' % field) - eq_(len(counts), 1) + for field in EXACT_FIELDS: + meta, counts = fetch_counts("/drug/drugsfda.json?count=%s&limit=1" % field) + eq_(len(counts), 1) + def test_total_count(): - assert_total('/drug/drugsfda.json', 24000) + assert_total("/drug/drugsfda.json", 24000) def test_malformed_dosage_in_products(): - meta, results = fetch( - '/drug/drugsfda.json?search=application_number:"ANDA206006"') - eq_(len(results), 1) + meta, results = fetch('/drug/drugsfda.json?search=application_number:"ANDA206006"') + eq_(len(results), 1) - app = results[0] - products = sorted(app['products'], key=lambda p: p['product_number']) - eq_(len(products), 1) + app = results[0] + products = sorted(app["products"], key=lambda p: p["product_number"]) + eq_(len(products), 1) - drug = products[0] - ingredients = drug['active_ingredients'] - eq_(len(ingredients), 3) - eq_(drug["brand_name"], "LANSOPRAZOLE, AMOXICILLIN AND CLARITHROMYCIN (COPACKAGED)") - eq_(drug["dosage_form"], "CAPSULE, TABLET, CAPSULE, DELAYED REL PELLETS") - eq_(drug["product_number"], "001") - eq_(drug.get("route"), "ORAL") + drug = products[0] + ingredients = drug["active_ingredients"] + eq_(len(ingredients), 3) + eq_(drug["brand_name"], "LANSOPRAZOLE, AMOXICILLIN AND CLARITHROMYCIN (COPACKAGED)") + eq_(drug["dosage_form"], "CAPSULE, TABLET, CAPSULE, DELAYED REL PELLETS") + eq_(drug["product_number"], "001") + eq_(drug.get("route"), "ORAL") def test_unknowns_in_product(): - meta, results = fetch( - '/drug/drugsfda.json?search=application_number:"ANDA204845"') - eq_(len(results), 1) + meta, results = fetch('/drug/drugsfda.json?search=application_number:"ANDA204845"') + eq_(len(results), 1) - app = results[0] - products = sorted(app['products'], key=lambda p: p['product_number']) - eq_(len(products), 1) + app = results[0] + products = sorted(app["products"], key=lambda p: p["product_number"]) + eq_(len(products), 1) - drug = products[0] - ingredients = drug['active_ingredients'] - eq_(len(ingredients), 1) - eq_(ingredients[0]["name"], "FOSAPREPITANT DIMEGLUMINE") - eq_(ingredients[0]["strength"], "UNKNOWN") - eq_(drug["brand_name"], "FOSAPREPITANT DIMEGLUMINE") - eq_(drug["dosage_form"], "UNKNOWN") - eq_(drug["product_number"], "001") - eq_(drug["route"], "UNKNOWN") + drug = products[0] + ingredients = drug["active_ingredients"] + eq_(len(ingredients), 1) + eq_(ingredients[0]["name"], "FOSAPREPITANT DIMEGLUMINE") + eq_(ingredients[0]["strength"], "UNKNOWN") + eq_(drug["brand_name"], "FOSAPREPITANT DIMEGLUMINE") + eq_(drug["dosage_form"], "UNKNOWN") + eq_(drug["product_number"], "001") + eq_(drug["route"], "UNKNOWN") def test_orphan_submission(): - meta, results = fetch( - '/drug/drugsfda.json?search=application_number:"NDA008107"') - eq_(len(results), 1) + meta, results = fetch('/drug/drugsfda.json?search=application_number:"NDA008107"') + eq_(len(results), 1) - app = results[0] - submissions = sorted(app['submissions'], key=lambda p: int(p['submission_number'])) - sub = submissions[0] - eq_(sub["submission_property_type"][0]["code"], "Orphan") + app = results[0] + submissions = sorted(app["submissions"], key=lambda p: int(p["submission_number"])) + sub = submissions[0] + eq_(sub["submission_property_type"][0]["code"], "Orphan") def test_submission_public_notes(): - meta, results = fetch( - '/drug/drugsfda.json?search=application_number:"NDA020229"') - eq_(len(results), 1) + meta, results = fetch('/drug/drugsfda.json?search=application_number:"NDA020229"') + eq_(len(results), 1) - app = results[0] - submissions = sorted(app['submissions'], key=lambda p: int(p['submission_number'])) - eq_(len(submissions), 6) + app = results[0] + submissions = sorted(app["submissions"], key=lambda p: int(p["submission_number"])) + eq_(len(submissions), 6) - sub = submissions[0] - eq_(sub["submission_public_notes"], "Withdrawn FR Effective 11/03/2016") + sub = submissions[0] + eq_(sub["submission_public_notes"], "Withdrawn FR Effective 11/03/2016") def test_multiple_application_docs(): - meta, results = fetch( - '/drug/drugsfda.json?search=_exists_:submissions.application_docs.title+AND+application_number:"ANDA076356"') - eq_(len(results), 1) - - app = results[0] - submissions = sorted(app['submissions'], key=lambda p: int(p['submission_number'])) - eq_(len(submissions), 19) - sub = submissions[0] - - docs = sorted(sub["application_docs"], key=lambda p: int(p['id'])) - eq_(len(docs), 2) - - doc = docs[0] - eq_(doc["id"], "23820") - eq_(doc["type"], "Other Important Information from FDA") - eq_(doc.get("title"), None) - eq_(doc["date"], "20050728") - eq_(doc["url"], - "http://www.fda.gov/Drugs/DrugSafety/PostmarketDrugSafetyInformationforPatientsandProviders/ucm094305.htm") - - doc = docs[1] - eq_(doc["id"], "24054") - eq_(doc["type"], "Other") - eq_(doc["title"], "REMS") - eq_(doc["date"], "20150702") - eq_(doc["url"], - "http://www.accessdata.fda.gov/scripts/cder/rems/index.cfm?event=RemsDetails.page&REMS=24") + meta, results = fetch( + '/drug/drugsfda.json?search=_exists_:submissions.application_docs.title+AND+application_number:"ANDA076356"' + ) + eq_(len(results), 1) + + app = results[0] + submissions = sorted(app["submissions"], key=lambda p: int(p["submission_number"])) + eq_(len(submissions), 19) + sub = submissions[0] + + docs = sorted(sub["application_docs"], key=lambda p: int(p["id"])) + eq_(len(docs), 2) + + doc = docs[0] + eq_(doc["id"], "23820") + eq_(doc["type"], "Other Important Information from FDA") + eq_(doc.get("title"), None) + eq_(doc["date"], "20050728") + eq_( + doc["url"], + "http://www.fda.gov/Drugs/DrugSafety/PostmarketDrugSafetyInformationforPatientsandProviders/ucm094305.htm", + ) + + doc = docs[1] + eq_(doc["id"], "24054") + eq_(doc["type"], "Other") + eq_(doc["title"], "REMS") + eq_(doc["date"], "20150702") + eq_( + doc["url"], + "http://www.accessdata.fda.gov/scripts/cder/rems/index.cfm?event=RemsDetails.page&REMS=24", + ) def test_application_docs(): - meta, results = fetch( - '/drug/drugsfda.json?search=_exists_:submissions.application_docs.title+AND+application_number:"ANDA083827"') - eq_(len(results), 1) - - app = results[0] - submissions = sorted(app['submissions'], key=lambda p: int(p['submission_number'])) - eq_(len(submissions), 1) - sub = submissions[0] - - docs = sub["application_docs"] - eq_(len(docs), 1) - - doc = docs[0] - eq_(doc["id"], "53918") - eq_(doc["type"], "Other") - eq_(doc["title"], "Safety Labeling Change Order Letter") - eq_(doc["date"], "20180501") - eq_(doc["url"], - "https://www.accessdata.fda.gov/drugsatfda_docs/appletter/slc/2018/086502, 086503, 086501, 086500, 086499, 086498,083827_SLCOrderLtr.pdf") + meta, results = fetch( + '/drug/drugsfda.json?search=_exists_:submissions.application_docs.title+AND+application_number:"ANDA083827"' + ) + eq_(len(results), 1) + + app = results[0] + submissions = sorted(app["submissions"], key=lambda p: int(p["submission_number"])) + eq_(len(submissions), 1) + sub = submissions[0] + + docs = sub["application_docs"] + eq_(len(docs), 1) + + doc = docs[0] + eq_(doc["id"], "53918") + eq_(doc["type"], "Other") + eq_(doc["title"], "Safety Labeling Change Order Letter") + eq_(doc["date"], "20180501") + eq_( + doc["url"], + "https://www.accessdata.fda.gov/drugsatfda_docs/appletter/slc/2018/086502, 086503, 086501, 086500, 086499, 086498,083827_SLCOrderLtr.pdf", + ) def test_one_record_in_detail(): - meta, results = fetch( - '/drug/drugsfda.json?search=application_number:"ANDA208705"&limit=100') - eq_(len(results), 1) - - app = results[0] - eq_(app["application_number"], "ANDA208705") - eq_(app["sponsor_name"], "ALEMBIC PHARMS LTD") - - products = sorted(app['products'], key=lambda p: p['product_number']) - eq_(len(products), 2) - - drug = products[0] - ingredients = drug['active_ingredients'] - eq_(len(ingredients), 1) - eq_(ingredients[0]["name"], "CHOLINE FENOFIBRATE") - eq_(ingredients[0]["strength"], "EQ 45MG FENOFIBRIC ACID") - eq_(drug["brand_name"], "FENOFIBRIC ACID") - eq_(drug["dosage_form"], "CAPSULE, DELAYED RELEASE") - eq_(drug["marketing_status"], "Prescription") - eq_(drug["product_number"], "001") - eq_(drug["reference_drug"], "No") - eq_(drug["reference_standard"], "No") - eq_(drug["route"], "ORAL") - eq_(drug["te_code"], "AB") - - drug = products[1] - ingredients = drug['active_ingredients'] - eq_(len(ingredients), 1) - eq_(ingredients[0]["name"], "CHOLINE FENOFIBRATE") - eq_(ingredients[0]["strength"], "EQ 135MG FENOFIBRIC ACID") - eq_(drug["brand_name"], "FENOFIBRIC ACID") - eq_(drug["dosage_form"], "CAPSULE, DELAYED RELEASE") - eq_(drug["marketing_status"], "Prescription") - eq_(drug["product_number"], "002") - eq_(drug["reference_drug"], "No") - eq_(drug["reference_standard"], "No") - eq_(drug["route"], "ORAL") - eq_(drug["te_code"], "AB") - - openfda = app["openfda"] - eq_(openfda["application_number"], ["ANDA208705"]) - eq_(openfda["brand_name"], ["FENOFIBRIC ACID"]) - - submissions = sorted(app['submissions'], key=lambda p: int(p['submission_number'])) - eq_(len(submissions), 4) - - sub = submissions[0] - eq_(sub["submission_class_code"], "UNKNOWN") - eq_(sub.get("submission_class_code_description"), None) - eq_(sub["submission_number"], "1") - eq_(sub.get("submission_public_notes"), None) - eq_(sub["submission_status"], "AP") - eq_(sub["submission_status_date"], "20170512") - eq_(sub["review_priority"], "STANDARD") - eq_(sub["submission_type"], "ORIG") - - - sub = submissions[1] - eq_(sub["submission_class_code"], "LABELING") - eq_(sub["submission_class_code_description"], "Labeling") - eq_(sub["submission_number"], "7") - eq_(sub.get("submission_public_notes"), None) - eq_(sub["submission_status"], "AP") - eq_(sub["submission_status_date"], "20190919") - eq_(sub["review_priority"], "STANDARD") - eq_(sub["submission_type"], "SUPPL") - - - sub = submissions[2] - eq_(sub["submission_class_code"], "LABELING") - eq_(sub["submission_class_code_description"], "Labeling") - eq_(sub["submission_number"], "9") - eq_(sub.get("submission_public_notes"), None) - eq_(sub["submission_status"], "AP") - eq_(sub["submission_status_date"], "20190919") - eq_(sub["review_priority"], "STANDARD") - eq_(sub["submission_type"], "SUPPL") - - - sub = submissions[3] - eq_(sub["submission_class_code"], "LABELING") - eq_(sub["submission_class_code_description"], "Labeling") - eq_(sub["submission_number"], "11") - eq_(sub.get("submission_public_notes"), None) - eq_(sub["submission_status"], "AP") - eq_(sub["submission_status_date"], "20190919") - eq_(sub["review_priority"], "STANDARD") - eq_(sub["submission_type"], "SUPPL") - - - -if __name__ == '__main__': - all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) - for key, func in all_functions: - if key.find("test_") > -1: - func() + meta, results = fetch( + '/drug/drugsfda.json?search=application_number:"ANDA208705"&limit=100' + ) + eq_(len(results), 1) + + app = results[0] + eq_(app["application_number"], "ANDA208705") + eq_(app["sponsor_name"], "ALEMBIC PHARMS LTD") + + products = sorted(app["products"], key=lambda p: p["product_number"]) + eq_(len(products), 2) + + drug = products[0] + ingredients = drug["active_ingredients"] + eq_(len(ingredients), 1) + eq_(ingredients[0]["name"], "CHOLINE FENOFIBRATE") + eq_(ingredients[0]["strength"], "EQ 45MG FENOFIBRIC ACID") + eq_(drug["brand_name"], "FENOFIBRIC ACID") + eq_(drug["dosage_form"], "CAPSULE, DELAYED RELEASE") + eq_(drug["marketing_status"], "Prescription") + eq_(drug["product_number"], "001") + eq_(drug["reference_drug"], "No") + eq_(drug["reference_standard"], "No") + eq_(drug["route"], "ORAL") + eq_(drug["te_code"], "AB") + + drug = products[1] + ingredients = drug["active_ingredients"] + eq_(len(ingredients), 1) + eq_(ingredients[0]["name"], "CHOLINE FENOFIBRATE") + eq_(ingredients[0]["strength"], "EQ 135MG FENOFIBRIC ACID") + eq_(drug["brand_name"], "FENOFIBRIC ACID") + eq_(drug["dosage_form"], "CAPSULE, DELAYED RELEASE") + eq_(drug["marketing_status"], "Prescription") + eq_(drug["product_number"], "002") + eq_(drug["reference_drug"], "No") + eq_(drug["reference_standard"], "No") + eq_(drug["route"], "ORAL") + eq_(drug["te_code"], "AB") + + openfda = app["openfda"] + eq_(openfda["application_number"], ["ANDA208705"]) + eq_(openfda["brand_name"], ["FENOFIBRIC ACID"]) + + submissions = sorted(app["submissions"], key=lambda p: int(p["submission_number"])) + eq_(len(submissions), 4) + + sub = submissions[0] + eq_(sub["submission_class_code"], "UNKNOWN") + eq_(sub.get("submission_class_code_description"), None) + eq_(sub["submission_number"], "1") + eq_(sub.get("submission_public_notes"), None) + eq_(sub["submission_status"], "AP") + eq_(sub["submission_status_date"], "20170512") + eq_(sub["review_priority"], "STANDARD") + eq_(sub["submission_type"], "ORIG") + + sub = submissions[1] + eq_(sub["submission_class_code"], "LABELING") + eq_(sub["submission_class_code_description"], "Labeling") + eq_(sub["submission_number"], "7") + eq_(sub.get("submission_public_notes"), None) + eq_(sub["submission_status"], "AP") + eq_(sub["submission_status_date"], "20190919") + eq_(sub["review_priority"], "STANDARD") + eq_(sub["submission_type"], "SUPPL") + + sub = submissions[2] + eq_(sub["submission_class_code"], "LABELING") + eq_(sub["submission_class_code_description"], "Labeling") + eq_(sub["submission_number"], "9") + eq_(sub.get("submission_public_notes"), None) + eq_(sub["submission_status"], "AP") + eq_(sub["submission_status_date"], "20190919") + eq_(sub["review_priority"], "STANDARD") + eq_(sub["submission_type"], "SUPPL") + + sub = submissions[3] + eq_(sub["submission_class_code"], "LABELING") + eq_(sub["submission_class_code_description"], "Labeling") + eq_(sub["submission_number"], "11") + eq_(sub.get("submission_public_notes"), None) + eq_(sub["submission_status"], "AP") + eq_(sub["submission_status_date"], "20190919") + eq_(sub["review_priority"], "STANDARD") + eq_(sub["submission_type"], "SUPPL") + + +if __name__ == "__main__": + all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) + for key, func in all_functions: + if key.find("test_") > -1: + func() diff --git a/openfda/deploy/tests/enforcement/test_endpoint.py b/openfda/deploy/tests/enforcement/test_endpoint.py index 2b46c88..ff23bf7 100644 --- a/openfda/deploy/tests/enforcement/test_endpoint.py +++ b/openfda/deploy/tests/enforcement/test_endpoint.py @@ -1,85 +1,122 @@ - import requests from openfda.tests.api_test_helpers import * +EXACT_FIELDS = [ + "openfda.application_number.exact", + "openfda.brand_name.exact", + "openfda.generic_name.exact", + "openfda.manufacturer_name.exact", + "openfda.nui.exact", + "openfda.package_ndc.exact", + "openfda.pharm_class_cs.exact", + "openfda.pharm_class_epc.exact", + "openfda.pharm_class_moa.exact", + "openfda.pharm_class_pe.exact", + "openfda.product_ndc.exact", + "openfda.product_type.exact", + "openfda.route.exact", + "openfda.rxcui.exact", + "openfda.spl_id.exact", + "openfda.spl_set_id.exact", + "openfda.substance_name.exact", + "openfda.unii.exact", +] -EXACT_FIELDS = ['openfda.application_number.exact', 'openfda.brand_name.exact', - 'openfda.generic_name.exact', 'openfda.manufacturer_name.exact', - 'openfda.nui.exact', 'openfda.package_ndc.exact', 'openfda.pharm_class_cs.exact', - 'openfda.pharm_class_epc.exact', 'openfda.pharm_class_moa.exact', 'openfda.pharm_class_pe.exact', - 'openfda.product_ndc.exact', 'openfda.product_type.exact', 'openfda.route.exact', 'openfda.rxcui.exact', - 'openfda.spl_id.exact', 'openfda.spl_set_id.exact', - 'openfda.substance_name.exact', 'openfda.unii.exact'] def test_exact_field_queries_after_fda_253(): - for field in EXACT_FIELDS: - print(field) - meta, counts = fetch_counts('/drug/enforcement.json?count=%s&limit=1' % field) - eq_(len(counts), 1) + for field in EXACT_FIELDS: + print(field) + meta, counts = fetch_counts("/drug/enforcement.json?count=%s&limit=1" % field) + eq_(len(counts), 1) + def test_no_reports_before_2012(): - assert_total('/drug/enforcement.json?search=report_date:[19600101+TO+20120630]', 0) - assert_total('/drug/enforcement.json?search=report_date:[1960-01-01+TO+2012-06-30]', 0) + assert_total("/drug/enforcement.json?search=report_date:[19600101+TO+20120630]", 0) + assert_total( + "/drug/enforcement.json?search=report_date:[1960-01-01+TO+2012-06-30]", 0 + ) def test_not_analyzed_0(): - assert_total('/drug/enforcement.json?search=openfda.spl_set_id.exact:fee7d073-0b99-48f2-7985-0d8cf970894b', 1) + assert_total( + "/drug/enforcement.json?search=openfda.spl_set_id.exact:fee7d073-0b99-48f2-7985-0d8cf970894b", + 1, + ) def test_date_1(): - assert_total('/food/enforcement.json?search=recall_initiation_date:20120910', 143) + assert_total("/food/enforcement.json?search=recall_initiation_date:20120910", 143) def test_date_2(): - assert_total('/drug/enforcement.json?search=recall_initiation_date:20140827', 120) + assert_total("/drug/enforcement.json?search=recall_initiation_date:20140827", 120) def test_date_3(): - assert_total('/device/enforcement.json?search=recall_initiation_date:20120910', 3) + assert_total("/device/enforcement.json?search=recall_initiation_date:20120910", 3) def test_date_range_4(): - assert_total('/device/enforcement.json?search=recall_initiation_date:([20120910+TO+20121231])', 607) + assert_total( + "/device/enforcement.json?search=recall_initiation_date:([20120910+TO+20121231])", + 607, + ) def test_date_range_5(): - assert_total('/drug/enforcement.json?search=recall_initiation_date:([20140827+TO+20141231])', 863) + assert_total( + "/drug/enforcement.json?search=recall_initiation_date:([20140827+TO+20141231])", + 863, + ) def test_date_range_6(): - assert_total('/food/enforcement.json?search=recall_initiation_date:([20120910+TO+20121231])', 140) + assert_total( + "/food/enforcement.json?search=recall_initiation_date:([20120910+TO+20121231])", + 140, + ) def test_openfda_7(): - assert_total('/drug/enforcement.json?search=openfda.is_original_packager:true', 885) + assert_total("/drug/enforcement.json?search=openfda.is_original_packager:true", 885) def test_drug_enforcement_unii(): - assert_unii('/drug/enforcement.json?search=recall_number.exact:"D-1616-2014"', - ['0E43V0BB57', 'QTG126297Q']) + assert_unii( + '/drug/enforcement.json?search=recall_number.exact:"D-1616-2014"', + ["0E43V0BB57", "QTG126297Q"], + ) def test_no_dupes(): - assert_total_exact('/device/enforcement.json?search=recall_number.exact:Z-0031-2018', 1) + assert_total_exact( + "/device/enforcement.json?search=recall_number.exact:Z-0031-2018", 1 + ) def test_no_whitespace_in_zip_codes(): - meta, results = fetch('/device/enforcement.json?search=recall_number.exact:Z-0031-2018') - eq_(results[0]['postal_code'], '91342-3577') + meta, results = fetch( + "/device/enforcement.json?search=recall_number.exact:Z-0031-2018" + ) + eq_(results[0]["postal_code"], "91342-3577") def test_multiple_more_code_info_handling_fda_347(): - meta, results = fetch('/device/enforcement.json?search=recall_number.exact:Z-2483-2021') - ok_('21)US919C1149 US919C1150' in results[0]['more_code_info']) - ok_('15B0470 N/A US215B0471 N/A US215B0472' in results[0]['more_code_info']) - ok_('B1987 (01)00884838047693(21)US818B1987' in results[0]['more_code_info']) - ok_('S418B0848 US419B0778 (01)00884838047693' in results[0]['more_code_info']) - ok_('4838088658(21)US119B1422' in results[0]['more_code_info']) - ok_('37 (01)00884838088658(21)US718B1337' in results[0]['more_code_info']) - ok_('(01)00884838088658(21)USD19B0624' in results[0]['more_code_info']) - ok_('USO20B0661 (01)00884838088658(21)USO20B0661' in results[0]['more_code_info']) - - meta, results = fetch('/device/enforcement.json?search=recall_number.exact:Z-2469-2021') - eq_(results[0]['more_code_info'], '') + meta, results = fetch( + "/device/enforcement.json?search=recall_number.exact:Z-2483-2021" + ) + ok_("21)US919C1149 US919C1150" in results[0]["more_code_info"]) + ok_("15B0470 N/A US215B0471 N/A US215B0472" in results[0]["more_code_info"]) + ok_("B1987 (01)00884838047693(21)US818B1987" in results[0]["more_code_info"]) + ok_("S418B0848 US419B0778 (01)00884838047693" in results[0]["more_code_info"]) + ok_("4838088658(21)US119B1422" in results[0]["more_code_info"]) + ok_("37 (01)00884838088658(21)US718B1337" in results[0]["more_code_info"]) + ok_("(01)00884838088658(21)USD19B0624" in results[0]["more_code_info"]) + ok_("USO20B0661 (01)00884838088658(21)USO20B0661" in results[0]["more_code_info"]) + + meta, results = fetch( + "/device/enforcement.json?search=recall_number.exact:Z-2469-2021" + ) + eq_(results[0]["more_code_info"], "") diff --git a/openfda/deploy/tests/foodevent/test_endpoint.py b/openfda/deploy/tests/foodevent/test_endpoint.py index 63a8689..7e951a3 100644 --- a/openfda/deploy/tests/foodevent/test_endpoint.py +++ b/openfda/deploy/tests/foodevent/test_endpoint.py @@ -2,82 +2,92 @@ def test_consumer_merge(): - meta, results = fetch( - '/food/event.json?search=report_number:65420') - eq_(len(results), 1) + meta, results = fetch("/food/event.json?search=report_number:65420") + eq_(len(results), 1) + + event = results[0] + eq_(event["consumer"]["gender"], "M") + eq_(event["consumer"]["age"], "33") + eq_(event["consumer"]["age_unit"], "year(s)") - event = results[0] - eq_(event["consumer"]["gender"], "M") - eq_(event["consumer"]["age"], "33") - eq_(event["consumer"]["age_unit"], "year(s)") def test_consumer_merge_with_missing_data(): - meta, results = fetch( - '/food/event.json?search=report_number:65619') - eq_(len(results), 1) + meta, results = fetch("/food/event.json?search=report_number:65619") + eq_(len(results), 1) + + event = results[0] + eq_(event["consumer"]["gender"], "M") + eq_(event["consumer"]["age"], "70") + eq_(event["consumer"]["age_unit"], "year(s)") - event = results[0] - eq_(event["consumer"]["gender"], "M") - eq_(event["consumer"]["age"], "70") - eq_(event["consumer"]["age_unit"], "year(s)") def test_full_record(): - meta, results = fetch( - '/food/event.json?search=report_number:65619') - eq_(len(results), 1) - - event = results[0] - eq_(event["date_created"], "20040112") - eq_(event["date_started"], "20031222") - eq_(event["outcomes"], ["Disability"]) - - products = sorted(event["products"], key=lambda k: k['name_brand']) - eq_(len(products), 5) - eq_(products[0]["industry_code"], "54") - eq_(products[0]["industry_name"], "Vit/Min/Prot/Unconv Diet(Human/Animal)") - eq_(products[0]["name_brand"], "ACEYTL-L-CARNITINE") - eq_(products[0]["role"], "SUSPECT") - - eq_(products[1]["industry_code"], "54") - eq_(products[1]["industry_name"], "Vit/Min/Prot/Unconv Diet(Human/Animal)") - eq_(products[1]["name_brand"], "ALPHA LIPOIC") - eq_(products[1]["role"], "SUSPECT") - - eq_(products[2]["industry_code"], "54") - eq_(products[2]["industry_name"], "Vit/Min/Prot/Unconv Diet(Human/Animal)") - eq_(products[2]["name_brand"], "CALCIUM CALTRATE") - eq_(products[2]["role"], "SUSPECT") - - eq_(products[3]["industry_code"], "54") - eq_(products[3]["industry_name"], "Vit/Min/Prot/Unconv Diet(Human/Animal)") - eq_(products[3]["name_brand"], "MULTIVITAMIN") - eq_(products[3]["role"], "SUSPECT") - - eq_(products[4]["industry_code"], "54") - eq_(products[4]["industry_name"], "Vit/Min/Prot/Unconv Diet(Human/Animal)") - eq_(products[4]["name_brand"], "VITAMIN E") - eq_(products[4]["role"], "SUSPECT") - - eq_(sorted(event["reactions"], key=lambda k: k), - [u'ASTHENIA', u'DEPRESSED MOOD', u'DIZZINESS', u'IMPAIRED DRIVING ABILITY', u'LETHARGY', u'PHYSICAL EXAMINATION']) - eq_(event["report_number"], "65619") - - eq_(event["consumer"]["gender"], "M") - eq_(event["consumer"]["age"], "70") - eq_(event["consumer"]["age_unit"], "year(s)") + meta, results = fetch("/food/event.json?search=report_number:65619") + eq_(len(results), 1) + + event = results[0] + eq_(event["date_created"], "20040112") + eq_(event["date_started"], "20031222") + eq_(event["outcomes"], ["Disability"]) + + products = sorted(event["products"], key=lambda k: k["name_brand"]) + eq_(len(products), 5) + eq_(products[0]["industry_code"], "54") + eq_(products[0]["industry_name"], "Vit/Min/Prot/Unconv Diet(Human/Animal)") + eq_(products[0]["name_brand"], "ACEYTL-L-CARNITINE") + eq_(products[0]["role"], "SUSPECT") + + eq_(products[1]["industry_code"], "54") + eq_(products[1]["industry_name"], "Vit/Min/Prot/Unconv Diet(Human/Animal)") + eq_(products[1]["name_brand"], "ALPHA LIPOIC") + eq_(products[1]["role"], "SUSPECT") + + eq_(products[2]["industry_code"], "54") + eq_(products[2]["industry_name"], "Vit/Min/Prot/Unconv Diet(Human/Animal)") + eq_(products[2]["name_brand"], "CALCIUM CALTRATE") + eq_(products[2]["role"], "SUSPECT") + + eq_(products[3]["industry_code"], "54") + eq_(products[3]["industry_name"], "Vit/Min/Prot/Unconv Diet(Human/Animal)") + eq_(products[3]["name_brand"], "MULTIVITAMIN") + eq_(products[3]["role"], "SUSPECT") + + eq_(products[4]["industry_code"], "54") + eq_(products[4]["industry_name"], "Vit/Min/Prot/Unconv Diet(Human/Animal)") + eq_(products[4]["name_brand"], "VITAMIN E") + eq_(products[4]["role"], "SUSPECT") + + eq_( + sorted(event["reactions"], key=lambda k: k), + [ + "ASTHENIA", + "DEPRESSED MOOD", + "DIZZINESS", + "IMPAIRED DRIVING ABILITY", + "LETHARGY", + "PHYSICAL EXAMINATION", + ], + ) + eq_(event["report_number"], "65619") + + eq_(event["consumer"]["gender"], "M") + eq_(event["consumer"]["age"], "70") + eq_(event["consumer"]["age_unit"], "year(s)") def test_sort_by_date_created(): - meta, results = fetch( - '/food/event.json?search=date_created:[20170220+TO+20170225]+AND+reactions:OVARIAN+CANCER&sort=date_created:asc') - eq_(len(results), 1) + meta, results = fetch( + "/food/event.json?search=date_created:[20170220+TO+20170225]+AND+reactions:OVARIAN+CANCER&sort=date_created:asc" + ) + eq_(len(results), 1) - event = results[0] - eq_(event["date_created"], "20170221") + event = results[0] + eq_(event["date_created"], "20170221") - meta, results = fetch( - '/food/event.json?search=date_created:[20170220+TO+20170225]+AND+reactions:OVARIAN+CANCER&sort=date_created:desc') - eq_(len(results), 1) + meta, results = fetch( + "/food/event.json?search=date_created:[20170220+TO+20170225]+AND+reactions:OVARIAN+CANCER&sort=date_created:desc" + ) + eq_(len(results), 1) - event = results[0] - eq_(event["date_created"], "20170224") + event = results[0] + eq_(event["date_created"], "20170224") diff --git a/openfda/deploy/tests/general/test_endpoint.py b/openfda/deploy/tests/general/test_endpoint.py index bc3e999..3272c65 100644 --- a/openfda/deploy/tests/general/test_endpoint.py +++ b/openfda/deploy/tests/general/test_endpoint.py @@ -1,98 +1,119 @@ - - from openfda.tests import api_test_helpers from openfda.tests.api_test_helpers import * def assert_enough_results(query, count): - meta, results = api_test_helpers.fetch(query) - assert meta['results']['total'] > count,\ - 'Query %s returned fewer results than expected: %d vs %d' % ( - query, count, meta['results']['total']) + meta, results = api_test_helpers.fetch(query) + assert ( + meta["results"]["total"] > count + ), "Query %s returned fewer results than expected: %d vs %d" % ( + query, + count, + meta["results"]["total"], + ) + def test_device_pma_counts(): - assert_enough_results('/device/pma.json?search=_exists_:openfda', 30000) + assert_enough_results("/device/pma.json?search=_exists_:openfda", 30000) + def test_device_510k_counts(): - assert_enough_results('/device/510k.json?search=_exists_:openfda', 14000) + assert_enough_results("/device/510k.json?search=_exists_:openfda", 14000) + def test_food_enformancement_counts(): - assert_enough_results('/food/enforcement.json?search=_missing_:openfda', 8900) + assert_enough_results("/food/enforcement.json?search=_missing_:openfda", 8900) + def test_device_classification(): - assert_enough_results('/device/classification.json?search=_exists_:openfda', 4300) + assert_enough_results("/device/classification.json?search=_exists_:openfda", 4300) + def test_device_reg(): - assert_enough_results('/device/registrationlisting.json?search=_exists_:products.openfda', 21000) + assert_enough_results( + "/device/registrationlisting.json?search=_exists_:products.openfda", 21000 + ) + def test_device_event(): - assert_enough_results('/device/event.json?search=_exists_:device.openfda', 4450000) + assert_enough_results("/device/event.json?search=_exists_:device.openfda", 4450000) + def test_device_enforcement(): - assert_enough_results('/device/enforcement.json?search=_missing_:openfda', 8500) + assert_enough_results("/device/enforcement.json?search=_missing_:openfda", 8500) + def test_device_recall(): - assert_enough_results('/device/recall.json?search=_exists_:openfda', 42000) + assert_enough_results("/device/recall.json?search=_exists_:openfda", 42000) + def test_drug_event(): - assert_enough_results('/drug/event.json?search=_exists_:patient.drug.openfda', 10000000) + assert_enough_results( + "/drug/event.json?search=_exists_:patient.drug.openfda", 10000000 + ) + def test_drug_label(): - assert_enough_results('/drug/label.json?search=_exists_:openfda', 78000) + assert_enough_results("/drug/label.json?search=_exists_:openfda", 78000) + def test_drug_enforcement(): - assert_enough_results('/drug/enforcement.json?search=_exists_:openfda', 900) + assert_enough_results("/drug/enforcement.json?search=_exists_:openfda", 900) def test_zero_limit_handling(): - meta, results = fetch( - '/food/event.json?search=date_created:[20170220+TO+20170225]+AND+reactions:OVARIAN+CANCER&sort=date_created:asc') - eq_(len(results), 1) + meta, results = fetch( + "/food/event.json?search=date_created:[20170220+TO+20170225]+AND+reactions:OVARIAN+CANCER&sort=date_created:asc" + ) + eq_(len(results), 1) + + meta, results = fetch( + "/food/event.json?search=date_created:[20170220+TO+20170225]+AND+reactions:OVARIAN+CANCER&sort=date_created:asc&limit=0" + ) + eq_(len(results), 0) + ok_(meta["results"]["total"] > 0) - meta, results = fetch( - '/food/event.json?search=date_created:[20170220+TO+20170225]+AND+reactions:OVARIAN+CANCER&sort=date_created:asc&limit=0') - eq_(len(results), 0) - ok_(meta['results']['total'] > 0) -''' +""" # Commenting this test out. Elasticsearch 6 no longer throws parse exceptions for bad dates in queries. def test_detailed_error_message(): error = expect_error( '/food/enforcement.json?limit=1&search=report_date:[2000-10-01+TO+2013-09-31]') eq_(error['details'], "[parse_exception] failed to parse date field [2013-09-31] with format [basic_date||date||epoch_millis]") -''' - -def test_limitless_count(): - def test_limitless_field(field): - meta, results = fetch( - '/drug/label.json?count=%s' % (field)) - # Default is still 100 even for limitless fields unless specified explicitly - eq_(len(results), 100) +""" - meta, results = fetch( - '/drug/label.json?count=%s&limit=4000' % (field)) - eq_(len(results), 4000) - # 2147482647 is the max, so 2147482648 is an error - error = expect_error( - '/drug/label.json?count=%s&limit=2147482648' % (field)) - eq_(error['message'], "Limit cannot exceed 1000 results for count requests.") +def test_limitless_count(): + def test_limitless_field(field): + meta, results = fetch("/drug/label.json?count=%s" % (field)) + # Default is still 100 even for limitless fields unless specified explicitly + eq_(len(results), 100) - for field in ['openfda.generic_name.exact', 'openfda.brand_name.exact', 'openfda.substance_name.exact']: - test_limitless_field(field) + meta, results = fetch("/drug/label.json?count=%s&limit=4000" % (field)) + eq_(len(results), 4000) - # Other fields should still be subject to 1000 max limit - error = expect_error( - '/drug/label.json?count=active_ingredient&limit=1001') - eq_(error['message'], "Limit cannot exceed 1000 results for count requests.") + # 2147482647 is the max, so 2147482648 is an error + error = expect_error("/drug/label.json?count=%s&limit=2147482648" % (field)) + eq_(error["message"], "Limit cannot exceed 1000 results for count requests.") -def test_skip_limit_ranges(): - meta, results = fetch( - '/drug/label.json?skip=25000&limit=1000' ) - eq_(len(results), 1000) + for field in [ + "openfda.generic_name.exact", + "openfda.brand_name.exact", + "openfda.substance_name.exact", + ]: + test_limitless_field(field) - error = expect_error( - '/drug/label.json?skip=25000&limit=1001' ) - eq_(error['message'], "Limit cannot exceed 1000 results for search requests. Use the skip or search_after param to get additional results.") + # Other fields should still be subject to 1000 max limit + error = expect_error("/drug/label.json?count=active_ingredient&limit=1001") + eq_(error["message"], "Limit cannot exceed 1000 results for count requests.") +def test_skip_limit_ranges(): + meta, results = fetch("/drug/label.json?skip=25000&limit=1000") + eq_(len(results), 1000) + + error = expect_error("/drug/label.json?skip=25000&limit=1001") + eq_( + error["message"], + "Limit cannot exceed 1000 results for search requests. Use the skip or search_after param to get additional results.", + ) diff --git a/openfda/deploy/tests/issues/test_endpoint.py b/openfda/deploy/tests/issues/test_endpoint.py index 5839907..bad77b7 100644 --- a/openfda/deploy/tests/issues/test_endpoint.py +++ b/openfda/deploy/tests/issues/test_endpoint.py @@ -5,42 +5,48 @@ def test_uppercase_filter(): - assert_count_top( - '/drug/event.json?search=patient.drug.openfda.generic_name:ASPIRIN&limit=100&count=patient.reaction.reactionmeddrapt.exact&skip=0', - [ 'NAUSEA', - 'DYSPNOEA', - 'FATIGUE', - 'DRUG INEFFECTIVE', - 'DIZZINESS', - 'DIARRHOEA'] - ) + assert_count_top( + "/drug/event.json?search=patient.drug.openfda.generic_name:ASPIRIN&limit=100&count=patient.reaction.reactionmeddrapt.exact&skip=0", + ["NAUSEA", "DYSPNOEA", "FATIGUE", "DRUG INEFFECTIVE", "DIZZINESS", "DIARRHOEA"], + ) + def test_spl_harmonization(): - assert_total('/drug/label.json?search=openfda.brand_name:humira', 1) + assert_total("/drug/label.json?search=openfda.brand_name:humira", 1) + def test_exact(): - assert_total('/drug/label.json?search=openfda.brand_name.exact:Humira', 1) + assert_total("/drug/label.json?search=openfda.brand_name.exact:Humira", 1) + def test_missing_generic(): - assert_total( - '/drug/event.json?search=patient.drug.openfda.generic_name:(%22FUROSEMIDE%22)+AND+patient.reaction.reactionmeddrapt:(%22RENAL+FAILURE%22)+AND+receivedate:(%5b20120101+TO+20120131%5d)&limit=1&skip=0', - 5) + assert_total( + "/drug/event.json?search=patient.drug.openfda.generic_name:(%22FUROSEMIDE%22)+AND+patient.reaction.reactionmeddrapt:(%22RENAL+FAILURE%22)+AND+receivedate:(%5b20120101+TO+20120131%5d)&limit=1&skip=0", + 5, + ) + def test_missing_clause_es5(): - assert_total_exact('/device/recall.json?search=' - '_missing_:openfda+AND+root_cause_description:Design' - '+AND+_missing_:pma_numbers' - '+AND+event_date_terminated:[2012-01-8+TO+2012-03-30]' - '+AND+NOT+_missing_:firm_fei_number+AND+product_res_number.exact:Z-0866-2011', 1) + assert_total_exact( + "/device/recall.json?search=" + "_missing_:openfda+AND+root_cause_description:Design" + "+AND+_missing_:pma_numbers" + "+AND+event_date_terminated:[2012-01-8+TO+2012-03-30]" + "+AND+NOT+_missing_:firm_fei_number+AND+product_res_number.exact:Z-0866-2011", + 1, + ) def test_malformed_dates(): - meta, counts = fetch_counts('/device/registrationlisting.json?search=_exists_:(products.created_date)&count=products.created_date&skip=0') - import arrow - dt = arrow.parser.DateTimeParser() - # ensure our dates are all valid, - for count in counts: - try: - dt.parse(count.term, 'YYYYMMDD') - except: - assert False, 'Invalid date: %s' % count.term + meta, counts = fetch_counts( + "/device/registrationlisting.json?search=_exists_:(products.created_date)&count=products.created_date&skip=0" + ) + import arrow + + dt = arrow.parser.DateTimeParser() + # ensure our dates are all valid, + for count in counts: + try: + dt.parse(count.term, "YYYYMMDD") + except: + assert False, "Invalid date: %s" % count.term diff --git a/openfda/deploy/tests/ndc/test_endpoint.py b/openfda/deploy/tests/ndc/test_endpoint.py index 3ff4f43..cb33b57 100644 --- a/openfda/deploy/tests/ndc/test_endpoint.py +++ b/openfda/deploy/tests/ndc/test_endpoint.py @@ -5,128 +5,151 @@ from openfda.tests.api_test_helpers import * EXACT_FIELDS = [ - 'openfda.manufacturer_name.exact', - 'openfda.nui.exact', 'openfda.pharm_class_cs.exact', - 'openfda.pharm_class_epc.exact', 'openfda.pharm_class_moa.exact', 'openfda.pharm_class_pe.exact', - 'openfda.rxcui.exact', - 'openfda.spl_set_id.exact', - 'openfda.unii.exact'] + "openfda.manufacturer_name.exact", + "openfda.nui.exact", + "openfda.pharm_class_cs.exact", + "openfda.pharm_class_epc.exact", + "openfda.pharm_class_moa.exact", + "openfda.pharm_class_pe.exact", + "openfda.rxcui.exact", + "openfda.spl_set_id.exact", + "openfda.unii.exact", +] def test_exact_field_queries_after_fda_253(): - for field in EXACT_FIELDS: - print(field) - meta, counts = fetch_counts('/drug/ndc.json?count=%s&limit=1' % field) - eq_(len(counts), 1) + for field in EXACT_FIELDS: + print(field) + meta, counts = fetch_counts("/drug/ndc.json?count=%s&limit=1" % field) + eq_(len(counts), 1) def test_total_count(): - assert_total('/drug/ndc.json', 100000) + assert_total("/drug/ndc.json", 100000) def test_record_with_multiple_packaging(): - meta, results = fetch( - '/drug/ndc.json?search=product_ndc:46122-062') - eq_(len(results), 1) - - ndc = results[0] - eq_(ndc["product_id"], "46122-062_e37f7dcd-aecc-49d4-881c-f2b490bdd5b2") - eq_(ndc["product_ndc"], "46122-062") - eq_(ndc["spl_id"], "e37f7dcd-aecc-49d4-881c-f2b490bdd5b2") - eq_(ndc["product_type"], "HUMAN OTC DRUG") - eq_(ndc["finished"], True) - eq_(ndc["brand_name"], "Acetaminophen") - eq_(ndc["brand_name_base"], "Acetaminophen") - eq_(ndc.get("brand_name_suffix"), None) - eq_(ndc["generic_name"], "Acetaminophen") - eq_(ndc["dosage_form"], "TABLET, FILM COATED, EXTENDED RELEASE") - eq_(ndc["route"][0], "ORAL") - eq_(ndc["marketing_start_date"], "20020430") - eq_(ndc.get("marketing_end_date"), None) - eq_(ndc["marketing_category"], "ANDA") - eq_(ndc["application_number"], "ANDA076200") - eq_(ndc["labeler_name"], "Amerisource Bergen") - eq_(ndc["active_ingredients"][0]['name'], "ACETAMINOPHEN") - eq_(ndc["active_ingredients"][0]['strength'], "650 mg/1") - eq_(ndc.get("pharm_class"), None) - eq_(ndc.get("dea_schedule"), None) - eq_(ndc["listing_expiration_date"], "20221231") - - packaging = sorted(ndc["packaging"], key=lambda k: k['package_ndc']) - eq_(len(packaging), 2) - eq_(packaging[0]["package_ndc"], "46122-062-71") - eq_(packaging[0]["description"], "50 TABLET, FILM COATED, EXTENDED RELEASE in 1 BOTTLE (46122-062-71)") - eq_(packaging[0]["marketing_start_date"], "20020430") - eq_(packaging[0]["sample"], False) - - eq_(packaging[1]["package_ndc"], "46122-062-78") - eq_(packaging[1]["description"], "100 TABLET, FILM COATED, EXTENDED RELEASE in 1 BOTTLE (46122-062-78)") - eq_(packaging[1]["marketing_start_date"], "20020430") - eq_(packaging[1]["sample"], False) - - openfda = ndc["openfda"] - eq_(openfda["is_original_packager"][0], True) - eq_(openfda["manufacturer_name"][0], "Amerisource Bergen") - eq_(openfda["rxcui"][0], "1148399") - eq_(openfda["spl_set_id"][0], "c0f4d7f1-aa17-4233-bc47-41c280fdd7ce") - eq_(openfda["unii"][0], "362O9ITL9D") - - # Updates to SPL made barcode unparseable; hence commenting out next line - # eq_(openfda["upc"][0], "0087701408991") + meta, results = fetch("/drug/ndc.json?search=product_ndc:46122-062") + eq_(len(results), 1) + + ndc = results[0] + eq_(ndc["product_id"], "46122-062_e37f7dcd-aecc-49d4-881c-f2b490bdd5b2") + eq_(ndc["product_ndc"], "46122-062") + eq_(ndc["spl_id"], "e37f7dcd-aecc-49d4-881c-f2b490bdd5b2") + eq_(ndc["product_type"], "HUMAN OTC DRUG") + eq_(ndc["finished"], True) + eq_(ndc["brand_name"], "Acetaminophen") + eq_(ndc["brand_name_base"], "Acetaminophen") + eq_(ndc.get("brand_name_suffix"), None) + eq_(ndc["generic_name"], "Acetaminophen") + eq_(ndc["dosage_form"], "TABLET, FILM COATED, EXTENDED RELEASE") + eq_(ndc["route"][0], "ORAL") + eq_(ndc["marketing_start_date"], "20020430") + eq_(ndc.get("marketing_end_date"), None) + eq_(ndc["marketing_category"], "ANDA") + eq_(ndc["application_number"], "ANDA076200") + eq_(ndc["labeler_name"], "Amerisource Bergen") + eq_(ndc["active_ingredients"][0]["name"], "ACETAMINOPHEN") + eq_(ndc["active_ingredients"][0]["strength"], "650 mg/1") + eq_(ndc.get("pharm_class"), None) + eq_(ndc.get("dea_schedule"), None) + eq_(ndc["listing_expiration_date"], "20221231") + + packaging = sorted(ndc["packaging"], key=lambda k: k["package_ndc"]) + eq_(len(packaging), 2) + eq_(packaging[0]["package_ndc"], "46122-062-71") + eq_( + packaging[0]["description"], + "50 TABLET, FILM COATED, EXTENDED RELEASE in 1 BOTTLE (46122-062-71)", + ) + eq_(packaging[0]["marketing_start_date"], "20020430") + eq_(packaging[0]["sample"], False) + + eq_(packaging[1]["package_ndc"], "46122-062-78") + eq_( + packaging[1]["description"], + "100 TABLET, FILM COATED, EXTENDED RELEASE in 1 BOTTLE (46122-062-78)", + ) + eq_(packaging[1]["marketing_start_date"], "20020430") + eq_(packaging[1]["sample"], False) + + openfda = ndc["openfda"] + eq_(openfda["is_original_packager"][0], True) + eq_(openfda["manufacturer_name"][0], "Amerisource Bergen") + eq_(openfda["rxcui"][0], "1148399") + eq_(openfda["spl_set_id"][0], "c0f4d7f1-aa17-4233-bc47-41c280fdd7ce") + eq_(openfda["unii"][0], "362O9ITL9D") + + # Updates to SPL made barcode unparseable; hence commenting out next line + # eq_(openfda["upc"][0], "0087701408991") def test_date_fields(): - assert_total('/drug/ndc.json?search=_exists_:marketing_start_date', 10000) - assert_total('/drug/ndc.json?search=_exists_:marketing_end_date', 3000) + assert_total("/drug/ndc.json?search=_exists_:marketing_start_date", 10000) + assert_total("/drug/ndc.json?search=_exists_:marketing_end_date", 3000) + def test_exact_fields(): - assert_total('/drug/ndc.json?search=openfda.brand_name.exact:Amoxicillin and Clavulanate Potassium', 1) + assert_total( + "/drug/ndc.json?search=openfda.brand_name.exact:Amoxicillin and Clavulanate Potassium", + 1, + ) + def test_minimum_CVS_brand_drugs(): - assert_total('/drug/ndc.json?search=brand_name_suffix:Rite Aid', 2) + assert_total("/drug/ndc.json?search=brand_name_suffix:Rite Aid", 2) + def test_openfda_fields(): - meta, results = fetch( - '/drug/ndc.json?search=_exists_:openfda.nui+AND+_exists_:openfda.rxcui+AND+_exists_:openfda.upc&limit=1') + meta, results = fetch( + "/drug/ndc.json?search=_exists_:openfda.nui+AND+_exists_:openfda.rxcui+AND+_exists_:openfda.upc&limit=1" + ) - eq_(len(results), 1) + eq_(len(results), 1) + + ndc = results[0] + ok_(ndc["openfda"]["upc"] is not None) + ok_(ndc["openfda"]["rxcui"] is not None) + ok_(ndc["openfda"]["nui"] is not None) - ndc = results[0] - ok_(ndc['openfda']['upc'] is not None) - ok_(ndc['openfda']['rxcui'] is not None) - ok_(ndc['openfda']['nui'] is not None) def test_pharm_class_exists(): - assert_total('/drug/ndc.json?search=_exists_:openfda.pharm_class_pe', 10) - assert_total('/drug/ndc.json?search=_exists_:openfda.pharm_class_moa', 10) - assert_total('/drug/ndc.json?search=_exists_:openfda.pharm_class_epc', 10) - assert_total('/drug/ndc.json?search=_exists_:openfda.pharm_class_cs', 10) + assert_total("/drug/ndc.json?search=_exists_:openfda.pharm_class_pe", 10) + assert_total("/drug/ndc.json?search=_exists_:openfda.pharm_class_moa", 10) + assert_total("/drug/ndc.json?search=_exists_:openfda.pharm_class_epc", 10) + assert_total("/drug/ndc.json?search=_exists_:openfda.pharm_class_cs", 10) + def test_dea_scheduled_numbers(): - meta, results = fetch( - '/drug/ndc.json?count=dea_schedule') + meta, results = fetch("/drug/ndc.json?count=dea_schedule") + + eq_(len(results), 5) - eq_(len(results), 5) + for each_result in results: + ok_(each_result["count"] > 1) - for each_result in results: - ok_(each_result['count'] > 1) def test_lotion_products(): - meta, results = fetch( - '/drug/ndc.json?search=dosage_form.exact:"LOTION"&limit=1') + meta, results = fetch('/drug/ndc.json?search=dosage_form.exact:"LOTION"&limit=1') - eq_(len(results), 1) + eq_(len(results), 1) + + eq_(results[0]["dosage_form"], "LOTION") - eq_(results[0]["dosage_form"], "LOTION") def test_DENTAL_route_drugs(): - assert_total('/drug/ndc.json?search=route:"DENTAL"', 1000) + assert_total('/drug/ndc.json?search=route:"DENTAL"', 1000) + def test_drugs_expiring_in_date_range(): - assert_total('/drug/ndc.json?search=marketing_end_date:([20001001+TO+20251231])', 3900) + assert_total( + "/drug/ndc.json?search=marketing_end_date:([20001001+TO+20251231])", 3900 + ) + def test_drugs_with_original_packager(): - assert_total('/drug/ndc.json?search=openfda.is_original_packager:true', 1000) + assert_total("/drug/ndc.json?search=openfda.is_original_packager:true", 1000) + # def test_unfinished_record_with_multiple_packaging(): # meta, results = fetch( @@ -189,49 +212,48 @@ def test_drugs_with_original_packager(): # eq_(len(openfda["pharm_class_epc"]), 2) # eq_(len(openfda["nui"]), 6) + def test_drug_with_most_number_of_packaging(): - meta, results = fetch( - '/drug/ndc.json?search=product_ndc:66083-741') - eq_(len(results), 1) + meta, results = fetch("/drug/ndc.json?search=product_ndc:66083-741") + eq_(len(results), 1) - ndc = results[0] - packaging = sorted(ndc["packaging"], key=lambda k: k['package_ndc']) - eq_(len(packaging), 9) + ndc = results[0] + packaging = sorted(ndc["packaging"], key=lambda k: k["package_ndc"]) + eq_(len(packaging), 9) - eq_(packaging[8]["package_ndc"], "66083-741-12") - eq_(packaging[8]["description"], "640 L in 1 CYLINDER (66083-741-12)") - eq_(packaging[8]["marketing_start_date"], "20110214") - eq_(packaging[8]["sample"], False) + eq_(packaging[8]["package_ndc"], "66083-741-12") + eq_(packaging[8]["description"], "640 L in 1 CYLINDER (66083-741-12)") + eq_(packaging[8]["marketing_start_date"], "20110214") + eq_(packaging[8]["sample"], False) def test_active_ingredients(): - meta, results = fetch( - '/drug/ndc.json?search=product_ndc:0942-6316') + meta, results = fetch("/drug/ndc.json?search=product_ndc:0942-6316") - eq_(len(results), 1) + eq_(len(results), 1) - ndc = results[0] - active_ingredients = sorted(ndc["active_ingredients"], key=lambda k: k['name']) - eq_(len(active_ingredients), 5) + ndc = results[0] + active_ingredients = sorted(ndc["active_ingredients"], key=lambda k: k["name"]) + eq_(len(active_ingredients), 5) - eq_(active_ingredients[0]["name"], "ADENINE") - eq_(active_ingredients[0]["strength"], "19.3 mg/70mL") + eq_(active_ingredients[0]["name"], "ADENINE") + eq_(active_ingredients[0]["strength"], "19.3 mg/70mL") - eq_(active_ingredients[1]["name"], "ANHYDROUS CITRIC ACID") - eq_(active_ingredients[1]["strength"], "209 mg/70mL") + eq_(active_ingredients[1]["name"], "ANHYDROUS CITRIC ACID") + eq_(active_ingredients[1]["strength"], "209 mg/70mL") - eq_(active_ingredients[2]["name"], "DEXTROSE MONOHYDRATE") - eq_(active_ingredients[2]["strength"], "2.23 g/70mL") + eq_(active_ingredients[2]["name"], "DEXTROSE MONOHYDRATE") + eq_(active_ingredients[2]["strength"], "2.23 g/70mL") - eq_(active_ingredients[3]["name"], "SODIUM PHOSPHATE, MONOBASIC, MONOHYDRATE") - eq_(active_ingredients[3]["strength"], "155 mg/70mL") + eq_(active_ingredients[3]["name"], "SODIUM PHOSPHATE, MONOBASIC, MONOHYDRATE") + eq_(active_ingredients[3]["strength"], "155 mg/70mL") - eq_(active_ingredients[4]["name"], "TRISODIUM CITRATE DIHYDRATE") - eq_(active_ingredients[4]["strength"], "1.84 g/70mL") + eq_(active_ingredients[4]["name"], "TRISODIUM CITRATE DIHYDRATE") + eq_(active_ingredients[4]["strength"], "1.84 g/70mL") -if __name__ == '__main__': - all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) - for key, func in all_functions: - if key.find("test_") > -1: - func() +if __name__ == "__main__": + all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) + for key, func in all_functions: + if key.find("test_") > -1: + func() diff --git a/openfda/deploy/tests/othernsde/test_endpoint.py b/openfda/deploy/tests/othernsde/test_endpoint.py index 7597fa3..13241c6 100644 --- a/openfda/deploy/tests/othernsde/test_endpoint.py +++ b/openfda/deploy/tests/othernsde/test_endpoint.py @@ -6,77 +6,79 @@ def test_total(): - assert_total('/other/nsde.json', 440000) + assert_total("/other/nsde.json", 440000) def test_package_ndc(): - meta, results = fetch( - '/other/nsde.json?limit=1') - eq_(len(results), 1) + meta, results = fetch("/other/nsde.json?limit=1") + eq_(len(results), 1) - nsde = results[0] - ok_(nsde.get("package_ndc") is not None) + nsde = results[0] + ok_(nsde.get("package_ndc") is not None) def test_package_ndc11(): - meta, results = fetch( - '/other/nsde.json?limit=1') - eq_(len(results), 1) + meta, results = fetch("/other/nsde.json?limit=1") + eq_(len(results), 1) - nsde = results[0] - ok_(nsde.get("package_ndc11") is not None) + nsde = results[0] + ok_(nsde.get("package_ndc11") is not None) def test_date_fields(): - assert_total('/other/nsde.json?search=_exists_:marketing_start_date', 10) - assert_total('/other/nsde.json?search=_exists_:marketing_end_date', 10) + assert_total("/other/nsde.json?search=_exists_:marketing_start_date", 10) + assert_total("/other/nsde.json?search=_exists_:marketing_end_date", 10) def test_other_fields(): - meta, results = fetch( - '/other/nsde.json?search=_exists_:product_type+AND+_exists_:application_number_or_citation+AND+_exists_:marketing_category+AND+_exists_:dosage_form+AND+_exists_:proprietary_name&limit=1') - eq_(len(results), 1) + meta, results = fetch( + "/other/nsde.json?search=_exists_:product_type+AND+_exists_:application_number_or_citation+AND+_exists_:marketing_category+AND+_exists_:dosage_form+AND+_exists_:proprietary_name&limit=1" + ) + eq_(len(results), 1) - nsde = results[0] + nsde = results[0] + + ok_(nsde.get("proprietary_name") is not None) + ok_(nsde.get("product_type") is not None) + ok_(nsde.get("application_number_or_citation") is not None) + ok_(nsde.get("marketing_category") is not None) + ok_(nsde.get("dosage_form") is not None) - ok_(nsde.get("proprietary_name") is not None) - ok_(nsde.get("product_type") is not None) - ok_(nsde.get("application_number_or_citation") is not None) - ok_(nsde.get("marketing_category") is not None) - ok_(nsde.get("dosage_form") is not None) def test_activation_date_fields(): - meta, results = fetch( - '/other/nsde.json?search=package_ndc11:63029040101&sort=inactivation_date:desc') - eq_(len(results), 1) + meta, results = fetch( + "/other/nsde.json?search=package_ndc11:63029040101&sort=inactivation_date:desc" + ) + eq_(len(results), 1) - nsde = results[0] + nsde = results[0] - eq_(nsde.get("inactivation_date"), '20200131') - eq_(nsde.get("reactivation_date"), '20200626') + eq_(nsde.get("inactivation_date"), "20200131") + eq_(nsde.get("reactivation_date"), "20200626") def test_all_fields(): - meta, results = fetch( - '/other/nsde.json?search=package_ndc11:15338010014+AND+_exists_:marketing_end_date+AND+_exists_:application_number_or_citation+AND+_exists_:marketing_category+AND+_exists_:dosage_form+AND+_exists_:billing_unit') - eq_(len(results), 1) - - nsde = results[0] - - eq_(nsde.get("package_ndc"), '15338-100-14') - eq_(nsde.get("package_ndc11"), '15338010014') - eq_(nsde.get("marketing_category"), 'ANDA') - eq_(nsde.get("marketing_start_date"), '19840507') - eq_(nsde.get("marketing_end_date"), '20150531') - eq_(nsde.get("billing_unit"), 'EA') - eq_(nsde.get("proprietary_name"), 'Doxycycline Hyclate') - eq_(nsde.get("dosage_form"), 'CAPSULE, GELATIN COATED') - eq_(nsde.get("application_number_or_citation"), 'ANDA062396') - eq_(nsde.get("product_type"), 'HUMAN PRESCRIPTION DRUG') - - -if __name__ == '__main__': - all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) - for key, func in all_functions: - if key.find("test_") > -1: - func() + meta, results = fetch( + "/other/nsde.json?search=package_ndc11:15338010014+AND+_exists_:marketing_end_date+AND+_exists_:application_number_or_citation+AND+_exists_:marketing_category+AND+_exists_:dosage_form+AND+_exists_:billing_unit" + ) + eq_(len(results), 1) + + nsde = results[0] + + eq_(nsde.get("package_ndc"), "15338-100-14") + eq_(nsde.get("package_ndc11"), "15338010014") + eq_(nsde.get("marketing_category"), "ANDA") + eq_(nsde.get("marketing_start_date"), "19840507") + eq_(nsde.get("marketing_end_date"), "20150531") + eq_(nsde.get("billing_unit"), "EA") + eq_(nsde.get("proprietary_name"), "Doxycycline Hyclate") + eq_(nsde.get("dosage_form"), "CAPSULE, GELATIN COATED") + eq_(nsde.get("application_number_or_citation"), "ANDA062396") + eq_(nsde.get("product_type"), "HUMAN PRESCRIPTION DRUG") + + +if __name__ == "__main__": + all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) + for key, func in all_functions: + if key.find("test_") > -1: + func() diff --git a/openfda/deploy/tests/othersubstance/test_endpoint.py b/openfda/deploy/tests/othersubstance/test_endpoint.py index 6583320..8e32ae0 100644 --- a/openfda/deploy/tests/othersubstance/test_endpoint.py +++ b/openfda/deploy/tests/othersubstance/test_endpoint.py @@ -6,342 +6,371 @@ def test_total_count(): - assert_total('/other/substance.json', 109200) + assert_total("/other/substance.json", 109200) def test_top_level(): - meta, results = fetch( - '/other/substance.json?search=unii:X1468QSS0Q') - eq_(len(results), 1) + meta, results = fetch("/other/substance.json?search=unii:X1468QSS0Q") + eq_(len(results), 1) - substance = results[0] - eq_(substance["uuid"], "5c013332-a41e-49b8-a135-fbb0eaf7ee78") - eq_(substance["substance_class"], "structurallyDiverse") - eq_(substance["definition_type"], "PRIMARY") - eq_(substance["version"], "1") - eq_(substance["definition_level"], "COMPLETE") + substance = results[0] + eq_(substance["uuid"], "5c013332-a41e-49b8-a135-fbb0eaf7ee78") + eq_(substance["substance_class"], "structurallyDiverse") + eq_(substance["definition_type"], "PRIMARY") + eq_(substance["version"], "1") + eq_(substance["definition_level"], "COMPLETE") def test_codes(): - meta, results = fetch( - '/other/substance.json?search=unii:981Y8SX18M') - eq_(len(results), 1) + meta, results = fetch("/other/substance.json?search=unii:981Y8SX18M") + eq_(len(results), 1) - codes = sorted(results[0]["codes"], key=lambda code: code['code'])[0] - eq_(codes["code"], "1065221") - eq_(codes["uuid"], "7040e30b-b3d2-ae4b-d8d2-2ab72578fa6b") - eq_(codes["url"], "https://store.usp.org/product/1065221") - eq_(codes["references"], ["2bec6c3a-ce05-37bb-b4f3-8770aedd51f7"]) - eq_(codes["type"], "PRIMARY") - eq_(codes["code_system"], "RS_ITEM_NUM") + codes = sorted(results[0]["codes"], key=lambda code: code["code"])[0] + eq_(codes["code"], "1065221") + eq_(codes["uuid"], "7040e30b-b3d2-ae4b-d8d2-2ab72578fa6b") + eq_(codes["url"], "https://store.usp.org/product/1065221") + eq_(codes["references"], ["2bec6c3a-ce05-37bb-b4f3-8770aedd51f7"]) + eq_(codes["type"], "PRIMARY") + eq_(codes["code_system"], "RS_ITEM_NUM") def test_mixture(): - meta, results = fetch( - '/other/substance.json?search=unii:F5UN44K0LE') - eq_(len(results), 1) - - mixture = results[0]["mixture"] - eq_(mixture["uuid"], "dbbd1271-908f-47b3-bc49-5bd11316bf9a") - eq_(mixture["references"], ["b8fea4ad-ed4d-4ecc-b68f-32f0e2836b24", "bec2c10b-b21c-44c2-8efb-3e67ecc52d71"]) - - components = mixture["components"][0] - eq_(components["type"], "MUST_BE_PRESENT") - eq_(components["uuid"], "119a7d5c-7ab7-4e00-911e-441101a6bd50") - eq_(components["references"], []) - - substance = components["substance"] - eq_(substance["unii"], "Z3S58SB58N") - eq_(substance["uuid"], "e919e4ef-589f-411a-8bb6-d6c4558e14c1") - eq_(substance["substance_class"], "reference") - eq_(substance["refuuid"], "23ebfc5b-41d7-4798-b549-1402da529c1f") - eq_(substance["ref_pname"], "4-HEXENAL, (4Z)-") - eq_(substance["linking_id"], "Z3S58SB58N") - eq_(substance["name"], "4-HEXENAL, (4Z)-") + meta, results = fetch("/other/substance.json?search=unii:F5UN44K0LE") + eq_(len(results), 1) + + mixture = results[0]["mixture"] + eq_(mixture["uuid"], "dbbd1271-908f-47b3-bc49-5bd11316bf9a") + eq_( + mixture["references"], + [ + "b8fea4ad-ed4d-4ecc-b68f-32f0e2836b24", + "bec2c10b-b21c-44c2-8efb-3e67ecc52d71", + ], + ) + + components = mixture["components"][0] + eq_(components["type"], "MUST_BE_PRESENT") + eq_(components["uuid"], "119a7d5c-7ab7-4e00-911e-441101a6bd50") + eq_(components["references"], []) + + substance = components["substance"] + eq_(substance["unii"], "Z3S58SB58N") + eq_(substance["uuid"], "e919e4ef-589f-411a-8bb6-d6c4558e14c1") + eq_(substance["substance_class"], "reference") + eq_(substance["refuuid"], "23ebfc5b-41d7-4798-b549-1402da529c1f") + eq_(substance["ref_pname"], "4-HEXENAL, (4Z)-") + eq_(substance["linking_id"], "Z3S58SB58N") + eq_(substance["name"], "4-HEXENAL, (4Z)-") def test_modifications(): - meta, results = fetch( - '/other/substance.json?search=unii:X1468QSS0Q') - eq_(len(results), 1) + meta, results = fetch("/other/substance.json?search=unii:X1468QSS0Q") + eq_(len(results), 1) - modifications = results[0]["modifications"] - eq_(modifications["uuid"], "03623958-d42e-4645-a319-1d642d0624ce") - eq_(modifications["physical_modifications"][0]["uuid"], "5ea6be78-03f2-4870-80cc-cf4c3922dc63") + modifications = results[0]["modifications"] + eq_(modifications["uuid"], "03623958-d42e-4645-a319-1d642d0624ce") + eq_( + modifications["physical_modifications"][0]["uuid"], + "5ea6be78-03f2-4870-80cc-cf4c3922dc63", + ) - mod_parameters = modifications["physical_modifications"][0]["parameters"][0] - eq_(mod_parameters["uuid"], "f576adb6-d4fa-4cc5-979e-ebf93f28e465") - eq_(mod_parameters["parameter_name"], "HEAT") - eq_(mod_parameters["amount"]["units"], "C") - eq_(mod_parameters["amount"]["uuid"], "aa719c80-fc63-44a1-952c-8f736253e1f5") - eq_(mod_parameters["amount"]["low_limit"], 60) + mod_parameters = modifications["physical_modifications"][0]["parameters"][0] + eq_(mod_parameters["uuid"], "f576adb6-d4fa-4cc5-979e-ebf93f28e465") + eq_(mod_parameters["parameter_name"], "HEAT") + eq_(mod_parameters["amount"]["units"], "C") + eq_(mod_parameters["amount"]["uuid"], "aa719c80-fc63-44a1-952c-8f736253e1f5") + eq_(mod_parameters["amount"]["low_limit"], 60) def test_moieties(): - meta, results = fetch( - '/other/substance.json?search=unii:981Y8SX18M') - eq_(len(results), 1) - - moieties = results[0]["moieties"][0] - eq_(moieties["count"], 1) - eq_(moieties["smiles"], "Cl") - eq_(moieties["molfile"], "\n Marvin 01132110552D \n\n 1 0 0 0 0 0 999 V2000\n -2.3335 -4.9698 0.0000 Cl 0 0 0 0 0 0 0 0 0 0 0 0\nM END") - eq_(moieties["defined_stereo"], 0) - eq_(moieties["molecular_weight"], "36.461") - eq_(moieties["ez_centers"], 0) - eq_(moieties["charge"], 0) - eq_(moieties["stereo_centers"], 0) - eq_(moieties["formula"], "ClH") - eq_(moieties["optical_activity"], "NONE") - eq_(moieties["stereochemistry"], "ACHIRAL") - eq_(moieties["id"], "d34f8e44-d289-4bd2-b1a8-c96965fc4435") - - count_amount = moieties["count_amount"] - eq_(count_amount["units"], "MOL RATIO") - eq_(count_amount["average"], 1) - eq_(count_amount["type"], "MOL RATIO") - eq_(count_amount["uuid"], "6a62169b-deaf-4f47-971e-db7aa4ab68d6") + meta, results = fetch("/other/substance.json?search=unii:981Y8SX18M") + eq_(len(results), 1) + + moieties = results[0]["moieties"][0] + eq_(moieties["count"], 1) + eq_(moieties["smiles"], "Cl") + eq_( + moieties["molfile"], + "\n Marvin 01132110552D \n\n 1 0 0 0 0 0 999 V2000\n -2.3335 -4.9698 0.0000 Cl 0 0 0 0 0 0 0 0 0 0 0 0\nM END", + ) + eq_(moieties["defined_stereo"], 0) + eq_(moieties["molecular_weight"], "36.461") + eq_(moieties["ez_centers"], 0) + eq_(moieties["charge"], 0) + eq_(moieties["stereo_centers"], 0) + eq_(moieties["formula"], "ClH") + eq_(moieties["optical_activity"], "NONE") + eq_(moieties["stereochemistry"], "ACHIRAL") + eq_(moieties["id"], "d34f8e44-d289-4bd2-b1a8-c96965fc4435") + + count_amount = moieties["count_amount"] + eq_(count_amount["units"], "MOL RATIO") + eq_(count_amount["average"], 1) + eq_(count_amount["type"], "MOL RATIO") + eq_(count_amount["uuid"], "6a62169b-deaf-4f47-971e-db7aa4ab68d6") def test_names(): - meta, results = fetch( - '/other/substance.json?search=unii:X1468QSS0Q') - eq_(len(results), 1) + meta, results = fetch("/other/substance.json?search=unii:X1468QSS0Q") + eq_(len(results), 1) - names = results[0]["names"][0] - eq_(names["display_name"], True) - eq_(names["uuid"], "0ee6b739-e3ce-4cc5-8153-edeeb5858c1d") - eq_(names["preferred"], False) - eq_(names["languages"], ["en"]) - eq_(names["references"], ["fc0dc26c-e4a8-49a9-97b3-dc9d412bd96a"]) - eq_(names["type"], "cn") - eq_(names["name"], "DANIO RERIO, COOKED") + names = results[0]["names"][0] + eq_(names["display_name"], True) + eq_(names["uuid"], "0ee6b739-e3ce-4cc5-8153-edeeb5858c1d") + eq_(names["preferred"], False) + eq_(names["languages"], ["en"]) + eq_(names["references"], ["fc0dc26c-e4a8-49a9-97b3-dc9d412bd96a"]) + eq_(names["type"], "cn") + eq_(names["name"], "DANIO RERIO, COOKED") def test_notes(): - meta, results = fetch( - '/other/substance.json?search=unii:X5B33C49HP') - eq_(len(results), 1) + meta, results = fetch("/other/substance.json?search=unii:X5B33C49HP") + eq_(len(results), 1) - notes = results[0]["notes"][0] - eq_(notes["note"], "This is an incomplete polymer record. A more detailed definition may be available soon.") - eq_(notes["uuid"], None) - eq_(notes["references"], []) + notes = results[0]["notes"][0] + eq_( + notes["note"], + "This is an incomplete polymer record. A more detailed definition may be available soon.", + ) + eq_(notes["uuid"], None) + eq_(notes["references"], []) def test_nucleic_acid(): - meta, results = fetch( - '/other/substance.json?search=unii:3Z6W3S36X5') - eq_(len(results), 1) - - nucleic_acid = results[0]["nucleic_acid"] - eq_(nucleic_acid["nucleic_acid_type"], "OLIGONUCLEOTIDE") - eq_(nucleic_acid["uuid"], "2b219491-b23a-4e0c-b301-6297410c72c7") - eq_(nucleic_acid["references"], ["e1ba81ce-8fcd-4229-8683-ea33083fcef7", "fe33becb-5108-4398-8337-6df9d84f7fce"]) - - sugars = nucleic_acid["sugars"][0] - eq_(sugars["uuid"], "73318f42-0a0e-4da5-bf2d-c06ec5009216") - eq_(sugars["sugar"], "dR") - - sites = sugars["sites"][0] - eq_(sites["subunit_index"], 1) - eq_(sites["residue_index"], 1) - - subunits = nucleic_acid["subunits"][0] - eq_(subunits["subunit_index"], 1) - eq_(subunits["uuid"], "33ee4c10-2e33-49a4-b48a-e8f4fe28b3f1") - eq_(subunits["sequence"], "GCGTTTGCTCTTCTTCTTGCG") - - linkages = nucleic_acid["linkages"][0] - eq_(linkages["references"], []) - eq_(linkages["uuid"], "4d204101-896b-42a4-b72d-1f89b3ca6ea3") - eq_(linkages["linkage"], "nasP") - - linkage_sites = linkages["sites"][0] - eq_(linkage_sites["subunit_index"], 1) - eq_(linkage_sites["residue_index"], 2) + meta, results = fetch("/other/substance.json?search=unii:3Z6W3S36X5") + eq_(len(results), 1) + + nucleic_acid = results[0]["nucleic_acid"] + eq_(nucleic_acid["nucleic_acid_type"], "OLIGONUCLEOTIDE") + eq_(nucleic_acid["uuid"], "2b219491-b23a-4e0c-b301-6297410c72c7") + eq_( + nucleic_acid["references"], + [ + "e1ba81ce-8fcd-4229-8683-ea33083fcef7", + "fe33becb-5108-4398-8337-6df9d84f7fce", + ], + ) + + sugars = nucleic_acid["sugars"][0] + eq_(sugars["uuid"], "73318f42-0a0e-4da5-bf2d-c06ec5009216") + eq_(sugars["sugar"], "dR") + + sites = sugars["sites"][0] + eq_(sites["subunit_index"], 1) + eq_(sites["residue_index"], 1) + + subunits = nucleic_acid["subunits"][0] + eq_(subunits["subunit_index"], 1) + eq_(subunits["uuid"], "33ee4c10-2e33-49a4-b48a-e8f4fe28b3f1") + eq_(subunits["sequence"], "GCGTTTGCTCTTCTTCTTGCG") + + linkages = nucleic_acid["linkages"][0] + eq_(linkages["references"], []) + eq_(linkages["uuid"], "4d204101-896b-42a4-b72d-1f89b3ca6ea3") + eq_(linkages["linkage"], "nasP") + + linkage_sites = linkages["sites"][0] + eq_(linkage_sites["subunit_index"], 1) + eq_(linkage_sites["residue_index"], 2) def test_polymer(): - meta, results = fetch( - '/other/substance.json?search=unii:X5B33C49HP') - eq_(len(results), 1) - - polymer = results[0]["polymer"] - eq_(polymer["uuid"], "bcabb374-5683-4ec0-8d34-38c1eaceac31") - eq_(polymer["references"], ["703c5b0b-aa5d-40fb-8e01-557a2ab4749d", "a49d7af2-6db4-4d4b-9e32-59f5e037df07"]) - - classification = polymer["classification"] - eq_(classification["uuid"], "445904ab-14d8-415b-bc22-5c39f65a3bd8") - eq_(classification["polymer_class"], "HOMOPOLYMER") - eq_(classification["polymer_geometry"], "STAR") - - idealized_structure = polymer["idealized_structure"] - eq_(idealized_structure["count"], 1) - eq_(idealized_structure["id"], "3c203a89-60f6-43f6-8658-b73047f20485") - eq_(idealized_structure["references"], []) - - monomers = polymer["monomers"][0] - eq_(monomers["uuid"], "c87d3e1a-da0b-4152-9cd5-a74ec1524534") - eq_(monomers["type"], "MONOMER") - eq_(monomers["defining"], False) - - monomer_substance = monomers["monomer_substance"] - eq_(monomer_substance["unii"], "JJH7GNN18P") - eq_(monomer_substance["uuid"], "4ed5e189-c399-4811-aa87-dd879901db63") - eq_(monomer_substance["substance_class"], "reference") - eq_(monomer_substance["refuuid"], "5753e233-3266-499e-9c8f-1024eb5452dd") - eq_(monomer_substance["ref_pname"], "ETHYLENE OXIDE") - eq_(monomer_substance["linking_id"], "JJH7GNN18P") - eq_(monomer_substance["name"], "ETHYLENE OXIDE") - - amount = monomers["amount"] - eq_(amount["average"], 18) - eq_(amount["uuid"], "9e1540ef-c5c2-4fbb-a9ad-2a39ce4e1b74") - eq_(amount["type"], "MOLE RATIO") - - display_structure = polymer["display_structure"] - eq_(display_structure["count"], 1) - eq_(display_structure["id"], "c522848e-6de9-4b93-83f0-e00e0980fe56") + meta, results = fetch("/other/substance.json?search=unii:X5B33C49HP") + eq_(len(results), 1) + + polymer = results[0]["polymer"] + eq_(polymer["uuid"], "bcabb374-5683-4ec0-8d34-38c1eaceac31") + eq_( + polymer["references"], + [ + "703c5b0b-aa5d-40fb-8e01-557a2ab4749d", + "a49d7af2-6db4-4d4b-9e32-59f5e037df07", + ], + ) + + classification = polymer["classification"] + eq_(classification["uuid"], "445904ab-14d8-415b-bc22-5c39f65a3bd8") + eq_(classification["polymer_class"], "HOMOPOLYMER") + eq_(classification["polymer_geometry"], "STAR") + + idealized_structure = polymer["idealized_structure"] + eq_(idealized_structure["count"], 1) + eq_(idealized_structure["id"], "3c203a89-60f6-43f6-8658-b73047f20485") + eq_(idealized_structure["references"], []) + + monomers = polymer["monomers"][0] + eq_(monomers["uuid"], "c87d3e1a-da0b-4152-9cd5-a74ec1524534") + eq_(monomers["type"], "MONOMER") + eq_(monomers["defining"], False) + + monomer_substance = monomers["monomer_substance"] + eq_(monomer_substance["unii"], "JJH7GNN18P") + eq_(monomer_substance["uuid"], "4ed5e189-c399-4811-aa87-dd879901db63") + eq_(monomer_substance["substance_class"], "reference") + eq_(monomer_substance["refuuid"], "5753e233-3266-499e-9c8f-1024eb5452dd") + eq_(monomer_substance["ref_pname"], "ETHYLENE OXIDE") + eq_(monomer_substance["linking_id"], "JJH7GNN18P") + eq_(monomer_substance["name"], "ETHYLENE OXIDE") + + amount = monomers["amount"] + eq_(amount["average"], 18) + eq_(amount["uuid"], "9e1540ef-c5c2-4fbb-a9ad-2a39ce4e1b74") + eq_(amount["type"], "MOLE RATIO") + + display_structure = polymer["display_structure"] + eq_(display_structure["count"], 1) + eq_(display_structure["id"], "c522848e-6de9-4b93-83f0-e00e0980fe56") def test_properties(): - meta, results = fetch( - '/other/substance.json?search=unii:X5B33C49HP') - eq_(len(results), 1) + meta, results = fetch("/other/substance.json?search=unii:X5B33C49HP") + eq_(len(results), 1) - properties = results[0]["properties"][0] - eq_(properties["uuid"], "0437b524-7f20-4805-b1c1-3862016bb3d4") - eq_(properties["type"], "amount") - eq_(properties["property_type"], "CHEMICAL") - eq_(properties["defining"], False) - eq_(properties["name"], "MOL_WEIGHT:CALCULATED") + properties = results[0]["properties"][0] + eq_(properties["uuid"], "0437b524-7f20-4805-b1c1-3862016bb3d4") + eq_(properties["type"], "amount") + eq_(properties["property_type"], "CHEMICAL") + eq_(properties["defining"], False) + eq_(properties["name"], "MOL_WEIGHT:CALCULATED") - value = properties["value"] - eq_(value["units"], "Da") - eq_(value["average"], 1360) - eq_(value["type"], "CALCULATED") - eq_(value["uuid"], "81ef2ee8-a48c-4ea8-b4bc-ff14edcaa3b1") + value = properties["value"] + eq_(value["units"], "Da") + eq_(value["average"], 1360) + eq_(value["type"], "CALCULATED") + eq_(value["uuid"], "81ef2ee8-a48c-4ea8-b4bc-ff14edcaa3b1") def test_protein(): - meta, results = fetch( - '/other/substance.json?search=unii:M91TG242P2') - eq_(len(results), 1) + meta, results = fetch("/other/substance.json?search=unii:M91TG242P2") + eq_(len(results), 1) - protein = results[0]["protein"] - eq_(protein["uuid"], "32077660-b8bf-4983-bbbb-35182654f11a") - eq_(protein["protein_sub_type"], "") - eq_(protein["sequence_type"], "COMPLETE") + protein = results[0]["protein"] + eq_(protein["uuid"], "32077660-b8bf-4983-bbbb-35182654f11a") + eq_(protein["protein_sub_type"], "") + eq_(protein["sequence_type"], "COMPLETE") - glycosylation = protein["glycosylation"] - eq_(glycosylation["glycosylation_type"], "MAMMALIAN") - eq_(glycosylation["uuid"], "54160485-e8d0-42e7-8cc1-9f51022f5dc8") + glycosylation = protein["glycosylation"] + eq_(glycosylation["glycosylation_type"], "MAMMALIAN") + eq_(glycosylation["uuid"], "54160485-e8d0-42e7-8cc1-9f51022f5dc8") - n_glycosylation_sites = glycosylation["n_glycosylation_sites"][0] - eq_(n_glycosylation_sites["subunit_index"], 1) - eq_(n_glycosylation_sites["residue_index"], 84) + n_glycosylation_sites = glycosylation["n_glycosylation_sites"][0] + eq_(n_glycosylation_sites["subunit_index"], 1) + eq_(n_glycosylation_sites["residue_index"], 84) - disulfide_links_sites = protein["disulfide_links"][0]['sites'][0] - eq_(disulfide_links_sites["subunit_index"], 1) - eq_(disulfide_links_sites["residue_index"], 6) + disulfide_links_sites = protein["disulfide_links"][0]["sites"][0] + eq_(disulfide_links_sites["subunit_index"], 1) + eq_(disulfide_links_sites["residue_index"], 6) - subunits = protein["subunits"][0] - eq_(subunits["subunit_index"], 1) - eq_(subunits["uuid"], "f721a8f4-75f5-4455-abd9-8c59a70ac2e5") + subunits = protein["subunits"][0] + eq_(subunits["subunit_index"], 1) + eq_(subunits["uuid"], "f721a8f4-75f5-4455-abd9-8c59a70ac2e5") - other_links = protein["other_links"][0] - eq_(other_links["linkage_type"], "COORDINATION COMPOUND") - eq_(other_links["uuid"], "427c87f4-6005-4797-958e-1e098cb2c2b1") + other_links = protein["other_links"][0] + eq_(other_links["linkage_type"], "COORDINATION COMPOUND") + eq_(other_links["uuid"], "427c87f4-6005-4797-958e-1e098cb2c2b1") - other_links_sites = other_links["sites"][0] - eq_(other_links_sites["subunit_index"], 1) - eq_(other_links_sites["residue_index"], 23) + other_links_sites = other_links["sites"][0] + eq_(other_links_sites["subunit_index"], 1) + eq_(other_links_sites["residue_index"], 23) -def test_references(): - meta, results = fetch( - '/other/substance.json?search=unii:X1468QSS0Q') - eq_(len(results), 1) - references = sorted(results[0]["references"], key=lambda k: k['uuid']) - eq_(references[2]["citation"], "NCBI") - eq_(references[2]["uuid"], "ff641620-26d9-4cf9-81b1-0f8f95bc7ce9") - eq_(references[2]["tags"], ["NOMEN"]) - eq_(references[2]["public_domain"], True) - eq_(references[0]["doc_type"], "SRS") - eq_(references[0]["url"], "http://fdasis.nlm.nih.gov/srs/srsdirect.jsp?regno=X1468QSS0Q") - eq_(references[0]["document_date"], 1493391472000) +def test_references(): + meta, results = fetch("/other/substance.json?search=unii:X1468QSS0Q") + eq_(len(results), 1) + + references = sorted(results[0]["references"], key=lambda k: k["uuid"]) + eq_(references[2]["citation"], "NCBI") + eq_(references[2]["uuid"], "ff641620-26d9-4cf9-81b1-0f8f95bc7ce9") + eq_(references[2]["tags"], ["NOMEN"]) + eq_(references[2]["public_domain"], True) + eq_(references[0]["doc_type"], "SRS") + eq_( + references[0]["url"], + "http://fdasis.nlm.nih.gov/srs/srsdirect.jsp?regno=X1468QSS0Q", + ) + eq_(references[0]["document_date"], 1493391472000) def test_relationships(): - meta, results = fetch( - '/other/substance.json?search=unii:981Y8SX18M') - eq_(len(results), 1) + meta, results = fetch("/other/substance.json?search=unii:981Y8SX18M") + eq_(len(results), 1) - rel = sorted(results[0]["relationships"], key=lambda k: k['uuid'])[1] - eq_(rel["type"], "PARENT->SALT/SOLVATE") - eq_(rel["uuid"], "0d717420-619e-4e45-aa3e-d13593ea90c9") + rel = sorted(results[0]["relationships"], key=lambda k: k["uuid"])[1] + eq_(rel["type"], "PARENT->SALT/SOLVATE") + eq_(rel["uuid"], "0d717420-619e-4e45-aa3e-d13593ea90c9") - related_substance = rel["related_substance"] - eq_(related_substance["unii"], "9266D9P3PQ") - eq_(related_substance["uuid"], "294bf8e1-4639-4fc6-81ef-01e97dc51826") - eq_(related_substance["substance_class"], "reference") - eq_(related_substance["refuuid"], "9b2310d6-22b1-40d5-ae6e-145673ad5866") - eq_(related_substance["ref_pname"], "BENDAMUSTINE") - eq_(related_substance["linking_id"], "9266D9P3PQ") - eq_(related_substance["name"], "BENDAMUSTINE") + related_substance = rel["related_substance"] + eq_(related_substance["unii"], "9266D9P3PQ") + eq_(related_substance["uuid"], "294bf8e1-4639-4fc6-81ef-01e97dc51826") + eq_(related_substance["substance_class"], "reference") + eq_(related_substance["refuuid"], "9b2310d6-22b1-40d5-ae6e-145673ad5866") + eq_(related_substance["ref_pname"], "BENDAMUSTINE") + eq_(related_substance["linking_id"], "9266D9P3PQ") + eq_(related_substance["name"], "BENDAMUSTINE") def test_structurally_diverse(): - meta, results = fetch( - '/other/substance.json?search=unii:X1468QSS0Q') - eq_(len(results), 1) - - structurally_diverse = results[0]["structurally_diverse"] - eq_(structurally_diverse["source_material_type"], "BONY FISH") - eq_(structurally_diverse["uuid"], "b1ab3af6-4a14-4d28-8f01-8084cc58f0aa") - eq_(structurally_diverse["source_material_class"], "ORGANISM") - eq_(structurally_diverse["part"], ["MUSCLE"]) - eq_(structurally_diverse["references"], ["ff641620-26d9-4cf9-81b1-0f8f95bc7ce9", - "0578e457-dff1-41e3-ae2e-c61cb9308af1"]) - - parent_substance = structurally_diverse["parent_substance"] - eq_(parent_substance["unii"], "48HJA7IOQP") - eq_(parent_substance["uuid"], "8c40fcc9-0950-45f8-857b-e9140d8dc692") - eq_(parent_substance["substance_class"], "reference") - eq_(parent_substance["refuuid"], "0bfddc8e-ff36-43d7-b549-301b9e161343") - eq_(parent_substance["ref_pname"], "DANIO RERIO WHOLE") - eq_(parent_substance["linking_id"], "48HJA7IOQP") - eq_(parent_substance["name"], "DANIO RERIO WHOLE") + meta, results = fetch("/other/substance.json?search=unii:X1468QSS0Q") + eq_(len(results), 1) + + structurally_diverse = results[0]["structurally_diverse"] + eq_(structurally_diverse["source_material_type"], "BONY FISH") + eq_(structurally_diverse["uuid"], "b1ab3af6-4a14-4d28-8f01-8084cc58f0aa") + eq_(structurally_diverse["source_material_class"], "ORGANISM") + eq_(structurally_diverse["part"], ["MUSCLE"]) + eq_( + structurally_diverse["references"], + [ + "ff641620-26d9-4cf9-81b1-0f8f95bc7ce9", + "0578e457-dff1-41e3-ae2e-c61cb9308af1", + ], + ) + + parent_substance = structurally_diverse["parent_substance"] + eq_(parent_substance["unii"], "48HJA7IOQP") + eq_(parent_substance["uuid"], "8c40fcc9-0950-45f8-857b-e9140d8dc692") + eq_(parent_substance["substance_class"], "reference") + eq_(parent_substance["refuuid"], "0bfddc8e-ff36-43d7-b549-301b9e161343") + eq_(parent_substance["ref_pname"], "DANIO RERIO WHOLE") + eq_(parent_substance["linking_id"], "48HJA7IOQP") + eq_(parent_substance["name"], "DANIO RERIO WHOLE") def test_structure(): - meta, results = fetch( - '/other/substance.json?search=unii:981Y8SX18M') - eq_(len(results), 1) - - structure = results[0]["structure"] - eq_(structure["count"], 1) - eq_(structure["smiles"], "Cl.CN1C(CCCC(O)=O)=NC2=CC(=CC=C12)N(CCCl)CCCl") - eq_(structure["defined_stereo"], 0) - eq_(structure["molecular_weight"], "394.721") - eq_(structure["atropisomerism"], "No") - eq_(structure["ez_centers"], 0) - eq_(structure["charge"], 0) - eq_(structure["references"], ["b3f2c801-a26d-4a3e-aaa6-59d3874874ab", "4a37244b-6353-43d8-a8e9-384ba6686d00"]) - eq_(structure["stereo_centers"], 0) - eq_(structure["stereochemistry"], "ACHIRAL") - eq_(structure["optical_activity"], "NONE") - eq_(structure["formula"], "C16H21Cl2N3O2.ClH") - eq_(structure["id"], "8a878d3f-096b-4bfe-8e4a-13b62ff7179a") + meta, results = fetch("/other/substance.json?search=unii:981Y8SX18M") + eq_(len(results), 1) + + structure = results[0]["structure"] + eq_(structure["count"], 1) + eq_(structure["smiles"], "Cl.CN1C(CCCC(O)=O)=NC2=CC(=CC=C12)N(CCCl)CCCl") + eq_(structure["defined_stereo"], 0) + eq_(structure["molecular_weight"], "394.721") + eq_(structure["atropisomerism"], "No") + eq_(structure["ez_centers"], 0) + eq_(structure["charge"], 0) + eq_( + structure["references"], + [ + "b3f2c801-a26d-4a3e-aaa6-59d3874874ab", + "4a37244b-6353-43d8-a8e9-384ba6686d00", + ], + ) + eq_(structure["stereo_centers"], 0) + eq_(structure["stereochemistry"], "ACHIRAL") + eq_(structure["optical_activity"], "NONE") + eq_(structure["formula"], "C16H21Cl2N3O2.ClH") + eq_(structure["id"], "8a878d3f-096b-4bfe-8e4a-13b62ff7179a") def test_date_fields(): - assert_total('/other/substance.json?search=_exists_:references.document_date', 90000) + assert_total( + "/other/substance.json?search=_exists_:references.document_date", 90000 + ) -if __name__ == '__main__': - all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) - for key, func in all_functions: - if key.find("test_") > -1: - func() +if __name__ == "__main__": + all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) + for key, func in all_functions: + if key.find("test_") > -1: + func() diff --git a/openfda/deploy/tests/serology/test_endpoint.py b/openfda/deploy/tests/serology/test_endpoint.py index e8171f8..bb406f4 100644 --- a/openfda/deploy/tests/serology/test_endpoint.py +++ b/openfda/deploy/tests/serology/test_endpoint.py @@ -6,75 +6,88 @@ def test_total_count(): - assert_total('/device/covid19serology.json', 6380) + assert_total("/device/covid19serology.json", 6380) def test_basic_search(): - meta, results = fetch( - '/device/covid19serology.json?search=manufacturer.exact:Euroimmun+AND+date_performed:([20200421+TO+20200421])' - '+AND+panel:"Panel 1"+AND+sample_id:C0001+AND+sample_no:[1+TO+1]+AND+lot_number:E200330DT') - eq_(len(results), 1) + meta, results = fetch( + "/device/covid19serology.json?search=manufacturer.exact:Euroimmun+AND+date_performed:([20200421+TO+20200421])" + '+AND+panel:"Panel 1"+AND+sample_id:C0001+AND+sample_no:[1+TO+1]+AND+lot_number:E200330DT' + ) + eq_(len(results), 1) - row = results[0] - # Verify base logic. - eq_('Euroimmun', row.get('manufacturer')) - eq_('SARS-COV-2 ELISA (IgG)', row.get('device')) - eq_('4/21/2020', row.get('date_performed')) - eq_('E200330DT', row.get('lot_number')) - eq_('Panel 1', row.get('panel')) - eq_('1', row.get('sample_no')) - eq_('C0001', row.get('sample_id')) - eq_('Serum', row.get('type')) - eq_('Negatives', row.get('group')) - eq_('NA', row.get('days_from_symptom')) - eq_('NA', row.get('igm_result')) - eq_('Negative', row.get('igg_result')) - eq_('NA', row.get('iga_result')) - eq_('NA', row.get('pan_result')) - eq_('NA', row.get('igm_igg_result')) - eq_('Pass', row.get('control')) - eq_('0', row.get('igm_titer')) - eq_('0', row.get('igg_titer')) - eq_('Negative', row.get('igm_truth')) - eq_('Negative', row.get('igg_truth')) - eq_('Negative', row.get('antibody_truth')) - eq_('NA', row.get('igm_agree')) - eq_('TN', row.get('igg_agree')) - eq_('NA', row.get('iga_agree')) - eq_('NA', row.get('pan_agree')) - eq_('NA', row.get('igm_igg_agree')) - eq_('NA', row.get('igm_iga_result')) - eq_('NA', row.get('igm_iga_agree')) - eq_('TN', row.get('antibody_agree')) + row = results[0] + # Verify base logic. + eq_("Euroimmun", row.get("manufacturer")) + eq_("SARS-COV-2 ELISA (IgG)", row.get("device")) + eq_("4/21/2020", row.get("date_performed")) + eq_("E200330DT", row.get("lot_number")) + eq_("Panel 1", row.get("panel")) + eq_("1", row.get("sample_no")) + eq_("C0001", row.get("sample_id")) + eq_("Serum", row.get("type")) + eq_("Negatives", row.get("group")) + eq_("NA", row.get("days_from_symptom")) + eq_("NA", row.get("igm_result")) + eq_("Negative", row.get("igg_result")) + eq_("NA", row.get("iga_result")) + eq_("NA", row.get("pan_result")) + eq_("NA", row.get("igm_igg_result")) + eq_("Pass", row.get("control")) + eq_("0", row.get("igm_titer")) + eq_("0", row.get("igg_titer")) + eq_("Negative", row.get("igm_truth")) + eq_("Negative", row.get("igg_truth")) + eq_("Negative", row.get("antibody_truth")) + eq_("NA", row.get("igm_agree")) + eq_("TN", row.get("igg_agree")) + eq_("NA", row.get("iga_agree")) + eq_("NA", row.get("pan_agree")) + eq_("NA", row.get("igm_igg_agree")) + eq_("NA", row.get("igm_iga_result")) + eq_("NA", row.get("igm_iga_agree")) + eq_("TN", row.get("antibody_agree")) def test_exact_fields(): - assert_total( - '/device/covid19serology.json?search=date_performed:[20200420+TO+20200422]', 110) - meta, results = fetch( - '/device/covid19serology.json?search=date_performed:[20221225+TO+20230422]') - eq_(results, None) + assert_total( + "/device/covid19serology.json?search=date_performed:[20200420+TO+20200422]", 110 + ) + meta, results = fetch( + "/device/covid19serology.json?search=date_performed:[20221225+TO+20230422]" + ) + eq_(results, None) def test_date_ranges(): - assert_total( - '/device/covid19serology.json?search=manufacturer.exact:Euroimmun+AND+device.exact:"SARS-COV-2 ELISA (IgG)"', 110) - assert_total( - '/device/covid19serology.json?search=manufacturer:Euroimmun+AND+device:"SARS-COV-2 ELISA (IgG)"', 110) - assert_total( - '/device/covid19serology.json?search=manufacturer:Euroimmun+AND+device:"SARS-COV-2"', 110) + assert_total( + '/device/covid19serology.json?search=manufacturer.exact:Euroimmun+AND+device.exact:"SARS-COV-2 ELISA (IgG)"', + 110, + ) + assert_total( + '/device/covid19serology.json?search=manufacturer:Euroimmun+AND+device:"SARS-COV-2 ELISA (IgG)"', + 110, + ) + assert_total( + '/device/covid19serology.json?search=manufacturer:Euroimmun+AND+device:"SARS-COV-2"', + 110, + ) - meta, results = fetch( - '/device/covid19serology.json?search=manufacturer.exact:Euroimmun+AND+device.exact:"SARS-COV-2"') - eq_(results, None) + meta, results = fetch( + '/device/covid19serology.json?search=manufacturer.exact:Euroimmun+AND+device.exact:"SARS-COV-2"' + ) + eq_(results, None) def test_sample_no_ranges(): - assert_total('/device/covid19serology.json?search=sample_no:[1+TO+5]&limit=100', 20) - assert_total('/device/covid19serology.json?search=sample_no:[1+TO+15]&limit=100', 60) + assert_total("/device/covid19serology.json?search=sample_no:[1+TO+5]&limit=100", 20) + assert_total( + "/device/covid19serology.json?search=sample_no:[1+TO+15]&limit=100", 60 + ) -if __name__ == '__main__': - all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) - for key, func in all_functions: - if key.find("test_") > -1: - func() + +if __name__ == "__main__": + all_functions = inspect.getmembers(sys.modules[__name__], inspect.isfunction) + for key, func in all_functions: + if key.find("test_") > -1: + func() diff --git a/openfda/deploy/tests/status/test_endpoint.py b/openfda/deploy/tests/status/test_endpoint.py index 2c7edda..d278698 100644 --- a/openfda/deploy/tests/status/test_endpoint.py +++ b/openfda/deploy/tests/status/test_endpoint.py @@ -2,33 +2,67 @@ def assert_enough_results(query, count): - meta, results = api_test_helpers.fetch(query) - assert meta['results']['total'] > count, \ - 'Query %s returned fewer results than expected: %d vs %d' % ( - query, count, meta['results']['total']) + meta, results = api_test_helpers.fetch(query) + assert ( + meta["results"]["total"] > count + ), "Query %s returned fewer results than expected: %d vs %d" % ( + query, + count, + meta["results"]["total"], + ) def test_all_green(): - status = api_test_helpers.json('/status') - print(status) - for ep in status: - assert ep['status'] == 'GREEN' + status = api_test_helpers.json("/status") + print(status) + for ep in status: + assert ep["status"] == "GREEN" def test_counts(): - status = api_test_helpers.json('/status') - assert (next(x for x in status if x['endpoint'] == 'deviceevent'))['documents'] >= 9400000 - assert (next(x for x in status if x['endpoint'] == 'devicerecall'))['documents'] >= 43000 - assert (next(x for x in status if x['endpoint'] == 'deviceclass'))['documents'] >= 6176 - assert (next(x for x in status if x['endpoint'] == 'devicereglist'))['documents'] >= 227602 - assert (next(x for x in status if x['endpoint'] == 'deviceclearance'))['documents'] >= 145930 - assert (next(x for x in status if x['endpoint'] == 'devicepma'))['documents'] >= 33000 - assert (next(x for x in status if x['endpoint'] == 'deviceudi'))['documents'] >= 2480000 - assert (next(x for x in status if x['endpoint'] == 'drugevent'))['documents'] >= 11512000 - assert (next(x for x in status if x['endpoint'] == 'druglabel'))['documents'] >= 160000 - assert (next(x for x in status if x['endpoint'] == 'foodevent'))['documents'] >= 83000 - assert (next(x for x in status if x['endpoint'] == 'deviceenforcement'))['documents'] >= 10000 - assert (next(x for x in status if x['endpoint'] == 'drugenforcement'))['documents'] >= 5000 - assert (next(x for x in status if x['endpoint'] == 'foodenforcement'))['documents'] >= 1000 - assert (next(x for x in status if x['endpoint'] == 'animalandveterinarydrugevent'))['documents'] >= 1040000 - assert (next(x for x in status if x['endpoint'] == 'drugsfda'))['documents'] >= 24000 + status = api_test_helpers.json("/status") + assert (next(x for x in status if x["endpoint"] == "deviceevent"))[ + "documents" + ] >= 9400000 + assert (next(x for x in status if x["endpoint"] == "devicerecall"))[ + "documents" + ] >= 43000 + assert (next(x for x in status if x["endpoint"] == "deviceclass"))[ + "documents" + ] >= 6176 + assert (next(x for x in status if x["endpoint"] == "devicereglist"))[ + "documents" + ] >= 227602 + assert (next(x for x in status if x["endpoint"] == "deviceclearance"))[ + "documents" + ] >= 145930 + assert (next(x for x in status if x["endpoint"] == "devicepma"))[ + "documents" + ] >= 33000 + assert (next(x for x in status if x["endpoint"] == "deviceudi"))[ + "documents" + ] >= 2480000 + assert (next(x for x in status if x["endpoint"] == "drugevent"))[ + "documents" + ] >= 11512000 + assert (next(x for x in status if x["endpoint"] == "druglabel"))[ + "documents" + ] >= 160000 + assert (next(x for x in status if x["endpoint"] == "foodevent"))[ + "documents" + ] >= 83000 + assert (next(x for x in status if x["endpoint"] == "deviceenforcement"))[ + "documents" + ] >= 10000 + assert (next(x for x in status if x["endpoint"] == "drugenforcement"))[ + "documents" + ] >= 5000 + assert (next(x for x in status if x["endpoint"] == "foodenforcement"))[ + "documents" + ] >= 1000 + assert (next(x for x in status if x["endpoint"] == "animalandveterinarydrugevent"))[ + "documents" + ] >= 1040000 + assert (next(x for x in status if x["endpoint"] == "drugsfda"))[ + "documents" + ] >= 24000 diff --git a/openfda/device_clearance/pipeline.py b/openfda/device_clearance/pipeline.py index e04c89b..c198c40 100755 --- a/openfda/device_clearance/pipeline.py +++ b/openfda/device_clearance/pipeline.py @@ -1,8 +1,8 @@ #!/usr/bin/python -''' 510k pipeline for downloading, transforming to JSON and loading into +""" 510k pipeline for downloading, transforming to JSON and loading into Elasticsearch. -''' +""" import glob import os @@ -17,107 +17,119 @@ from openfda import download_util from openfda.common import first_file_timestamp from openfda.device_clearance import transform -from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, - DeviceAnnotateMapper) +from openfda.device_harmonization.pipeline import ( + Harmonized2OpenFDA, + DeviceAnnotateMapper, +) RUN_DIR = dirname(dirname(os.path.abspath(__file__))) # A directory for holding files that track Task state -META_DIR = config.data_dir('510k/meta') -RAW_DIR = config.data_dir('510k/raw') -common.shell_cmd('mkdir -p %s', META_DIR) +META_DIR = config.data_dir("510k/meta") +RAW_DIR = config.data_dir("510k/raw") +common.shell_cmd("mkdir -p %s", META_DIR) + +CLEARED_DEVICE_URL = ( + "https://www.fda.gov/medical-devices/510k-clearances/downloadable-510k-files" +) -CLEARED_DEVICE_URL = 'https://www.fda.gov/medical-devices/510k-clearances/downloadable-510k-files' class Download_510K(luigi.Task): - def requires(self): - return [] + def requires(self): + return [] - def output(self): - return luigi.LocalTarget(RAW_DIR) + def output(self): + return luigi.LocalTarget(RAW_DIR) - def run(self): - soup = BeautifulSoup(urlopen(CLEARED_DEVICE_URL).read(), 'lxml') - for a in soup.find_all(href=re.compile('.*.zip')): - if a.text.startswith('PMN') and a.text != 'PMNLSTMN.ZIP': - fileURL = a['href'] - common.download(fileURL, join(self.output().path, a['href'].split('/')[-1])) + def run(self): + soup = BeautifulSoup(urlopen(CLEARED_DEVICE_URL).read(), "lxml") + for a in soup.find_all(href=re.compile(".*.zip")): + if a.text.startswith("PMN") and a.text != "PMNLSTMN.ZIP": + fileURL = a["href"] + common.download( + fileURL, join(self.output().path, a["href"].split("/")[-1]) + ) class ExtractAndCleanDownloads510k(luigi.Task): - ''' Unzip each of the download files and remove all the non-UTF8 characters. - Unzip -p streams the data directly to iconv which then writes to disk. - ''' - def requires(self): - return Download_510K() + """Unzip each of the download files and remove all the non-UTF8 characters. + Unzip -p streams the data directly to iconv which then writes to disk. + """ + + def requires(self): + return Download_510K() - def output(self): - return luigi.LocalTarget(config.data_dir('510k/extracted')) + def output(self): + return luigi.LocalTarget(config.data_dir("510k/extracted")) - def run(self): - output_dir = self.output().path - common.shell_cmd('mkdir -p %s', output_dir) - input_dir = self.input().path - download_util.extract_and_clean(input_dir, 'ISO-8859-1', 'UTF-8', 'txt') + def run(self): + output_dir = self.output().path + common.shell_cmd("mkdir -p %s", output_dir) + input_dir = self.input().path + download_util.extract_and_clean(input_dir, "ISO-8859-1", "UTF-8", "txt") class Clearance2JSON(parallel.MRTask): - def map(self, key, value, output): - # TODO(hansnelsen): bring the `transform.py` logic into the mapper and - # remove the file. - new_value = transform.transform_device_clearance(value) - output.add(self.filename + ':' + key, new_value) + def map(self, key, value, output): + # TODO(hansnelsen): bring the `transform.py` logic into the mapper and + # remove the file. + new_value = transform.transform_device_clearance(value) + output.add(self.filename + ":" + key, new_value) - def requires(self): - return ExtractAndCleanDownloads510k() + def requires(self): + return ExtractAndCleanDownloads510k() - def output(self): - return luigi.LocalTarget(config.data_dir('510k', 'json.db')) + def output(self): + return luigi.LocalTarget(config.data_dir("510k", "json.db")) - def mapreduce_inputs(self): - input_files = glob.glob(self.input().path + '/*.txt') - return parallel.Collection.from_glob( - input_files, parallel.CSVDictLineInput(delimiter='|', strip_str='\0')) + def mapreduce_inputs(self): + input_files = glob.glob(self.input().path + "/*.txt") + return parallel.Collection.from_glob( + input_files, parallel.CSVDictLineInput(delimiter="|", strip_str="\0") + ) class ClearanceAnnotateMapper(DeviceAnnotateMapper): - def filter(self, data): - product_code = data['product_code'] - harmonized = self.harmonized_db.get(product_code, None) - if harmonized: - # 510k should never have a PMA openfda key - if 'device_pma' in harmonized: - del harmonized['device_pma'] - if self.table in harmonized: - del harmonized[self.table] - return harmonized - return None + def filter(self, data): + product_code = data["product_code"] + harmonized = self.harmonized_db.get(product_code, None) + if harmonized: + # 510k should never have a PMA openfda key + if "device_pma" in harmonized: + del harmonized["device_pma"] + if self.table in harmonized: + del harmonized[self.table] + return harmonized + return None + class AnnotateDevice(luigi.Task): - def requires(self): - return [Harmonized2OpenFDA(), Clearance2JSON()] + def requires(self): + return [Harmonized2OpenFDA(), Clearance2JSON()] - def output(self): - return luigi.LocalTarget(config.data_dir('510k','annotate.db')) + def output(self): + return luigi.LocalTarget(config.data_dir("510k", "annotate.db")) - def run(self): - harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() + def run(self): + harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() - parallel.mapreduce( - parallel.Collection.from_sharded(self.input()[1].path), - mapper=ClearanceAnnotateMapper(harmonized_db=harmonized_db), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=10) + parallel.mapreduce( + parallel.Collection.from_sharded(self.input()[1].path), + mapper=ClearanceAnnotateMapper(harmonized_db=harmonized_db), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=10, + ) class LoadJSON(index_util.LoadJSONBase): - index_name = 'deviceclearance' - mapping_file = './schemas/clearance_mapping.json' - data_source = AnnotateDevice() - use_checksum = False - optimize_index = True - last_update_date = lambda _: first_file_timestamp(RAW_DIR) - -if __name__ == '__main__': - luigi.run() + index_name = "deviceclearance" + mapping_file = "./schemas/clearance_mapping.json" + data_source = AnnotateDevice() + use_checksum = False + optimize_index = True + last_update_date = lambda _: first_file_timestamp(RAW_DIR) + + +if __name__ == "__main__": + luigi.run() diff --git a/openfda/device_clearance/transform.py b/openfda/device_clearance/transform.py index ffebeef..9fa7b00 100644 --- a/openfda/device_clearance/transform.py +++ b/openfda/device_clearance/transform.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -''' A set of utility functions for converting CDRH 510K Device Clearance data +""" A set of utility functions for converting CDRH 510K Device Clearance data from its native Pipe Separated Value (PSV) format into JSON so that it can be loaded into Elasticsearch. -''' +""" import collections import csv @@ -13,73 +13,81 @@ from openfda import common, device_common # Dates are special case for Elasticsearch, so we need a list of them. -DATE_KEYS = ['date_received', 'decision_date'] +DATE_KEYS = ["date_received", "decision_date"] RENAME_MAP = { - 'classadvisecomm': 'advisory_committee', - 'sspindicator': 'ssp_indicator', - 'knumber': 'k_number', - 'devicename': 'device_name', - 'zip': 'zip_code', - 'stateorsumm': 'statement_or_summary', - 'decisiondate': 'decision_date', - 'expeditedreview': 'expedited_review_flag', - 'type': 'clearance_type', - 'reviewadvisecomm': 'review_advisory_committee', - 'productcode': 'product_code', - 'thirdparty': 'third_party_flag', - 'street1': 'address_1', - 'street2': 'address_2', - 'datereceived': 'date_received', - 'decision': 'decision_code' + "classadvisecomm": "advisory_committee", + "sspindicator": "ssp_indicator", + "knumber": "k_number", + "devicename": "device_name", + "zip": "zip_code", + "stateorsumm": "statement_or_summary", + "decisiondate": "decision_date", + "expeditedreview": "expedited_review_flag", + "type": "clearance_type", + "reviewadvisecomm": "review_advisory_committee", + "productcode": "product_code", + "thirdparty": "third_party_flag", + "street1": "address_1", + "street2": "address_2", + "datereceived": "date_received", + "decision": "decision_code", } -IGNORE = ['sspindicator'] +IGNORE = ["sspindicator"] ADVISORY_COMMITTEE = device_common.MED_SPECIALTY_ADVISORY_COMMITTEE -code_file = './openfda/device_clearance/data/decision_codes.csv' -CODE_LIST = csv.reader(open(code_file, 'r')) +code_file = "./openfda/device_clearance/data/decision_codes.csv" +CODE_LIST = csv.reader(open(code_file, "r")) DECISION_CODES = collections.defaultdict(str) for row in CODE_LIST: - key, value = row[0], row[1] - DECISION_CODES[key] = value + key, value = row[0], row[1] + DECISION_CODES[key] = value + def get_description(data): - data = data.replace('SESE', 'SE') - if len(data) > 2: - data.replace('SE', '') - if data in DECISION_CODES: - return DECISION_CODES[data.strip()] + data = data.replace("SESE", "SE") + if len(data) > 2: + data.replace("SE", "") + if data in DECISION_CODES: + return DECISION_CODES[data.strip()] + + return "Unknown" - return 'Unknown' def _cleaner(k, v): - ''' A helper function that is used for renaming dictionary keys and - formatting dates. - ''' - if k is None: return None # malformed record most likely due to unescaped pipe character. - - k = k.lower() - if k in IGNORE: return None - if k in RENAME_MAP: - k = RENAME_MAP[k] - if v != None: - if k in DATE_KEYS: - try: - dt = arrow.get(v, 'MM/DD/YYYY').format('YYYY-MM-DD') - return (k, dt) - except arrow.parser.ParserMatchError: - logging.warning("Unparseable date likely due to a malformed CSV record: "+v) - return (k, v) - if k == 'decision_code': - ek, ev = 'decision_description', get_description(v) - return [(k, v.strip()), (ek, ev)] - if k == 'advisory_committee': - ek, ev = 'advisory_committee_description', ADVISORY_COMMITTEE.get(v, "") - return [(k, v.strip()), (ek, ev)] - else: return (k, v.strip()) - return (k, v) + """A helper function that is used for renaming dictionary keys and + formatting dates. + """ + if k is None: + return None # malformed record most likely due to unescaped pipe character. + + k = k.lower() + if k in IGNORE: + return None + if k in RENAME_MAP: + k = RENAME_MAP[k] + if v != None: + if k in DATE_KEYS: + try: + dt = arrow.get(v, "MM/DD/YYYY").format("YYYY-MM-DD") + return (k, dt) + except arrow.parser.ParserMatchError: + logging.warning( + "Unparseable date likely due to a malformed CSV record: " + v + ) + return (k, v) + if k == "decision_code": + ek, ev = "decision_description", get_description(v) + return [(k, v.strip()), (ek, ev)] + if k == "advisory_committee": + ek, ev = "advisory_committee_description", ADVISORY_COMMITTEE.get(v, "") + return [(k, v.strip()), (ek, ev)] + else: + return (k, v.strip()) + return (k, v) + def transform_device_clearance(csv_dict): - return common.transform_dict(csv_dict, _cleaner) + return common.transform_dict(csv_dict, _cleaner) diff --git a/openfda/device_common.py b/openfda/device_common.py index b9c5efa..b6f5239 100644 --- a/openfda/device_common.py +++ b/openfda/device_common.py @@ -1,32 +1,32 @@ #!/usr/bin/env python -''' A common set of variables for device pipelines. -''' +""" A common set of variables for device pipelines. +""" # A description for device classification medical speciality AND # device PMA and 510K advisory committee (same thing, called two different) # things in three places. MED_SPECIALTY_ADVISORY_COMMITTEE = { - 'AN':'Anesthesiology', - 'CV':'Cardiovascular', - 'CH':'Clinical Chemistry', - 'DE':'Dental', - 'EN':'Ear, Nose, Throat', - 'GU':'Gastroenterology, Urology', - 'HO':'General Hospital', - 'HE':'Hematology', - 'IM':'Immunology', - 'MG':'Medical Genetics', - 'MI':'Microbiology', - 'NE':'Neurology', - 'OB':'Obstetrics/Gynecology', - 'OP':'Ophthalmic', - 'OR':'Orthopedic', - 'PA':'Pathology', - 'PM':'Physical Medicine', - 'RA':'Radiology', - 'SU':'General, Plastic Surgery', - 'TX':'Clinical Toxicology', - '': 'Unknown' + "AN": "Anesthesiology", + "CV": "Cardiovascular", + "CH": "Clinical Chemistry", + "DE": "Dental", + "EN": "Ear, Nose, Throat", + "GU": "Gastroenterology, Urology", + "HO": "General Hospital", + "HE": "Hematology", + "IM": "Immunology", + "MG": "Medical Genetics", + "MI": "Microbiology", + "NE": "Neurology", + "OB": "Obstetrics/Gynecology", + "OP": "Ophthalmic", + "OR": "Orthopedic", + "PA": "Pathology", + "PM": "Physical Medicine", + "RA": "Radiology", + "SU": "General, Plastic Surgery", + "TX": "Clinical Toxicology", + "": "Unknown", } diff --git a/openfda/device_harmonization/pipeline.py b/openfda/device_harmonization/pipeline.py index 0711c25..c4c5ec4 100644 --- a/openfda/device_harmonization/pipeline.py +++ b/openfda/device_harmonization/pipeline.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -''' A pipeline for joining together each of the device datasets into one +""" A pipeline for joining together each of the device datasets into one harmonization database to be used by each pipeline's annotation step. The output of this process is a leveldb that contains a key of @@ -24,7 +24,7 @@ the device_pma OR 510k is populated, since they each represent alternate business processes for the same effect, approval. -''' +""" import collections import logging @@ -39,280 +39,290 @@ from openfda.parallel import mapreduce, Collection, Mapper, PivotReducer RUN_DIR = dirname(dirname(os.path.abspath(__file__))) -DATA_DIR = config.data_dir('device_harmonization') -common.shell_cmd('mkdir -p %s', DATA_DIR) +DATA_DIR = config.data_dir("device_harmonization") +common.shell_cmd("mkdir -p %s", DATA_DIR) PLUCK_MAP = { - 'device_pma': [ - 'applicant', - 'trade_name', - 'generic_name', - 'decision_date', - 'advisory_committee', - 'pma_number', - 'product_code', - 'advisory_committee', - ], - '510k': [ - 'applicant', - 'decision_date', - 'clearance_type', - 'k_number', - 'device_name', - 'decision_description', - 'advisory_committee', - 'review_advisory_committee', - 'third_party_flag', - 'expedited_review_flag', - 'product_code' - ], - 'classification': [ - 'device_name', - 'implant_flag', - 'review_code', - 'gmp_exempt_flag', - 'submission_type_id', - 'review_panel', - 'medical_specialty', - 'medical_specialty_description', - 'third_party_flag', - 'life_sustain_support_flag', - 'regulation_number', - 'device_class', - 'product_code' - ], - 'registration': [ - 'proprietary_name', - 'establishment_type', - 'product_code', - 'created_date', - 'registration.owner_operator.firm_name', - 'registration.registration_number', - 'registration.fei_number', - 'registration.iso_country_code', - 'registration.status_code', - 'registration.name', - 'registration.reg_expiry_date_year' - ] + "device_pma": [ + "applicant", + "trade_name", + "generic_name", + "decision_date", + "advisory_committee", + "pma_number", + "product_code", + "advisory_committee", + ], + "510k": [ + "applicant", + "decision_date", + "clearance_type", + "k_number", + "device_name", + "decision_description", + "advisory_committee", + "review_advisory_committee", + "third_party_flag", + "expedited_review_flag", + "product_code", + ], + "classification": [ + "device_name", + "implant_flag", + "review_code", + "gmp_exempt_flag", + "submission_type_id", + "review_panel", + "medical_specialty", + "medical_specialty_description", + "third_party_flag", + "life_sustain_support_flag", + "regulation_number", + "device_class", + "product_code", + ], + "registration": [ + "proprietary_name", + "establishment_type", + "product_code", + "created_date", + "registration.owner_operator.firm_name", + "registration.registration_number", + "registration.fei_number", + "registration.iso_country_code", + "registration.status_code", + "registration.name", + "registration.reg_expiry_date_year", + ], } def pluck(pluck_list, data): - ''' A helper function for extracting a specific subset of keys from a - dictionary. - ''' - # TODO(hansnelsen): If there is collision on a document key names, a better - # way of naming needs to be found. It just so happens that - # there is no collision on a key name for any particular - # table. - result = {} - data = common.ObjectDict(data) - for key in pluck_list: - new_key = key.split('.')[-1] - try: - result[new_key] = data.get_nested(key) - except: - logging.warning('Could not get key: '+key) - return result + """A helper function for extracting a specific subset of keys from a + dictionary. + """ + # TODO(hansnelsen): If there is collision on a document key names, a better + # way of naming needs to be found. It just so happens that + # there is no collision on a key name for any particular + # table. + result = {} + data = common.ObjectDict(data) + for key in pluck_list: + new_key = key.split(".")[-1] + try: + result[new_key] = data.get_nested(key) + except: + logging.warning("Could not get key: " + key) + return result class JoinMapper(Mapper): - ''' A mapper that extracts the `product_code` from each input and streams - out the value with `product_code` as the key. Each value object is - transformed via `pluck()` before it is streamed to the reducer. - - ''' - def map_shard(self, map_input, map_output): - self.filename = map_input.filename - self.table = basename(dirname(dirname(self.filename))) - - assert (self.table in PLUCK_MAP), \ - 'Could not find %s in PLUCK_MAP' % self.table - - self.pluck_list = PLUCK_MAP[self.table] - - return Mapper.map_shard(self, map_input, map_output) - - def map(self, key, value, output): - if not isinstance(value, list): value = [value] - for val in value: - # Registration is a special case, in that it contains many product codes - # that we want to stream out of the Mapper. - # As such, we want to hoist each product out and merge it with all of - # its respective registration data. - if self.table == 'registration': - products = val.get('products', []) - for product in products: - new_key = product['product_code'] - new_value = dict(list(val.items()) + list(product.items())) - if new_key: - output.add(new_key, (self.table, pluck(self.pluck_list, new_value))) - else: - new_key = val['product_code'] - if new_key: - output.add(new_key, (self.table, pluck(self.pluck_list, val))) + """A mapper that extracts the `product_code` from each input and streams + out the value with `product_code` as the key. Each value object is + transformed via `pluck()` before it is streamed to the reducer. + + """ + + def map_shard(self, map_input, map_output): + self.filename = map_input.filename + self.table = basename(dirname(dirname(self.filename))) + + assert self.table in PLUCK_MAP, "Could not find %s in PLUCK_MAP" % self.table + + self.pluck_list = PLUCK_MAP[self.table] + + return Mapper.map_shard(self, map_input, map_output) + + def map(self, key, value, output): + if not isinstance(value, list): + value = [value] + for val in value: + # Registration is a special case, in that it contains many product codes + # that we want to stream out of the Mapper. + # As such, we want to hoist each product out and merge it with all of + # its respective registration data. + if self.table == "registration": + products = val.get("products", []) + for product in products: + new_key = product["product_code"] + new_value = dict(list(val.items()) + list(product.items())) + if new_key: + output.add( + new_key, (self.table, pluck(self.pluck_list, new_value)) + ) + else: + new_key = val["product_code"] + if new_key: + output.add(new_key, (self.table, pluck(self.pluck_list, val))) class DeviceAnnotateMapper(parallel.Mapper): - def __init__(self, harmonized_db): - parallel.Mapper.__init__(self) - self.harmonized_db = harmonized_db - - def map_shard(self, map_input, map_output): - self.filename = map_input.filename - self.table = basename(dirname(dirname(self.filename))) - # TODO(hansnelsen): clean up the inconsistent naming of device_clearance, - # it should be device_clearance every where. - if self.table == 'device_clearance': self.table = '510k' - return parallel.Mapper.map_shard(self, map_input, map_output) - - def flatten(self, harmonized): - ''' Helper function which flattens a filtered harmonized db entry. + def __init__(self, harmonized_db): + parallel.Mapper.__init__(self) + self.harmonized_db = harmonized_db + + def map_shard(self, map_input, map_output): + self.filename = map_input.filename + self.table = basename(dirname(dirname(self.filename))) + # TODO(hansnelsen): clean up the inconsistent naming of device_clearance, + # it should be device_clearance every where. + if self.table == "device_clearance": + self.table = "510k" + return parallel.Mapper.map_shard(self, map_input, map_output) + + def flatten(self, harmonized): + """Helper function which flattens a filtered harmonized db entry. Moves any key/values from the `classification` dictionary to the top-level and anything else gets added to a list of values. - ''' - result = collections.defaultdict(set) - for key, val in harmonized.items(): - if key == 'classification': - try: - for k, v in val[0].items(): - result[k] = v - except IndexError: - continue - else: - for row in val: - for k, v in row.items(): - result[k].add(v) - - return {k:list(v) if isinstance(v, set) else v for k, v in result.items()} - - def filter(self, data, lookup=None): - ''' This is the standard filter for Device Annnotation. It may be + """ + result = collections.defaultdict(set) + for key, val in harmonized.items(): + if key == "classification": + try: + for k, v in val[0].items(): + result[k] = v + except IndexError: + continue + else: + for row in val: + for k, v in row.items(): + result[k].add(v) + + return {k: list(v) if isinstance(v, set) else v for k, v in result.items()} + + def filter(self, data, lookup=None): + """This is the standard filter for Device Annnotation. It may be appropriate to override this when the data does not have a `product_code` and/or there is special logic for the filter. A filter should only filter out items from the harmonized_db entry, not alter it. The standard case is to remove the current table from the harmonized_db entry so there is not duplicate data in `openfda`. - ''' - product_code = data['product_code'] - harmonized = self.harmonized_db.get(product_code, None) - if harmonized: - if self.table in harmonized: - del harmonized[self.table] - return harmonized - return None - - def harmonize(self, data): - ''' A function for placing the `openfda` section in the appropriate place. + """ + product_code = data["product_code"] + harmonized = self.harmonized_db.get(product_code, None) + if harmonized: + if self.table in harmonized: + del harmonized[self.table] + return harmonized + return None + + def harmonize(self, data): + """A function for placing the `openfda` section in the appropriate place. The default is just to put it at the top level. If there are more unique requirements, this function should be over-ridden in a custom mapper. The `harmonize()` should call `self.filter()` and `self.flatten()` appropriately. - ''' - result = dict(data) - harmonized = self.filter(result) - if harmonized: - result['openfda'] = self.flatten(harmonized) - else: - result['openfda'] = {} - return result - - def map(self, key, value, output): - if not value: return - if not isinstance(value, list): value = [value] - for val in value: - new_value = self.harmonize(val) - output.add(self.filename + ':' + key, new_value) + """ + result = dict(data) + harmonized = self.filter(result) + if harmonized: + result["openfda"] = self.flatten(harmonized) + else: + result["openfda"] = {} + return result + + def map(self, key, value, output): + if not value: + return + if not isinstance(value, list): + value = [value] + for val in value: + new_value = self.harmonize(val) + output.add(self.filename + ":" + key, new_value) class DeviceHarmonization(luigi.Task): - ''' There will be some namespace collision since the inputs come from the - files listed below and the output of this process will be consumed by - each of these files during their respective annotation steps. It seems - cleaner to contain all the namespace ugliness in one place, here. - ''' - def requires(self): - from openfda.classification import pipeline as classification_pipeline - from openfda.device_clearance import pipeline as clearance_pipeline - from openfda.device_pma import pipeline as device_pma_pipeline - from openfda.registration import pipeline as registration_pipeline - return [classification_pipeline.Classification2JSON(), + """There will be some namespace collision since the inputs come from the + files listed below and the output of this process will be consumed by + each of these files during their respective annotation steps. It seems + cleaner to contain all the namespace ugliness in one place, here. + """ + + def requires(self): + from openfda.classification import pipeline as classification_pipeline + from openfda.device_clearance import pipeline as clearance_pipeline + from openfda.device_pma import pipeline as device_pma_pipeline + from openfda.registration import pipeline as registration_pipeline + + return [ + classification_pipeline.Classification2JSON(), clearance_pipeline.Clearance2JSON(), device_pma_pipeline.Pma2JSON(), - registration_pipeline.JoinAll()] + registration_pipeline.JoinAll(), + ] - def output(self): - return luigi.LocalTarget(join(DATA_DIR, 'harmonized.db')) + def output(self): + return luigi.LocalTarget(join(DATA_DIR, "harmonized.db")) - def run(self): - db_list = [s.path for s in self.input()] - mapreduce( - Collection.from_sharded_list(db_list), - mapper=JoinMapper(), - reducer=PivotReducer(), - output_prefix=self.output().path, - num_shards=10) + def run(self): + db_list = [s.path for s in self.input()] + mapreduce( + Collection.from_sharded_list(db_list), + mapper=JoinMapper(), + reducer=PivotReducer(), + output_prefix=self.output().path, + num_shards=10, + ) class Harmonized2OpenFDAMapper(parallel.Mapper): - ''' A simple mapper that strips out all unwanted keys from the harmonized.db - ''' - def __init__(self): - parallel.Mapper.__init__(self) - # openfda_map represents the fields that will end up in the openfda - # section of a document during the annotation step. - self.openfda_map = { - 'device_pma': [ - 'pma_number' - ], - '510k': [ - 'k_number' - ], - 'classification': [ - 'device_name', - 'regulation_number', - 'device_class', - 'medical_specialty_description' - ], - 'registration': [ - 'registration_number', - 'fei_number', - 'k_number', - 'pma_number' - ] - } - - def openfda_pluck(self, table, data): - ''' A helper function for extracting a specific subset of keys from a + """A simple mapper that strips out all unwanted keys from the harmonized.db""" + + def __init__(self): + parallel.Mapper.__init__(self) + # openfda_map represents the fields that will end up in the openfda + # section of a document during the annotation step. + self.openfda_map = { + "device_pma": ["pma_number"], + "510k": ["k_number"], + "classification": [ + "device_name", + "regulation_number", + "device_class", + "medical_specialty_description", + ], + "registration": [ + "registration_number", + "fei_number", + "k_number", + "pma_number", + ], + } + + def openfda_pluck(self, table, data): + """A helper function for extracting a specific subset of keys from a dictionary. - ''' - return {k: v for k, v in data.items() if k in self.openfda_map[table]} + """ + return {k: v for k, v in data.items() if k in self.openfda_map[table]} - def map(self, key, value, output): - new_value = collections.defaultdict(list) - for table_name, rows in value.items(): - for row in rows: - new_value[table_name].append(self.openfda_pluck(table_name, row)) - output.add(key, new_value) + def map(self, key, value, output): + new_value = collections.defaultdict(list) + for table_name, rows in value.items(): + for row in rows: + new_value[table_name].append(self.openfda_pluck(table_name, row)) + output.add(key, new_value) class Harmonized2OpenFDA(luigi.Task): - def requires(self): - return DeviceHarmonization() + def requires(self): + return DeviceHarmonization() + + def output(self): + return luigi.LocalTarget(join(DATA_DIR, "openfda.db")) - def output(self): - return luigi.LocalTarget(join(DATA_DIR, 'openfda.db')) + def run(self): + mapreduce( + Collection.from_sharded(self.input().path), + mapper=Harmonized2OpenFDAMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=1, + ) - def run(self): - mapreduce(Collection.from_sharded(self.input().path), - mapper=Harmonized2OpenFDAMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=1) -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/device_pma/pipeline.py b/openfda/device_pma/pipeline.py index ebaff6b..0f524a8 100755 --- a/openfda/device_pma/pipeline.py +++ b/openfda/device_pma/pipeline.py @@ -1,8 +1,8 @@ #!/usr/bin/python -''' Device Pre-market approval pipeline for downloading, converting to JSON and +""" Device Pre-market approval pipeline for downloading, converting to JSON and loading into elasticsearch. -''' +""" import glob import os @@ -12,107 +12,120 @@ from openfda import common, config, download_util, index_util, parallel from openfda.common import first_file_timestamp -from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, - DeviceAnnotateMapper) +from openfda.device_harmonization.pipeline import ( + Harmonized2OpenFDA, + DeviceAnnotateMapper, +) from openfda.device_pma import transform RUN_DIR = dirname(dirname(os.path.abspath(__file__))) # A directory for holding files that track Task state -META_DIR = config.data_dir('device_pma/meta') -RAW_DIR = config.data_dir('device_pma/raw') -common.shell_cmd('mkdir -p %s', META_DIR) +META_DIR = config.data_dir("device_pma/meta") +RAW_DIR = config.data_dir("device_pma/raw") +common.shell_cmd("mkdir -p %s", META_DIR) + +DEVICE_PMA_ZIP = "https://www.accessdata.fda.gov/premarket/ftparea/pma.zip" -DEVICE_PMA_ZIP = 'https://www.accessdata.fda.gov/premarket/ftparea/pma.zip' class DownloadPMA(luigi.Task): - def requires(self): - return [] + def requires(self): + return [] + + def output(self): + return luigi.LocalTarget(RAW_DIR) - def output(self): - return luigi.LocalTarget(RAW_DIR) + def run(self): + output_filename = join(self.output().path, DEVICE_PMA_ZIP.split("/")[-1]) + common.download(DEVICE_PMA_ZIP, output_filename) - def run(self): - output_filename = join(self.output().path, DEVICE_PMA_ZIP.split('/')[-1]) - common.download(DEVICE_PMA_ZIP, output_filename) class ExtractAndCleanDownloadsPMA(luigi.Task): - ''' Unzip each of the download files and remove all the non-UTF8 characters. - Unzip -p streams the data directly to iconv which then writes to disk. - ''' - def requires(self): - return DownloadPMA() + """Unzip each of the download files and remove all the non-UTF8 characters. + Unzip -p streams the data directly to iconv which then writes to disk. + """ + + def requires(self): + return DownloadPMA() - def output(self): - return luigi.LocalTarget(config.data_dir('device_pma/extracted')) + def output(self): + return luigi.LocalTarget(config.data_dir("device_pma/extracted")) + + def run(self): + output_dir = self.output().path + common.shell_cmd("mkdir -p %s", output_dir) + input_dir = self.input().path + download_util.extract_and_clean(input_dir, "ISO-8859-1", "UTF-8", "txt") - def run(self): - output_dir = self.output().path - common.shell_cmd('mkdir -p %s', output_dir) - input_dir = self.input().path - download_util.extract_and_clean(input_dir, 'ISO-8859-1', 'UTF-8', 'txt') class PMAMapper(parallel.Mapper): - def map(self, key, value, output): - # TODO(hansnelsen): bring the `transform.py` logic into the mapper and - # remove the file. - new_value = transform.transform_device_pma(value) - output.add(key, new_value) + def map(self, key, value, output): + # TODO(hansnelsen): bring the `transform.py` logic into the mapper and + # remove the file. + new_value = transform.transform_device_pma(value) + output.add(key, new_value) + class Pma2JSON(luigi.Task): - def requires(self): - return ExtractAndCleanDownloadsPMA() - - def output(self): - return luigi.LocalTarget(config.data_dir('device_pma/json.db')) - - def run(self): - common.shell_cmd('mkdir -p %s', dirname(self.output().path)) - input_files = glob.glob(self.input().path + '/*.txt') - parallel.mapreduce( - parallel.Collection.from_glob( - input_files, parallel.CSVDictLineInput(delimiter='|', strip_str='\0')), - mapper=PMAMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + def requires(self): + return ExtractAndCleanDownloadsPMA() + + def output(self): + return luigi.LocalTarget(config.data_dir("device_pma/json.db")) + + def run(self): + common.shell_cmd("mkdir -p %s", dirname(self.output().path)) + input_files = glob.glob(self.input().path + "/*.txt") + parallel.mapreduce( + parallel.Collection.from_glob( + input_files, parallel.CSVDictLineInput(delimiter="|", strip_str="\0") + ), + mapper=PMAMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) + class PMAAnnotateMapper(DeviceAnnotateMapper): - def filter(self, data): - product_code = data['product_code'] - harmonized = self.harmonized_db.get(product_code, None) - if harmonized: - # PMA should never have a 510k openfda key - if '510k' in harmonized: - del harmonized['510k'] - if self.table in harmonized: - del harmonized[self.table] - return harmonized - return None + def filter(self, data): + product_code = data["product_code"] + harmonized = self.harmonized_db.get(product_code, None) + if harmonized: + # PMA should never have a 510k openfda key + if "510k" in harmonized: + del harmonized["510k"] + if self.table in harmonized: + del harmonized[self.table] + return harmonized + return None + class AnnotateDevice(luigi.Task): - def requires(self): - return [Harmonized2OpenFDA(), Pma2JSON()] + def requires(self): + return [Harmonized2OpenFDA(), Pma2JSON()] - def output(self): - return luigi.LocalTarget(config.data_dir('device_pma/annotate.db')) + def output(self): + return luigi.LocalTarget(config.data_dir("device_pma/annotate.db")) - def run(self): - harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() + def run(self): + harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() - parallel.mapreduce( - parallel.Collection.from_sharded(self.input()[1].path), - mapper=PMAAnnotateMapper(harmonized_db=harmonized_db), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=10) + parallel.mapreduce( + parallel.Collection.from_sharded(self.input()[1].path), + mapper=PMAAnnotateMapper(harmonized_db=harmonized_db), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=10, + ) class LoadJSON(index_util.LoadJSONBase): - index_name = 'devicepma' - mapping_file = './schemas/pma_mapping.json' - data_source = AnnotateDevice() - use_checksum = False - optimize_index = True - last_update_date = lambda _: first_file_timestamp(RAW_DIR) - -if __name__ == '__main__': - luigi.run() + index_name = "devicepma" + mapping_file = "./schemas/pma_mapping.json" + data_source = AnnotateDevice() + use_checksum = False + optimize_index = True + last_update_date = lambda _: first_file_timestamp(RAW_DIR) + + +if __name__ == "__main__": + luigi.run() diff --git a/openfda/device_pma/transform.py b/openfda/device_pma/transform.py index 7e3a40d..67df405 100644 --- a/openfda/device_pma/transform.py +++ b/openfda/device_pma/transform.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -''' A set of functions for converting CDRH Pre-Market Approval data from Pipe +""" A set of functions for converting CDRH Pre-Market Approval data from Pipe Separated Value (PSV) format to JSON so that it can be loaded into Elasticsearch. -''' +""" import arrow import csv @@ -14,46 +14,50 @@ # A list of keys to ignore from the source file. IGNORE = [] # A list of date keys to reformat -DATE_KEYS = ['fedregnoticedate', 'decisiondate', 'datereceived'] +DATE_KEYS = ["fedregnoticedate", "decisiondate", "datereceived"] # Changing names to match the openFDA naming standard # key = source name, value = replacement name RENAME_MAP = { - 'productcode': 'product_code', - 'fedregnoticedate': 'fed_reg_notice_date', - 'reviewgrantedyn': 'expedited_review_flag', - 'advisorycommittee': 'advisory_committee', - 'decisiondate': 'decision_date', - 'supplementnumber': 'supplement_number', - 'genericname': 'generic_name', - 'supplementreason': 'supplement_reason', - 'pmanumber': 'pma_number', - 'docketnumber': 'docket_number', - 'tradename': 'trade_name', - 'supplementtype': 'supplement_type', - 'aostatement': 'ao_statement', - 'decisioncode': 'decision_code', - 'datereceived': 'date_received' + "productcode": "product_code", + "fedregnoticedate": "fed_reg_notice_date", + "reviewgrantedyn": "expedited_review_flag", + "advisorycommittee": "advisory_committee", + "decisiondate": "decision_date", + "supplementnumber": "supplement_number", + "genericname": "generic_name", + "supplementreason": "supplement_reason", + "pmanumber": "pma_number", + "docketnumber": "docket_number", + "tradename": "trade_name", + "supplementtype": "supplement_type", + "aostatement": "ao_statement", + "decisioncode": "decision_code", + "datereceived": "date_received", } ADVISORY_COMMITTEE = device_common.MED_SPECIALTY_ADVISORY_COMMITTEE + def _cleaner(k, v): - ''' A helper function that is used for removing and renaming dictionary keys. - Dates are also reformated for Elasticsearch and removed if null. - ''' - k = k.lower() - if k in IGNORE: return None - if k in DATE_KEYS: - if not v: return None - v = arrow.get(v, 'MM/DD/YYYY').format('YYYY-MM-DD') - if k in RENAME_MAP: - k = RENAME_MAP[k] - if k == 'advisory_committee': - ek, ev = 'advisory_committee_description', ADVISORY_COMMITTEE[v] - return [(k, v), (ek, ev)] - - return (k, v) + """A helper function that is used for removing and renaming dictionary keys. + Dates are also reformated for Elasticsearch and removed if null. + """ + k = k.lower() + if k in IGNORE: + return None + if k in DATE_KEYS: + if not v: + return None + v = arrow.get(v, "MM/DD/YYYY").format("YYYY-MM-DD") + if k in RENAME_MAP: + k = RENAME_MAP[k] + if k == "advisory_committee": + ek, ev = "advisory_committee_description", ADVISORY_COMMITTEE[v] + return [(k, v), (ek, ev)] + + return (k, v) + def transform_device_pma(csv_dict): - return common.transform_dict(csv_dict, _cleaner) + return common.transform_dict(csv_dict, _cleaner) diff --git a/openfda/device_recall/pipeline.py b/openfda/device_recall/pipeline.py index 91c370a..1d04f80 100644 --- a/openfda/device_recall/pipeline.py +++ b/openfda/device_recall/pipeline.py @@ -1,8 +1,8 @@ #!/usr/bin/python -''' Device pipeline for downloading, transforming to JSON and loading device +""" Device pipeline for downloading, transforming to JSON and loading device recall data into Elasticsearch. -''' +""" import csv import glob import os @@ -18,394 +18,480 @@ from openfda import common, config, index_util, parallel from openfda.common import soup_with_retry -from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, - DeviceAnnotateMapper) +from openfda.device_harmonization.pipeline import ( + Harmonized2OpenFDA, + DeviceAnnotateMapper, +) from openfda.tasks import AlwaysRunTask, NoopTask -DEVICE_RECALL_DATA_DIR = config.data_dir('device_recall') -SUMMARY_DOWNLOAD_URL = "https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfRes/res.cfm?" \ - "start_search=1&event_id=&productdescriptiontxt=&productcode=&IVDProducts=&rootCauseText=&recallstatus=¢erclassificationtypetext=&recallnumber=&" \ - "postdatefrom=%s&postdateto=%s" \ - "&productshortreasontxt=&firmlegalnam=&PMA_510K_Num=&pnumber=&knumber=&PAGENUM=500" +DEVICE_RECALL_DATA_DIR = config.data_dir("device_recall") +SUMMARY_DOWNLOAD_URL = ( + "https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfRes/res.cfm?" + "start_search=1&event_id=&productdescriptiontxt=&productcode=&IVDProducts=&rootCauseText=&recallstatus=¢erclassificationtypetext=&recallnumber=&" + "postdatefrom=%s&postdateto=%s" + "&productshortreasontxt=&firmlegalnam=&PMA_510K_Num=&pnumber=&knumber=&PAGENUM=500" +) -RECALL_RECORD_DOWNLOAD_URL = "https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfRes/res.cfm?id=%s" +RECALL_RECORD_DOWNLOAD_URL = ( + "https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfRes/res.cfm?id=%s" +) -''' +""" Even though the supply of the CSV recall file has stopped (reasons unknown), we are still making use of the most recent file in S3 because the Web page we scrape does not have FEI numbers for manufacturers, and we can still get a good deal of FEI numbers from the most recent CSV file (albeit not a 100% complete list). -''' -DEVICE_RECALL_BUCKET = 's3://openfda-device-recalls/' -DEVICE_RECALL_LOCAL_DIR = config.data_dir('device_recall/s3_sync') -DEVICE_RECALL_FILTERED_DIR = config.data_dir('device_recall/filtered') -DEVICE_CSV_TO_JSON_DIR = config.data_dir('device_recall/csv2json.db') +""" +DEVICE_RECALL_BUCKET = "s3://openfda-device-recalls/" +DEVICE_RECALL_LOCAL_DIR = config.data_dir("device_recall/s3_sync") +DEVICE_RECALL_FILTERED_DIR = config.data_dir("device_recall/filtered") +DEVICE_CSV_TO_JSON_DIR = config.data_dir("device_recall/csv2json.db") + def batch_dir(batch): - return arrow.get(batch[0]).strftime('%Y%m%d') + '_' + arrow.get(batch[1]).strftime('%Y%m%d') + return ( + arrow.get(batch[0]).strftime("%Y%m%d") + + "_" + + arrow.get(batch[1]).strftime("%Y%m%d") + ) + class SyncS3DeviceRecall(luigi.Task): - bucket = DEVICE_RECALL_BUCKET - local_dir = DEVICE_RECALL_LOCAL_DIR - - def output(self): - return luigi.LocalTarget(DEVICE_RECALL_LOCAL_DIR) - - def run(self): - common.cmd(['mkdir', '-p', self.local_dir]) - common.cmd(['aws', - '--profile=' + config.aws_profile(), - 's3', - 'sync', + bucket = DEVICE_RECALL_BUCKET + local_dir = DEVICE_RECALL_LOCAL_DIR + + def output(self): + return luigi.LocalTarget(DEVICE_RECALL_LOCAL_DIR) + + def run(self): + common.cmd(["mkdir", "-p", self.local_dir]) + common.cmd( + [ + "aws", + "--profile=" + config.aws_profile(), + "s3", + "sync", self.bucket, - self.local_dir]) + self.local_dir, + ] + ) class RemoveDuplicates(luigi.Task): - def requires(self): - return SyncS3DeviceRecall() - - def output(self): - return luigi.LocalTarget(join(DEVICE_RECALL_FILTERED_DIR, "filtered.csv")) - - def run(self): - common.cmd(['mkdir', '-p', DEVICE_RECALL_FILTERED_DIR]) - unfiltered = pd.read_csv(glob.glob(DEVICE_RECALL_LOCAL_DIR + '/*.csv')[0], index_col=False, - encoding='utf-8', - dtype=np.unicode, keep_default_na=False) - filtered = unfiltered.drop_duplicates() - filtered.to_csv(self.output().path, encoding='utf-8', index=False, quoting=csv.QUOTE_ALL) + def requires(self): + return SyncS3DeviceRecall() + + def output(self): + return luigi.LocalTarget(join(DEVICE_RECALL_FILTERED_DIR, "filtered.csv")) + + def run(self): + common.cmd(["mkdir", "-p", DEVICE_RECALL_FILTERED_DIR]) + unfiltered = pd.read_csv( + glob.glob(DEVICE_RECALL_LOCAL_DIR + "/*.csv")[0], + index_col=False, + encoding="utf-8", + dtype=np.unicode, + keep_default_na=False, + ) + filtered = unfiltered.drop_duplicates() + filtered.to_csv( + self.output().path, encoding="utf-8", index=False, quoting=csv.QUOTE_ALL + ) class CSV2JSON(parallel.MRTask): - input_dir = DEVICE_RECALL_LOCAL_DIR - - def map(self, key, value, output): - - RENAME_MAP = { - 'rcl products res number': 'product_res_number', - 'rcl firm fei number': 'firm_fei_number', - 'rcl products submission numbers': 'product_submission_numbers' - } - - def _cleaner(k, v): - ''' A helper function that is used for renaming dictionary keys and - formatting dates. - ''' - - def is_other(data): - if not common.get_p_number(data) and \ - not common.get_k_number(data) and \ - not data.startswith('G'): - return True - return False - - k = k.lower() - if k in RENAME_MAP: - k = RENAME_MAP[k] - else: - return None - - if k == 'product_submission_numbers': - # We take this single key, split it on space and then return three new - # keys and not the original. - submissions = [s for s in v.split(' ') if s] - k_numbers = [d for d in submissions if common.get_k_number(d)] - pma_numbers = [d for d in submissions if common.get_p_number(d)] - other = ' '.join([d.strip() for d in submissions if is_other(d)]) - return [('k_numbers', k_numbers), - ('pma_numbers', pma_numbers), - ('other_submission_description', other)] - - if v != None: - return (k, v.strip()) - return (k, v) - - new_value = common.transform_dict(value, _cleaner) - output.add(new_value['product_res_number'], new_value) - - def requires(self): - return RemoveDuplicates() - - def output(self): - return luigi.LocalTarget(DEVICE_CSV_TO_JSON_DIR) - - def mapreduce_inputs(self): - input_files = glob.glob(dirname(self.requires().output().path) + '/*.csv') - return parallel.Collection.from_glob(input_files, - parallel.CSVDictLineInput()) + input_dir = DEVICE_RECALL_LOCAL_DIR + + def map(self, key, value, output): + + RENAME_MAP = { + "rcl products res number": "product_res_number", + "rcl firm fei number": "firm_fei_number", + "rcl products submission numbers": "product_submission_numbers", + } + + def _cleaner(k, v): + """A helper function that is used for renaming dictionary keys and + formatting dates. + """ + + def is_other(data): + if ( + not common.get_p_number(data) + and not common.get_k_number(data) + and not data.startswith("G") + ): + return True + return False + + k = k.lower() + if k in RENAME_MAP: + k = RENAME_MAP[k] + else: + return None + + if k == "product_submission_numbers": + # We take this single key, split it on space and then return three new + # keys and not the original. + submissions = [s for s in v.split(" ") if s] + k_numbers = [d for d in submissions if common.get_k_number(d)] + pma_numbers = [d for d in submissions if common.get_p_number(d)] + other = " ".join([d.strip() for d in submissions if is_other(d)]) + return [ + ("k_numbers", k_numbers), + ("pma_numbers", pma_numbers), + ("other_submission_description", other), + ] + + if v != None: + return (k, v.strip()) + return (k, v) + + new_value = common.transform_dict(value, _cleaner) + output.add(new_value["product_res_number"], new_value) + + def requires(self): + return RemoveDuplicates() + + def output(self): + return luigi.LocalTarget(DEVICE_CSV_TO_JSON_DIR) + + def mapreduce_inputs(self): + input_files = glob.glob(dirname(self.requires().output().path) + "/*.csv") + return parallel.Collection.from_glob(input_files, parallel.CSVDictLineInput()) class DownloadWeeklyRecallSummary(luigi.Task): - batch = luigi.TupleParameter() - - def requires(self): - return [] - - def output(self): - file_name = '%s/recalls.csv' % batch_dir(self.batch) - return luigi.LocalTarget(os.path.join(DEVICE_RECALL_DATA_DIR, file_name)) - - def run(self): - output_dir = dirname(self.output().path) - common.shell_cmd('mkdir -p %s', output_dir) - - end = arrow.get(self.batch[1]) - start = arrow.get(self.batch[0]) - - id_list = self._fetch_ids(start, end) - if len(id_list) >= 500: - # We have tried to get the entire list of weekly results in one shot, but apparently there are more than 500 - # hits and CDRH recall search does not support more than 500 results per search and does not support paging either. - # This occurrence is rare, but when it does happen we need to re-retrieve the results day by day. - for r in arrow.Arrow.range('day', start, end): - id_list = id_list + self._fetch_ids(r, r) - - df = pd.DataFrame(data=list(set(id_list)), columns=['id']) - df.to_csv(self.output().path, index=False) - - def _fetch_ids(self, start, end): - id_list = [] - url = SUMMARY_DOWNLOAD_URL % (quote(start.format('MM/DD/YYYY'), ''), quote(end.format('MM/DD/YYYY'), '')) - soup = soup_with_retry(url) - regex = re.compile(r'\./res\.cfm\?id=(\d+)') - for a in soup.find_all(href=regex): - id_list.append(int(regex.fullmatch(a['href']).group(1))) - return id_list + batch = luigi.TupleParameter() + + def requires(self): + return [] + + def output(self): + file_name = "%s/recalls.csv" % batch_dir(self.batch) + return luigi.LocalTarget(os.path.join(DEVICE_RECALL_DATA_DIR, file_name)) + + def run(self): + output_dir = dirname(self.output().path) + common.shell_cmd("mkdir -p %s", output_dir) + + end = arrow.get(self.batch[1]) + start = arrow.get(self.batch[0]) + + id_list = self._fetch_ids(start, end) + if len(id_list) >= 500: + # We have tried to get the entire list of weekly results in one shot, but apparently there are more than 500 + # hits and CDRH recall search does not support more than 500 results per search and does not support paging either. + # This occurrence is rare, but when it does happen we need to re-retrieve the results day by day. + for r in arrow.Arrow.range("day", start, end): + id_list = id_list + self._fetch_ids(r, r) + + df = pd.DataFrame(data=list(set(id_list)), columns=["id"]) + df.to_csv(self.output().path, index=False) + + def _fetch_ids(self, start, end): + id_list = [] + url = SUMMARY_DOWNLOAD_URL % ( + quote(start.format("MM/DD/YYYY"), ""), + quote(end.format("MM/DD/YYYY"), ""), + ) + soup = soup_with_retry(url) + regex = re.compile(r"\./res\.cfm\?id=(\d+)") + for a in soup.find_all(href=regex): + id_list.append(int(regex.fullmatch(a["href"]).group(1))) + return id_list class RecallDownloaderAndMapper(parallel.Mapper): - TERMINATED_RE = re.compile(r'Terminated\s+on\s+(.+)') - - def __init__(self, csv2json_db): - parallel.Mapper.__init__(self) - self.csv2json_db = csv2json_db - - def field_val_str(self, soup, field_key): - for th in soup.find_all('th'): - if th.text == field_key: - td = next(x for x in list(th.next_siblings) if x.name == 'td') - if td.a and td.a.sup: - td.a.decompose() - for child in list(td.children): - if child.name == 'br': - child.replace_with('\n') - return td.text.strip() - return None - - def field_val_array(self, soup, field_key): - str_val = self.field_val_str(soup, field_key) - return str_val.split() if str_val is not None else None - - def field_val_date(self, soup, field_key): - str_val = self.field_val_str(soup, field_key) - return self.reformat_date(str_val) - - def reformat_date(self, str_val): - return arrow.get(str_val, 'MMMM DD, YYYY').format('YYYY-MM-DD') if str_val is not None else None - - def map(self, key, value, output): - - def _cleaner(k, v): - if v == None: + TERMINATED_RE = re.compile(r"Terminated\s+on\s+(.+)") + + def __init__(self, csv2json_db): + parallel.Mapper.__init__(self) + self.csv2json_db = csv2json_db + + def field_val_str(self, soup, field_key): + for th in soup.find_all("th"): + if th.text == field_key: + td = next(x for x in list(th.next_siblings) if x.name == "td") + if td.a and td.a.sup: + td.a.decompose() + for child in list(td.children): + if child.name == "br": + child.replace_with("\n") + return td.text.strip() return None - if k == 'recall_status': - fullmatch = self.TERMINATED_RE.fullmatch(v) - if fullmatch: - return [('recall_status', 'Terminated'), - ('event_date_terminated', self.reformat_date(fullmatch.group(1)))] - - if k == 'recalling_firm': - return parse_address(v) - - return (k, v) - - def parse_address(addr): - ''' - Attempt to parse the Recalling Firm/Manufacturer piece into firm name and address components using - usaddress library. If unable to parse, stuff everything back into the 'recalling_firm' element. - ''' - - def _concat(addr, part_keys): - parts = [] - for key in part_keys: - if addr.get(key) is not None: - parts.append(addr.get(key)) - return ' '.join(parts) - - if len(addr.splitlines()) >= 2: - try: - tagged = usaddress.tag(addr.replace('\n', ' ')) - return [x for x in [('recalling_firm', tagged[0].get('Recipient')), - ('address_1', _concat(tagged[0], ['AddressNumberPrefix', - 'AddressNumber', - 'AddressNumberSuffix', - 'StreetNamePreModifier', - 'StreetNamePreDirectional', - 'StreetNamePreType', - 'StreetName', - 'StreetNamePostType', - 'StreetNamePostDirectional', - 'USPSBoxType', - 'USPSBoxID', - 'USPSBoxGroupType', - 'USPSBoxGroupID'])), - ('address_2', _concat(tagged[0], ['SubaddressType', - 'SubaddressIdentifier', - 'BuildingName', - 'OccupancyType', - 'OccupancyIdentifier', - 'CornerOf', - 'LandmarkName', - 'IntersectionSeparator', - 'NotAddress'])), - ('city', tagged[0].get('PlaceName')), - ('state', tagged[0].get('StateName')), - ('postal_code', tagged[0].get('ZipCode')), - ('country', tagged[0].get('CountryName')) - ] if x[1] is not None and x[1] != ''] - except usaddress.RepeatedLabelError: - pass - - return 'recalling_firm', addr - - id = value['id'] - recall = {} - url = RECALL_RECORD_DOWNLOAD_URL % (id) - soup = soup_with_retry(url) - - recall['@id'] = id - recall['cfres_id'] = id - recall_number = self.field_val_str(soup, 'Recall Number') - recall['product_res_number'] = recall_number - recall['event_date_initiated'] = self.field_val_date(soup, 'Date Initiated by Firm') - recall['event_date_created'] = self.field_val_date(soup, 'Create Date') - recall['event_date_posted'] = self.field_val_date(soup, 'Date Posted') - recall['recall_status'] = self.field_val_str(soup, 'Recall Status1') - recall['res_event_number'] = self.field_val_str(soup, 'Recall Event ID') - - product_code = self.field_val_str(soup, 'Product Classification') - if product_code: - recall['product_code'] = re.search(r'Product Code\s+(\S+)', - product_code).group(1).replace('ooo', 'N/A') - - recall['k_numbers'] = self.field_val_array(soup, '510(K)Number') - recall['pma_numbers'] = self.field_val_array(soup, 'PMA Number') - recall['product_description'] = self.field_val_str(soup, 'Product') - recall['code_info'] = self.field_val_str(soup, 'Code Information') - recall['recalling_firm'] = self.field_val_str(soup, 'Recalling Firm/Manufacturer') - recall['additional_info_contact'] = self.field_val_str(soup, 'For Additional Information Contact') - recall['reason_for_recall'] = self.field_val_str(soup, 'Manufacturer Reasonfor Recall') - recall['root_cause_description'] = self.field_val_str(soup, 'FDA DeterminedCause 2') - recall['action'] = self.field_val_str(soup, 'Action') - recall['product_quantity'] = self.field_val_str(soup, 'Quantity in Commerce') - recall['distribution_pattern'] = self.field_val_str(soup, 'Distribution') - - # Add data from the "old" CSV file. - csv_record = self.csv2json_db.get(recall_number, None) - if csv_record is not None: - if len(csv_record['firm_fei_number']) > 1: - recall['firm_fei_number'] = csv_record['firm_fei_number'] - if len(csv_record['other_submission_description']) > 0: - recall['other_submission_description'] = csv_record['other_submission_description'] - if recall.get('k_numbers') is None and len(csv_record.get('k_numbers')) > 0: - recall['k_numbers'] = csv_record['k_numbers'] - if recall.get('pma_numbers') is None and len(csv_record.get('pma_numbers')) > 0: - recall['pma_numbers'] = csv_record['pma_numbers'] - - xformed_recall = common.transform_dict(recall, _cleaner) - output.add(key, xformed_recall) + def field_val_array(self, soup, field_key): + str_val = self.field_val_str(soup, field_key) + return str_val.split() if str_val is not None else None + + def field_val_date(self, soup, field_key): + str_val = self.field_val_str(soup, field_key) + return self.reformat_date(str_val) + + def reformat_date(self, str_val): + return ( + arrow.get(str_val, "MMMM DD, YYYY").format("YYYY-MM-DD") + if str_val is not None + else None + ) + + def map(self, key, value, output): + + def _cleaner(k, v): + if v == None: + return None + + if k == "recall_status": + fullmatch = self.TERMINATED_RE.fullmatch(v) + if fullmatch: + return [ + ("recall_status", "Terminated"), + ( + "event_date_terminated", + self.reformat_date(fullmatch.group(1)), + ), + ] + + if k == "recalling_firm": + return parse_address(v) + + return (k, v) + + def parse_address(addr): + """ + Attempt to parse the Recalling Firm/Manufacturer piece into firm name and address components using + usaddress library. If unable to parse, stuff everything back into the 'recalling_firm' element. + """ + + def _concat(addr, part_keys): + parts = [] + for key in part_keys: + if addr.get(key) is not None: + parts.append(addr.get(key)) + return " ".join(parts) + + if len(addr.splitlines()) >= 2: + try: + tagged = usaddress.tag(addr.replace("\n", " ")) + return [ + x + for x in [ + ("recalling_firm", tagged[0].get("Recipient")), + ( + "address_1", + _concat( + tagged[0], + [ + "AddressNumberPrefix", + "AddressNumber", + "AddressNumberSuffix", + "StreetNamePreModifier", + "StreetNamePreDirectional", + "StreetNamePreType", + "StreetName", + "StreetNamePostType", + "StreetNamePostDirectional", + "USPSBoxType", + "USPSBoxID", + "USPSBoxGroupType", + "USPSBoxGroupID", + ], + ), + ), + ( + "address_2", + _concat( + tagged[0], + [ + "SubaddressType", + "SubaddressIdentifier", + "BuildingName", + "OccupancyType", + "OccupancyIdentifier", + "CornerOf", + "LandmarkName", + "IntersectionSeparator", + "NotAddress", + ], + ), + ), + ("city", tagged[0].get("PlaceName")), + ("state", tagged[0].get("StateName")), + ("postal_code", tagged[0].get("ZipCode")), + ("country", tagged[0].get("CountryName")), + ] + if x[1] is not None and x[1] != "" + ] + except usaddress.RepeatedLabelError: + pass + + return "recalling_firm", addr + + id = value["id"] + recall = {} + url = RECALL_RECORD_DOWNLOAD_URL % (id) + soup = soup_with_retry(url) + + recall["@id"] = id + recall["cfres_id"] = id + recall_number = self.field_val_str(soup, "Recall Number") + recall["product_res_number"] = recall_number + recall["event_date_initiated"] = self.field_val_date( + soup, "Date Initiated by Firm" + ) + recall["event_date_created"] = self.field_val_date(soup, "Create Date") + recall["event_date_posted"] = self.field_val_date(soup, "Date Posted") + recall["recall_status"] = self.field_val_str(soup, "Recall Status1") + recall["res_event_number"] = self.field_val_str(soup, "Recall Event ID") + + product_code = self.field_val_str(soup, "Product Classification") + if product_code: + recall["product_code"] = ( + re.search(r"Product Code\s+(\S+)", product_code) + .group(1) + .replace("ooo", "N/A") + ) + + recall["k_numbers"] = self.field_val_array(soup, "510(K)Number") + recall["pma_numbers"] = self.field_val_array(soup, "PMA Number") + recall["product_description"] = self.field_val_str(soup, "Product") + recall["code_info"] = self.field_val_str(soup, "Code Information") + recall["recalling_firm"] = self.field_val_str( + soup, "Recalling Firm/Manufacturer" + ) + recall["additional_info_contact"] = self.field_val_str( + soup, "For Additional Information Contact" + ) + recall["reason_for_recall"] = self.field_val_str( + soup, "Manufacturer Reasonfor Recall" + ) + recall["root_cause_description"] = self.field_val_str( + soup, "FDA DeterminedCause 2" + ) + recall["action"] = self.field_val_str(soup, "Action") + recall["product_quantity"] = self.field_val_str(soup, "Quantity in Commerce") + recall["distribution_pattern"] = self.field_val_str(soup, "Distribution") + + # Add data from the "old" CSV file. + csv_record = self.csv2json_db.get(recall_number, None) + if csv_record is not None: + if len(csv_record["firm_fei_number"]) > 1: + recall["firm_fei_number"] = csv_record["firm_fei_number"] + if len(csv_record["other_submission_description"]) > 0: + recall["other_submission_description"] = csv_record[ + "other_submission_description" + ] + if recall.get("k_numbers") is None and len(csv_record.get("k_numbers")) > 0: + recall["k_numbers"] = csv_record["k_numbers"] + if ( + recall.get("pma_numbers") is None + and len(csv_record.get("pma_numbers")) > 0 + ): + recall["pma_numbers"] = csv_record["pma_numbers"] + + xformed_recall = common.transform_dict(recall, _cleaner) + output.add(key, xformed_recall) class ProcessWeeklyBatch(luigi.Task): - batch = luigi.TupleParameter() + batch = luigi.TupleParameter() - def requires(self): - return [DownloadWeeklyRecallSummary(self.batch), CSV2JSON()] + def requires(self): + return [DownloadWeeklyRecallSummary(self.batch), CSV2JSON()] - def output(self): - file_name = '%s/json.db' % batch_dir(self.batch) - return luigi.LocalTarget(os.path.join(DEVICE_RECALL_DATA_DIR, file_name)) + def output(self): + file_name = "%s/json.db" % batch_dir(self.batch) + return luigi.LocalTarget(os.path.join(DEVICE_RECALL_DATA_DIR, file_name)) - def run(self): - csv2json_db = parallel.ShardedDB.open(self.input()[1].path).as_dict() - parallel.mapreduce( - parallel.Collection.from_glob( - self.input()[0].path, parallel.CSVDictLineInput()), - mapper=RecallDownloaderAndMapper(csv2json_db=csv2json_db), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, map_workers=10, num_shards=10) # Do not hit fda.gov too hard here. + def run(self): + csv2json_db = parallel.ShardedDB.open(self.input()[1].path).as_dict() + parallel.mapreduce( + parallel.Collection.from_glob( + self.input()[0].path, parallel.CSVDictLineInput() + ), + mapper=RecallDownloaderAndMapper(csv2json_db=csv2json_db), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + map_workers=10, + num_shards=10, + ) # Do not hit fda.gov too hard here. class DeviceRecallAnnotateMapper(DeviceAnnotateMapper): - def filter(self, data, lookup=None): - ''' This is the standard filter copied from Device Harmonization with a small difference in that - it does not fail for missing product codes, which is the case for a small number of recall records. - ''' - product_code = data.get('product_code') - if product_code is not None: - harmonized = self.harmonized_db.get(product_code, None) - if harmonized: - if self.table in harmonized: - del harmonized[self.table] - return harmonized - return None + def filter(self, data, lookup=None): + """This is the standard filter copied from Device Harmonization with a small difference in that + it does not fail for missing product codes, which is the case for a small number of recall records. + """ + product_code = data.get("product_code") + if product_code is not None: + harmonized = self.harmonized_db.get(product_code, None) + if harmonized: + if self.table in harmonized: + del harmonized[self.table] + return harmonized + return None class AnnotateWeeklyBatch(luigi.Task): - batch = luigi.TupleParameter() + batch = luigi.TupleParameter() - def requires(self): - return [Harmonized2OpenFDA(), ProcessWeeklyBatch(self.batch)] + def requires(self): + return [Harmonized2OpenFDA(), ProcessWeeklyBatch(self.batch)] - def output(self): - file_name = '%s/annotate.db' % batch_dir(self.batch) - return luigi.LocalTarget(os.path.join(DEVICE_RECALL_DATA_DIR, file_name)) + def output(self): + file_name = "%s/annotate.db" % batch_dir(self.batch) + return luigi.LocalTarget(os.path.join(DEVICE_RECALL_DATA_DIR, file_name)) - def run(self): - harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() + def run(self): + harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() - parallel.mapreduce( - parallel.Collection.from_sharded(self.input()[1].path), - mapper=DeviceRecallAnnotateMapper(harmonized_db=harmonized_db), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + parallel.mapreduce( + parallel.Collection.from_sharded(self.input()[1].path), + mapper=DeviceRecallAnnotateMapper(harmonized_db=harmonized_db), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) class LoadJSON(index_util.LoadJSONBase): - batch = luigi.TupleParameter() - last_update_date = luigi.Parameter() - index_name = 'devicerecall' - mapping_file = './schemas/device_recall_mapping.json' - use_checksum = True - optimize_index = False - docid_key = 'product_res_number' + batch = luigi.TupleParameter() + last_update_date = luigi.Parameter() + index_name = "devicerecall" + mapping_file = "./schemas/device_recall_mapping.json" + use_checksum = True + optimize_index = False + docid_key = "product_res_number" - def _data(self): - return AnnotateWeeklyBatch(self.batch) + def _data(self): + return AnnotateWeeklyBatch(self.batch) class RunWeeklyProcess(AlwaysRunTask): - ''' Generates a date object that is passed through the pipeline tasks in order - to generate and load JSON documents for the weekly device recall batches. - ''' - - def requires(self): - run_date = arrow.utcnow() - start = arrow.get(2002, 12, 8) - end = arrow.get(run_date.year, run_date.month, run_date.day) - previous_task = NoopTask() - - for batch in arrow.Arrow.span_range('week', start, end, bounds='[]', exact=True): - task = LoadJSON(batch=(batch[0].format(), batch[1].format()), last_update_date=end.format('YYYY-MM-DD'), - previous_task=previous_task) - previous_task = task - yield task - - def _run(self): - index_util.optimize_index('devicerecall', wait_for_merge=0) - - -if __name__ == '__main__': - luigi.run() + """Generates a date object that is passed through the pipeline tasks in order + to generate and load JSON documents for the weekly device recall batches. + """ + + def requires(self): + run_date = arrow.utcnow() + start = arrow.get(2002, 12, 8) + end = arrow.get(run_date.year, run_date.month, run_date.day) + previous_task = NoopTask() + + for batch in arrow.Arrow.span_range( + "week", start, end, bounds="[]", exact=True + ): + task = LoadJSON( + batch=(batch[0].format(), batch[1].format()), + last_update_date=end.format("YYYY-MM-DD"), + previous_task=previous_task, + ) + previous_task = task + yield task + + def _run(self): + index_util.optimize_index("devicerecall", wait_for_merge=0) + + +if __name__ == "__main__": + luigi.run() diff --git a/openfda/device_udi/pipeline.py b/openfda/device_udi/pipeline.py index 557ed8c..0b61d11 100644 --- a/openfda/device_udi/pipeline.py +++ b/openfda/device_udi/pipeline.py @@ -15,291 +15,346 @@ import xmltodict from openfda import common, config, index_util, parallel from openfda.common import first_file_timestamp -from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, - DeviceAnnotateMapper) +from openfda.device_harmonization.pipeline import ( + Harmonized2OpenFDA, + DeviceAnnotateMapper, +) -DEVICE_UDI_BUCKET = 's3://cdrh-data' -DEVICE_UDI_LOCAL_DIR = config.data_dir('device_udi/s3_sync') -BATCH = arrow.utcnow().floor('day').format('YYYYMMDD') -AWS_CLI = 'aws' +DEVICE_UDI_BUCKET = "s3://cdrh-data" +DEVICE_UDI_LOCAL_DIR = config.data_dir("device_udi/s3_sync") +BATCH = arrow.utcnow().floor("day").format("YYYYMMDD") +AWS_CLI = "aws" -class SyncS3DeviceUDI(luigi.Task): - bucket = DEVICE_UDI_BUCKET - local_dir = DEVICE_UDI_LOCAL_DIR - aws = AWS_CLI - - def output(self): - return luigi.LocalTarget(DEVICE_UDI_LOCAL_DIR) - def run(self): - common.cmd([self.aws, - '--profile=' + config.aws_profile(), - 's3', 'sync', +class SyncS3DeviceUDI(luigi.Task): + bucket = DEVICE_UDI_BUCKET + local_dir = DEVICE_UDI_LOCAL_DIR + aws = AWS_CLI + + def output(self): + return luigi.LocalTarget(DEVICE_UDI_LOCAL_DIR) + + def run(self): + common.cmd( + [ + self.aws, + "--profile=" + config.aws_profile(), + "s3", + "sync", self.bucket, - self.local_dir]) + self.local_dir, + ] + ) class ExtractXML(luigi.Task): - local_dir = DEVICE_UDI_LOCAL_DIR + local_dir = DEVICE_UDI_LOCAL_DIR - def requires(self): - return SyncS3DeviceUDI() + def requires(self): + return SyncS3DeviceUDI() - def output(self): - return luigi.LocalTarget(os.path.join(config.data_dir('device_udi/extracted'), BATCH)) + def output(self): + return luigi.LocalTarget( + os.path.join(config.data_dir("device_udi/extracted"), BATCH) + ) - def run(self): - output_dir = self.output().path - common.shell_cmd('mkdir -p %s', output_dir) - input_dir = self.local_dir - for zip_filename in glob.glob(input_dir + '/*.zip'): - common.shell_cmd_quiet('unzip -ou %s -d %s', zip_filename, output_dir) + def run(self): + output_dir = self.output().path + common.shell_cmd("mkdir -p %s", output_dir) + input_dir = self.local_dir + for zip_filename in glob.glob(input_dir + "/*.zip"): + common.shell_cmd_quiet("unzip -ou %s -d %s", zip_filename, output_dir) class XML2JSONMapper(parallel.Mapper): - def map_shard(self, map_input, map_output): - - # Changing names to match the openFDA naming standard - # key = source name, value = replacement name - RENAME_MAP = { - "publicDeviceRecordKey": "public_device_record_key", - "publicVersionStatus": "public_version_status", - "publicVersionNumber": "public_version_number", - "publicVersionDate": "public_version_date", - "dunsNumber": "labeler_duns_number", - "deviceRecordStatus": "record_status", - "devicePublishDate": "publish_date", - "deviceCommDistributionEndDate": "commercial_distribution_end_date", - "deviceCommDistributionStatus": "commercial_distribution_status", - "identifiers": "identifiers", - "identifier": "identifier", - "brandName": "brand_name", - "versionModelNumber": "version_or_model_number", - "catalogNumber": "catalog_number", - "companyName": "company_name", - "deviceCount": "device_count_in_base_package", - "deviceDescription": "device_description", - "DMExempt": "is_direct_marking_exempt", - "premarketExempt": "is_pm_exempt", - "deviceHCTP": "is_hct_p", - "deviceKit": "is_kit", - "deviceCombinationProduct": "is_combination_product", - "singleUse": "is_single_use", - "lotBatch": "has_lot_or_batch_number", - "serialNumber": "has_serial_number", - "manufacturingDate": "has_manufacturing_date", - "expirationDate": "has_expiration_date", - "donationIdNumber": "has_donation_id_number", - "labeledContainsNRL": "is_labeled_as_nrl", - "labeledNoNRL": "is_labeled_as_no_nrl", - "MRISafetyStatus": "mri_safety", - "rx": "is_rx", - "otc": "is_otc", - "contacts": "customer_contacts", - "customerContact": "customer_contact", - "gmdnTerms": "gmdn_terms", - "gmdn": "gmdn", - "productCodes": "product_codes", - "fdaProductCode": "code", - "deviceSizes": "device_sizes", - "deviceSize": "device_size", - "storageHandling": "storage_handling", - "deviceSterile": "is_sterile", - "sterilizationPriorToUse": "is_sterilization_prior_use", - "methodTypes": "sterilization_methods", - "deviceId": "id", - "deviceIdType": "type", - "deviceIdIssuingAgency": "issuing_agency", - "containsDINumber": "unit_of_use_id", - "pkgQuantity": "quantity_per_package", - "pkgDiscontinueDate": "package_discontinue_date", - "pkgStatus": "package_status", - "pkgType": "package_type", - "phone": "phone", - "phoneExtension": "ext", - "email": "email", - "gmdnPTName": "name", - "gmdnPTDefinition": "definition", - "productCode": "code", - "productCodeName": "name", - "sizeType": "type", - "size": "value", - "sizeText": "text", - "storageHandlingType": "type", - "storageHandlingHigh": "high", - "storageHandlingLow": "low", - "storageHandlingSpecialConditionText": "special_conditions", - "sterilizationMethod": "sterilization_method", - "feis": "fei_number", - "premarketSubmissions": "premarket_submissions", - "submissionNumber": "submission_number", - "supplementNumber": "supplement_number", - "submissionType": "submission_type", - "environmentalConditions": "storage" - } - - IGNORE = ['@xmlns'] - - FLATTEN = ["customer_contacts", "device_sizes", "gmdn_terms", "identifiers", "premarket_submissions", - "product_codes", "storage", "fei_number", "sterilization_methods"] - - logging.info('Extracting devices from %s...', map_input.filename) - - def transformer_fn(k, v): - """ A helper function used for removing and renaming dictionary keys. + def map_shard(self, map_input, map_output): + + # Changing names to match the openFDA naming standard + # key = source name, value = replacement name + RENAME_MAP = { + "publicDeviceRecordKey": "public_device_record_key", + "publicVersionStatus": "public_version_status", + "publicVersionNumber": "public_version_number", + "publicVersionDate": "public_version_date", + "dunsNumber": "labeler_duns_number", + "deviceRecordStatus": "record_status", + "devicePublishDate": "publish_date", + "deviceCommDistributionEndDate": "commercial_distribution_end_date", + "deviceCommDistributionStatus": "commercial_distribution_status", + "identifiers": "identifiers", + "identifier": "identifier", + "brandName": "brand_name", + "versionModelNumber": "version_or_model_number", + "catalogNumber": "catalog_number", + "companyName": "company_name", + "deviceCount": "device_count_in_base_package", + "deviceDescription": "device_description", + "DMExempt": "is_direct_marking_exempt", + "premarketExempt": "is_pm_exempt", + "deviceHCTP": "is_hct_p", + "deviceKit": "is_kit", + "deviceCombinationProduct": "is_combination_product", + "singleUse": "is_single_use", + "lotBatch": "has_lot_or_batch_number", + "serialNumber": "has_serial_number", + "manufacturingDate": "has_manufacturing_date", + "expirationDate": "has_expiration_date", + "donationIdNumber": "has_donation_id_number", + "labeledContainsNRL": "is_labeled_as_nrl", + "labeledNoNRL": "is_labeled_as_no_nrl", + "MRISafetyStatus": "mri_safety", + "rx": "is_rx", + "otc": "is_otc", + "contacts": "customer_contacts", + "customerContact": "customer_contact", + "gmdnTerms": "gmdn_terms", + "gmdn": "gmdn", + "productCodes": "product_codes", + "fdaProductCode": "code", + "deviceSizes": "device_sizes", + "deviceSize": "device_size", + "storageHandling": "storage_handling", + "deviceSterile": "is_sterile", + "sterilizationPriorToUse": "is_sterilization_prior_use", + "methodTypes": "sterilization_methods", + "deviceId": "id", + "deviceIdType": "type", + "deviceIdIssuingAgency": "issuing_agency", + "containsDINumber": "unit_of_use_id", + "pkgQuantity": "quantity_per_package", + "pkgDiscontinueDate": "package_discontinue_date", + "pkgStatus": "package_status", + "pkgType": "package_type", + "phone": "phone", + "phoneExtension": "ext", + "email": "email", + "gmdnPTName": "name", + "gmdnPTDefinition": "definition", + "productCode": "code", + "productCodeName": "name", + "sizeType": "type", + "size": "value", + "sizeText": "text", + "storageHandlingType": "type", + "storageHandlingHigh": "high", + "storageHandlingLow": "low", + "storageHandlingSpecialConditionText": "special_conditions", + "sterilizationMethod": "sterilization_method", + "feis": "fei_number", + "premarketSubmissions": "premarket_submissions", + "submissionNumber": "submission_number", + "supplementNumber": "supplement_number", + "submissionType": "submission_type", + "environmentalConditions": "storage", + } + + IGNORE = ["@xmlns"] + + FLATTEN = [ + "customer_contacts", + "device_sizes", + "gmdn_terms", + "identifiers", + "premarket_submissions", + "product_codes", + "storage", + "fei_number", + "sterilization_methods", + ] + + logging.info("Extracting devices from %s...", map_input.filename) + + def transformer_fn(k, v): + """A helper function used for removing and renaming dictionary keys.""" + if k in IGNORE: + return None + if v is None: + return None + if type(v) == dict and len(list(v.keys())) == 0: + return None + if type(v) == list and len(v) == 1 and v[0] is None: + return None + if k in RENAME_MAP: + k = RENAME_MAP[k] + + if k in FLATTEN: + v = v[list(v.keys())[0]] + + if ( + type(v) == list + and len(v) == 1 + and type(v[0]) == dict + and len(list(v[0].keys())) == 0 + ): + return None + + return k, v + + def handle_device(_, xml): + """Transform the dictionary, which follows the structure of UDI XML files, to a new one that + matches the UDI mapping in ES. """ - if k in IGNORE: return None - if v is None: return None - if type(v) == dict and len(list(v.keys())) == 0: - return None - if type(v) == list and len(v) == 1 and v[0] is None: - return None - if k in RENAME_MAP: - k = RENAME_MAP[k] - if k in FLATTEN: - v = v[list(v.keys())[0]] - - if type(v) == list and len(v) == 1 and type(v[0]) == dict and len(list(v[0].keys())) == 0: - return None - - return k, v - - def handle_device(_, xml): - """Transform the dictionary, which follows the structure of UDI XML files, to a new one that - matches the UDI mapping in ES. - """ - - try: - - device = common.transform_dict(xml, transformer_fn) - - # Additional transformation of the dictionary not handled by the transformer_fn function. - if "id" in device: - del device["id"] - if "device_sizes" in device: - for size in device["device_sizes"]: - if "value" in size: - value_dict = size["value"] - del size["value"] - if "value" in value_dict: - size["value"] = value_dict["value"] - if "unit" in value_dict: - size["unit"] = value_dict["unit"] - - # print(json.dumps(device, indent=4 * ' ')) - - # @id will be a concatenation of primary identifier's issuer with the ID number itself. - for identifier, idenList in iter(xml["identifiers"].items()): - if type(idenList) == type([]): - for iden in idenList: - if 'deviceIdType' in iden and 'deviceIdIssuingAgency' in iden and 'Primary' == iden['deviceIdType']: - device["@id"] = (('' if 'deviceIdIssuingAgency' not in iden else iden['deviceIdIssuingAgency'] + "_") + \ - iden["deviceId"]).lower() - - map_output.add(device["@id"], device) - return True - except Exception: - logging.error(xml) - traceback.print_exc() - logging.error(sys.exc_info()[0]) - raise - - try: - xmltodict.parse(open(map_input.filename, 'rb'), encoding="utf-8", xml_attribs=True, - force_list=("identifier", "customerContact", "gmdn", "fdaProductCode", - "deviceSize", "storageHandling", "fei", "premarketSubmission"), - item_depth=2, item_callback=handle_device) - except Exception: - traceback.print_exc() - logging.error(sys.exc_info()[0]) - raise + try: + + device = common.transform_dict(xml, transformer_fn) + + # Additional transformation of the dictionary not handled by the transformer_fn function. + if "id" in device: + del device["id"] + if "device_sizes" in device: + for size in device["device_sizes"]: + if "value" in size: + value_dict = size["value"] + del size["value"] + if "value" in value_dict: + size["value"] = value_dict["value"] + if "unit" in value_dict: + size["unit"] = value_dict["unit"] + + # print(json.dumps(device, indent=4 * ' ')) + + # @id will be a concatenation of primary identifier's issuer with the ID number itself. + for identifier, idenList in iter(xml["identifiers"].items()): + if type(idenList) == type([]): + for iden in idenList: + if ( + "deviceIdType" in iden + and "deviceIdIssuingAgency" in iden + and "Primary" == iden["deviceIdType"] + ): + device["@id"] = ( + ( + "" + if "deviceIdIssuingAgency" not in iden + else iden["deviceIdIssuingAgency"] + "_" + ) + + iden["deviceId"] + ).lower() + + map_output.add(device["@id"], device) + return True + except Exception: + logging.error(xml) + traceback.print_exc() + logging.error(sys.exc_info()[0]) + raise + + try: + xmltodict.parse( + open(map_input.filename, "rb"), + encoding="utf-8", + xml_attribs=True, + force_list=( + "identifier", + "customerContact", + "gmdn", + "fdaProductCode", + "deviceSize", + "storageHandling", + "fei", + "premarketSubmission", + ), + item_depth=2, + item_callback=handle_device, + ) + except Exception: + traceback.print_exc() + logging.error(sys.exc_info()[0]) + raise class XML2JSON(luigi.Task): - def requires(self): - return ExtractXML() + def requires(self): + return ExtractXML() - def output(self): - return luigi.LocalTarget(os.path.join(config.data_dir('device_udi/json.db'), BATCH)) + def output(self): + return luigi.LocalTarget( + os.path.join(config.data_dir("device_udi/json.db"), BATCH) + ) - def run(self): - input_shards = [] - input_dir = self.input().path - for xml_filename in glob.glob(input_dir + '/*.xml'): - input_shards.append(xml_filename) + def run(self): + input_shards = [] + input_dir = self.input().path + for xml_filename in glob.glob(input_dir + "/*.xml"): + input_shards.append(xml_filename) - parallel.mapreduce( - parallel.Collection.from_list(input_shards), - mapper=XML2JSONMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=len(input_shards)) + parallel.mapreduce( + parallel.Collection.from_list(input_shards), + mapper=XML2JSONMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=len(input_shards), + ) class UDIAnnotateMapper(DeviceAnnotateMapper): - """ The UDI document has a unique placement requirement, so we need - to override the `harmonize()` method to place the `openfda` section on - each product code `product_codes` list within the document. - """ - - def harmonize(self, data): - result = dict(data) - - product_codes = [] - for row in result.get('product_codes', []): - d = dict(row) - harmonized = self.filter(row) - if harmonized: - d['openfda'] = self.flatten(harmonized) - else: - d['openfda'] = {} - product_codes.append(d) - result['product_codes'] = product_codes - return result - - def filter(self, data): - product_code = data['code'] - harmonized = self.harmonized_db.get(product_code, None) - if harmonized: - # Remove k_number, fei_number and pma_number from the harmonization section because they are contained in the - # main record. - if harmonized.get("510k") is not None: - del harmonized["510k"] - if harmonized.get("device_pma") is not None: - del harmonized["device_pma"] - if harmonized.get("registration") is not None: - del harmonized["registration"] - - return harmonized - return None + """The UDI document has a unique placement requirement, so we need + to override the `harmonize()` method to place the `openfda` section on + each product code `product_codes` list within the document. + """ + + def harmonize(self, data): + result = dict(data) + + product_codes = [] + for row in result.get("product_codes", []): + d = dict(row) + harmonized = self.filter(row) + if harmonized: + d["openfda"] = self.flatten(harmonized) + else: + d["openfda"] = {} + product_codes.append(d) + result["product_codes"] = product_codes + return result + + def filter(self, data): + product_code = data["code"] + harmonized = self.harmonized_db.get(product_code, None) + if harmonized: + # Remove k_number, fei_number and pma_number from the harmonization section because they are contained in the + # main record. + if harmonized.get("510k") is not None: + del harmonized["510k"] + if harmonized.get("device_pma") is not None: + del harmonized["device_pma"] + if harmonized.get("registration") is not None: + del harmonized["registration"] + + return harmonized + return None class AnnotateDevice(luigi.Task): - def requires(self): - return [Harmonized2OpenFDA(), XML2JSON()] + def requires(self): + return [Harmonized2OpenFDA(), XML2JSON()] - def output(self): - return luigi.LocalTarget(os.path.join(config.data_dir('device_udi/annotate.db'), BATCH)) + def output(self): + return luigi.LocalTarget( + os.path.join(config.data_dir("device_udi/annotate.db"), BATCH) + ) - def run(self): - harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() + def run(self): + harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() - parallel.mapreduce( - parallel.Collection.from_sharded(self.input()[1].path), - mapper=UDIAnnotateMapper(harmonized_db=harmonized_db), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=10) + parallel.mapreduce( + parallel.Collection.from_sharded(self.input()[1].path), + mapper=UDIAnnotateMapper(harmonized_db=harmonized_db), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=10, + ) class LoadJSON(index_util.LoadJSONBase): - index_name = 'deviceudi' - mapping_file = './schemas/deviceudi_mapping.json' - data_source = AnnotateDevice() - use_checksum = False - optimize_index = False - last_update_date = lambda _: first_file_timestamp(DEVICE_UDI_LOCAL_DIR) - -if __name__ == '__main__': - luigi.run() + index_name = "deviceudi" + mapping_file = "./schemas/deviceudi_mapping.json" + data_source = AnnotateDevice() + use_checksum = False + optimize_index = False + last_update_date = lambda _: first_file_timestamp(DEVICE_UDI_LOCAL_DIR) + + +if __name__ == "__main__": + luigi.run() diff --git a/openfda/device_udi/tests/pipeline_test.py b/openfda/device_udi/tests/pipeline_test.py index 3217fa0..ae6b490 100644 --- a/openfda/device_udi/tests/pipeline_test.py +++ b/openfda/device_udi/tests/pipeline_test.py @@ -8,176 +8,214 @@ import luigi from mock import MagicMock -from openfda.device_udi.pipeline import SyncS3DeviceUDI, ExtractXML, XML2JSONMapper, UDIAnnotateMapper +from openfda.device_udi.pipeline import ( + SyncS3DeviceUDI, + ExtractXML, + XML2JSONMapper, + UDIAnnotateMapper, +) from openfda.tests.api_test_helpers import * - class UDIPipelineTests(unittest.TestCase): - def setUp(self): - self.test_dir = tempfile.mkdtemp() - - def tearDown(self): - shutil.rmtree(self.test_dir) - - def test_annotation(self): - harmonized_db = MagicMock() - harmonized_db.get = lambda product_code, _: {"510k": [{"k_number": "K094035"}], \ - "device_pma": [{"pma_number": "P950002"}], \ - "registration": [{"fei_number": "3001451451"}], \ - } if product_code == "OQG" else {} - - ann = UDIAnnotateMapper(harmonized_db) - mapper = XML2JSONMapper() - - def add_fn(id, json): - harmonized = ann.harmonize(json) - eq_("OQG", harmonized["product_codes"][0]["code"]) - eq_(None, harmonized["product_codes"][0]["openfda"].get("pma_number")) - eq_(None, harmonized["product_codes"][0]["openfda"].get("k_number")) - eq_(None, harmonized["product_codes"][0]["openfda"].get("fei_number")) - - eq_({}, harmonized["product_codes"][1].get("openfda")) - - map_input = MagicMock() - map_input.filename = os.path.join(dirname(os.path.abspath(__file__)), "test.xml") - map_output = MagicMock() - map_output.add = add_fn - mapper.map_shard(map_input, map_output) - - def test_xml_to_json(self): - mapper = XML2JSONMapper() - - map_input = MagicMock() - map_input.filename = os.path.join(dirname(os.path.abspath(__file__)), "test.xml") - - def add_fn(id, json): - eq_("gs1_00844588018923", id) - eq_("gs1_00844588018923", json["@id"]) - - eq_("00844588018923", json["identifiers"][0]["id"]) - eq_("Primary", json["identifiers"][0]["type"]) - eq_("GS1", json["identifiers"][0]["issuing_agency"]) - - eq_("00844868017264", json["identifiers"][1]["id"]) - eq_("Package", json["identifiers"][1]["type"]) - eq_("GS1", json["identifiers"][1]["issuing_agency"]) - eq_("00844868017288", json["identifiers"][1]["unit_of_use_id"]) - eq_("25", json["identifiers"][1]["quantity_per_package"]) - eq_("2015-11-11", json["identifiers"][1]["package_discontinue_date"]) - eq_("Not in Commercial Distribution", json["identifiers"][1]["package_status"]) - eq_("case", json["identifiers"][1]["package_type"]) - - eq_("Published", json["record_status"]) - eq_("2015-09-24", json["publish_date"]) - - eq_("New", json["public_version_status"]) - eq_("33e1d2ec-8555-43d0-8421-b208e7185d06", json["public_device_record_key"]) - eq_("1", json["public_version_number"]) - eq_("2017-12-19", json["public_version_date"]) - eq_("123456789", json["labeler_duns_number"]) - - eq_("2018-09-24", json["commercial_distribution_end_date"]) - eq_("In Commercial Distribution", json["commercial_distribution_status"]) - eq_("CS2 Acet. Cup Sys. - VitalitE", json["brand_name"]) - eq_("1107-0-3258", json["version_or_model_number"]) - eq_("1107-0-3258", json["catalog_number"]) - eq_("CONSENSUS ORTHOPEDICS, INC.", json["company_name"]) - eq_("1", json["device_count_in_base_package"]) - eq_("Acet. Insert, VitalitE", json["device_description"]) - eq_("true", json["is_direct_marking_exempt"]) - eq_("false", json["is_pm_exempt"]) - eq_("false", json["is_hct_p"]) - eq_("false", json["is_kit"]) - eq_("true", json["is_combination_product"]) - eq_("true", json["is_single_use"]) - eq_("true", json["has_lot_or_batch_number"]) - eq_("false", json["has_serial_number"]) - eq_("false", json["has_manufacturing_date"]) - eq_("true", json["has_expiration_date"]) - eq_("false", json["has_donation_id_number"]) - eq_("false", json["is_labeled_as_nrl"]) - eq_("true", json["is_labeled_as_no_nrl"]) - eq_("true", json["is_rx"]) - eq_("false", json["is_otc"]) - eq_("Labeling does not contain MRI Safety Information", json["mri_safety"]) - eq_("+1(916)355-7100", json["customer_contacts"][0]["phone"]) - eq_("ext555", json["customer_contacts"][0]["ext"]) - eq_("fang.wei@fda.hhs.gov", json["customer_contacts"][0]["email"]) - eq_("+1 (555) 555-5555", json["customer_contacts"][1]["phone"]) - eq_(None, json["customer_contacts"][1].get("ext")) - eq_("jdoe@example.com", json["customer_contacts"][1]["email"]) - - eq_("Non-constrained polyethylene acetabular liner", json["gmdn_terms"][0]["name"]) - eq_( - "A sterile, implantable component of a two-piece acetabulum prosthesis that is inserted\n into an acetabular shell prosthesis to provide the articulating surface with a femoral head\n prosthesis as part of a total hip arthroplasty (THA). It is made of polyethylene (includes hylamer,\n cross-linked polyethylene), and does not include a stabilizing component to limit the range of\n motion of the hip.", - json["gmdn_terms"][0]["definition"]) - eq_("Bone-screw internal spinal fixation system, non-sterile", json["gmdn_terms"][1]["name"]) - eq_( - "An assembly of non-sterile implantable devices intended to provide immobilization and\n stabilization of spinal segments in the treatment of various spinal instabilities or deformities,\n also used as an adjunct to spinal fusion [e.g., for degenerative disc disease (DDD)]. Otherwise\n known as a pedicle screw instrumentation system, it typically consists of a combination of anchors\n (e.g., bolts, hooks, pedicle screws or other types), interconnection mechanisms (incorporating nuts,\n screws, sleeves, or bolts), longitudinal members (e.g., plates, rods, plate/rod combinations),\n and/or transverse connectors. Non-sterile disposable devices associated with implantation may be\n included.", - json["gmdn_terms"][1]["definition"]) - eq_("OQG", json["product_codes"][0]["code"]) - eq_( - "Hip Prosthesis, semi-constrained, cemented, metal/polymer, + additive, porous,\n uncemented", - json["product_codes"][0]["name"]) - eq_("MAX", json["product_codes"][1]["code"]) - eq_("Intervertebral fusion device with bone graft, lumbar", json["product_codes"][1]["name"]) - - eq_("Millimeter", json["device_sizes"][0]["unit"]) - eq_("32/6", json["device_sizes"][0]["value"]) - eq_("Outer Diameter", json["device_sizes"][0]["type"]) - eq_("Size test here", json["device_sizes"][0]["text"]) - - eq_(None, json["device_sizes"][1].get("unit")) - eq_(None, json["device_sizes"][1].get("value")) - eq_("Device Size Text, specify", json["device_sizes"][1]["type"]) - eq_(str("SPACER LAT PEEK 8° 40L X 18W X 10H"), json["device_sizes"][1]["text"]) - - eq_("Storage Environment Temperature", json["storage"][0]["type"]) - eq_("Degrees Celsius", json["storage"][0]["high"]["unit"]) - eq_("8", json["storage"][0]["high"]["value"]) - eq_("Degrees Celsius", json["storage"][0]["low"]["unit"]) - eq_("-30", json["storage"][0]["low"]["value"]) - eq_(None, json["storage"][0].get("special_conditions")) - - eq_("Special Storage Condition, Specify", json["storage"][1]["type"]) - eq_("This device must be stored in a dry location away from temperature\n extremes", - json["storage"][1].get("special_conditions")) - eq_(None, json["storage"][1].get("high")) - eq_(None, json["storage"][1].get("low")) - - eq_("false", json["sterilization"]["is_sterile"]) - eq_("true", json["sterilization"]["is_sterilization_prior_use"]) - eq_("Moist Heat or Steam Sterilization", json["sterilization"]["sterilization_methods"]) - - eq_(["3001451451", "12223430908"], json["fei_number"]) - - eq_("K094035", json["premarket_submissions"][0]["submission_number"]) - eq_("000", json["premarket_submissions"][0]["supplement_number"]) - eq_("PMN", json["premarket_submissions"][0]["submission_type"]) - eq_("P950002", json["premarket_submissions"][1]["submission_number"]) - eq_("001", json["premarket_submissions"][1]["supplement_number"]) - eq_("PMA", json["premarket_submissions"][1]["submission_type"]) - - map_output = MagicMock() - map_output.add = add_fn - mapper.map_shard(map_input, map_output) - - def test_extract_xml(self): - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "file.zip"), - os.path.join(self.test_dir, "file.zip")) - extract = ExtractXML() - extract.local_dir = self.test_dir - extract.output = MagicMock( - return_value=luigi.LocalTarget(self.test_dir)) - extract.run() - ok_(os.path.isfile(os.path.join(self.test_dir, "file1.xml"))) - + def setUp(self): + self.test_dir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_annotation(self): + harmonized_db = MagicMock() + harmonized_db.get = lambda product_code, _: ( + { + "510k": [{"k_number": "K094035"}], + "device_pma": [{"pma_number": "P950002"}], + "registration": [{"fei_number": "3001451451"}], + } + if product_code == "OQG" + else {} + ) + + ann = UDIAnnotateMapper(harmonized_db) + mapper = XML2JSONMapper() + + def add_fn(id, json): + harmonized = ann.harmonize(json) + eq_("OQG", harmonized["product_codes"][0]["code"]) + eq_(None, harmonized["product_codes"][0]["openfda"].get("pma_number")) + eq_(None, harmonized["product_codes"][0]["openfda"].get("k_number")) + eq_(None, harmonized["product_codes"][0]["openfda"].get("fei_number")) + + eq_({}, harmonized["product_codes"][1].get("openfda")) + + map_input = MagicMock() + map_input.filename = os.path.join( + dirname(os.path.abspath(__file__)), "test.xml" + ) + map_output = MagicMock() + map_output.add = add_fn + mapper.map_shard(map_input, map_output) + + def test_xml_to_json(self): + mapper = XML2JSONMapper() + + map_input = MagicMock() + map_input.filename = os.path.join( + dirname(os.path.abspath(__file__)), "test.xml" + ) + + def add_fn(id, json): + eq_("gs1_00844588018923", id) + eq_("gs1_00844588018923", json["@id"]) + + eq_("00844588018923", json["identifiers"][0]["id"]) + eq_("Primary", json["identifiers"][0]["type"]) + eq_("GS1", json["identifiers"][0]["issuing_agency"]) + + eq_("00844868017264", json["identifiers"][1]["id"]) + eq_("Package", json["identifiers"][1]["type"]) + eq_("GS1", json["identifiers"][1]["issuing_agency"]) + eq_("00844868017288", json["identifiers"][1]["unit_of_use_id"]) + eq_("25", json["identifiers"][1]["quantity_per_package"]) + eq_("2015-11-11", json["identifiers"][1]["package_discontinue_date"]) + eq_( + "Not in Commercial Distribution", + json["identifiers"][1]["package_status"], + ) + eq_("case", json["identifiers"][1]["package_type"]) + + eq_("Published", json["record_status"]) + eq_("2015-09-24", json["publish_date"]) + + eq_("New", json["public_version_status"]) + eq_( + "33e1d2ec-8555-43d0-8421-b208e7185d06", json["public_device_record_key"] + ) + eq_("1", json["public_version_number"]) + eq_("2017-12-19", json["public_version_date"]) + eq_("123456789", json["labeler_duns_number"]) + + eq_("2018-09-24", json["commercial_distribution_end_date"]) + eq_("In Commercial Distribution", json["commercial_distribution_status"]) + eq_("CS2 Acet. Cup Sys. - VitalitE", json["brand_name"]) + eq_("1107-0-3258", json["version_or_model_number"]) + eq_("1107-0-3258", json["catalog_number"]) + eq_("CONSENSUS ORTHOPEDICS, INC.", json["company_name"]) + eq_("1", json["device_count_in_base_package"]) + eq_("Acet. Insert, VitalitE", json["device_description"]) + eq_("true", json["is_direct_marking_exempt"]) + eq_("false", json["is_pm_exempt"]) + eq_("false", json["is_hct_p"]) + eq_("false", json["is_kit"]) + eq_("true", json["is_combination_product"]) + eq_("true", json["is_single_use"]) + eq_("true", json["has_lot_or_batch_number"]) + eq_("false", json["has_serial_number"]) + eq_("false", json["has_manufacturing_date"]) + eq_("true", json["has_expiration_date"]) + eq_("false", json["has_donation_id_number"]) + eq_("false", json["is_labeled_as_nrl"]) + eq_("true", json["is_labeled_as_no_nrl"]) + eq_("true", json["is_rx"]) + eq_("false", json["is_otc"]) + eq_("Labeling does not contain MRI Safety Information", json["mri_safety"]) + eq_("+1(916)355-7100", json["customer_contacts"][0]["phone"]) + eq_("ext555", json["customer_contacts"][0]["ext"]) + eq_("fang.wei@fda.hhs.gov", json["customer_contacts"][0]["email"]) + eq_("+1 (555) 555-5555", json["customer_contacts"][1]["phone"]) + eq_(None, json["customer_contacts"][1].get("ext")) + eq_("jdoe@example.com", json["customer_contacts"][1]["email"]) + + eq_( + "Non-constrained polyethylene acetabular liner", + json["gmdn_terms"][0]["name"], + ) + eq_( + "A sterile, implantable component of a two-piece acetabulum prosthesis that is inserted\n into an acetabular shell prosthesis to provide the articulating surface with a femoral head\n prosthesis as part of a total hip arthroplasty (THA). It is made of polyethylene (includes hylamer,\n cross-linked polyethylene), and does not include a stabilizing component to limit the range of\n motion of the hip.", + json["gmdn_terms"][0]["definition"], + ) + eq_( + "Bone-screw internal spinal fixation system, non-sterile", + json["gmdn_terms"][1]["name"], + ) + eq_( + "An assembly of non-sterile implantable devices intended to provide immobilization and\n stabilization of spinal segments in the treatment of various spinal instabilities or deformities,\n also used as an adjunct to spinal fusion [e.g., for degenerative disc disease (DDD)]. Otherwise\n known as a pedicle screw instrumentation system, it typically consists of a combination of anchors\n (e.g., bolts, hooks, pedicle screws or other types), interconnection mechanisms (incorporating nuts,\n screws, sleeves, or bolts), longitudinal members (e.g., plates, rods, plate/rod combinations),\n and/or transverse connectors. Non-sterile disposable devices associated with implantation may be\n included.", + json["gmdn_terms"][1]["definition"], + ) + eq_("OQG", json["product_codes"][0]["code"]) + eq_( + "Hip Prosthesis, semi-constrained, cemented, metal/polymer, + additive, porous,\n uncemented", + json["product_codes"][0]["name"], + ) + eq_("MAX", json["product_codes"][1]["code"]) + eq_( + "Intervertebral fusion device with bone graft, lumbar", + json["product_codes"][1]["name"], + ) + + eq_("Millimeter", json["device_sizes"][0]["unit"]) + eq_("32/6", json["device_sizes"][0]["value"]) + eq_("Outer Diameter", json["device_sizes"][0]["type"]) + eq_("Size test here", json["device_sizes"][0]["text"]) + + eq_(None, json["device_sizes"][1].get("unit")) + eq_(None, json["device_sizes"][1].get("value")) + eq_("Device Size Text, specify", json["device_sizes"][1]["type"]) + eq_( + str("SPACER LAT PEEK 8° 40L X 18W X 10H"), + json["device_sizes"][1]["text"], + ) + + eq_("Storage Environment Temperature", json["storage"][0]["type"]) + eq_("Degrees Celsius", json["storage"][0]["high"]["unit"]) + eq_("8", json["storage"][0]["high"]["value"]) + eq_("Degrees Celsius", json["storage"][0]["low"]["unit"]) + eq_("-30", json["storage"][0]["low"]["value"]) + eq_(None, json["storage"][0].get("special_conditions")) + + eq_("Special Storage Condition, Specify", json["storage"][1]["type"]) + eq_( + "This device must be stored in a dry location away from temperature\n extremes", + json["storage"][1].get("special_conditions"), + ) + eq_(None, json["storage"][1].get("high")) + eq_(None, json["storage"][1].get("low")) + + eq_("false", json["sterilization"]["is_sterile"]) + eq_("true", json["sterilization"]["is_sterilization_prior_use"]) + eq_( + "Moist Heat or Steam Sterilization", + json["sterilization"]["sterilization_methods"], + ) + + eq_(["3001451451", "12223430908"], json["fei_number"]) + + eq_("K094035", json["premarket_submissions"][0]["submission_number"]) + eq_("000", json["premarket_submissions"][0]["supplement_number"]) + eq_("PMN", json["premarket_submissions"][0]["submission_type"]) + eq_("P950002", json["premarket_submissions"][1]["submission_number"]) + eq_("001", json["premarket_submissions"][1]["supplement_number"]) + eq_("PMA", json["premarket_submissions"][1]["submission_type"]) + + map_output = MagicMock() + map_output.add = add_fn + mapper.map_shard(map_input, map_output) + + def test_extract_xml(self): + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "file.zip"), + os.path.join(self.test_dir, "file.zip"), + ) + extract = ExtractXML() + extract.local_dir = self.test_dir + extract.output = MagicMock(return_value=luigi.LocalTarget(self.test_dir)) + extract.run() + ok_(os.path.isfile(os.path.join(self.test_dir, "file1.xml"))) def main(argv): - unittest.main(argv=argv) + unittest.main(argv=argv) -if __name__ == '__main__': - unittest.main() +if __name__ == "__main__": + unittest.main() diff --git a/openfda/download_util.py b/openfda/download_util.py index 2d5c9de..22a3a57 100644 --- a/openfda/download_util.py +++ b/openfda/download_util.py @@ -1,8 +1,8 @@ #!/usr/bin/env python -''' A set of utility functions used for downloading and extracting files used +""" A set of utility functions used for downloading and extracting files used as inputs for pipelines. -''' +""" import glob import logging @@ -11,18 +11,15 @@ from openfda import common + def extract_and_clean(input_dir, source_encoding, target_encoding, file_type): - ''' A utility function that extracts all of the zip files in a directory and - converts the files from a source encoding to a target encoding. - ''' - for zip_filename in glob.glob(input_dir + '/*.zip'): - txt_name = zip_filename.replace('zip', file_type) - txt_name = txt_name.replace('raw', 'extracted') - common.shell_cmd('mkdir -p %s', dirname(txt_name)) - cmd = 'unzip -p %s | iconv -f %s -t %s -c > %s' - logging.info('Unzipping and converting %s', zip_filename) - common.shell_cmd(cmd, - zip_filename, - source_encoding, - target_encoding, - txt_name) + """A utility function that extracts all of the zip files in a directory and + converts the files from a source encoding to a target encoding. + """ + for zip_filename in glob.glob(input_dir + "/*.zip"): + txt_name = zip_filename.replace("zip", file_type) + txt_name = txt_name.replace("raw", "extracted") + common.shell_cmd("mkdir -p %s", dirname(txt_name)) + cmd = "unzip -p %s | iconv -f %s -t %s -c > %s" + logging.info("Unzipping and converting %s", zip_filename) + common.shell_cmd(cmd, zip_filename, source_encoding, target_encoding, txt_name) diff --git a/openfda/downloadstats/pipeline.py b/openfda/downloadstats/pipeline.py index 258043b..4b3587c 100644 --- a/openfda/downloadstats/pipeline.py +++ b/openfda/downloadstats/pipeline.py @@ -1,7 +1,7 @@ #!/usr/bin/python -''' Pipeline for parsing all the access log files (S3 and CloudFront) for dataset downloads and calculating totals. -''' +""" Pipeline for parsing all the access log files (S3 and CloudFront) for dataset downloads and calculating totals. +""" import glob import gzip @@ -20,205 +20,266 @@ RUN_DIR = dirname(os.path.abspath(__file__)) -S3_ACCESS_LOGS_BUCKET = 's3://openfda-logs/download/' -S3_ACCESS_LOGS_DIR = config.data_dir('downloadstats/s3_logs_raw') -S3_STATS_DB_DIR = config.data_dir('downloadstats/s3_stats.db') -S3_ACCESS_LOGS_CUTOFF = arrow.get('2017-03-01') +S3_ACCESS_LOGS_BUCKET = "s3://openfda-logs/download/" +S3_ACCESS_LOGS_DIR = config.data_dir("downloadstats/s3_logs_raw") +S3_STATS_DB_DIR = config.data_dir("downloadstats/s3_stats.db") +S3_ACCESS_LOGS_CUTOFF = arrow.get("2017-03-01") -CF_ACCESS_LOGS_BUCKET = 's3://openfda-splash-logs/download-cf-logs/' -CF_ACCESS_LOGS_DIR = config.data_dir('downloadstats/cf_logs_raw') -CF_STATS_DB_DIR = config.data_dir('downloadstats/cf_stats.db') +CF_ACCESS_LOGS_BUCKET = "s3://openfda-splash-logs/download-cf-logs/" +CF_ACCESS_LOGS_DIR = config.data_dir("downloadstats/cf_logs_raw") +CF_STATS_DB_DIR = config.data_dir("downloadstats/cf_stats.db") -TOTAL_STATS_DB_DIR = config.data_dir('downloadstats/total_stats.db') +TOTAL_STATS_DB_DIR = config.data_dir("downloadstats/total_stats.db") ENDPOINT_INDEX_MAP = { - 'animalandveterinary/event': 'animalandveterinarydrugevent', - 'drug/event': 'drugevent', - 'drug/label': 'druglabel', - 'drug/enforcement': 'drugenforcement', - 'drug/ndc': 'ndc', - 'drug/drugsfda': 'drugsfda', - 'device/enforcement': 'deviceenforcement', - 'food/enforcement': 'foodenforcement', - 'food/event': 'foodevent', - 'device/event': 'deviceevent', - 'device/classification': 'deviceclass', - 'device/510k': 'deviceclearance', - 'device/pma': 'devicepma', - 'device/recall': 'devicerecall', - 'device/registrationlisting': 'devicereglist', - 'device/udi': 'deviceudi', - 'device/covid19serology': 'covid19serology', - 'other/nsde': 'othernsde', - 'other/substance': 'othersubstance', - 'tobacco/problem': 'tobaccoproblem' + "animalandveterinary/event": "animalandveterinarydrugevent", + "drug/event": "drugevent", + "drug/label": "druglabel", + "drug/enforcement": "drugenforcement", + "drug/ndc": "ndc", + "drug/drugsfda": "drugsfda", + "device/enforcement": "deviceenforcement", + "food/enforcement": "foodenforcement", + "food/event": "foodevent", + "device/event": "deviceevent", + "device/classification": "deviceclass", + "device/510k": "deviceclearance", + "device/pma": "devicepma", + "device/recall": "devicerecall", + "device/registrationlisting": "devicereglist", + "device/udi": "deviceudi", + "device/covid19serology": "covid19serology", + "other/nsde": "othernsde", + "other/substance": "othersubstance", + "tobacco/problem": "tobaccoproblem", } -BOT_USER_AGENTS = map(lambda o: o['pattern'], json.loads(open(join(RUN_DIR, 'crawler-user-agents.json'), 'r').read())) +BOT_USER_AGENTS = map( + lambda o: o["pattern"], + json.loads(open(join(RUN_DIR, "crawler-user-agents.json"), "r").read()), +) def isBot(ua): - return next((p for p in BOT_USER_AGENTS if re.search(p, ua)), False) != False + return next((p for p in BOT_USER_AGENTS if re.search(p, ua)), False) != False class SyncS3AccessLogs(AlwaysRunTask): - def _run(self): - common.cmd(['mkdir', '-p', S3_ACCESS_LOGS_DIR]) - common.cmd(['aws', - '--profile=' + config.aws_profile(), - 's3', - 'sync', + def _run(self): + common.cmd(["mkdir", "-p", S3_ACCESS_LOGS_DIR]) + common.cmd( + [ + "aws", + "--profile=" + config.aws_profile(), + "s3", + "sync", S3_ACCESS_LOGS_BUCKET, - S3_ACCESS_LOGS_DIR]) + S3_ACCESS_LOGS_DIR, + ] + ) - # Cut off S3 access logs at the point when CF logs became available - for file in glob.glob(join(S3_ACCESS_LOGS_DIR, '*')): - if arrow.get(os.path.split(file)[1][:10]) > S3_ACCESS_LOGS_CUTOFF: - os.remove(file) + # Cut off S3 access logs at the point when CF logs became available + for file in glob.glob(join(S3_ACCESS_LOGS_DIR, "*")): + if arrow.get(os.path.split(file)[1][:10]) > S3_ACCESS_LOGS_CUTOFF: + os.remove(file) - def output(self): - return luigi.LocalTarget(S3_ACCESS_LOGS_DIR) + def output(self): + return luigi.LocalTarget(S3_ACCESS_LOGS_DIR) class SyncCFAccessLogs(AlwaysRunTask): - def _run(self): - common.cmd(['mkdir', '-p', CF_ACCESS_LOGS_DIR]) - common.cmd(['aws', - '--profile=' + config.aws_profile(), - 's3', - 'sync', + def _run(self): + common.cmd(["mkdir", "-p", CF_ACCESS_LOGS_DIR]) + common.cmd( + [ + "aws", + "--profile=" + config.aws_profile(), + "s3", + "sync", CF_ACCESS_LOGS_BUCKET, - CF_ACCESS_LOGS_DIR]) + CF_ACCESS_LOGS_DIR, + ] + ) - def output(self): - return luigi.LocalTarget(CF_ACCESS_LOGS_DIR) + def output(self): + return luigi.LocalTarget(CF_ACCESS_LOGS_DIR) class CFAccessLogsStats(parallel.MRTask): - agg_stats = {} - - def requires(self): - return SyncCFAccessLogs() - - def output(self): - return luigi.LocalTarget(CF_STATS_DB_DIR) - - def mapreduce_inputs(self): - return parallel.Collection.from_glob(join(self.input().path, '*')) - - def map(self, log_file, value, output): - stats = {} - with gzip.open(log_file, 'rt', encoding="utf-8") as file: - df = pd.read_csv( - io.StringIO(file.read()), - sep='\t', skiprows=(0, 1), - names=['date', 'time', 'edge', 'bytes', 'ip', 'method', 'host', 'uri', - 'status', 'referer', 'ua'], - usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - low_memory=False, na_values=[], keep_default_na=False, index_col=False, - engine='c', memory_map=False) - for row in df.itertuples(): - if row.method == 'GET' and (200 <= int(row.status) <= 299) and not isBot(row.ua): - for path, endpoint in ENDPOINT_INDEX_MAP.items(): - if row.uri.startswith('/' + path): - if endpoint in stats: - stats[endpoint] = stats[endpoint] + 1 - else: - stats[endpoint] = 1 - - if len(stats) > 0: - output.add('stats', stats) - - def reduce(self, key, values, output): - assert key == 'stats' - for value in values: - for endpoint, count in value.items(): - if endpoint in self.agg_stats: - self.agg_stats[endpoint] = self.agg_stats[endpoint] + count - else: - self.agg_stats[endpoint] = count - output.put(key, self.agg_stats) + agg_stats = {} + + def requires(self): + return SyncCFAccessLogs() + + def output(self): + return luigi.LocalTarget(CF_STATS_DB_DIR) + + def mapreduce_inputs(self): + return parallel.Collection.from_glob(join(self.input().path, "*")) + + def map(self, log_file, value, output): + stats = {} + with gzip.open(log_file, "rt", encoding="utf-8") as file: + df = pd.read_csv( + io.StringIO(file.read()), + sep="\t", + skiprows=(0, 1), + names=[ + "date", + "time", + "edge", + "bytes", + "ip", + "method", + "host", + "uri", + "status", + "referer", + "ua", + ], + usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + low_memory=False, + na_values=[], + keep_default_na=False, + index_col=False, + engine="c", + memory_map=False, + ) + for row in df.itertuples(): + if ( + row.method == "GET" + and (200 <= int(row.status) <= 299) + and not isBot(row.ua) + ): + for path, endpoint in ENDPOINT_INDEX_MAP.items(): + if row.uri.startswith("/" + path): + if endpoint in stats: + stats[endpoint] = stats[endpoint] + 1 + else: + stats[endpoint] = 1 + + if len(stats) > 0: + output.add("stats", stats) + + def reduce(self, key, values, output): + assert key == "stats" + for value in values: + for endpoint, count in value.items(): + if endpoint in self.agg_stats: + self.agg_stats[endpoint] = self.agg_stats[endpoint] + count + else: + self.agg_stats[endpoint] = count + output.put(key, self.agg_stats) class S3AccessLogsStats(parallel.MRTask): - agg_stats = {} - - def requires(self): - return SyncS3AccessLogs() - - def output(self): - return luigi.LocalTarget(S3_STATS_DB_DIR) - - def mapreduce_inputs(self): - return parallel.Collection.from_glob(join(self.input().path, '*')) - - def map(self, log_file, value, output): - stats = {} - df = pd.read_csv( - log_file, - sep=" ", - names=['Owner', 'Bucket', 'Time', 'Tz', 'IP', 'Requester', - 'RequestID', - 'Operation', 'Key', 'URI', 'Status', 'ErrorCode', 'BytesSent', 'ObjectSize', - 'TotalTime', - 'TurnAroundTime', 'Referrer', 'UserAgent', 'VersionId', 'HostId'], - low_memory=False, na_values=[], keep_default_na=False, index_col=False, - engine='c', memory_map=True) - for row in df.itertuples(): - if row.Operation == 'REST.GET.OBJECT' and isinstance(row.Status, int) and (200 <= row.Status <= 299) and row.ErrorCode == '-' and not isBot( - row.UserAgent): - for path, endpoint in ENDPOINT_INDEX_MAP.items(): - if row.Key.startswith(path): - if endpoint in stats: - stats[endpoint] = stats[endpoint] + 1 - else: - stats[endpoint] = 1 - if len(stats) > 0: - output.add('stats', stats) - - def reduce(self, key, values, output): - assert key == 'stats' - for value in values: - for endpoint, count in value.items(): - if endpoint in self.agg_stats: - self.agg_stats[endpoint] = self.agg_stats[endpoint] + count - else: - self.agg_stats[endpoint] = count - output.put(key, self.agg_stats) + agg_stats = {} + + def requires(self): + return SyncS3AccessLogs() + + def output(self): + return luigi.LocalTarget(S3_STATS_DB_DIR) + + def mapreduce_inputs(self): + return parallel.Collection.from_glob(join(self.input().path, "*")) + + def map(self, log_file, value, output): + stats = {} + df = pd.read_csv( + log_file, + sep=" ", + names=[ + "Owner", + "Bucket", + "Time", + "Tz", + "IP", + "Requester", + "RequestID", + "Operation", + "Key", + "URI", + "Status", + "ErrorCode", + "BytesSent", + "ObjectSize", + "TotalTime", + "TurnAroundTime", + "Referrer", + "UserAgent", + "VersionId", + "HostId", + ], + low_memory=False, + na_values=[], + keep_default_na=False, + index_col=False, + engine="c", + memory_map=True, + ) + for row in df.itertuples(): + if ( + row.Operation == "REST.GET.OBJECT" + and isinstance(row.Status, int) + and (200 <= row.Status <= 299) + and row.ErrorCode == "-" + and not isBot(row.UserAgent) + ): + for path, endpoint in ENDPOINT_INDEX_MAP.items(): + if row.Key.startswith(path): + if endpoint in stats: + stats[endpoint] = stats[endpoint] + 1 + else: + stats[endpoint] = 1 + if len(stats) > 0: + output.add("stats", stats) + + def reduce(self, key, values, output): + assert key == "stats" + for value in values: + for endpoint, count in value.items(): + if endpoint in self.agg_stats: + self.agg_stats[endpoint] = self.agg_stats[endpoint] + count + else: + self.agg_stats[endpoint] = count + output.put(key, self.agg_stats) class TotalStats(parallel.MRTask): - agg_stats = {} + agg_stats = {} - def requires(self): - return [S3AccessLogsStats(), CFAccessLogsStats()] + def requires(self): + return [S3AccessLogsStats(), CFAccessLogsStats()] - def output(self): - return luigi.LocalTarget(TOTAL_STATS_DB_DIR) + def output(self): + return luigi.LocalTarget(TOTAL_STATS_DB_DIR) - def mapreduce_inputs(self): - return parallel.Collection.from_sharded_list([path.path for path in self.input()]) + def mapreduce_inputs(self): + return parallel.Collection.from_sharded_list( + [path.path for path in self.input()] + ) - def reduce(self, key, values, output): - assert key == 'stats' - for value in values: - for endpoint, count in value.items(): - if endpoint in self.agg_stats: - self.agg_stats[endpoint] = self.agg_stats[endpoint] + count - else: - self.agg_stats[endpoint] = count - output.put(key, self.agg_stats) + def reduce(self, key, values, output): + assert key == "stats" + for value in values: + for endpoint, count in value.items(): + if endpoint in self.agg_stats: + self.agg_stats[endpoint] = self.agg_stats[endpoint] + count + else: + self.agg_stats[endpoint] = count + output.put(key, self.agg_stats) class LoadJSON(index_util.LoadJSONBase): - index_name = 'downloadstats' - mapping_file = './schemas/downloadstats_mapping.json' - data_source = TotalStats() - use_checksum = False - optimize_index = True + index_name = "downloadstats" + mapping_file = "./schemas/downloadstats_mapping.json" + data_source = TotalStats() + use_checksum = False + optimize_index = True -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/drugsfda/annotate.py b/openfda/drugsfda/annotate.py index 9350b96..ea5bfb8 100644 --- a/openfda/drugsfda/annotate.py +++ b/openfda/drugsfda/annotate.py @@ -8,115 +8,118 @@ def read_json_file(json_file): - for line in json_file: - yield json.loads(line) + for line in json_file: + yield json.loads(line) def read_harmonized_file(harmonized_file): - ''' - Create a dictionary that is keyed by NDA/ANDA application numbers. Used to join - against the application_number value. - ''' - return_dict = collections.defaultdict(list) - for row in read_json_file(harmonized_file): - return_dict[row['application_number'].strip().upper()].append(row) - return return_dict + """ + Create a dictionary that is keyed by NDA/ANDA application numbers. Used to join + against the application_number value. + """ + return_dict = collections.defaultdict(list) + for row in read_json_file(harmonized_file): + return_dict[row["application_number"].strip().upper()].append(row) + return return_dict def _add_field(openfda, field, value): - if not isinstance(value, list): - value = [value] - - for v in value: - if not v: - continue - if field not in openfda: - openfda[field] = {} - openfda[field][v] = True - return + if not isinstance(value, list): + value = [value] + + for v in value: + if not v: + continue + if field not in openfda: + openfda[field] = {} + openfda[field][v] = True + return def AddHarmonizedRowToOpenfda(openfda, row): - if not row['is_original_packager']: - return - - if row['product_ndc'] in row['spl_product_ndc']: - _add_field(openfda, 'application_number', row['application_number']) - # Using the most precise possible name for brand_name - if row['brand_name_suffix']: - brand_name = row['brand_name'] + ' ' + row['brand_name_suffix'] - else: - brand_name = row['brand_name'] - - _add_field(openfda, 'brand_name', brand_name.rstrip().upper()) - _add_field(openfda, 'generic_name', row['generic_name'].upper()) - _add_field(openfda, 'manufacturer_name', row['manufacturer_name']) - _add_field(openfda, 'product_ndc', row['product_ndc']) - _add_field(openfda, 'product_ndc', row['spl_product_ndc']) - _add_field(openfda, 'product_type', row['product_type']) - - for route in row['route'].split(';'): - route = route.strip() - _add_field(openfda, 'route', route) - - for substance in row['substance_name'].split(';'): - substance = substance.strip() - _add_field(openfda, 'substance_name', substance) - - for rxnorm in row['rxnorm']: - _add_field(openfda, 'rxcui', rxnorm['rxcui']) - - _add_field(openfda, 'spl_id', row['id']) - _add_field(openfda, 'spl_set_id', row['spl_set_id']) - _add_field(openfda, 'package_ndc', row['package_ndc']) - - if row['unii_indexing'] != []: - unii_list = [] - for unii_row in row['unii_indexing']: - for key, value in unii_row.items(): - if key == 'unii': - unii_list.append(value) - if key == 'va': - for this_item in value: - for va_key, va_value in this_item.items(): - if va_key == 'name': - if va_value.find('[MoA]') != -1: - _add_field(openfda, 'pharm_class_moa', va_value) - if va_value.find('[Chemical/Ingredient]') != -1 or va_value.find('[CS]') != -1: - _add_field(openfda, 'pharm_class_cs', va_value) - if va_value.find('[PE]') != -1: - _add_field(openfda, 'pharm_class_pe', va_value) - if va_value.find('[EPC]') != -1: - _add_field(openfda, 'pharm_class_epc', va_value) - if va_key == 'number': - _add_field(openfda, 'nui', va_value) - _add_field(openfda, 'unii', unii_list) + if not row["is_original_packager"]: + return + + if row["product_ndc"] in row["spl_product_ndc"]: + _add_field(openfda, "application_number", row["application_number"]) + # Using the most precise possible name for brand_name + if row["brand_name_suffix"]: + brand_name = row["brand_name"] + " " + row["brand_name_suffix"] + else: + brand_name = row["brand_name"] + + _add_field(openfda, "brand_name", brand_name.rstrip().upper()) + _add_field(openfda, "generic_name", row["generic_name"].upper()) + _add_field(openfda, "manufacturer_name", row["manufacturer_name"]) + _add_field(openfda, "product_ndc", row["product_ndc"]) + _add_field(openfda, "product_ndc", row["spl_product_ndc"]) + _add_field(openfda, "product_type", row["product_type"]) + + for route in row["route"].split(";"): + route = route.strip() + _add_field(openfda, "route", route) + + for substance in row["substance_name"].split(";"): + substance = substance.strip() + _add_field(openfda, "substance_name", substance) + + for rxnorm in row["rxnorm"]: + _add_field(openfda, "rxcui", rxnorm["rxcui"]) + + _add_field(openfda, "spl_id", row["id"]) + _add_field(openfda, "spl_set_id", row["spl_set_id"]) + _add_field(openfda, "package_ndc", row["package_ndc"]) + + if row["unii_indexing"] != []: + unii_list = [] + for unii_row in row["unii_indexing"]: + for key, value in unii_row.items(): + if key == "unii": + unii_list.append(value) + if key == "va": + for this_item in value: + for va_key, va_value in this_item.items(): + if va_key == "name": + if va_value.find("[MoA]") != -1: + _add_field(openfda, "pharm_class_moa", va_value) + if ( + va_value.find("[Chemical/Ingredient]") != -1 + or va_value.find("[CS]") != -1 + ): + _add_field(openfda, "pharm_class_cs", va_value) + if va_value.find("[PE]") != -1: + _add_field(openfda, "pharm_class_pe", va_value) + if va_value.find("[EPC]") != -1: + _add_field(openfda, "pharm_class_epc", va_value) + if va_key == "number": + _add_field(openfda, "nui", va_value) + _add_field(openfda, "unii", unii_list) def annotate_drug(drug, harmonized_dict): - application_number = drug['application_number'] + application_number = drug["application_number"] - if application_number in harmonized_dict: - openfda = {} + if application_number in harmonized_dict: + openfda = {} - for row in harmonized_dict[application_number]: - AddHarmonizedRowToOpenfda(openfda, row) + for row in harmonized_dict[application_number]: + AddHarmonizedRowToOpenfda(openfda, row) - openfda_lists = {} - for field, value in openfda.items(): - openfda_lists[field] = [s for s in value.keys()] + openfda_lists = {} + for field, value in openfda.items(): + openfda_lists[field] = [s for s in value.keys()] - drug['openfda'] = openfda_lists + drug["openfda"] = openfda_lists class AnnotateMapper(parallel.Mapper): - def __init__(self, harmonized_file): - self.harmonized_file = harmonized_file + def __init__(self, harmonized_file): + self.harmonized_file = harmonized_file - def map_shard(self, map_input, map_output): - self.harmonized_dict = read_harmonized_file(open(self.harmonized_file)) - parallel.Mapper.map_shard(self, map_input, map_output) + def map_shard(self, map_input, map_output): + self.harmonized_dict = read_harmonized_file(open(self.harmonized_file)) + parallel.Mapper.map_shard(self, map_input, map_output) - def map(self, key, drug, out): - annotate_drug(drug, self.harmonized_dict) - out.add(key, drug) + def map(self, key, drug, out): + annotate_drug(drug, self.harmonized_dict) + out.add(key, drug) diff --git a/openfda/drugsfda/pipeline.py b/openfda/drugsfda/pipeline.py index 8fc8d28..21c5202 100644 --- a/openfda/drugsfda/pipeline.py +++ b/openfda/drugsfda/pipeline.py @@ -1,8 +1,8 @@ #!/usr/local/bin/python -''' +""" Pipeline for converting Drugs@FDA files to JSON and importing into Elasticsearch. -''' +""" import glob import os import re @@ -16,672 +16,759 @@ from openfda.common import first_file_timestamp from openfda.drugsfda.annotate import AnnotateMapper -DOWNLOAD_FILE = 'https://www.fda.gov/media/89850/download' -BASE_DIR = join(config.data_dir(), 'drugsfda') -EXTRACTED_DIR = join(BASE_DIR, 'extracted') -RAW_DATA_FILE = join(BASE_DIR, 'raw/drugsfda.zip') +DOWNLOAD_FILE = "https://www.fda.gov/media/89850/download" +BASE_DIR = join(config.data_dir(), "drugsfda") +EXTRACTED_DIR = join(BASE_DIR, "extracted") +RAW_DATA_FILE = join(BASE_DIR, "raw/drugsfda.zip") -PRODUCTS_DB = join(BASE_DIR, 'json/products.db') -APPLICATIONS_DB = join(BASE_DIR, 'json/applications.db') -APPLICATIONS_DOCS_DB = join(BASE_DIR, 'json/applicationsdocs.db') -SUBMISSIONS_DB = join(BASE_DIR, 'json/submissions.db') -SUBMISSION_PROPERTY_TYPE_DB = join(BASE_DIR, 'json/submissionpropertytype.db') -MARKETING_STATUS_DB = join(BASE_DIR, 'json/marketingstatus.db') -ANNOTATED_DB = join(BASE_DIR, 'json/annotated.db') -TE_DB = join(BASE_DIR, 'json/te.db') -MERGED_DB = join(BASE_DIR, 'json/merged.db') +PRODUCTS_DB = join(BASE_DIR, "json/products.db") +APPLICATIONS_DB = join(BASE_DIR, "json/applications.db") +APPLICATIONS_DOCS_DB = join(BASE_DIR, "json/applicationsdocs.db") +SUBMISSIONS_DB = join(BASE_DIR, "json/submissions.db") +SUBMISSION_PROPERTY_TYPE_DB = join(BASE_DIR, "json/submissionpropertytype.db") +MARKETING_STATUS_DB = join(BASE_DIR, "json/marketingstatus.db") +ANNOTATED_DB = join(BASE_DIR, "json/annotated.db") +TE_DB = join(BASE_DIR, "json/te.db") +MERGED_DB = join(BASE_DIR, "json/merged.db") class DownloadDrugsFDAFiles(luigi.Task): - def requires(self): - return [] + def requires(self): + return [] - def output(self): - return luigi.LocalTarget(RAW_DATA_FILE) + def output(self): + return luigi.LocalTarget(RAW_DATA_FILE) - def run(self): - common.download(DOWNLOAD_FILE, RAW_DATA_FILE) + def run(self): + common.download(DOWNLOAD_FILE, RAW_DATA_FILE) class ExtractDrugsFDAFiles(luigi.Task): - def requires(self): - return DownloadDrugsFDAFiles() + def requires(self): + return DownloadDrugsFDAFiles() - def output(self): - return luigi.LocalTarget(EXTRACTED_DIR) + def output(self): + return luigi.LocalTarget(EXTRACTED_DIR) - def run(self): - zip_filename = RAW_DATA_FILE - output_dir = self.output().path - os.system('unzip -o %(zip_filename)s -d %(output_dir)s' % locals()) + def run(self): + zip_filename = RAW_DATA_FILE + output_dir = self.output().path + os.system("unzip -o %(zip_filename)s -d %(output_dir)s" % locals()) class CleanDrugsFDAFiles(luigi.Task): - def requires(self): - return ExtractDrugsFDAFiles() - - def output(self): - return luigi.LocalTarget(EXTRACTED_DIR) - - def run(self): - for filename in glob.glob(self.input().path + '/ApplicationDocs.txt'): - logging.info('Pre-processing %s', filename) - filtered = filename + '.filtered' - out = open(filtered, 'w') - line_num = 0 - bad_lines = 0 - with open(filename, 'rU', errors='ignore') as fp: - for line in fp: - line = line.strip() - if line_num < 1: - # First line is usually the header - out.write(line) - else: - if len(line.strip()) > 0: - if re.search(r'^\d{1,}', line): - # Properly formatted line. Append it and move on. - out.write('\n' + line) - else: - # Bad line, most likely due to an unescaped carriage return. Tuck it onto the previous line - out.write(' ' + line) - bad_lines += 1 - - line_num += 1 - - logging.info('Issues found & fixed: %s', bad_lines) - out.close() - os.remove(filename) - os.rename(filtered, filename) - + def requires(self): + return ExtractDrugsFDAFiles() + + def output(self): + return luigi.LocalTarget(EXTRACTED_DIR) + + def run(self): + for filename in glob.glob(self.input().path + "/ApplicationDocs.txt"): + logging.info("Pre-processing %s", filename) + filtered = filename + ".filtered" + out = open(filtered, "w") + line_num = 0 + bad_lines = 0 + with open(filename, "rU", errors="ignore") as fp: + for line in fp: + line = line.strip() + if line_num < 1: + # First line is usually the header + out.write(line) + else: + if len(line.strip()) > 0: + if re.search(r"^\d{1,}", line): + # Properly formatted line. Append it and move on. + out.write("\n" + line) + else: + # Bad line, most likely due to an unescaped carriage return. Tuck it onto the previous line + out.write(" " + line) + bad_lines += 1 + + line_num += 1 + + logging.info("Issues found & fixed: %s", bad_lines) + out.close() + os.remove(filename) + os.rename(filtered, filename) class Applications2JSONMapper(parallel.Mapper): - rename_map = { - 'ApplNo': 'application_no', - 'ApplType': 'application_type', - 'ApplPublicNotes': 'application_public_notes', - 'SponsorName': 'sponsor_name' - } + rename_map = { + "ApplNo": "application_no", + "ApplType": "application_type", + "ApplPublicNotes": "application_public_notes", + "SponsorName": "sponsor_name", + } - def map(self, key, value, output): - def _cleaner(k, v): - ''' Helper function to rename keys and purge any keys that are not in - the map. - ''' - v = v.strip() if isinstance(v, str) else v - if k in self.rename_map and v is not None and v != '': - return (self.rename_map[k], v) + def map(self, key, value, output): + def _cleaner(k, v): + """Helper function to rename keys and purge any keys that are not in + the map. + """ + v = v.strip() if isinstance(v, str) else v + if k in self.rename_map and v is not None and v != "": + return (self.rename_map[k], v) - json = common.transform_dict(value, _cleaner) + json = common.transform_dict(value, _cleaner) - if json.get('application_public_notes') != None: - del json['application_public_notes'] + if json.get("application_public_notes") != None: + del json["application_public_notes"] - if json.get('application_no') and json.get('application_type'): - json['application_number'] = json.get('application_type') + json.get('application_no') - del json['application_type'] - del json['application_no'] + if json.get("application_no") and json.get("application_type"): + json["application_number"] = json.get("application_type") + json.get( + "application_no" + ) + del json["application_type"] + del json["application_no"] - output.add(key, json) + output.add(key, json) class Product2JSONMapper(parallel.Mapper): - rename_map = { - 'ApplNo': 'application_number', - 'ProductNo': 'product_number', - 'Form': 'df_and_route', - 'Strength': 'strength', - 'ReferenceDrug': 'reference_drug', - 'DrugName': 'brand_name', - 'ActiveIngredient': 'active_ingredients', - 'ReferenceStandard': 'reference_standard' - } - - VALUE_MAPPINGS = { - "reference_drug": { - "0": "No", - "1": "Yes", - "2": "TBD" - }, - "reference_standard": { - "0": "No", - "1": "Yes" + rename_map = { + "ApplNo": "application_number", + "ProductNo": "product_number", + "Form": "df_and_route", + "Strength": "strength", + "ReferenceDrug": "reference_drug", + "DrugName": "brand_name", + "ActiveIngredient": "active_ingredients", + "ReferenceStandard": "reference_standard", } - } - - def map(self, key, value, output): - def _cleaner(k, v): - ''' Helper function to rename keys and purge any keys that are not in - the map. - ''' - v = v.strip() if isinstance(v, str) else v - if k in self.rename_map and v is not None and v != '': - new_key = self.rename_map[k] - if new_key in self.VALUE_MAPPINGS and v in self.VALUE_MAPPINGS[new_key]: - v = self.VALUE_MAPPINGS[new_key][v] - return (new_key, v) - - json = common.transform_dict(value, _cleaner) - - # Turn active ingredients into an array of objects as per the mapping. - if json.get('active_ingredients'): - ingredientList = re.sub(';\s+', ';', json['active_ingredients']).split(';') - json['active_ingredients'] = [] - - strengthList = re.sub(';\s+', ';', json['strength']).split(';') if json.get('strength') else [] - - for idx, name in enumerate(ingredientList): - ingredient = {'name': name} - if len(strengthList) > idx: - ingredient['strength'] = strengthList[idx] - json['active_ingredients'].append(ingredient) - else: - # Delete to avoid complaints from Elasticsearch. - if json.get('active_ingredients') is not None: - del json['active_ingredients'] - - if json.get('strength') is not None: - del json['strength'] - - # Split dosage and form into two distinct fields. - if json.get('df_and_route') and len(json['df_and_route'].split(';')) == 2: - json['dosage_form'] = json['df_and_route'].split(';')[0].strip() - json['route'] = json['df_and_route'].split(';')[1].strip() - # Sometimes the entire entry is Unknown. Indicate this for both df & route. - elif json.get('df_and_route') and "UNKNOWN" in json['df_and_route']: - json['dosage_form'] = json['df_and_route'] - json['route'] = json['df_and_route'] - # Sometimes the entire only contains dosage form. - else: - json['dosage_form'] = json['df_and_route'] - json['route'] = None - - # Delete the field either way - del json['df_and_route'] - - # Assign application number as the key, since all three drugs@FDA files can be joined by this key. - key = build_products_key(json['application_number'], json) - del json['application_number'] - - output.add(key, json) + + VALUE_MAPPINGS = { + "reference_drug": {"0": "No", "1": "Yes", "2": "TBD"}, + "reference_standard": {"0": "No", "1": "Yes"}, + } + + def map(self, key, value, output): + def _cleaner(k, v): + """Helper function to rename keys and purge any keys that are not in + the map. + """ + v = v.strip() if isinstance(v, str) else v + if k in self.rename_map and v is not None and v != "": + new_key = self.rename_map[k] + if new_key in self.VALUE_MAPPINGS and v in self.VALUE_MAPPINGS[new_key]: + v = self.VALUE_MAPPINGS[new_key][v] + return (new_key, v) + + json = common.transform_dict(value, _cleaner) + + # Turn active ingredients into an array of objects as per the mapping. + if json.get("active_ingredients"): + ingredientList = re.sub(";\s+", ";", json["active_ingredients"]).split(";") + json["active_ingredients"] = [] + + strengthList = ( + re.sub(";\s+", ";", json["strength"]).split(";") + if json.get("strength") + else [] + ) + + for idx, name in enumerate(ingredientList): + ingredient = {"name": name} + if len(strengthList) > idx: + ingredient["strength"] = strengthList[idx] + json["active_ingredients"].append(ingredient) + else: + # Delete to avoid complaints from Elasticsearch. + if json.get("active_ingredients") is not None: + del json["active_ingredients"] + + if json.get("strength") is not None: + del json["strength"] + + # Split dosage and form into two distinct fields. + if json.get("df_and_route") and len(json["df_and_route"].split(";")) == 2: + json["dosage_form"] = json["df_and_route"].split(";")[0].strip() + json["route"] = json["df_and_route"].split(";")[1].strip() + # Sometimes the entire entry is Unknown. Indicate this for both df & route. + elif json.get("df_and_route") and "UNKNOWN" in json["df_and_route"]: + json["dosage_form"] = json["df_and_route"] + json["route"] = json["df_and_route"] + # Sometimes the entire only contains dosage form. + else: + json["dosage_form"] = json["df_and_route"] + json["route"] = None + + # Delete the field either way + del json["df_and_route"] + + # Assign application number as the key, since all three drugs@FDA files can be joined by this key. + key = build_products_key(json["application_number"], json) + del json["application_number"] + + output.add(key, json) def build_products_key(app_number, json): - return ('%s-%s' % (app_number, json['product_number'])) + return "%s-%s" % (app_number, json["product_number"]) class MarketingStatus2JSONMapper(parallel.Mapper): - def __init__(self, doc_lookup): - parallel.Mapper.__init__(self) - self.doc_lookup = doc_lookup - - rename_map = { - 'MarketingStatusID': 'marketing_status_id', - 'ApplNo': 'application_number', - 'ProductNo': 'product_number' - } + def __init__(self, doc_lookup): + parallel.Mapper.__init__(self) + self.doc_lookup = doc_lookup + + rename_map = { + "MarketingStatusID": "marketing_status_id", + "ApplNo": "application_number", + "ProductNo": "product_number", + } - def map(self, key, value, output): - def _cleaner(k, v): - ''' Helper function to rename keys and purge any keys that are not in - the map. - ''' - v = v.strip() if isinstance(v, str) else v - if k in self.rename_map and v is not None and v != '': - return (self.rename_map[k], v) + def map(self, key, value, output): + def _cleaner(k, v): + """Helper function to rename keys and purge any keys that are not in + the map. + """ + v = v.strip() if isinstance(v, str) else v + if k in self.rename_map and v is not None and v != "": + return (self.rename_map[k], v) - json = common.transform_dict(value, _cleaner) + json = common.transform_dict(value, _cleaner) - if json.get('marketing_status_id'): - json['marketing_status'] = self.doc_lookup[json['marketing_status_id']] - del json['marketing_status_id'] + if json.get("marketing_status_id"): + json["marketing_status"] = self.doc_lookup[json["marketing_status_id"]] + del json["marketing_status_id"] - # Assign application number as the key, since all three drugs@FDA files can be joined by this key. - key = build_products_key(json['application_number'], json) - del json['application_number'], json['product_number'] + # Assign application number as the key, since all three drugs@FDA files can be joined by this key. + key = build_products_key(json["application_number"], json) + del json["application_number"], json["product_number"] - output.add(key, json) + output.add(key, json) class TE2JSONMapper(parallel.Mapper): - def __init__(self, doc_lookup): - parallel.Mapper.__init__(self) - self.doc_lookup = doc_lookup - - rename_map = { - 'ApplNo': 'application_number', - 'ProductNo': 'product_number', - 'MarketingStatusID': 'marketing_status_id', - 'TECode': 'te_code' - } + def __init__(self, doc_lookup): + parallel.Mapper.__init__(self) + self.doc_lookup = doc_lookup + + rename_map = { + "ApplNo": "application_number", + "ProductNo": "product_number", + "MarketingStatusID": "marketing_status_id", + "TECode": "te_code", + } - def map(self, key, value, output): - def _cleaner(k, v): - ''' Helper function to rename keys and purge any keys that are not in - the map. - ''' - v = v.strip() if isinstance(v, str) else v - if k in self.rename_map and v is not None and v != '': - return (self.rename_map[k], v) + def map(self, key, value, output): + def _cleaner(k, v): + """Helper function to rename keys and purge any keys that are not in + the map. + """ + v = v.strip() if isinstance(v, str) else v + if k in self.rename_map and v is not None and v != "": + return (self.rename_map[k], v) - json = common.transform_dict(value, _cleaner) + json = common.transform_dict(value, _cleaner) - if json.get('marketing_status_id'): - json['marketing_status'] = self.doc_lookup[json['marketing_status_id']] - del json['marketing_status_id'] + if json.get("marketing_status_id"): + json["marketing_status"] = self.doc_lookup[json["marketing_status_id"]] + del json["marketing_status_id"] - # Assign application number as the key, since all three drugs@FDA files can be joined by this key. - key = build_products_key(json['application_number'], json) - del json['application_number'], json['product_number'] + # Assign application number as the key, since all three drugs@FDA files can be joined by this key. + key = build_products_key(json["application_number"], json) + del json["application_number"], json["product_number"] - output.add(key, json) + output.add(key, json) class Submissions2JSONMapper(parallel.Mapper): - def __init__(self, doc_lookup): - parallel.Mapper.__init__(self) - self.doc_lookup = doc_lookup - - rename_map = { - 'ApplNo': 'application_number', - 'SubmissionClassCodeID': 'submission_class_code_id', - 'SubmissionType': 'submission_type', - 'SubmissionNo': 'submission_number', - 'SubmissionStatus': 'submission_status', - 'SubmissionStatusDate': 'submission_status_date', - 'SubmissionsPublicNotes': 'submission_public_notes', - 'ReviewPriority': 'review_priority' - } - - def map(self, key, value, output): - def _cleaner(k, v): - ''' Helper function to rename keys and purge any keys that are not in - the map. - ''' - v = common.convert_unicode(v.strip()) if isinstance(v, str) else v - if k in self.rename_map and v is not None and v != '': - return (self.rename_map[k], v) - - json = common.transform_dict(value, _cleaner) - - if json.get('submission_class_code_id') and json.get('submission_class_code_id') is not None: - json['submission_class_code'] = self.doc_lookup[json['submission_class_code_id']][0] - descr = self.doc_lookup[json['submission_class_code_id']][1].rstrip() - if descr: - json['submission_class_code_description'] = descr - del json['submission_class_code_id'] - - # Convert date to format used throughout openFDA (yyyymmdd) - if json.get('submission_status_date'): - json['submission_status_date'] = arrow.get(json['submission_status_date']).strftime("%Y%m%d") - - # Assign application number as the key, since all three drugs@FDA files can be joined by this key. - key = build_submissions_key(json['application_number'], json) - del json['application_number'] - - output.add(key, json) + def __init__(self, doc_lookup): + parallel.Mapper.__init__(self) + self.doc_lookup = doc_lookup + + rename_map = { + "ApplNo": "application_number", + "SubmissionClassCodeID": "submission_class_code_id", + "SubmissionType": "submission_type", + "SubmissionNo": "submission_number", + "SubmissionStatus": "submission_status", + "SubmissionStatusDate": "submission_status_date", + "SubmissionsPublicNotes": "submission_public_notes", + "ReviewPriority": "review_priority", + } + + def map(self, key, value, output): + def _cleaner(k, v): + """Helper function to rename keys and purge any keys that are not in + the map. + """ + v = common.convert_unicode(v.strip()) if isinstance(v, str) else v + if k in self.rename_map and v is not None and v != "": + return (self.rename_map[k], v) + + json = common.transform_dict(value, _cleaner) + + if ( + json.get("submission_class_code_id") + and json.get("submission_class_code_id") is not None + ): + json["submission_class_code"] = self.doc_lookup[ + json["submission_class_code_id"] + ][0] + descr = self.doc_lookup[json["submission_class_code_id"]][1].rstrip() + if descr: + json["submission_class_code_description"] = descr + del json["submission_class_code_id"] + + # Convert date to format used throughout openFDA (yyyymmdd) + if json.get("submission_status_date"): + json["submission_status_date"] = arrow.get( + json["submission_status_date"] + ).strftime("%Y%m%d") + + # Assign application number as the key, since all three drugs@FDA files can be joined by this key. + key = build_submissions_key(json["application_number"], json) + del json["application_number"] + + output.add(key, json) def build_submissions_key(app_number, json): - return ('%s-%s-%s' % (app_number, json['submission_type'], json['submission_number'])) + return "%s-%s-%s" % (app_number, json["submission_type"], json["submission_number"]) class SubmissionPropertyType2JSONMapper(parallel.Mapper): - rename_map = { - 'ApplNo': 'application_number', - 'SubmissionType': 'submission_type', - 'SubmissionNo': 'submission_number', - 'SubmissionPropertyTypeCode': 'code' - } - - def map(self, key, value, output): - def _cleaner(k, v): - ''' Helper function to rename keys and purge any keys that are not in - the map. - ''' - v = v.strip() if isinstance(v, str) else v - if k in self.rename_map and v is not None and v != '' and v != 'Null': - return (self.rename_map[k], v) - - json = common.transform_dict(value, _cleaner) - - # Assign application number as the key, since all three drugs@FDA files can be joined by this key. - key = build_submissions_key(json['application_number'], json) - del json['application_number'], json['submission_number'], json['submission_type'] - if json != {}: - output.add(key, json) + rename_map = { + "ApplNo": "application_number", + "SubmissionType": "submission_type", + "SubmissionNo": "submission_number", + "SubmissionPropertyTypeCode": "code", + } + + def map(self, key, value, output): + def _cleaner(k, v): + """Helper function to rename keys and purge any keys that are not in + the map. + """ + v = v.strip() if isinstance(v, str) else v + if k in self.rename_map and v is not None and v != "" and v != "Null": + return (self.rename_map[k], v) + + json = common.transform_dict(value, _cleaner) + + # Assign application number as the key, since all three drugs@FDA files can be joined by this key. + key = build_submissions_key(json["application_number"], json) + del ( + json["application_number"], + json["submission_number"], + json["submission_type"], + ) + if json != {}: + output.add(key, json) class ApplicationsDocs2JSONMapper(parallel.Mapper): - def __init__(self, doc_lookup): - parallel.Mapper.__init__(self) - self.doc_lookup = doc_lookup - - rename_map = { - 'ApplicationDocsID': 'id', - 'ApplicationDocsTypeID': 'type_id', - 'ApplNo': 'application_number', - 'SubmissionType': 'submission_type', - 'SubmissionNo': 'submission_number', - 'ApplicationDocsTitle': 'title', - 'ApplicationDocsURL': 'url', - 'ApplicationDocsDate': 'date' - } - - def map(self, key, value, output): - def _cleaner(k, v): - ''' Helper function to rename keys and purge any keys that are not in - the map. - ''' - v = v.strip() if isinstance(v, str) else v - if k in self.rename_map and v is not None and v != '': - new_key = self.rename_map[k] - if not (new_key == 'title' and v == '0'): - return (new_key, v) - - json = common.transform_dict(value, _cleaner) - - json['type'] = self.doc_lookup[json['type_id']] - del json['type_id'] - - # Convert date to format used throughout openFDA (yyyymmdd) - json['date'] = arrow.get(json['date']).strftime("%Y%m%d") if json.get('date') is not None else "" - json['url'] = common.convert_unicode(json['url']) if json.get('url') is not None else "" - - # Assign application number as the key, since all three drugs@FDA files can be joined by this key. - key = build_submissions_key(json['application_number'], json) - del json['application_number'], json['submission_number'], json['submission_type'] - - output.add(key, json) + def __init__(self, doc_lookup): + parallel.Mapper.__init__(self) + self.doc_lookup = doc_lookup + + rename_map = { + "ApplicationDocsID": "id", + "ApplicationDocsTypeID": "type_id", + "ApplNo": "application_number", + "SubmissionType": "submission_type", + "SubmissionNo": "submission_number", + "ApplicationDocsTitle": "title", + "ApplicationDocsURL": "url", + "ApplicationDocsDate": "date", + } + + def map(self, key, value, output): + def _cleaner(k, v): + """Helper function to rename keys and purge any keys that are not in + the map. + """ + v = v.strip() if isinstance(v, str) else v + if k in self.rename_map and v is not None and v != "": + new_key = self.rename_map[k] + if not (new_key == "title" and v == "0"): + return (new_key, v) + + json = common.transform_dict(value, _cleaner) + + json["type"] = self.doc_lookup[json["type_id"]] + del json["type_id"] + + # Convert date to format used throughout openFDA (yyyymmdd) + json["date"] = ( + arrow.get(json["date"]).strftime("%Y%m%d") + if json.get("date") is not None + else "" + ) + json["url"] = ( + common.convert_unicode(json["url"]) if json.get("url") is not None else "" + ) + + # Assign application number as the key, since all three drugs@FDA files can be joined by this key. + key = build_submissions_key(json["application_number"], json) + del ( + json["application_number"], + json["submission_number"], + json["submission_type"], + ) + + output.add(key, json) class Applications2JSON(luigi.Task): - def requires(self): - return CleanDrugsFDAFiles() + def requires(self): + return CleanDrugsFDAFiles() - def output(self): - return luigi.LocalTarget(APPLICATIONS_DB) + def output(self): + return luigi.LocalTarget(APPLICATIONS_DB) - def run(self): - parallel.mapreduce( - parallel.Collection.from_glob( - join(self.input().path, 'Applications.txt'), parallel.CSVDictLineInput(delimiter='\t')), - mapper=Applications2JSONMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + def run(self): + parallel.mapreduce( + parallel.Collection.from_glob( + join(self.input().path, "Applications.txt"), + parallel.CSVDictLineInput(delimiter="\t"), + ), + mapper=Applications2JSONMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) class Products2JSON(luigi.Task): - def requires(self): - return CleanDrugsFDAFiles() + def requires(self): + return CleanDrugsFDAFiles() - def output(self): - return luigi.LocalTarget(PRODUCTS_DB) + def output(self): + return luigi.LocalTarget(PRODUCTS_DB) - def run(self): - parallel.mapreduce( - parallel.Collection.from_glob( - join(self.input().path, 'Products.txt'), parallel.CSVDictLineInput(delimiter='\t')), - mapper=Product2JSONMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + def run(self): + parallel.mapreduce( + parallel.Collection.from_glob( + join(self.input().path, "Products.txt"), + parallel.CSVDictLineInput(delimiter="\t"), + ), + mapper=Product2JSONMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) class MarketingStatus2JSON(luigi.Task): - def requires(self): - return CleanDrugsFDAFiles() + def requires(self): + return CleanDrugsFDAFiles() - def output(self): - return luigi.LocalTarget(MARKETING_STATUS_DB) + def output(self): + return luigi.LocalTarget(MARKETING_STATUS_DB) - def run(self): - with open(join(EXTRACTED_DIR, 'MarketingStatus_Lookup.txt')) as fin: - rows = (line.split('\t') for line in fin) - doc_lookup = {row[0]: row[1] for row in rows} + def run(self): + with open(join(EXTRACTED_DIR, "MarketingStatus_Lookup.txt")) as fin: + rows = (line.split("\t") for line in fin) + doc_lookup = {row[0]: row[1] for row in rows} - parallel.mapreduce( - parallel.Collection.from_glob( - join(self.input().path, 'MarketingStatus.txt'), parallel.CSVDictLineInput(delimiter='\t')), - mapper=MarketingStatus2JSONMapper(doc_lookup=doc_lookup), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + parallel.mapreduce( + parallel.Collection.from_glob( + join(self.input().path, "MarketingStatus.txt"), + parallel.CSVDictLineInput(delimiter="\t"), + ), + mapper=MarketingStatus2JSONMapper(doc_lookup=doc_lookup), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) class TE2JSON(luigi.Task): - def requires(self): - return CleanDrugsFDAFiles() + def requires(self): + return CleanDrugsFDAFiles() - def output(self): - return luigi.LocalTarget(TE_DB) + def output(self): + return luigi.LocalTarget(TE_DB) - def run(self): - with open(join(EXTRACTED_DIR, 'MarketingStatus_Lookup.txt')) as fin: - rows = (line.split('\t') for line in fin) - doc_lookup = {row[0]: row[1] for row in rows} + def run(self): + with open(join(EXTRACTED_DIR, "MarketingStatus_Lookup.txt")) as fin: + rows = (line.split("\t") for line in fin) + doc_lookup = {row[0]: row[1] for row in rows} - parallel.mapreduce( - parallel.Collection.from_glob( - join(self.input().path, 'TE.txt'), parallel.CSVDictLineInput(delimiter='\t')), - mapper=TE2JSONMapper(doc_lookup=doc_lookup), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + parallel.mapreduce( + parallel.Collection.from_glob( + join(self.input().path, "TE.txt"), + parallel.CSVDictLineInput(delimiter="\t"), + ), + mapper=TE2JSONMapper(doc_lookup=doc_lookup), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) class Submissions2JSON(luigi.Task): - def requires(self): - return CleanDrugsFDAFiles() + def requires(self): + return CleanDrugsFDAFiles() - def output(self): - return luigi.LocalTarget(SUBMISSIONS_DB) + def output(self): + return luigi.LocalTarget(SUBMISSIONS_DB) - def run(self): - with open(join(EXTRACTED_DIR, 'SubmissionClass_Lookup.txt')) as fin: - rows = ( line.split('\t') for line in fin ) - doc_lookup = {row[0]: [row[1], row[2]] for row in rows} + def run(self): + with open(join(EXTRACTED_DIR, "SubmissionClass_Lookup.txt")) as fin: + rows = (line.split("\t") for line in fin) + doc_lookup = {row[0]: [row[1], row[2]] for row in rows} - parallel.mapreduce( - parallel.Collection.from_glob( - join(self.input().path, 'Submissions.txt'), parallel.CSVDictLineInput(delimiter='\t')), - mapper=Submissions2JSONMapper(doc_lookup=doc_lookup), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + parallel.mapreduce( + parallel.Collection.from_glob( + join(self.input().path, "Submissions.txt"), + parallel.CSVDictLineInput(delimiter="\t"), + ), + mapper=Submissions2JSONMapper(doc_lookup=doc_lookup), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) class SubmissionPropertyType2JSON(luigi.Task): - def requires(self): - return CleanDrugsFDAFiles() + def requires(self): + return CleanDrugsFDAFiles() - def output(self): - return luigi.LocalTarget(SUBMISSION_PROPERTY_TYPE_DB) + def output(self): + return luigi.LocalTarget(SUBMISSION_PROPERTY_TYPE_DB) - def run(self): - parallel.mapreduce( - parallel.Collection.from_glob( - join(self.input().path, 'SubmissionPropertyType.txt'), parallel.CSVDictLineInput(delimiter='\t')), - mapper=SubmissionPropertyType2JSONMapper(), - reducer=parallel.ListReducer(), - output_prefix=self.output().path) + def run(self): + parallel.mapreduce( + parallel.Collection.from_glob( + join(self.input().path, "SubmissionPropertyType.txt"), + parallel.CSVDictLineInput(delimiter="\t"), + ), + mapper=SubmissionPropertyType2JSONMapper(), + reducer=parallel.ListReducer(), + output_prefix=self.output().path, + ) class ApplicationsDocs2JSON(luigi.Task): - def requires(self): - return CleanDrugsFDAFiles() + def requires(self): + return CleanDrugsFDAFiles() - def output(self): - return luigi.LocalTarget(APPLICATIONS_DOCS_DB) + def output(self): + return luigi.LocalTarget(APPLICATIONS_DOCS_DB) - def run(self): - with open(join(EXTRACTED_DIR, 'ApplicationsDocsType_Lookup.txt')) as fin: - rows = (line.split('\t') for line in fin) - doc_lookup = {row[0]: row[1].rstrip() for row in rows} + def run(self): + with open(join(EXTRACTED_DIR, "ApplicationsDocsType_Lookup.txt")) as fin: + rows = (line.split("\t") for line in fin) + doc_lookup = {row[0]: row[1].rstrip() for row in rows} - parallel.mapreduce( - parallel.Collection.from_glob( - join(self.input().path, 'ApplicationDocs.txt'), parallel.CSVDictLineInput(delimiter='\t')), - mapper=ApplicationsDocs2JSONMapper(doc_lookup=doc_lookup), - reducer=parallel.ListReducer(), - output_prefix=self.output().path) + parallel.mapreduce( + parallel.Collection.from_glob( + join(self.input().path, "ApplicationDocs.txt"), + parallel.CSVDictLineInput(delimiter="\t"), + ), + mapper=ApplicationsDocs2JSONMapper(doc_lookup=doc_lookup), + reducer=parallel.ListReducer(), + output_prefix=self.output().path, + ) class MergeAllMapper(parallel.Mapper): - def __init__(self, applications_db_path, products_db_path, applications_docs_db_path, submissions_db_path, - submissions_property_type_db_path, marketing_status_path, te_db_path): - self.applications_db_path = applications_db_path - self.products_db_path = products_db_path - self.applications_docs_db_path = applications_docs_db_path - self.submissions_db_path = submissions_db_path - self.submissions_property_type_db_path = submissions_property_type_db_path - self.marketing_status_db_path = marketing_status_path - self.te_db_path = te_db_path - - def map_shard(self, map_input, map_output): - # Transform product DB into a dictionary keyed by application number - self.products_dict = {} - for key, product in parallel.ShardedDB.open(self.products_db_path).range_iter(None, None): - split = key.split('-') - app_key = split[0] - products_arr = [] if self.products_dict.get(app_key) is None else self.products_dict.get(app_key) - products_arr.append(product) - self.products_dict[app_key] = products_arr - - # Transform all sub-product DBs into a dictionary keyed by application number & product number - self.marketing_status_dict = {} - for key, value in parallel.ShardedDB.open(self.marketing_status_db_path).range_iter(None, None): - self.marketing_status_dict[key] = value - - self.te_dict = {} - for key, value in parallel.ShardedDB.open(self.te_db_path).range_iter(None, None): - self.te_dict[key] = value - - # Transform submissions DB into a dictionary keyed by application number - self.submissions_dict = {} - for key, submission in parallel.ShardedDB.open(self.submissions_db_path).range_iter(None, None): - split = key.split('-') - app_key = split[0] - submissions_arr = [] if self.submissions_dict.get(app_key) is None else self.submissions_dict.get(app_key) - submissions_arr.append(submission) - self.submissions_dict[app_key] = submissions_arr - - # Transform all sub-submission DBs into a dictionary keyed by application number & submission number - self.submissions_property_type_dict = {} - for key, value in parallel.ShardedDB.open(self.submissions_property_type_db_path).range_iter(None, None): - self.submissions_property_type_dict[key] = value - - self.applications_docs_dict = {} - for key, value in parallel.ShardedDB.open(self.applications_docs_db_path).range_iter(None, None): - self.applications_docs_dict[key] = value - - parallel.Mapper.map_shard(self, map_input, map_output) - - def map(self, key, application, out): - self.add_products(application) - self.add_submissions(application) - out.add(key, application) - - def add_products(self, application): - key = re.sub("[^0-9]", "", application['application_number']) - products = self.products_dict.get(key) - if products: - products = self.add_marketing_status(products, key) - products = self.add_te(products, key) - application['products'] = products - - def add_marketing_status(self, products, app_key): - for product in products: - key = build_products_key(app_key, product) - if key in self.marketing_status_dict: - marketing_json = self.marketing_status_dict.get(key) - product['marketing_status'] = marketing_json['marketing_status'].rstrip() - return products - - def add_te(self, products, app_key): - for product in products: - key = build_products_key(app_key, product) - if key in self.te_dict: - te_json = self.te_dict.get(key) - if te_json.get('te_code'): - product['te_code'] = te_json['te_code'].rstrip() - if not 'marketing_status' in product and 'marketing_status' in te_json: - product['marketing_status'] = te_json['marketing_status'].rstrip() - return products - - def add_submissions(self, application): - key = re.sub("[^0-9]", "", application['application_number']) - submissions = self.submissions_dict.get(key) - if submissions: - submissions = self.add_submissions_property_type(submissions, key) - submissions = self.add_applications_docs(submissions, key) - application['submissions'] = submissions - - def add_submissions_property_type(self, submissions, app_key): - for submission in submissions: - key = build_submissions_key(app_key, submission) - if key in self.submissions_property_type_dict: - prop_type = self.submissions_property_type_dict.get(key) - submission['submission_property_type'] = prop_type - return submissions - - def add_applications_docs(self, submissions, app_key): - for submission in submissions: - key = build_submissions_key(app_key, submission) - if key in self.applications_docs_dict: - submission['application_docs'] = self.applications_docs_dict.get(key) - return submissions + def __init__( + self, + applications_db_path, + products_db_path, + applications_docs_db_path, + submissions_db_path, + submissions_property_type_db_path, + marketing_status_path, + te_db_path, + ): + self.applications_db_path = applications_db_path + self.products_db_path = products_db_path + self.applications_docs_db_path = applications_docs_db_path + self.submissions_db_path = submissions_db_path + self.submissions_property_type_db_path = submissions_property_type_db_path + self.marketing_status_db_path = marketing_status_path + self.te_db_path = te_db_path + + def map_shard(self, map_input, map_output): + # Transform product DB into a dictionary keyed by application number + self.products_dict = {} + for key, product in parallel.ShardedDB.open(self.products_db_path).range_iter( + None, None + ): + split = key.split("-") + app_key = split[0] + products_arr = ( + [] + if self.products_dict.get(app_key) is None + else self.products_dict.get(app_key) + ) + products_arr.append(product) + self.products_dict[app_key] = products_arr + + # Transform all sub-product DBs into a dictionary keyed by application number & product number + self.marketing_status_dict = {} + for key, value in parallel.ShardedDB.open( + self.marketing_status_db_path + ).range_iter(None, None): + self.marketing_status_dict[key] = value + + self.te_dict = {} + for key, value in parallel.ShardedDB.open(self.te_db_path).range_iter( + None, None + ): + self.te_dict[key] = value + + # Transform submissions DB into a dictionary keyed by application number + self.submissions_dict = {} + for key, submission in parallel.ShardedDB.open( + self.submissions_db_path + ).range_iter(None, None): + split = key.split("-") + app_key = split[0] + submissions_arr = ( + [] + if self.submissions_dict.get(app_key) is None + else self.submissions_dict.get(app_key) + ) + submissions_arr.append(submission) + self.submissions_dict[app_key] = submissions_arr + + # Transform all sub-submission DBs into a dictionary keyed by application number & submission number + self.submissions_property_type_dict = {} + for key, value in parallel.ShardedDB.open( + self.submissions_property_type_db_path + ).range_iter(None, None): + self.submissions_property_type_dict[key] = value + + self.applications_docs_dict = {} + for key, value in parallel.ShardedDB.open( + self.applications_docs_db_path + ).range_iter(None, None): + self.applications_docs_dict[key] = value + + parallel.Mapper.map_shard(self, map_input, map_output) + + def map(self, key, application, out): + self.add_products(application) + self.add_submissions(application) + out.add(key, application) + + def add_products(self, application): + key = re.sub("[^0-9]", "", application["application_number"]) + products = self.products_dict.get(key) + if products: + products = self.add_marketing_status(products, key) + products = self.add_te(products, key) + application["products"] = products + + def add_marketing_status(self, products, app_key): + for product in products: + key = build_products_key(app_key, product) + if key in self.marketing_status_dict: + marketing_json = self.marketing_status_dict.get(key) + product["marketing_status"] = marketing_json[ + "marketing_status" + ].rstrip() + return products + + def add_te(self, products, app_key): + for product in products: + key = build_products_key(app_key, product) + if key in self.te_dict: + te_json = self.te_dict.get(key) + if te_json.get("te_code"): + product["te_code"] = te_json["te_code"].rstrip() + if not "marketing_status" in product and "marketing_status" in te_json: + product["marketing_status"] = te_json["marketing_status"].rstrip() + return products + + def add_submissions(self, application): + key = re.sub("[^0-9]", "", application["application_number"]) + submissions = self.submissions_dict.get(key) + if submissions: + submissions = self.add_submissions_property_type(submissions, key) + submissions = self.add_applications_docs(submissions, key) + application["submissions"] = submissions + + def add_submissions_property_type(self, submissions, app_key): + for submission in submissions: + key = build_submissions_key(app_key, submission) + if key in self.submissions_property_type_dict: + prop_type = self.submissions_property_type_dict.get(key) + submission["submission_property_type"] = prop_type + return submissions + + def add_applications_docs(self, submissions, app_key): + for submission in submissions: + key = build_submissions_key(app_key, submission) + if key in self.applications_docs_dict: + submission["application_docs"] = self.applications_docs_dict.get(key) + return submissions class MergeAll(luigi.Task): - def requires(self): - return [Applications2JSON(), Products2JSON(), ApplicationsDocs2JSON(), Submissions2JSON(), - SubmissionPropertyType2JSON(), MarketingStatus2JSON(), TE2JSON()] - - def output(self): - return luigi.LocalTarget(MERGED_DB) - - def run(self): - applications_db = self.input()[0].path - products_db = self.input()[1].path - applications_docs_db = self.input()[2].path - submissions_db = self.input()[3].path - submissions_property_type_db = self.input()[4].path - marketing_status = self.input()[5].path - te_db = self.input()[6].path - - parallel.mapreduce( - parallel.Collection.from_sharded(applications_db), - mapper=MergeAllMapper(applications_db, products_db, applications_docs_db, submissions_db, - submissions_property_type_db, marketing_status, te_db), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - map_workers=1, - num_shards=1) # TODO: improve the code to avoid having to limit number of shards to one + def requires(self): + return [ + Applications2JSON(), + Products2JSON(), + ApplicationsDocs2JSON(), + Submissions2JSON(), + SubmissionPropertyType2JSON(), + MarketingStatus2JSON(), + TE2JSON(), + ] + + def output(self): + return luigi.LocalTarget(MERGED_DB) + + def run(self): + applications_db = self.input()[0].path + products_db = self.input()[1].path + applications_docs_db = self.input()[2].path + submissions_db = self.input()[3].path + submissions_property_type_db = self.input()[4].path + marketing_status = self.input()[5].path + te_db = self.input()[6].path + + parallel.mapreduce( + parallel.Collection.from_sharded(applications_db), + mapper=MergeAllMapper( + applications_db, + products_db, + applications_docs_db, + submissions_db, + submissions_property_type_db, + marketing_status, + te_db, + ), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + map_workers=1, + num_shards=1, + ) # TODO: improve the code to avoid having to limit number of shards to one + class AnnotateDrugsFDA(luigi.Task): - def requires(self): - return [MergeAll(), CombineHarmonization()] + def requires(self): + return [MergeAll(), CombineHarmonization()] - def output(self): - return luigi.LocalTarget(ANNOTATED_DB) + def output(self): + return luigi.LocalTarget(ANNOTATED_DB) - def run(self): - input_db = self.input()[0].path - harmonized_file = self.input()[1].path + def run(self): + input_db = self.input()[0].path + harmonized_file = self.input()[1].path - parallel.mapreduce( - parallel.Collection.from_sharded(input_db), - mapper=AnnotateMapper(harmonized_file), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=1) # TODO: improve the code to avoid having to limit number of shards to one + parallel.mapreduce( + parallel.Collection.from_sharded(input_db), + mapper=AnnotateMapper(harmonized_file), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=1, + ) # TODO: improve the code to avoid having to limit number of shards to one class LoadJSON(index_util.LoadJSONBase): - index_name = 'drugsfda' - mapping_file = './schemas/drugsfda_mapping.json' - data_source = AnnotateDrugsFDA() - use_checksum = False - optimize_index = True - last_update_date = lambda _: first_file_timestamp(os.path.dirname(RAW_DATA_FILE)) + index_name = "drugsfda" + mapping_file = "./schemas/drugsfda_mapping.json" + data_source = AnnotateDrugsFDA() + use_checksum = False + optimize_index = True + last_update_date = lambda _: first_file_timestamp(os.path.dirname(RAW_DATA_FILE)) -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/drugsfda/tests/pipeline_test.py b/openfda/drugsfda/tests/pipeline_test.py index 380f71f..ea9aaea 100644 --- a/openfda/drugsfda/tests/pipeline_test.py +++ b/openfda/drugsfda/tests/pipeline_test.py @@ -13,55 +13,75 @@ class DrugsFDAPipelineTests(unittest.TestCase): - def setUp(self): - self.test_dir = tempfile.mkdtemp() - openfda.drugsfda.pipeline.BASE_DIR = self.test_dir - openfda.annotation_table.pipeline.BASE_DIR = self.test_dir - openfda.drugsfda.pipeline.EXTRACTED_DIR = join(self.test_dir, 'extracted/') - openfda.drugsfda.pipeline.RAW_DATA_FILE = join(self.test_dir, 'raw/drugsatfda.zip') - openfda.drugsfda.pipeline.PRODUCTS_DB = join(self.test_dir, 'json/products.db') - openfda.drugsfda.pipeline.APPLICATIONS_DB = join(self.test_dir, 'json/applications.db') - openfda.drugsfda.pipeline.APPLICATIONS_DOCS_DB = join(self.test_dir, 'json/applicationsdocs.db') - openfda.drugsfda.pipeline.SUBMISSIONS_DB = join(self.test_dir, 'json/submissions.db') - openfda.drugsfda.pipeline.SUBMISSION_PROPERTY_TYPE_DB = join(self.test_dir, 'json/submissionpropertytype.db') - openfda.drugsfda.pipeline.MARKETING_STATUS_DB = join(self.test_dir, 'json/marketingstatus.db') - openfda.drugsfda.pipeline.TE_DB = join(self.test_dir, 'json/te.db') - openfda.drugsfda.pipeline.MERGED_DB = join(self.test_dir, 'json/merged.db') - openfda.drugsfda.pipeline.ANNOTATED_DB = join(self.test_dir, 'json/annotated.db') - print('Temp dir is ' + self.test_dir) + def setUp(self): + self.test_dir = tempfile.mkdtemp() + openfda.drugsfda.pipeline.BASE_DIR = self.test_dir + openfda.annotation_table.pipeline.BASE_DIR = self.test_dir + openfda.drugsfda.pipeline.EXTRACTED_DIR = join(self.test_dir, "extracted/") + openfda.drugsfda.pipeline.RAW_DATA_FILE = join( + self.test_dir, "raw/drugsatfda.zip" + ) + openfda.drugsfda.pipeline.PRODUCTS_DB = join(self.test_dir, "json/products.db") + openfda.drugsfda.pipeline.APPLICATIONS_DB = join( + self.test_dir, "json/applications.db" + ) + openfda.drugsfda.pipeline.APPLICATIONS_DOCS_DB = join( + self.test_dir, "json/applicationsdocs.db" + ) + openfda.drugsfda.pipeline.SUBMISSIONS_DB = join( + self.test_dir, "json/submissions.db" + ) + openfda.drugsfda.pipeline.SUBMISSION_PROPERTY_TYPE_DB = join( + self.test_dir, "json/submissionpropertytype.db" + ) + openfda.drugsfda.pipeline.MARKETING_STATUS_DB = join( + self.test_dir, "json/marketingstatus.db" + ) + openfda.drugsfda.pipeline.TE_DB = join(self.test_dir, "json/te.db") + openfda.drugsfda.pipeline.MERGED_DB = join(self.test_dir, "json/merged.db") + openfda.drugsfda.pipeline.ANNOTATED_DB = join( + self.test_dir, "json/annotated.db" + ) + print("Temp dir is " + self.test_dir) - os.mkdir(dirname(os.path.abspath(openfda.drugsfda.pipeline.RAW_DATA_FILE))) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "drugsatfda.zip"), - openfda.drugsfda.pipeline.RAW_DATA_FILE) + os.mkdir(dirname(os.path.abspath(openfda.drugsfda.pipeline.RAW_DATA_FILE))) + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "drugsatfda.zip"), + openfda.drugsfda.pipeline.RAW_DATA_FILE, + ) - zip_filename = os.path.join(dirname(os.path.abspath(__file__)), "harmonized.json.zip") - output_dir = self.test_dir - os.system('unzip -o %(zip_filename)s -d %(output_dir)s' % locals()) + zip_filename = os.path.join( + dirname(os.path.abspath(__file__)), "harmonized.json.zip" + ) + output_dir = self.test_dir + os.system("unzip -o %(zip_filename)s -d %(output_dir)s" % locals()) - def tearDown(self): - shutil.rmtree(self.test_dir) + def tearDown(self): + shutil.rmtree(self.test_dir) - def test_annotated_json(self): - ExtractDrugsFDAFiles().run() - Applications2JSON().run() - Products2JSON().run() - MarketingStatus2JSON().run() - TE2JSON().run() - Submissions2JSON().run() - SubmissionPropertyType2JSON().run() - ApplicationsDocs2JSON().run() - MergeAll().run() - AnnotateDrugsFDA().run() - ldb = leveldb.LevelDB(os.path.join(AnnotateDrugsFDA().output().path, 'shard-00000-of-00001.db')) - data = list(ldb.RangeIter()) + def test_annotated_json(self): + ExtractDrugsFDAFiles().run() + Applications2JSON().run() + Products2JSON().run() + MarketingStatus2JSON().run() + TE2JSON().run() + Submissions2JSON().run() + SubmissionPropertyType2JSON().run() + ApplicationsDocs2JSON().run() + MergeAll().run() + AnnotateDrugsFDA().run() + ldb = leveldb.LevelDB( + os.path.join(AnnotateDrugsFDA().output().path, "shard-00000-of-00001.db") + ) + data = list(ldb.RangeIter()) - # We should have exactly 23267 records in the DB - eq_(len(data), 23267) + # We should have exactly 23267 records in the DB + eq_(len(data), 23267) def main(argv): - unittest.main(argv=argv) + unittest.main(argv=argv) -if __name__ == '__main__': - unittest.main() +if __name__ == "__main__": + unittest.main() diff --git a/openfda/elasticsearch_requests.py b/openfda/elasticsearch_requests.py index 55b838a..c783eb2 100644 --- a/openfda/elasticsearch_requests.py +++ b/openfda/elasticsearch_requests.py @@ -1,5 +1,6 @@ -''' A bundle of ElasticSearch functions used throughout the pipelines -''' +""" A bundle of ElasticSearch functions used throughout the pipelines +""" + import logging import os from os.path import join, dirname @@ -7,70 +8,77 @@ import elasticsearch import simplejson as json -METADATA_INDEX = 'openfdametadata' -METADATA_TYPE = '_doc' +METADATA_INDEX = "openfdametadata" +METADATA_TYPE = "_doc" RUN_DIR = dirname(dirname(os.path.abspath(__file__))) -INDEX_SETTINGS = join(RUN_DIR, 'schemas/indexing.json') +INDEX_SETTINGS = join(RUN_DIR, "schemas/indexing.json") + def clear_and_load(es, index_name, type_name, mapping_file): - try: - es.indices.delete(index=index_name) - logging.info('Deleting index %s', index_name) - except elasticsearch.ElasticsearchException: - logging.info('%s does not exist, nothing to delete', index_name) + try: + es.indices.delete(index=index_name) + logging.info("Deleting index %s", index_name) + except elasticsearch.ElasticsearchException: + logging.info("%s does not exist, nothing to delete", index_name) - load_mapping(es, index_name, type_name, mapping_file) + load_mapping(es, index_name, type_name, mapping_file) def load_mapping(es, index_name, type_name, mapping_file_or_dict): - if es.indices.exists(index_name): - logging.info('Index %s already exists, skipping creation.', index_name) - return + if es.indices.exists(index_name): + logging.info("Index %s already exists, skipping creation.", index_name) + return - if not isinstance(mapping_file_or_dict, dict): - mapping = open(mapping_file_or_dict, 'r').read().strip() - mapping_dict = json.loads(mapping) - else: - mapping_dict = mapping_file_or_dict + if not isinstance(mapping_file_or_dict, dict): + mapping = open(mapping_file_or_dict, "r").read().strip() + mapping_dict = json.loads(mapping) + else: + mapping_dict = mapping_file_or_dict - try: - settings_dict = json.loads(open(INDEX_SETTINGS).read())['settings'] - es.indices.create(index=index_name, mappings=mapping_dict, - settings=settings_dict, - include_type_name=True) - es.indices.clear_cache(index=index_name) - except: - logging.fatal('Something has gone wrong making the mapping for %s', type_name, - exc_info=1) - raise + try: + settings_dict = json.loads(open(INDEX_SETTINGS).read())["settings"] + es.indices.create( + index=index_name, + mappings=mapping_dict, + settings=settings_dict, + include_type_name=True, + ) + es.indices.clear_cache(index=index_name) + except: + logging.fatal( + "Something has gone wrong making the mapping for %s", type_name, exc_info=1 + ) + raise def update_process_datetime(es, doc_id, last_run_date, last_update_date): - ''' Updates the last_run_date and last_update_date for the document id passed into function. + """Updates the last_run_date and last_update_date for the document id passed into function. The document id in will be the name of another index in the cluster. - ''' - _map = { - '_doc': { - 'properties': { - 'last_update_date': { - 'type': 'date', - 'format': 'strict_date_optional_time' - }, - 'last_run_date': { - 'type': 'date', - 'format': 'strict_date_optional_time' + """ + _map = { + "_doc": { + "properties": { + "last_update_date": { + "type": "date", + "format": "strict_date_optional_time", + }, + "last_run_date": { + "type": "date", + "format": "strict_date_optional_time", + }, + } } - } } - } - load_mapping(es, METADATA_INDEX, METADATA_TYPE, _map) - new_doc = {'last_update_date': last_update_date, 'last_run_date': last_run_date} - logging.info( - "Updating last_run_date & last_update_date - [index=%s, type=%s, id=%s, last_run_date=%s, last_update_date=%s]", - METADATA_INDEX, METADATA_TYPE, doc_id, last_run_date, last_update_date) - es.index(index=METADATA_INDEX, - doc_type=METADATA_TYPE, - id=doc_id, - body=new_doc) + load_mapping(es, METADATA_INDEX, METADATA_TYPE, _map) + new_doc = {"last_update_date": last_update_date, "last_run_date": last_run_date} + logging.info( + "Updating last_run_date & last_update_date - [index=%s, type=%s, id=%s, last_run_date=%s, last_update_date=%s]", + METADATA_INDEX, + METADATA_TYPE, + doc_id, + last_run_date, + last_update_date, + ) + es.index(index=METADATA_INDEX, doc_type=METADATA_TYPE, id=doc_id, body=new_doc) diff --git a/openfda/export/pipeline.py b/openfda/export/pipeline.py index 928e7e3..9aa9e3e 100644 --- a/openfda/export/pipeline.py +++ b/openfda/export/pipeline.py @@ -1,4 +1,3 @@ - import collections import shutil from itertools import tee @@ -15,494 +14,477 @@ RUN_DIR = dirname(dirname(dirname(os.path.abspath(__file__)))) -BASE_DIR = os.path.abspath(join(RUN_DIR, './data/export/')) -FILES_DIR = os.path.abspath(join(BASE_DIR, './files/')) -SCHEMA_DIR = os.path.abspath(join(RUN_DIR, './schemas/')) +BASE_DIR = os.path.abspath(join(RUN_DIR, "./data/export/")) +FILES_DIR = os.path.abspath(join(BASE_DIR, "./files/")) +SCHEMA_DIR = os.path.abspath(join(RUN_DIR, "./schemas/")) # Export output is spoken in the language of the API, meaning that the directory # structure needs to follow the API. As such, we need to map the API to the # index. Please note that some indices serve more than one endpoint. ENDPOINT_INDEX_MAP = { - '/animalandveterinary/event': 'animalandveterinarydrugevent', - '/drug/event': 'drugevent', - '/drug/label': 'druglabel', - '/drug/enforcement': 'recall', - '/drug/ndc': 'ndc', - '/drug/drugsfda': 'drugsfda', - '/device/enforcement': 'recall', - '/food/enforcement': 'recall', - '/food/event': 'foodevent', - '/device/event': 'deviceevent', - '/device/classification': 'deviceclass', - '/device/510k': 'deviceclearance', - '/device/pma': 'devicepma', - '/device/recall': 'devicerecall', - '/device/registrationlisting': 'devicereglist', - '/device/udi': 'deviceudi', - '/device/covid19serology': 'covid19serology', - '/other/nsde': 'othernsde', - '/other/substance': 'substancedata', - '/tobacco/problem': 'tobaccoproblem' - + "/animalandveterinary/event": "animalandveterinarydrugevent", + "/drug/event": "drugevent", + "/drug/label": "druglabel", + "/drug/enforcement": "recall", + "/drug/ndc": "ndc", + "/drug/drugsfda": "drugsfda", + "/device/enforcement": "recall", + "/food/enforcement": "recall", + "/food/event": "foodevent", + "/device/event": "deviceevent", + "/device/classification": "deviceclass", + "/device/510k": "deviceclearance", + "/device/pma": "devicepma", + "/device/recall": "devicerecall", + "/device/registrationlisting": "devicereglist", + "/device/udi": "deviceudi", + "/device/covid19serology": "covid19serology", + "/other/nsde": "othernsde", + "/other/substance": "substancedata", + "/tobacco/problem": "tobaccoproblem", } # A data structure that helps generate a distinct dataset from a shared index. # This structure tells us which key in the index and which value to query by in # order to get a distinct export for an endpoint. FILTERED_ENPOINT_MAP = { - '/drug/enforcement': { - 'key': 'product_type', - 'term': 'drugs' - }, - '/device/enforcement': { - 'key': 'product_type', - 'term': 'devices' - }, - '/food/enforcement': { - 'key': 'product_type', - 'term': 'food' - }, + "/drug/enforcement": {"key": "product_type", "term": "drugs"}, + "/device/enforcement": {"key": "product_type", "term": "devices"}, + "/food/enforcement": {"key": "product_type", "term": "food"}, } # For the larger endpoints, we want to separate the output by quarter, this # structure tells us the data range, the key to query as well as the chunk size # to use for a larger payload. RANGE_ENDPOINT_MAP = { - # The end_date should be set to the date of the latest data availability - # If you do not set this correctly, too much data collects in the `All other data` zip files - # Check here for drug event: - # https://www.fda.gov/drugs/guidancecomplianceregulatoryinformation/surveillance/adversedrugeffects/ucm082193.htm - '/drug/event': { - 'date_key': '@timestamp', - 'start_date': '2004-01-01', - 'end_date': '2021-10-01' - }, - # Check here for device event: - # https://www.fda.gov/MedicalDevices/DeviceRegulationandGuidance/PostmarketRequirements/ReportingAdverseEvents/ucm127891.htm - '/device/event': { - 'date_key': 'date_received', - 'start_date': '1991-10-01', - 'end_date': '2022-04-01' - }, - '/animalandveterinary/event': { - 'date_key': 'original_receive_date', - 'start_date': '1987-01-01', - 'end_date': '2021-10-01' - } + # The end_date should be set to the date of the latest data availability + # If you do not set this correctly, too much data collects in the `All other data` zip files + # Check here for drug event: + # https://www.fda.gov/drugs/guidancecomplianceregulatoryinformation/surveillance/adversedrugeffects/ucm082193.htm + "/drug/event": { + "date_key": "@timestamp", + "start_date": "2004-01-01", + "end_date": "2021-10-01", + }, + # Check here for device event: + # https://www.fda.gov/MedicalDevices/DeviceRegulationandGuidance/PostmarketRequirements/ReportingAdverseEvents/ucm127891.htm + "/device/event": { + "date_key": "date_received", + "start_date": "1991-10-01", + "end_date": "2022-04-01", + }, + "/animalandveterinary/event": { + "date_key": "original_receive_date", + "start_date": "1987-01-01", + "end_date": "2021-10-01", + }, } DEFAULT_CHUNKS = 250000 CUSTOM_CHUNKS = { - '/drug/label': 20000, - '/drug/event': 12000, - '/device/event': 100000, - '/device/udi': 100000 + "/drug/label": 20000, + "/drug/event": 12000, + "/device/event": 100000, + "/device/udi": 100000, } class EndpointExport(object): - ''' Object that holds the data required to do a query based export of an - endpoint. Also exposes some helper functions to assist in generating - both date range and term filter based queries. - ''' - def __init__(self, endpoint, chunks=250000, partition='', query=None): - self.chunks = chunks - self.endpoint = endpoint - self.index_name = ENDPOINT_INDEX_MAP[self.endpoint] - self.partition = partition - self.query = query - self.quarter_map = { - 1: 'q1', - 4: 'q2', - 7: 'q3', - 10: 'q4' - } - - @staticmethod - def build_term_filter(key, term): - return { - 'query': { - 'bool': { - 'must': {'match_all': {}}, - 'filter': { - 'term': { - key: term + """Object that holds the data required to do a query based export of an + endpoint. Also exposes some helper functions to assist in generating + both date range and term filter based queries. + """ + + def __init__(self, endpoint, chunks=250000, partition="", query=None): + self.chunks = chunks + self.endpoint = endpoint + self.index_name = ENDPOINT_INDEX_MAP[self.endpoint] + self.partition = partition + self.query = query + self.quarter_map = {1: "q1", 4: "q2", 7: "q3", 10: "q4"} + + @staticmethod + def build_term_filter(key, term): + return { + "query": { + "bool": {"must": {"match_all": {}}, "filter": {"term": {key: term}}} } - } } - } - } - - @staticmethod - def build_date_range_query(key, start, end, negate=False): - query = { - 'query': { - 'bool': { - 'must': {'match_all': {}}, - 'filter': { - 'range': { - key: { - 'gte': start, - 'lt': end - } + + @staticmethod + def build_date_range_query(key, start, end, negate=False): + query = { + "query": { + "bool": { + "must": {"match_all": {}}, + "filter": {"range": {key: {"gte": start, "lt": end}}}, + } } - } } - } - } - # If negated, wrap condition with a `not` dictionary - if negate: - query['query']['bool']['must_not'] = query['query']['bool'].pop('filter') + # If negated, wrap condition with a `not` dictionary + if negate: + query["query"]["bool"]["must_not"] = query["query"]["bool"].pop("filter") - return dict(query) + return dict(query) - def build_quarters(self, start_date, end_date): - date_range = arrow.Arrow.range('month', start_date, end_date) + def build_quarters(self, start_date, end_date): + date_range = arrow.Arrow.range("month", start_date, end_date) - return [d for d in date_range if d.month in self.quarter_map] + return [d for d in date_range if d.month in self.quarter_map] def pairwise(iterable): - ''' Helper function taken from python docs to turn a list into a list of - pairwise tuples. + """Helper function taken from python docs to turn a list into a list of + pairwise tuples. - [s0, s1, s2, s3] -> [(s0,s1), (s1,s2), (s2, s3)] - ''' - a, b = tee(iterable) - next(b, None) + [s0, s1, s2, s3] -> [(s0,s1), (s1,s2), (s2, s3)] + """ + a, b = tee(iterable) + next(b, None) - return zip(a, b) + return zip(a, b) def walk_glob(file_pattern, crawl_dir): - # The directory layout is variable, depending upon the existence of - # partitions, so we have to walk everything from this point down. - results = [] + # The directory layout is variable, depending upon the existence of + # partitions, so we have to walk everything from this point down. + results = [] - for dirpath, dirs, filenames in os.walk(crawl_dir): - for name in filenames: - if name.startswith(file_pattern): - results.append(join(dirpath, name)) + for dirpath, dirs, filenames in os.walk(crawl_dir): + for name in filenames: + if name.startswith(file_pattern): + results.append(join(dirpath, name)) - return results + return results def _make_date_range_endpoint_batch(endpoint, params): - ''' Helper function to make the export quarters code more readable. This - function does two things: exports all data that is NOT in between two - dates (putting it in the all_other partition), and it exports every - quarter between two dates as a separate partition. - - Creates a list of EndpointExport objects, which get fed into the - exported. - ''' - - batch = [] - base = EndpointExport(endpoint) - quarter_map = base.quarter_map - - fmt = 'YYYY-MM-DD' - start_date = arrow.get(params['start_date'], fmt) - end_date = arrow.get(params['end_date'], fmt) - date_key = params['date_key'] - chunks = params['chunks'] - quarters = base.build_quarters(start_date, end_date) - - # First grab everything that is outside the range and put it in an `all_other` - # partition. - query = base.build_date_range_query(date_key, - start_date.format(fmt), - end_date.format(fmt), - negate=True) - - batch.append( - EndpointExport(endpoint, query=query, chunks=chunks, partition='all_other') - ) - - # Now iterate over each quarter as a range tuple. - # [(q1, q2), (q2, q3), ...] - for date_pair in pairwise(quarters): - start, end = date_pair - partition = str(start.year) + quarter_map[start.month] - query = base.build_date_range_query(date_key, - start.format(fmt), - end.format(fmt)) + """Helper function to make the export quarters code more readable. This + function does two things: exports all data that is NOT in between two + dates (putting it in the all_other partition), and it exports every + quarter between two dates as a separate partition. + + Creates a list of EndpointExport objects, which get fed into the + exported. + """ + + batch = [] + base = EndpointExport(endpoint) + quarter_map = base.quarter_map + + fmt = "YYYY-MM-DD" + start_date = arrow.get(params["start_date"], fmt) + end_date = arrow.get(params["end_date"], fmt) + date_key = params["date_key"] + chunks = params["chunks"] + quarters = base.build_quarters(start_date, end_date) + + # First grab everything that is outside the range and put it in an `all_other` + # partition. + query = base.build_date_range_query( + date_key, start_date.format(fmt), end_date.format(fmt), negate=True + ) + batch.append( - EndpointExport(endpoint, query=query, chunks=chunks, partition=partition) + EndpointExport(endpoint, query=query, chunks=chunks, partition="all_other") ) - return batch + # Now iterate over each quarter as a range tuple. + # [(q1, q2), (q2, q3), ...] + for date_pair in pairwise(quarters): + start, end = date_pair + partition = str(start.year) + quarter_map[start.month] + query = base.build_date_range_query( + date_key, start.format(fmt), end.format(fmt) + ) + batch.append( + EndpointExport(endpoint, query=query, chunks=chunks, partition=partition) + ) + + return batch def basic_cleaner(k, v): - ''' Cleaning function so that the output of Elasticsearch mimics the API. - That is, remove internal keys like `@field` and `field_exact`. - ''' - ignore = [ - 'baseline_510_k_exempt_flag', - 'baseline_510_k_flag', - 'baseline_510_k_number', - 'baseline_brand_name', - 'baseline_catalog_number', - 'baseline_date_ceased_marketing', - 'baseline_date_first_marketed', - 'baseline_device_family', - 'baseline_generic_name', - 'baseline_model_number', - 'baseline_other_id_number', - 'baseline_pma_flag', - 'baseline_pma_number', - 'baseline_preamendment_flag', - 'baseline_shelf_life_contained', - 'baseline_shelf_life_in_months', - 'baseline_transitional_flag' - ] - - if k in ignore: - return None - - if k.startswith('@'): - return None - - if k.endswith('_exact'): - return None - - return (k, v) + """Cleaning function so that the output of Elasticsearch mimics the API. + That is, remove internal keys like `@field` and `field_exact`. + """ + ignore = [ + "baseline_510_k_exempt_flag", + "baseline_510_k_flag", + "baseline_510_k_number", + "baseline_brand_name", + "baseline_catalog_number", + "baseline_date_ceased_marketing", + "baseline_date_first_marketed", + "baseline_device_family", + "baseline_generic_name", + "baseline_model_number", + "baseline_other_id_number", + "baseline_pma_flag", + "baseline_pma_number", + "baseline_preamendment_flag", + "baseline_shelf_life_contained", + "baseline_shelf_life_in_months", + "baseline_transitional_flag", + ] + + if k in ignore: + return None + + if k.startswith("@"): + return None + + if k.endswith("_exact"): + return None + + return (k, v) def omit_internal_keys(data): - ''' Cleaner function to pass to the dump_index command and is used as a - json.load(..., object_hook=omit_internal_keys). - ''' - return common.transform_dict(data, basic_cleaner) + """Cleaner function to pass to the dump_index command and is used as a + json.load(..., object_hook=omit_internal_keys). + """ + return common.transform_dict(data, basic_cleaner) class MakeExportBatches(AlwaysRunTask): - def requires(self): - return [] - - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'batches')) - - def _run(self): - shutil.rmtree(self.output().path, ignore_errors=True) - os.makedirs(self.output().path) - # Get all of the endpoints served by this index - # Create an `EndpointExport` object for each endpoint in order to export - # each endpoint properly. - # - # Endpoint exports can be: - # date range based (quarterly output) - # filter based (index serves many endpoints) - # vanilla (endpoint is 1 to 1 with index and it is exported all at once) - for endpoint, index_name in ENDPOINT_INDEX_MAP.items(): - endpoint_batches = [] - chunks = CUSTOM_CHUNKS.get(endpoint, DEFAULT_CHUNKS) - if endpoint in RANGE_ENDPOINT_MAP: - params = RANGE_ENDPOINT_MAP[endpoint] - params['chunks'] = chunks - endpoint_batches = _make_date_range_endpoint_batch(endpoint, params) - elif endpoint in FILTERED_ENPOINT_MAP: - params = FILTERED_ENPOINT_MAP[endpoint] - query = EndpointExport.build_term_filter(**params) - endpoint_batches.append( - EndpointExport(endpoint, query=query, chunks=chunks) - ) - else: - endpoint_batches.append(EndpointExport(endpoint, chunks=chunks)) + def requires(self): + return [] + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "batches")) + + def _run(self): + shutil.rmtree(self.output().path, ignore_errors=True) + os.makedirs(self.output().path) + # Get all of the endpoints served by this index + # Create an `EndpointExport` object for each endpoint in order to export + # each endpoint properly. + # + # Endpoint exports can be: + # date range based (quarterly output) + # filter based (index serves many endpoints) + # vanilla (endpoint is 1 to 1 with index and it is exported all at once) + for endpoint, index_name in ENDPOINT_INDEX_MAP.items(): + endpoint_batches = [] + chunks = CUSTOM_CHUNKS.get(endpoint, DEFAULT_CHUNKS) + if endpoint in RANGE_ENDPOINT_MAP: + params = RANGE_ENDPOINT_MAP[endpoint] + params["chunks"] = chunks + endpoint_batches = _make_date_range_endpoint_batch(endpoint, params) + elif endpoint in FILTERED_ENPOINT_MAP: + params = FILTERED_ENPOINT_MAP[endpoint] + query = EndpointExport.build_term_filter(**params) + endpoint_batches.append( + EndpointExport(endpoint, query=query, chunks=chunks) + ) + else: + endpoint_batches.append(EndpointExport(endpoint, chunks=chunks)) + + # This is a hack to overcome the shortcoming of the parallel library of + # only having one mapper process for a tiny, single file input. Since we + # want to execute these endpoint batches in parallel, we write each task + # to its own file. It will create a mapper for each file. + for ep in endpoint_batches: + partition = ep.partition if ep.partition else "all" + + if "enforcement" in ep.endpoint: + partition = ep.endpoint.replace("enforcement", "").replace("/", "") + elif "label" in ep.endpoint: + partition = ep.endpoint.replace("label", "").replace("/", "") + + output_dir = join(self.output().path, index_name) + common.shell_cmd("mkdir -p %s", output_dir) + file_name = join(output_dir, partition + ".json") + + with open(file_name, "w") as json_out: + json_dict = json.dumps(ep.__dict__) + json_out.write(json_dict + "\n") - # This is a hack to overcome the shortcoming of the parallel library of - # only having one mapper process for a tiny, single file input. Since we - # want to execute these endpoint batches in parallel, we write each task - # to its own file. It will create a mapper for each file. - for ep in endpoint_batches: - partition = ep.partition if ep.partition else 'all' - if 'enforcement' in ep.endpoint: - partition = ep.endpoint.replace('enforcement', '').replace('/', '') - elif 'label' in ep.endpoint: - partition = ep.endpoint.replace('label', '').replace('/', '') +class ParallelExportMapper(parallel.Mapper): + def __init__(self, output_dir): + self.output_dir = output_dir + + def map(self, key, value, output): + es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120) + ep = common.ObjectDict(value) + schema_file = join(SCHEMA_DIR, ep.index_name + "_schema.json") + endpoint_dir = join(self.output_dir, ep.endpoint[1:]) + target_dir = join(endpoint_dir, ep.partition) + common.shell_cmd("mkdir -p %s", target_dir) + if self.index_changed_since_last_export(es_client, ep.index_name, target_dir): + index_util.dump_index( + es_client, + ep.index_name, + ep.endpoint, + target_dir, + cleaner=omit_internal_keys, + query=ep.query, + chunks=ep.chunks, + ) + # Copy the current JSON schema to the zip location so that it is included + # in the sync to s3. flock is required to avoid a race condition when copying the schema file. + common.shell_cmd_quiet( + "flock --verbose %s cp %s %s", schema_file, schema_file, endpoint_dir + ) - output_dir = join(self.output().path, index_name) - common.shell_cmd('mkdir -p %s', output_dir) - file_name = join(output_dir, partition + '.json') + def index_changed_since_last_export(self, es_client, index_name, target_dir): + manifest_path = join(target_dir, "manifest.json") + if os.path.isfile(manifest_path): + with open(manifest_path, "r") as manifest_file: + manifest = json.load(manifest_file) + if manifest.get("index_stamp", ""): + stamp = manifest["index_stamp"] + if stamp == index_util.get_stamp(es_client, index_name): + logging.info( + "Index %s has not changed since last export; skipping %s", + index_name, + target_dir, + ) + return False + return True - with open(file_name, 'w') as json_out: - json_dict = json.dumps(ep.__dict__) - json_out.write(json_dict + '\n') +class ParallelExport(AlwaysRunTask): + def requires(self): + return MakeExportBatches() + + def output(self): + return luigi.LocalTarget(FILES_DIR) + + def _run(self): + files = glob.glob(self.input().path + "/*/*.json") + parallel.mapreduce( + parallel.Collection.from_glob(files, parallel.JSONLineInput()), + mapper=ParallelExportMapper(output_dir=self.output().path), + reducer=parallel.NullReducer(), + output_prefix=join(BASE_DIR, "tmp"), + output_format=parallel.NullOutput(), + map_workers=12, + ) -class ParallelExportMapper(parallel.Mapper): - def __init__(self, output_dir): - self.output_dir = output_dir - - def map(self, key, value, output): - es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120) - ep = common.ObjectDict(value) - schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json') - endpoint_dir = join(self.output_dir, ep.endpoint[1:]) - target_dir = join(endpoint_dir, ep.partition) - common.shell_cmd('mkdir -p %s', target_dir) - if self.index_changed_since_last_export(es_client, ep.index_name, target_dir): - index_util.dump_index(es_client, - ep.index_name, - ep.endpoint, - target_dir, - cleaner=omit_internal_keys, - query=ep.query, - chunks=ep.chunks) - # Copy the current JSON schema to the zip location so that it is included - # in the sync to s3. flock is required to avoid a race condition when copying the schema file. - common.shell_cmd_quiet('flock --verbose %s cp %s %s', schema_file, schema_file, endpoint_dir) - - def index_changed_since_last_export(self, es_client, index_name, target_dir): - manifest_path = join(target_dir, 'manifest.json') - if (os.path.isfile(manifest_path)): - with open(manifest_path, 'r') as manifest_file: - manifest = json.load(manifest_file) - if manifest.get('index_stamp', ''): - stamp = manifest['index_stamp'] - if stamp == index_util.get_stamp(es_client, index_name): - logging.info('Index %s has not changed since last export; skipping %s', index_name, target_dir) - return False - return True +class CopyIndexToS3(AlwaysRunTask): + date_str = luigi.Parameter() + download_bucket = luigi.Parameter() + + def requires(self): + return ParallelExport() + + def _run(self): + sync_path = FILES_DIR + target_bucket = "s3://%s/%s/" % (self.download_bucket, self.date_str) + s3_cmd = [ + "aws", + "--profile", + config.aws_profile(), + "s3", + "sync", + sync_path, + target_bucket, + '--exclude "*"', + '--include "*.zip"', + '--include "*schema.json"', + ] + + common.shell_cmd_quiet(" ".join(s3_cmd)) -class ParallelExport(AlwaysRunTask): - def requires(self): - return MakeExportBatches() +class CombineManifests(index_util.AlwaysRunTask): + date_str = luigi.Parameter() + download_bucket = luigi.Parameter() - def output(self): - return luigi.LocalTarget(FILES_DIR) + def requires(self): + return CopyIndexToS3( + date_str=self.date_str, download_bucket=self.download_bucket + ) - def _run(self): - files = glob.glob(self.input().path + '/*/*.json') - parallel.mapreduce( - parallel.Collection.from_glob(files, parallel.JSONLineInput()), - mapper=ParallelExportMapper(output_dir=self.output().path), - reducer=parallel.NullReducer(), - output_prefix=join(BASE_DIR, 'tmp'), - output_format=parallel.NullOutput(), - map_workers=12) + def output(self): + target_dir = join(FILES_DIR, "manifest/final_manifest.json") + return luigi.LocalTarget(target_dir) + def _run(self): + crawl_dir = dirname(dirname(self.output().path)) + common.shell_cmd("mkdir -p %s", dirname(self.output().path)) -class CopyIndexToS3(AlwaysRunTask): - date_str = luigi.Parameter() - download_bucket = luigi.Parameter() - - def requires(self): - return ParallelExport() - - def _run(self): - sync_path = FILES_DIR - target_bucket = 's3://%s/%s/' % (self.download_bucket, self.date_str) - s3_cmd = [ - 'aws', - '--profile', - config.aws_profile(), - 's3', - 'sync', - sync_path, - target_bucket, - '--exclude "*"', - '--include "*.zip"', - '--include "*schema.json"'] - - common.shell_cmd_quiet(' '.join(s3_cmd)) + manifests = walk_glob("manifest.json", crawl_dir) + records = [] + for file_name in manifests: + records.append(json.load(open(file_name))) -class CombineManifests(index_util.AlwaysRunTask): - date_str = luigi.Parameter() - download_bucket = luigi.Parameter() - - def requires(self): - return CopyIndexToS3(date_str=self.date_str, - download_bucket=self.download_bucket) - - def output(self): - target_dir = join(FILES_DIR, 'manifest/final_manifest.json') - return luigi.LocalTarget(target_dir) - - def _run(self): - crawl_dir = dirname(dirname(self.output().path)) - common.shell_cmd('mkdir -p %s', dirname(self.output().path)) - - manifests = walk_glob('manifest.json', crawl_dir) - - records = [] - for file_name in manifests: - records.append(json.load(open(file_name))) - - # Default data structure that creates the appropriate structure on the - # first put so that we can blindly use `+=` when appropriate. - combined = collections.defaultdict( - lambda: collections.defaultdict( - lambda: { - 'export_date': None, - 'partitions': [], - 'total_records': 0 - } - ) - ) + # Default data structure that creates the appropriate structure on the + # first put so that we can blindly use `+=` when appropriate. + combined = collections.defaultdict( + lambda: collections.defaultdict( + lambda: {"export_date": None, "partitions": [], "total_records": 0} + ) + ) - # Walk over all of the manifests and create a single dictionary - for row in records: - row.pop('index_stamp', None) - for domain, value in row.items(): - for sub, val in value.items(): - combined[domain][sub]['export_date'] = val.get('export_date', '') - combined[domain][sub]['partitions'] += val.get('partitions', []) - combined[domain][sub]['total_records'] += val.get('total_records', 0) + # Walk over all of the manifests and create a single dictionary + for row in records: + row.pop("index_stamp", None) + for domain, value in row.items(): + for sub, val in value.items(): + combined[domain][sub]["export_date"] = val.get("export_date", "") + combined[domain][sub]["partitions"] += val.get("partitions", []) + combined[domain][sub]["total_records"] += val.get( + "total_records", 0 + ) - with open(join(self.output().path), 'w') as json_out: - json.dump(combined, json_out, indent=2) + with open(join(self.output().path), "w") as json_out: + json.dump(combined, json_out, indent=2) class LoadDownloadJSON(index_util.LoadJSONBase): - date_str = luigi.Parameter() - download_bucket = luigi.Parameter(default='download.open.fda.gov') - - index_name = 'openfdadata' - mapping_file = './schemas/downloads_mapping.json' - use_checksum = True - delete_index = False - - def _data(self): - if not self.data_source: - self.data_source = CombineManifests(date_str=self.date_str, - download_bucket=self.download_bucket) - return self.data_source - - def _run(self): - json_data = json.load(open(self.input()['data'].path)) - date_str = basename(dirname(dirname(self.input()['data'].path))) - index_dict = { - 'index': self.index_name, - 'doc_type': self.type_name, - 'id': date_str, - 'body': json_data - } - - es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=60) - - # We put the same document into the index twice. - # Once with a key as its date - # Once with a key of `current` - # The API will serve the current key as its default response. - # How we serve archive records is TBD, but the data is preserved this way. - es_client.index(**index_dict) - index_dict['id'] = 'current' - es_client.index(**index_dict) - - elasticsearch_requests.update_process_datetime( - config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD'), arrow.utcnow().format('YYYY-MM-DD') - ) + date_str = luigi.Parameter() + download_bucket = luigi.Parameter(default="download.open.fda.gov") + + index_name = "openfdadata" + mapping_file = "./schemas/downloads_mapping.json" + use_checksum = True + delete_index = False + + def _data(self): + if not self.data_source: + self.data_source = CombineManifests( + date_str=self.date_str, download_bucket=self.download_bucket + ) + return self.data_source + + def _run(self): + json_data = json.load(open(self.input()["data"].path)) + date_str = basename(dirname(dirname(self.input()["data"].path))) + index_dict = { + "index": self.index_name, + "doc_type": self.type_name, + "id": date_str, + "body": json_data, + } + + es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=60) + + # We put the same document into the index twice. + # Once with a key as its date + # Once with a key of `current` + # The API will serve the current key as its default response. + # How we serve archive records is TBD, but the data is preserved this way. + es_client.index(**index_dict) + index_dict["id"] = "current" + es_client.index(**index_dict) + + elasticsearch_requests.update_process_datetime( + config.es_client(), + self.index_name, + arrow.utcnow().format("YYYY-MM-DD"), + arrow.utcnow().format("YYYY-MM-DD"), + ) -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/faers/annotate.py b/openfda/faers/annotate.py index 7b01a92..d7089ab 100644 --- a/openfda/faers/annotate.py +++ b/openfda/faers/annotate.py @@ -7,155 +7,161 @@ import simplejson as json from openfda import parallel -STRIP_PUNCT_RE = re.compile('[^a-zA-Z -]+') +STRIP_PUNCT_RE = re.compile("[^a-zA-Z -]+") + + def normalize_product_name(product_name): - 'Simple drugname normalization: strip punctuation and whitespace and lowercase.' - product_name = product_name.strip().lower() - return STRIP_PUNCT_RE.sub('', product_name) + "Simple drugname normalization: strip punctuation and whitespace and lowercase." + product_name = product_name.strip().lower() + return STRIP_PUNCT_RE.sub("", product_name) + def read_json_file(json_file): - for line in json_file: - yield json.loads(line) + for line in json_file: + yield json.loads(line) def read_harmonized_file(harmonized_file): - ''' - Create a dictionary that is keyed by drug names. First brand_name, - then brand_name + brand_name_suffix, finally generic_name. Used to join - against the event medicinalproduct value. - ''' - return_dict = collections.defaultdict(list) - for row in read_json_file(harmonized_file): - drug = row['brand_name'].strip().lower() - generic_name = row['generic_name'].strip().lower() - suffix = row['brand_name_suffix'].strip().lower() - - return_dict[normalize_product_name(drug)].append(row) + """ + Create a dictionary that is keyed by drug names. First brand_name, + then brand_name + brand_name_suffix, finally generic_name. Used to join + against the event medicinalproduct value. + """ + return_dict = collections.defaultdict(list) + for row in read_json_file(harmonized_file): + drug = row["brand_name"].strip().lower() + generic_name = row["generic_name"].strip().lower() + suffix = row["brand_name_suffix"].strip().lower() - if suffix: - return_dict[normalize_product_name(drug + ' ' + suffix)].append(row) + return_dict[normalize_product_name(drug)].append(row) - if generic_name: - return_dict[normalize_product_name(generic_name)].append(row) + if suffix: + return_dict[normalize_product_name(drug + " " + suffix)].append(row) - return return_dict + if generic_name: + return_dict[normalize_product_name(generic_name)].append(row) + return return_dict def _add_field(openfda, field, value): - if not isinstance(value, list): - value = [value] - - for v in value: - if not v: - continue - if field not in openfda: - openfda[field] = {} - openfda[field][v] = True - return + if not isinstance(value, list): + value = [value] + + for v in value: + if not v: + continue + if field not in openfda: + openfda[field] = {} + openfda[field][v] = True + return def AddHarmonizedRowToOpenfda(openfda, row): - if not row['is_original_packager']: - return + if not row["is_original_packager"]: + return + + if row["product_ndc"] in row["spl_product_ndc"]: + _add_field(openfda, "application_number", row["application_number"]) + # Using the most precise possible name for brand_name + if row["brand_name_suffix"]: + brand_name = row["brand_name"] + " " + row["brand_name_suffix"] + else: + brand_name = row["brand_name"] + + _add_field(openfda, "brand_name", brand_name.rstrip().upper()) + _add_field(openfda, "generic_name", row["generic_name"].upper()) + _add_field(openfda, "manufacturer_name", row["manufacturer_name"]) + _add_field(openfda, "product_ndc", row["product_ndc"]) + _add_field(openfda, "product_ndc", row["spl_product_ndc"]) + _add_field(openfda, "product_type", row["product_type"]) + + for route in row["route"].split(";"): + route = route.strip() + _add_field(openfda, "route", route) + + for substance in row["substance_name"].split(";"): + substance = substance.strip() + _add_field(openfda, "substance_name", substance) + + for rxnorm in row["rxnorm"]: + _add_field(openfda, "rxcui", rxnorm["rxcui"]) + + _add_field(openfda, "spl_id", row["id"]) + _add_field(openfda, "spl_set_id", row["spl_set_id"]) + _add_field(openfda, "package_ndc", row["package_ndc"]) + + if row["unii_indexing"] != []: + unii_list = [] + for unii_row in row["unii_indexing"]: + for key, value in unii_row.items(): + if key == "unii": + unii_list.append(value) + if key == "va": + for this_item in value: + for va_key, va_value in this_item.items(): + if va_key == "name": + if va_value.find("[MoA]") != -1: + _add_field(openfda, "pharm_class_moa", va_value) + if ( + va_value.find("[Chemical/Ingredient]") != -1 + or va_value.find("[CS]") != -1 + ): + _add_field(openfda, "pharm_class_cs", va_value) + if va_value.find("[PE]") != -1: + _add_field(openfda, "pharm_class_pe", va_value) + if va_value.find("[EPC]") != -1: + _add_field(openfda, "pharm_class_epc", va_value) + if va_key == "number": + _add_field(openfda, "nui", va_value) + _add_field(openfda, "unii", unii_list) - if row['product_ndc'] in row['spl_product_ndc']: - _add_field(openfda, 'application_number', row['application_number']) - # Using the most precise possible name for brand_name - if row['brand_name_suffix']: - brand_name = row['brand_name'] + ' ' + row['brand_name_suffix'] - else: - brand_name = row['brand_name'] - - _add_field(openfda, 'brand_name', brand_name.rstrip().upper()) - _add_field(openfda, 'generic_name', row['generic_name'].upper()) - _add_field(openfda, 'manufacturer_name', row['manufacturer_name']) - _add_field(openfda, 'product_ndc', row['product_ndc']) - _add_field(openfda, 'product_ndc', row['spl_product_ndc']) - _add_field(openfda, 'product_type', row['product_type']) - - for route in row['route'].split(';'): - route = route.strip() - _add_field(openfda, 'route', route) - - for substance in row['substance_name'].split(';'): - substance = substance.strip() - _add_field(openfda, 'substance_name', substance) - - for rxnorm in row['rxnorm']: - _add_field(openfda, 'rxcui', rxnorm['rxcui']) - - _add_field(openfda, 'spl_id', row['id']) - _add_field(openfda, 'spl_set_id', row['spl_set_id']) - _add_field(openfda, 'package_ndc', row['package_ndc']) - - if row['unii_indexing'] != []: - unii_list = [] - for unii_row in row['unii_indexing']: - for key, value in unii_row.items(): - if key == 'unii': - unii_list.append(value) - if key == 'va': - for this_item in value: - for va_key, va_value in this_item.items(): - if va_key == 'name': - if va_value.find('[MoA]') != -1: - _add_field(openfda, 'pharm_class_moa', va_value) - if va_value.find('[Chemical/Ingredient]') != -1 or va_value.find('[CS]') != -1: - _add_field(openfda, 'pharm_class_cs', va_value) - if va_value.find('[PE]') != -1: - _add_field(openfda, 'pharm_class_pe', va_value) - if va_value.find('[EPC]') != -1: - _add_field(openfda, 'pharm_class_epc', va_value) - if va_key == 'number': - _add_field(openfda, 'nui', va_value) - _add_field(openfda, 'unii', unii_list) def AnnotateDrug(drug, harmonized_dict): - if 'medicinalproduct' not in drug or not drug['medicinalproduct']: - return None + if "medicinalproduct" not in drug or not drug["medicinalproduct"]: + return None - medicinal_product = normalize_product_name(drug['medicinalproduct']) - if medicinal_product in harmonized_dict: - openfda = {} + medicinal_product = normalize_product_name(drug["medicinalproduct"]) + if medicinal_product in harmonized_dict: + openfda = {} - for row in harmonized_dict[medicinal_product]: - AddHarmonizedRowToOpenfda(openfda, row) + for row in harmonized_dict[medicinal_product]: + AddHarmonizedRowToOpenfda(openfda, row) - openfda_lists = {} - for field, value in openfda.items(): - openfda_lists[field] = [s for s in value.keys()] + openfda_lists = {} + for field, value in openfda.items(): + openfda_lists[field] = [s for s in value.keys()] - drug['openfda'] = openfda_lists + drug["openfda"] = openfda_lists def AnnotateEvent(event, version, harmonized_dict): - if 'patient' not in event: - return + if "patient" not in event: + return - for drug in event['patient']['drug']: - AnnotateDrug(drug, harmonized_dict) + for drug in event["patient"]["drug"]: + AnnotateDrug(drug, harmonized_dict) - try: - if not event['patient']['patientonsetage'].isdigit(): - event['patient']['patientonsetage'] = None - except: - pass + try: + if not event["patient"]["patientonsetage"].isdigit(): + event["patient"]["patientonsetage"] = None + except: + pass - date = event['receiptdate'] - event['@timestamp'] = date[0:4] + '-' + date[4:6] + '-' + date[6:8] - event['@version'] = version + date = event["receiptdate"] + event["@timestamp"] = date[0:4] + "-" + date[4:6] + "-" + date[6:8] + event["@version"] = version class AnnotateMapper(parallel.Mapper): - def __init__(self, harmonized_file): - self.harmonized_file = harmonized_file + def __init__(self, harmonized_file): + self.harmonized_file = harmonized_file - def map_shard(self, map_input, map_output): - harmonized_dict = read_harmonized_file(open(self.harmonized_file)) + def map_shard(self, map_input, map_output): + harmonized_dict = read_harmonized_file(open(self.harmonized_file)) - for case_number, (timestamp, event) in map_input: - # TODO(hansnelsen): Change all of the code to refer to the timestamp - # as 'version' - AnnotateEvent(event, timestamp, harmonized_dict) - map_output.add(case_number, event) + for case_number, (timestamp, event) in map_input: + # TODO(hansnelsen): Change all of the code to refer to the timestamp + # as 'version' + AnnotateEvent(event, timestamp, harmonized_dict) + map_output.add(case_number, event) diff --git a/openfda/faers/pipeline.py b/openfda/faers/pipeline.py index cada04c..2cab98d 100755 --- a/openfda/faers/pipeline.py +++ b/openfda/faers/pipeline.py @@ -28,198 +28,216 @@ # this should be a symlink to wherever the real data directory is RUN_DIR = dirname(dirname(os.path.abspath(__file__))) BASE_DIR = config.data_dir() -RAW_DIR = join(BASE_DIR, 'faers/raw') -FAERS_HISTORIC = 's3://openfda-data-faers-historical' -FAERS_CURRENT = 'https://fis.fda.gov/extensions/FPD-QDE-FAERS/FPD-QDE-FAERS.html' +RAW_DIR = join(BASE_DIR, "faers/raw") +FAERS_HISTORIC = "s3://openfda-data-faers-historical" +FAERS_CURRENT = "https://fis.fda.gov/extensions/FPD-QDE-FAERS/FPD-QDE-FAERS.html" MAX_RECORDS_PER_FILE = luigi.IntParameter(-1, is_global=True) + class DownloadDataset(luigi.Task): - ''' - This task downloads all datasets that have not yet been fetched. - ''' - def _fetch(self): - for page in [self._faers_current.find_all(href=re.compile('.*.zip'))]: - for a in page: - filename = a['href'].split('/')[-1] - yield filename, a['href'] - - def _download_with_retry(self, url, target_name): - if os.path.exists(target_name): - return - - for i in range(10): - try: - logging.info('Downloading: ' + url) - common.download(url, target_name) - subprocess.check_call('unzip -t %s' % target_name, shell=True) - return - except: - logging.info('Problem while unzipping[download URL:%s, zip file:%s], retrying...' , url, target_name) - logging.fatal('Zip File: %s from URL :%s is not valid, stop all processing', target_name, url) - - def output(self): - return luigi.LocalTarget(RAW_DIR) - - def run(self): - os.system('mkdir -p "%s"' % self.output().path) - self._faers_current = BeautifulSoup(urlopen(FAERS_CURRENT).read(), "lxml") - for filename, url in list(self._fetch()): - target_name = join(RAW_DIR, filename.lower()) - self._download_with_retry(url, target_name) + """ + This task downloads all datasets that have not yet been fetched. + """ + + def _fetch(self): + for page in [self._faers_current.find_all(href=re.compile(".*.zip"))]: + for a in page: + filename = a["href"].split("/")[-1] + yield filename, a["href"] + + def _download_with_retry(self, url, target_name): + if os.path.exists(target_name): + return + + for i in range(10): + try: + logging.info("Downloading: " + url) + common.download(url, target_name) + subprocess.check_call("unzip -t %s" % target_name, shell=True) + return + except: + logging.info( + "Problem while unzipping[download URL:%s, zip file:%s], retrying...", + url, + target_name, + ) + logging.fatal( + "Zip File: %s from URL :%s is not valid, stop all processing", + target_name, + url, + ) + + def output(self): + return luigi.LocalTarget(RAW_DIR) + + def run(self): + os.system('mkdir -p "%s"' % self.output().path) + self._faers_current = BeautifulSoup(urlopen(FAERS_CURRENT).read(), "lxml") + for filename, url in list(self._fetch()): + target_name = join(RAW_DIR, filename.lower()) + self._download_with_retry(url, target_name) class ExtractZip(luigi.Task): - quarter = luigi.Parameter() - - def requires(self): - return DownloadDataset() - - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'faers/extracted/', self.quarter)) - - def run(self): - logging.info('Extracting: %s', self.quarter) - - src_dir = self.input().path - os.system('mkdir -p "%s"' % self.output().path) - pattern = join(src_dir, '*%s*.[Zz][Ii][Pp]' % self.quarter) - zip_files = glob.glob(pattern) - - if len(zip_files) == 0: - logging.warning( - 'Expected to find one or more files for quarter %s (searched for: %s)\n' - 'This may be a result of a bad download, or simply missing quarter data.', - self.quarter, pattern) - - extract_dir = self.output().path - for zip_file in zip_files: - os.system('unzip -o -d %(extract_dir)s %(zip_file)s' % locals()) - - # FAERS XML files aren't always well-formed due to invalid characters. E.g. ADR12Q4.xml - for xml in glob.glob(self.output().path + '/**/*.xml'): - logging.info('Cleaning XML file: %s', xml) - cleaned = os.path.splitext(xml)[0]+'_cleaned.xml' - os.system('iconv -f UTF-8 -t UTF-8 -c %(xml)s > %(cleaned)s' % locals()) - os.remove(xml) - - - # AERS SGM records don't always properly escape & - for sgm in glob.glob(self.output().path + '/*/sgml/*.SGM'): - logging.info('Escaping SGML file: %s', sgm) - os.system('LANG=C sed -i -e "s/&/&/" %s' % sgm) - - # Apply patches if they exist for this SGM - sgm_filename = sgm.split('/')[-1] - sgm_diff_glob = join(RUN_DIR, 'faers/diffs/*/*/%(sgm_filename)s.diff' % locals()) - for sgm_diff in glob.glob(sgm_diff_glob): - logging.info('Patching: %s', sgm) - cmd = 'patch -N %(sgm)s -i %(sgm_diff)s' % locals() - logging.info(cmd) - os.system(cmd) + quarter = luigi.Parameter() + + def requires(self): + return DownloadDataset() + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "faers/extracted/", self.quarter)) + + def run(self): + logging.info("Extracting: %s", self.quarter) + + src_dir = self.input().path + os.system('mkdir -p "%s"' % self.output().path) + pattern = join(src_dir, "*%s*.[Zz][Ii][Pp]" % self.quarter) + zip_files = glob.glob(pattern) + + if len(zip_files) == 0: + logging.warning( + "Expected to find one or more files for quarter %s (searched for: %s)\n" + "This may be a result of a bad download, or simply missing quarter data.", + self.quarter, + pattern, + ) + + extract_dir = self.output().path + for zip_file in zip_files: + os.system("unzip -o -d %(extract_dir)s %(zip_file)s" % locals()) + + # FAERS XML files aren't always well-formed due to invalid characters. E.g. ADR12Q4.xml + for xml in glob.glob(self.output().path + "/**/*.xml"): + logging.info("Cleaning XML file: %s", xml) + cleaned = os.path.splitext(xml)[0] + "_cleaned.xml" + os.system("iconv -f UTF-8 -t UTF-8 -c %(xml)s > %(cleaned)s" % locals()) + os.remove(xml) + + # AERS SGM records don't always properly escape & + for sgm in glob.glob(self.output().path + "/*/sgml/*.SGM"): + logging.info("Escaping SGML file: %s", sgm) + os.system('LANG=C sed -i -e "s/&/&/" %s' % sgm) + + # Apply patches if they exist for this SGM + sgm_filename = sgm.split("/")[-1] + sgm_diff_glob = join( + RUN_DIR, "faers/diffs/*/*/%(sgm_filename)s.diff" % locals() + ) + for sgm_diff in glob.glob(sgm_diff_glob): + logging.info("Patching: %s", sgm) + cmd = "patch -N %(sgm)s -i %(sgm_diff)s" % locals() + logging.info(cmd) + os.system(cmd) + class XML2JSON(luigi.Task): - quarter = luigi.Parameter() - max_records_per_file = MAX_RECORDS_PER_FILE + quarter = luigi.Parameter() + max_records_per_file = MAX_RECORDS_PER_FILE - def requires(self): - return [ExtractZip(self.quarter)] + def requires(self): + return [ExtractZip(self.quarter)] - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'faers/json/', self.quarter)) + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "faers/json/", self.quarter)) - def run(self): - # AERS_SGML_2007q4.ZIP has files in sqml + def run(self): + # AERS_SGML_2007q4.ZIP has files in sqml - os.system('mkdir -p "%s"' % self.output().path) + os.system('mkdir -p "%s"' % self.output().path) - filenames = [] - for input in self.input(): - sgml_path = '/s[gq]ml/*.SGM' - xml_path = '/[Xx][Mm][Ll]/*.xml' - logging.info('Checking for inputs in: %s', input.path) - filenames.extend(glob.glob(input.path + sgml_path)) - filenames.extend(glob.glob(input.path + xml_path)) + filenames = [] + for input in self.input(): + sgml_path = "/s[gq]ml/*.SGM" + xml_path = "/[Xx][Mm][Ll]/*.xml" + logging.info("Checking for inputs in: %s", input.path) + filenames.extend(glob.glob(input.path + sgml_path)) + filenames.extend(glob.glob(input.path + xml_path)) - if len(filenames) == 0: - logging.warning('No files to process for quarter? %s', self.quarter) - return + if len(filenames) == 0: + logging.warning("No files to process for quarter? %s", self.quarter) + return - input_shards = [] - for filename in filenames: - if 'test' in filename.lower(): - continue - logging.info('Adding input file to pool: %s', filename) - input_shards.append(filename) + input_shards = [] + for filename in filenames: + if "test" in filename.lower(): + continue + logging.info("Adding input file to pool: %s", filename) + input_shards.append(filename) - report_counts = parallel.mapreduce( - parallel.Collection.from_list(input_shards), - xml_to_json.ExtractSafetyReportsMapper(), - xml_to_json.MergeSafetyReportsReducer(), - self.output().path, - num_shards=16) + report_counts = parallel.mapreduce( + parallel.Collection.from_list(input_shards), + xml_to_json.ExtractSafetyReportsMapper(), + xml_to_json.MergeSafetyReportsReducer(), + self.output().path, + num_shards=16, + ) - combined_counts = collections.defaultdict(int) - for rc in report_counts: - for timestamp, count in iter(rc.items()): - combined_counts[timestamp] += count + combined_counts = collections.defaultdict(int) + for rc in report_counts: + for timestamp, count in iter(rc.items()): + combined_counts[timestamp] += count - print('----REPORT COUNTS----') - for timestamp, count in sorted(combined_counts.items()): - print('>> ', timestamp, count) + print("----REPORT COUNTS----") + for timestamp, count in sorted(combined_counts.items()): + print(">> ", timestamp, count) class AnnotateJSON(DependencyTriggeredTask): - quarter = luigi.Parameter() + quarter = luigi.Parameter() - def requires(self): - return [CombineHarmonization(), XML2JSON(self.quarter)] + def requires(self): + return [CombineHarmonization(), XML2JSON(self.quarter)] - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'faers/annotated/', self.quarter)) + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "faers/annotated/", self.quarter)) - def run(self): - harmonized_file = self.input()[0].path - parallel.mapreduce( - parallel.Collection.from_sharded(self.input()[1].path), - annotate.AnnotateMapper(harmonized_file), - parallel.IdentityReducer(), - self.output().path) + def run(self): + harmonized_file = self.input()[0].path + parallel.mapreduce( + parallel.Collection.from_sharded(self.input()[1].path), + annotate.AnnotateMapper(harmonized_file), + parallel.IdentityReducer(), + self.output().path, + ) class LoadJSONQuarter(index_util.LoadJSONBase): - quarter = luigi.Parameter() - index_name = 'drugevent' - mapping_file = './schemas/faers_mapping.json' - docid_key='@case_number' - use_checksum = True - last_update_date = lambda _: newest_file_timestamp(RAW_DIR) + quarter = luigi.Parameter() + index_name = "drugevent" + mapping_file = "./schemas/faers_mapping.json" + docid_key = "@case_number" + use_checksum = True + last_update_date = lambda _: newest_file_timestamp(RAW_DIR) - # Optimize after all quarters are finished. - optimize_index = False + # Optimize after all quarters are finished. + optimize_index = False - def _data(self): - return AnnotateJSON(self.quarter) + def _data(self): + return AnnotateJSON(self.quarter) class LoadJSON(AlwaysRunTask): - quarter = luigi.Parameter(default='all') - - def requires(self): - if self.quarter == 'all': - now = arrow.now() - previous_task = NoopTask() - for year in range(2004, now.year + 1): - for quarter in range(1, 5): - task = LoadJSONQuarter(quarter='%4dq%d' % (year, quarter), previous_task=previous_task) - previous_task = task - yield task - else: - yield LoadJSONQuarter(quarter=self.quarter) - - def _run(self): - index_util.optimize_index('drugevent', wait_for_merge=0) - - -if __name__ == '__main__': - luigi.run() + quarter = luigi.Parameter(default="all") + + def requires(self): + if self.quarter == "all": + now = arrow.now() + previous_task = NoopTask() + for year in range(2004, now.year + 1): + for quarter in range(1, 5): + task = LoadJSONQuarter( + quarter="%4dq%d" % (year, quarter), previous_task=previous_task + ) + previous_task = task + yield task + else: + yield LoadJSONQuarter(quarter=self.quarter) + + def _run(self): + index_util.optimize_index("drugevent", wait_for_merge=0) + + +if __name__ == "__main__": + luigi.run() diff --git a/openfda/faers/tests/annotate_test.py b/openfda/faers/tests/annotate_test.py index 6563245..d3dd15a 100644 --- a/openfda/faers/tests/annotate_test.py +++ b/openfda/faers/tests/annotate_test.py @@ -9,37 +9,37 @@ class AnnotateUnitTest(unittest.TestCase): - 'FAERS Annotate Unit Test' + "FAERS Annotate Unit Test" - def setUp(self): - pass + def setUp(self): + pass - def test_annotate_event(self): - version = 10000 - harmonized_file = open_data_file('harmonized.small.json') - harmonized_dict = annotate.read_harmonized_file(harmonized_file) + def test_annotate_event(self): + version = 10000 + harmonized_file = open_data_file("harmonized.small.json") + harmonized_dict = annotate.read_harmonized_file(harmonized_file) - event = json.load(open_data_file('event.preannotation.json')) - annotate.AnnotateEvent(event, version, harmonized_dict) + event = json.load(open_data_file("event.preannotation.json")) + annotate.AnnotateEvent(event, version, harmonized_dict) - # Note that assertMultiLineEqual doesn't give pretty diffs so we do dict - # comparison of the parsed JSON instead. - # - # To update event.postannotation.json, uncoment the following line and run: - # nosetests -v -s openfda/faers > openfda/faers/tests/data/event.postannotation.json - #print json.dumps(event, indent=2, sort_keys=True) + # Note that assertMultiLineEqual doesn't give pretty diffs so we do dict + # comparison of the parsed JSON instead. + # + # To update event.postannotation.json, uncoment the following line and run: + # nosetests -v -s openfda/faers > openfda/faers/tests/data/event.postannotation.json + # print json.dumps(event, indent=2, sort_keys=True) - expected = json.load(open_data_file('event.postannotation.json')) + expected = json.load(open_data_file("event.postannotation.json")) - # Setting maxDiff to None means that there is no maximum length of diffs. - self.maxDiff = None - self.assertDictEqual(expected, event) + # Setting maxDiff to None means that there is no maximum length of diffs. + self.maxDiff = None + self.assertDictEqual(expected, event) - def test_normalization(self): - self.assertEqual(annotate.normalize_product_name('HUMIRa.'), 'humira') - self.assertEqual(annotate.normalize_product_name('HUMIRA'), 'humira') - self.assertEqual(annotate.normalize_product_name('FOLIC ACID'), 'folic acid') + def test_normalization(self): + self.assertEqual(annotate.normalize_product_name("HUMIRa."), "humira") + self.assertEqual(annotate.normalize_product_name("HUMIRA"), "humira") + self.assertEqual(annotate.normalize_product_name("FOLIC ACID"), "folic acid") -if __name__ == '__main__': - unittest.main() +if __name__ == "__main__": + unittest.main() diff --git a/openfda/faers/xml_to_json.py b/openfda/faers/xml_to_json.py index c2764fe..8ac148c 100644 --- a/openfda/faers/xml_to_json.py +++ b/openfda/faers/xml_to_json.py @@ -15,150 +15,162 @@ class MergeSafetyReportsReducer(parallel.Reducer): - def __init__(self): - self.report_counts = collections.defaultdict(int) - parallel.Reducer.__init__(self) + def __init__(self): + self.report_counts = collections.defaultdict(int) + parallel.Reducer.__init__(self) - def reduce(self, key, values, output): - # keys are case numbers, values are (timestamp, json_data) - timestamp, report = sorted(values, key=lambda v: v[0])[-1] - self.report_counts[timestamp] += 1 - output.put(key, (timestamp, report)) + def reduce(self, key, values, output): + # keys are case numbers, values are (timestamp, json_data) + timestamp, report = sorted(values, key=lambda v: v[0])[-1] + self.report_counts[timestamp] += 1 + output.put(key, (timestamp, report)) - def reduce_finished(self): - return self.report_counts + def reduce_finished(self): + return self.report_counts def timestamp_from_filename(filename): - '''Returns a timestamp corresponding to the year/quarter in `filename`. ''' - match = re.search(r'/([0-9]+)q([0-4])/', filename) - year, quarter = int(match.group(1)), int(match.group(2)) - return arrow.get('%04d.%02d' % (year, quarter), 'YYYY.MM').int_timestamp + """Returns a timestamp corresponding to the year/quarter in `filename`.""" + match = re.search(r"/([0-9]+)q([0-4])/", filename) + year, quarter = int(match.group(1)), int(match.group(2)) + return arrow.get("%04d.%02d" % (year, quarter), "YYYY.MM").int_timestamp def parse_demo_file(demo_filename): - '''Parse a FAERS demo file. - - Returns a dictionary mapping from safety report ID to case number. - ''' - result = {} - with open(demo_filename) as f: - f.readline() # skip header - for line in f.read().split('\n'): - if not line: - continue - parts = line.split('$') - safety_report_id = parts[0] - case_number = parts[1] - result[safety_report_id] = case_number - - return result + """Parse a FAERS demo file. + + Returns a dictionary mapping from safety report ID to case number. + """ + result = {} + with open(demo_filename) as f: + f.readline() # skip header + for line in f.read().split("\n"): + if not line: + continue + parts = line.split("$") + safety_report_id = parts[0] + case_number = parts[1] + result[safety_report_id] = case_number + + return result + def case_insensitive_glob(pattern): - def either(c): - return '[%s%s]'%(c.lower(),c.upper()) if c.isalpha() else c - return glob.glob(''.join(map(either,pattern))) + def either(c): + return "[%s%s]" % (c.lower(), c.upper()) if c.isalpha() else c + + return glob.glob("".join(map(either, pattern))) + class ExtractSafetyReportsMapper(parallel.Mapper): - '''Extract safety reports from ``input_filename``. - - This additionally looks up the case number for a given safety report ID - using the AERS/FAERS ascii files. - - The resulting reports are converted to JSON. - - For each report, a 3-tuple (timestamp, case_number, json_str) is - added to ``report_queue``. - ''' - def __init__(self, max_records_per_file=-1): - # For testing, this can be set to a number > 0, which will stop the - # extraction process early. - self.max_records_per_file = max_records_per_file - self._record_count = 0 - - def map_shard(self, map_input, map_output): - inputs = list(map_input) - assert len(inputs) == 1 - input_filename = inputs[0][0] - logging.info('Extracting reports from %s', input_filename) - file_timestamp = timestamp_from_filename(input_filename) - logging.info('File timestamp: %s', file_timestamp) - - # report id to case number conversion only needed for AERS SGM files - # not FAERS XML files - input_is_sgml = input_filename.lower().find('sgml') != -1 - id_to_case = None - - if input_is_sgml: - input_dir = dirname(dirname(input_filename)) - ascii_files = case_insensitive_glob('%s/*/demo*.txt' % (input_dir)) - if ascii_files: - logging.info('Found DEMO file %s', ascii_files[0]) - id_to_case = parse_demo_file(ascii_files[0]) - else: - logging.info('No DEMO file for input %s', input_filename) - - - def handle_safety_report(_, safety_report): - '''Handle a single safety_report entry.''' - try: - self._record_count += 1 - if self.max_records_per_file > 0 and self._record_count >= self.max_records_per_file: - return False - - # Skip the small number of records without a patient section - if 'patient' not in list(safety_report.keys()): - return True - - # Have drug and reaction in a list (even if they are just one element) - if type(safety_report['patient']['drug']) != type([]): - safety_report['patient']['drug'] = [safety_report['patient']['drug']] - if type(safety_report['patient']['reaction']) != type([]): - safety_report['patient']['reaction'] = [ - safety_report['patient']['reaction']] - - # Ensure drugrecurrence is always an array in response to https://github.com/FDA/openfda/issues/139 - for drug in safety_report['patient']['drug']: - if 'drugrecurrence' in drug: - if type(drug['drugrecurrence']) != type([]): - drug['drugrecurrence'] = [drug['drugrecurrence']] - - # add timestamp for kibana + """Extract safety reports from ``input_filename``. + + This additionally looks up the case number for a given safety report ID + using the AERS/FAERS ascii files. + + The resulting reports are converted to JSON. + + For each report, a 3-tuple (timestamp, case_number, json_str) is + added to ``report_queue``. + """ + + def __init__(self, max_records_per_file=-1): + # For testing, this can be set to a number > 0, which will stop the + # extraction process early. + self.max_records_per_file = max_records_per_file + self._record_count = 0 + + def map_shard(self, map_input, map_output): + inputs = list(map_input) + assert len(inputs) == 1 + input_filename = inputs[0][0] + logging.info("Extracting reports from %s", input_filename) + file_timestamp = timestamp_from_filename(input_filename) + logging.info("File timestamp: %s", file_timestamp) + + # report id to case number conversion only needed for AERS SGM files + # not FAERS XML files + input_is_sgml = input_filename.lower().find("sgml") != -1 + id_to_case = None + + if input_is_sgml: + input_dir = dirname(dirname(input_filename)) + ascii_files = case_insensitive_glob("%s/*/demo*.txt" % (input_dir)) + if ascii_files: + logging.info("Found DEMO file %s", ascii_files[0]) + id_to_case = parse_demo_file(ascii_files[0]) + else: + logging.info("No DEMO file for input %s", input_filename) + + def handle_safety_report(_, safety_report): + """Handle a single safety_report entry.""" + try: + self._record_count += 1 + if ( + self.max_records_per_file > 0 + and self._record_count >= self.max_records_per_file + ): + return False + + # Skip the small number of records without a patient section + if "patient" not in list(safety_report.keys()): + return True + + # Have drug and reaction in a list (even if they are just one element) + if type(safety_report["patient"]["drug"]) != type([]): + safety_report["patient"]["drug"] = [ + safety_report["patient"]["drug"] + ] + if type(safety_report["patient"]["reaction"]) != type([]): + safety_report["patient"]["reaction"] = [ + safety_report["patient"]["reaction"] + ] + + # Ensure drugrecurrence is always an array in response to https://github.com/FDA/openfda/issues/139 + for drug in safety_report["patient"]["drug"]: + if "drugrecurrence" in drug: + if type(drug["drugrecurrence"]) != type([]): + drug["drugrecurrence"] = [drug["drugrecurrence"]] + + # add timestamp for kibana + try: + d = safety_report["receiptdate"] + if d: + safety_report["@timestamp"] = ( + d[0:4] + "-" + d[4:6] + "-" + d[6:8] + ) + except: + pass + + # print json.dumps(safety_report, sort_keys=True, + # indent=2, separators=(',', ':')) + report_id = safety_report["safetyreportid"] + + # strip "check" digit + report_id = report_id.split("-")[0] + if id_to_case and id_to_case.get(report_id): + case_number = id_to_case[report_id] + else: + case_number = report_id + + safety_report["@case_number"] = case_number + + map_output.add(case_number, (file_timestamp, safety_report)) + return True + except Exception: + # We sometimes encounter bad records. + # Ignore them and continue processing. + logging.error("Traceback in file: %s" % input_filename) + traceback.print_exc() + logging.error("Report was: %s", pprint.pformat(safety_report)) + raise + try: - d = safety_report['receiptdate'] - if d: - safety_report['@timestamp'] = d[0:4] + '-' + d[4:6] + '-' + d[6:8] + xmltodict.parse( + open(input_filename, mode="rb"), + item_depth=2, + item_callback=handle_safety_report, + ) except: - pass - - # print json.dumps(safety_report, sort_keys=True, - # indent=2, separators=(',', ':')) - report_id = safety_report['safetyreportid'] - - # strip "check" digit - report_id = report_id.split('-')[0] - if id_to_case and id_to_case.get(report_id): - case_number = id_to_case[report_id] - else: - case_number = report_id - - safety_report['@case_number'] = case_number - - map_output.add(case_number, (file_timestamp, safety_report)) - return True - except Exception: - # We sometimes encounter bad records. - # Ignore them and continue processing. - logging.error('Traceback in file: %s' % input_filename) - traceback.print_exc() - logging.error('Report was: %s', pprint.pformat(safety_report)) - raise - - try: - xmltodict.parse(open(input_filename, mode='rb'), - item_depth=2, - item_callback=handle_safety_report) - except: - traceback.print_exc() - raise - + traceback.print_exc() + raise diff --git a/openfda/index_util.py b/openfda/index_util.py index 2f7263f..cd7c469 100644 --- a/openfda/index_util.py +++ b/openfda/index_util.py @@ -1,8 +1,8 @@ #!/usr/bin/env python -''' +""" Helper functions for managing ElasticSearch indices. -''' +""" import contextlib import hashlib @@ -18,524 +18,564 @@ import elasticsearch.helpers import luigi import zipfile as zipfile -from elasticsearch.client.utils import _make_path +from elasticsearch.client.utils import _make_path from openfda import config, elasticsearch_requests, parallel from openfda.common import BatchHelper from openfda.tasks import AlwaysRunTask, NoopTask import simplejson as json + def refresh_index(index_name): - logging.info('Refreshing: %s', index_name) - resp = requests.post('http://%s/%s/_refresh' % - (config.es_host(), index_name), timeout=100000) - resp.raise_for_status() + logging.info("Refreshing: %s", index_name) + resp = requests.post( + "http://%s/%s/_refresh" % (config.es_host(), index_name), timeout=100000 + ) + resp.raise_for_status() def optimize_index(index_name, wait_for_merge=1): - # elasticsearch client throws a timeout error when waiting for an optimize - # command to finish, so we use requests instead - logging.info('Optimizing: %s (this may take a while)', index_name) - resp = requests.post('http://%s/%s/_forcemerge?max_num_segments=1' % - (config.es_host(), index_name), timeout=100000) - resp.raise_for_status() - - -def dump_index(es, - index, - endpoint, - target_dir, - cleaner=None, - query=None, - chunks=100000): - ''' - Dump the index `index` to disk as compressed JSON files. If a query is passed - into the function, then it is filters out those records before the dump. If - chunks is passed into the function, then a new file is written for every - chunk. - ''' - endpoint_list = endpoint[1:].split('/') - update_date = arrow.utcnow().format('YYYY-MM-DD') - - def _write_chunk(target_file, data): - ''' Helper function for writing out zip file chunks. - ''' - file_to_zip = basename(target_file).replace('.zip', '') - try: - with contextlib.closing(zipfile.ZipFile(target_file, 'w')) as out_f: - zip_info = zipfile.ZipInfo(file_to_zip, time.localtime()[0:6]) - zip_info.external_attr = 0o777 << 16 - zip_info.compress_type = zipfile.ZIP_DEFLATED - out_f.writestr(zip_info, json.dumps(data, indent=2) + '\n') - except: - logging.error("Failed to write a chunk to: "+target_file) - raise - - def _make_file_name(target_dir, idx, total): - ''' Helper function for making uniform file names. - ''' - prefix = '-'.join(endpoint_list) - return join(target_dir, prefix + '-%04d-of-%04d.json.zip' % (idx, total)) - - def _make_display_name(endpoint, name): - ''' Isolating the thorny naming logic for the display name in a helper + # elasticsearch client throws a timeout error when waiting for an optimize + # command to finish, so we use requests instead + logging.info("Optimizing: %s (this may take a while)", index_name) + resp = requests.post( + "http://%s/%s/_forcemerge?max_num_segments=1" % (config.es_host(), index_name), + timeout=100000, + ) + resp.raise_for_status() + + +def dump_index( + es, index, endpoint, target_dir, cleaner=None, query=None, chunks=100000 +): + """ + Dump the index `index` to disk as compressed JSON files. If a query is passed + into the function, then it is filters out those records before the dump. If + chunks is passed into the function, then a new file is written for every + chunk. + """ + endpoint_list = endpoint[1:].split("/") + update_date = arrow.utcnow().format("YYYY-MM-DD") + + def _write_chunk(target_file, data): + """Helper function for writing out zip file chunks.""" + file_to_zip = basename(target_file).replace(".zip", "") + try: + with contextlib.closing(zipfile.ZipFile(target_file, "w")) as out_f: + zip_info = zipfile.ZipInfo(file_to_zip, time.localtime()[0:6]) + zip_info.external_attr = 0o777 << 16 + zip_info.compress_type = zipfile.ZIP_DEFLATED + out_f.writestr(zip_info, json.dumps(data, indent=2) + "\n") + except: + logging.error("Failed to write a chunk to: " + target_file) + raise + + def _make_file_name(target_dir, idx, total): + """Helper function for making uniform file names.""" + prefix = "-".join(endpoint_list) + return join(target_dir, prefix + "-%04d-of-%04d.json.zip" % (idx, total)) + + def _make_display_name(endpoint, name): + """Isolating the thorny naming logic for the display name in a helper function. The naming is solely present for url cleanliness. Possible outputs (examples): No partition and no shards: `Device pma data` No partition but sharded: `Device pma (part 1 of 2)` Partition but no shards: `2014 Q1 (all)` Partition with shards: `2014 Q1 (part 1 of 2)` - ''' - partition = '' - parts = name.split('/') - file_name = parts[-1] - pieces = file_name.split('-of-') - part, total = int(pieces[0][-4:]), int(pieces[1][:4]) - part_text = ' (all)' - - if len(parts) > 1: - partition = ' '.join(parts[0:-1]) - - # Currently only support quartly partitions, so we can safely split on `q` - if partition: - if 'q' in partition: - partition = ' q'.join(partition.split('q')) - partition = ' {partition}'.format(partition=partition).title() - elif 'all' in partition.lower(): - partition = 'All other data' - - if total > 1: - part_text = ' (part {part} of {total})'.format(**locals()) - elif total == 1 and not partition: - part_text = ' data' - - if partition: - endpoint = '' - - return '{endpoint}{partition}{part_text}'.format(**locals()).strip() - - def _make_partition(file_name, endpoint, total): - ''' Helper function for making the partition object that is appended to + """ + partition = "" + parts = name.split("/") + file_name = parts[-1] + pieces = file_name.split("-of-") + part, total = int(pieces[0][-4:]), int(pieces[1][:4]) + part_text = " (all)" + + if len(parts) > 1: + partition = " ".join(parts[0:-1]) + + # Currently only support quartly partitions, so we can safely split on `q` + if partition: + if "q" in partition: + partition = " q".join(partition.split("q")) + partition = " {partition}".format(partition=partition).title() + elif "all" in partition.lower(): + partition = "All other data" + + if total > 1: + part_text = " (part {part} of {total})".format(**locals()) + elif total == 1 and not partition: + part_text = " data" + + if partition: + endpoint = "" + + return "{endpoint}{partition}{part_text}".format(**locals()).strip() + + def _make_partition(file_name, endpoint, total): + """Helper function for making the partition object that is appended to manifest.partitions list. - ''' - # http://download.open.fda.gov/device/event/1992q4/device-event-0001-of-0001.json.zip - # Sometimes the filename includes a partition, so we take everything after - # the endpoint and not just the basename - name = file_name.split(endpoint)[-1] - url = 'https://download.open.fda.gov' + endpoint + name - file_size = os.path.getsize(file_name)/1024/1024.0 - - return { - 'display_name': _make_display_name(endpoint, name), - 'file': url, - 'size_mb': '%0.2f' % file_size, - 'records': total + """ + # http://download.open.fda.gov/device/event/1992q4/device-event-0001-of-0001.json.zip + # Sometimes the filename includes a partition, so we take everything after + # the endpoint and not just the basename + name = file_name.split(endpoint)[-1] + url = "https://download.open.fda.gov" + endpoint + name + file_size = os.path.getsize(file_name) / 1024 / 1024.0 + + return { + "display_name": _make_display_name(endpoint, name), + "file": url, + "size_mb": "%0.2f" % file_size, + "records": total, + } + + # Fetch the total number of documents for the query + # This is used for both logging and the skip and limit values which are + # added to the dump as to simulate the API output. + total_docs = es.count(index=index, body=query) + total_docs = total_docs["count"] + total_zips = int(math.ceil(total_docs * 1.0 / chunks)) + domain, subdomain = endpoint_list + + # Tracking data structure that will eventually be written to ES for the + # downloads API. + manifest = { + "index_stamp": get_stamp(es, index), + domain: { + subdomain: { + "export_date": update_date, + "total_records": total_docs, + "partitions": [], + } + }, } - # Fetch the total number of documents for the query - # This is used for both logging and the skip and limit values which are - # added to the dump as to simulate the API output. - total_docs = es.count(index=index, body=query) - total_docs = total_docs['count'] - total_zips = int(math.ceil(total_docs * 1.0/chunks)) - domain, subdomain = endpoint_list - - # Tracking data structure that will eventually be written to ES for the - # downloads API. - manifest = { - 'index_stamp': get_stamp(es, index), - domain: { - subdomain: { - 'export_date': update_date, - 'total_records': total_docs, - 'partitions': [] - } - } - } - - # All files are wrapped in the API format, so that the data interface remains - # constant between the API results and the download results. - disclaimer = ('Do not rely on openFDA to make decisions regarding medical care. ' - 'While we make every effort to ensure that data is accurate, you ' - 'should assume all results are unvalidated. We may limit or otherwise ' - 'restrict your access to the API in line with our Terms of Service.') - - # Bespoke disclaimer for caers API, if there are any more of these, then they - # should be externalized - caers_disclaimer = ('Do not rely on openFDA to make decisions regarding ' - 'medical care. While we make every effort to ensure that data is accurate, ' - 'you should assume all results are unvalidated. We may limit or otherwise ' - 'restrict your access to the API in line with our Terms of Service. ' - 'Submission of an adverse event report does not constitute an admission ' - 'that a product caused or contributed to an event. The information in ' - 'these reports has not been scientifically or otherwise verified as to a ' - 'cause and effect relationship and cannot be used to estimate incidence ' - '(occurrence rate) or to estimate risk.') - - return_dict = { - 'meta': { - 'disclaimer': caers_disclaimer if index == 'foodevent' else disclaimer, - 'terms': 'https://open.fda.gov/terms/', - 'license': 'https://open.fda.gov/license/', - 'last_updated': update_date, - 'results': { - 'skip': 0, - 'limit': chunks, - 'total': total_docs - } - }, - 'results': [] - } - - logging.info('Dumping %s (%d documents) to %s', index, total_docs, target_dir) - os.system('mkdir -p "%s"' % target_dir) - docs_written = 0 - file_idx = 1 - - target_file = _make_file_name(target_dir, file_idx, total_zips) - - results = [] - - query_with_sort = query.copy() if query is not None else None - if query_with_sort is not None: - query_with_sort['sort'] = ["_doc"] - - for result in elasticsearch.helpers.scan(es, - query=query_with_sort, - raise_on_error=True, - preserve_order=True, - scroll='30m', - index=index): - - # Grab a response from ES and clean it with the callback passed into the - # main function. This is so we can avoid exposing internal keys in the - # exported data. - source_data = json.loads(json.dumps(result['_source']), object_hook=cleaner) - results.append(source_data) - docs_written = len(results) - - # Write out a new file everytime we have fetched a chunks' worth. - if docs_written % chunks == 0: - chunks_written = int(return_dict['meta']['results']['skip'] + chunks) - total_written = (chunks_written * 1.0)/total_docs - logging.info('Writing %20s, %.2f%% done', index, 100. * total_written) - return_dict['results'] = results - - _write_chunk(target_file, return_dict) - manifest[domain][subdomain]['partitions'].append( - _make_partition(target_file, endpoint, len(results)) - ) - - results = [] - return_dict['meta']['results']['skip'] += chunks - file_idx += 1 - target_file = _make_file_name(target_dir, file_idx, total_zips) - - # Take care of any final results - if results: - return_dict['results'] = results - return_dict['meta']['results']['limit'] = len(results) - _write_chunk(target_file, return_dict) - manifest[domain][subdomain]['partitions'].append( - _make_partition(target_file, endpoint, len(results)) + # All files are wrapped in the API format, so that the data interface remains + # constant between the API results and the download results. + disclaimer = ( + "Do not rely on openFDA to make decisions regarding medical care. " + "While we make every effort to ensure that data is accurate, you " + "should assume all results are unvalidated. We may limit or otherwise " + "restrict your access to the API in line with our Terms of Service." ) - # Write out manifest file that will feed into the downloads API - with open(join(target_dir, 'manifest.json'), 'w') as manifest_out: - json.dump(manifest, manifest_out) + # Bespoke disclaimer for caers API, if there are any more of these, then they + # should be externalized + caers_disclaimer = ( + "Do not rely on openFDA to make decisions regarding " + "medical care. While we make every effort to ensure that data is accurate, " + "you should assume all results are unvalidated. We may limit or otherwise " + "restrict your access to the API in line with our Terms of Service. " + "Submission of an adverse event report does not constitute an admission " + "that a product caused or contributed to an event. The information in " + "these reports has not been scientifically or otherwise verified as to a " + "cause and effect relationship and cannot be used to estimate incidence " + "(occurrence rate) or to estimate risk." + ) + + return_dict = { + "meta": { + "disclaimer": caers_disclaimer if index == "foodevent" else disclaimer, + "terms": "https://open.fda.gov/terms/", + "license": "https://open.fda.gov/license/", + "last_updated": update_date, + "results": {"skip": 0, "limit": chunks, "total": total_docs}, + }, + "results": [], + } + + logging.info("Dumping %s (%d documents) to %s", index, total_docs, target_dir) + os.system('mkdir -p "%s"' % target_dir) + docs_written = 0 + file_idx = 1 + + target_file = _make_file_name(target_dir, file_idx, total_zips) + + results = [] + + query_with_sort = query.copy() if query is not None else None + if query_with_sort is not None: + query_with_sort["sort"] = ["_doc"] + + for result in elasticsearch.helpers.scan( + es, + query=query_with_sort, + raise_on_error=True, + preserve_order=True, + scroll="30m", + index=index, + ): + + # Grab a response from ES and clean it with the callback passed into the + # main function. This is so we can avoid exposing internal keys in the + # exported data. + source_data = json.loads(json.dumps(result["_source"]), object_hook=cleaner) + results.append(source_data) + docs_written = len(results) + + # Write out a new file everytime we have fetched a chunks' worth. + if docs_written % chunks == 0: + chunks_written = int(return_dict["meta"]["results"]["skip"] + chunks) + total_written = (chunks_written * 1.0) / total_docs + logging.info("Writing %20s, %.2f%% done", index, 100.0 * total_written) + return_dict["results"] = results + + _write_chunk(target_file, return_dict) + manifest[domain][subdomain]["partitions"].append( + _make_partition(target_file, endpoint, len(results)) + ) + + results = [] + return_dict["meta"]["results"]["skip"] += chunks + file_idx += 1 + target_file = _make_file_name(target_dir, file_idx, total_zips) + + # Take care of any final results + if results: + return_dict["results"] = results + return_dict["meta"]["results"]["limit"] = len(results) + _write_chunk(target_file, return_dict) + manifest[domain][subdomain]["partitions"].append( + _make_partition(target_file, endpoint, len(results)) + ) + + # Write out manifest file that will feed into the downloads API + with open(join(target_dir, "manifest.json"), "w") as manifest_out: + json.dump(manifest, manifest_out) def index_without_checksum(es, index, doc_type, batch): - ''' - A function for bulk loading a batch into an index without versioning. - - This operation assumes that the index was newly created (or erased) prior to - indexing. - ''' - create_batch = [] - for doc in batch: - create_batch.append({ '_index' : index, - '_type' : doc_type, - '_source' : doc }) - - create_count, errors = elasticsearch.helpers.bulk(es, create_batch, - raise_on_error=False, - raise_on_exception=False) - for item in errors: - create_resp = item['index'] - if create_resp['status'] >= 400: - logging.error('Unexpected conflict in creating documents: %s', create_resp) - - if len(errors) > 0: - logging.warning('%s create batch complete: %s created %s errors', - index, create_count, len(errors)) + """ + A function for bulk loading a batch into an index without versioning. + + This operation assumes that the index was newly created (or erased) prior to + indexing. + """ + create_batch = [] + for doc in batch: + create_batch.append({"_index": index, "_type": doc_type, "_source": doc}) + + create_count, errors = elasticsearch.helpers.bulk( + es, create_batch, raise_on_error=False, raise_on_exception=False + ) + for item in errors: + create_resp = item["index"] + if create_resp["status"] >= 400: + logging.error("Unexpected conflict in creating documents: %s", create_resp) + + if len(errors) > 0: + logging.warning( + "%s create batch complete: %s created %s errors", + index, + create_count, + len(errors), + ) def index_with_checksum(es, index, doc_type, batch): - ''' - Index documents in `batch`, inserting them into `index`. - - As an optimization, a SHA1 checksum is computed for each document. If a - document already exists with the same checksum, we skip re-indexing the - document. - ''' - if not batch: - return - - # skip documents which have a matching checksum. the checksum of a document - # is the checksum of the sha1 string of the JSON representation (where the - # "@checksum" field is left blank. - batch_with_checksum = [] - for docid, doc in batch: - doc['@checksum'] = '' - checksum = hashlib.sha1(json.dumps(doc, sort_keys=True).encode('utf-8')).hexdigest() - doc['@checksum'] = checksum - batch_with_checksum.append((docid, checksum, doc)) - - # mapping from docid to checksum - existing_docs = {} - found_docs = es.transport.perform_request('GET', _make_path(index, doc_type, '_mget'), - params={'_source_includes': '@checksum'}, - body={'ids': [docid for (docid, _, _) in batch_with_checksum]}) - for match in found_docs['docs']: - if 'error' in match: - logging.warning('ERROR during query: %s', match['error']) - continue - - if not match['found']: - continue - - # some mappings convert integer ids into strings: we want to - # match the documents in either case, so we use a string id here - # explicitly - matching_id = str(match['_id']) - existing_docs[matching_id] = match['_source']['@checksum'] - - logging.debug('%d existing documents found', len(existing_docs)) - # filter out documents with matching checksums - filtered_batch = [] - skipped = 0 - for docid, checksum, doc in batch_with_checksum: - es_checksum = existing_docs.get(str(docid), None) - if es_checksum == checksum: - logging.debug('Skipping docid %s, already exists.', docid) - skipped += 1 - else: - logging.debug('Checksum mismatch for %s, [ %s vs %s ]', docid, checksum, es_checksum) - filtered_batch.append((docid, checksum, doc)) - - batch = filtered_batch - - index_batch = [] - for docid, checksum, doc in batch: - index_batch.append( - { '_op_type' : 'index', - '_id' : docid, - '_index' : index, - '_type' : doc_type, - '_source' : doc }) - - index_count, errors = elasticsearch.helpers.bulk(es, index_batch, - raise_on_error=False, - raise_on_exception=False) - - for item in errors: - update_resp = item['index'] - logging.error('Failed to index: %s', update_resp) - - logging.info('%s update batch complete: %s total, %s indexed, %s skipped, %s errors', - index, len(index_batch) + skipped, len(index_batch), skipped, len(errors)) - if len(errors) > 0: - raise Exception('Some of the documents failed to index: halting the pipeline!') + """ + Index documents in `batch`, inserting them into `index`. + + As an optimization, a SHA1 checksum is computed for each document. If a + document already exists with the same checksum, we skip re-indexing the + document. + """ + if not batch: + return + + # skip documents which have a matching checksum. the checksum of a document + # is the checksum of the sha1 string of the JSON representation (where the + # "@checksum" field is left blank. + batch_with_checksum = [] + for docid, doc in batch: + doc["@checksum"] = "" + checksum = hashlib.sha1( + json.dumps(doc, sort_keys=True).encode("utf-8") + ).hexdigest() + doc["@checksum"] = checksum + batch_with_checksum.append((docid, checksum, doc)) + + # mapping from docid to checksum + existing_docs = {} + found_docs = es.transport.perform_request( + "GET", + _make_path(index, doc_type, "_mget"), + params={"_source_includes": "@checksum"}, + body={"ids": [docid for (docid, _, _) in batch_with_checksum]}, + ) + for match in found_docs["docs"]: + if "error" in match: + logging.warning("ERROR during query: %s", match["error"]) + continue + + if not match["found"]: + continue + + # some mappings convert integer ids into strings: we want to + # match the documents in either case, so we use a string id here + # explicitly + matching_id = str(match["_id"]) + existing_docs[matching_id] = match["_source"]["@checksum"] + + logging.debug("%d existing documents found", len(existing_docs)) + # filter out documents with matching checksums + filtered_batch = [] + skipped = 0 + for docid, checksum, doc in batch_with_checksum: + es_checksum = existing_docs.get(str(docid), None) + if es_checksum == checksum: + logging.debug("Skipping docid %s, already exists.", docid) + skipped += 1 + else: + logging.debug( + "Checksum mismatch for %s, [ %s vs %s ]", docid, checksum, es_checksum + ) + filtered_batch.append((docid, checksum, doc)) + + batch = filtered_batch + + index_batch = [] + for docid, checksum, doc in batch: + index_batch.append( + { + "_op_type": "index", + "_id": docid, + "_index": index, + "_type": doc_type, + "_source": doc, + } + ) + + index_count, errors = elasticsearch.helpers.bulk( + es, index_batch, raise_on_error=False, raise_on_exception=False + ) + + for item in errors: + update_resp = item["index"] + logging.error("Failed to index: %s", update_resp) + + logging.info( + "%s update batch complete: %s total, %s indexed, %s skipped, %s errors", + index, + len(index_batch) + skipped, + len(index_batch), + skipped, + len(errors), + ) + if len(errors) > 0: + raise Exception("Some of the documents failed to index: halting the pipeline!") + def get_mapping(es, index): - ''' Get the Elasticsearch mapping for an index. - ''' - indices = es.indices + """Get the Elasticsearch mapping for an index.""" + indices = es.indices + + return indices.get_mapping(index=index) - return indices.get_mapping(index=index) def get_stamp(es, index): - ''' Generates a "stamp" (or a hash) for the ES index that could be used to determine whether or not any updates have been made to the index - since the last export. Elasticsearch itself does not offer such a capability -- see the discussion here for example https://github.com/elastic/elasticsearch/issues/13462 -- - and we don't want to introduce a new metadata field to track index updates. So we are going to use the _stats API and rely on - a combination of a number of fields that is very likely to change when any updates are made to the index. - ''' - stats = es.indices.stats(index=index) - uuid = stats['indices'][index]['uuid'] - total_docs = stats['indices'][index]['total']['docs']['count'] - total_deleted = stats['indices'][index]['total']['docs']['deleted'] - total_store = stats['indices'][index]['total']['store']['size_in_bytes'] - return f'{uuid}_{total_docs}_{total_deleted}_{total_store}' + """Generates a "stamp" (or a hash) for the ES index that could be used to determine whether or not any updates have been made to the index + since the last export. Elasticsearch itself does not offer such a capability -- see the discussion here for example https://github.com/elastic/elasticsearch/issues/13462 -- + and we don't want to introduce a new metadata field to track index updates. So we are going to use the _stats API and rely on + a combination of a number of fields that is very likely to change when any updates are made to the index. + """ + stats = es.indices.stats(index=index) + uuid = stats["indices"][index]["uuid"] + total_docs = stats["indices"][index]["total"]["docs"]["count"] + total_deleted = stats["indices"][index]["total"]["docs"]["deleted"] + total_store = stats["indices"][index]["total"]["store"]["size_in_bytes"] + return f"{uuid}_{total_docs}_{total_deleted}_{total_store}" def copy_mapping(es, source_index, target_index): - ''' - Copy the mappings and settings from `source_index` to `target_index`. + """ + Copy the mappings and settings from `source_index` to `target_index`. - `target_index` must not exist. This operation does not copy any data. - ''' - idx = es.indices - mapping = idx.get_mapping(index=source_index) - settings = idx.get_settings(index=source_index)[source_index] + `target_index` must not exist. This operation does not copy any data. + """ + idx = es.indices + mapping = idx.get_mapping(index=source_index) + settings = idx.get_settings(index=source_index)[source_index] - assert not es.indices.exists(target_index), ( - 'Trying to copy mapping to an already existing index: %s' % target_index) + assert not es.indices.exists(target_index), ( + "Trying to copy mapping to an already existing index: %s" % target_index + ) - idx.create(target_index, - body={ 'settings' : settings['settings'] }) + idx.create(target_index, body={"settings": settings["settings"]}) - for doc_type, schema in mapping[source_index]['mappings'].items(): - idx.put_mapping(index=target_index, doc_type=doc_type, body=schema) + for doc_type, schema in mapping[source_index]["mappings"].items(): + idx.put_mapping(index=target_index, doc_type=doc_type, body=schema) class ResetElasticSearch(AlwaysRunTask): - '''Create a new index with the given type and mapping.''' - target_host = luigi.Parameter() - target_index_name = luigi.Parameter() - target_type_name = luigi.Parameter() - target_mapping_file = luigi.Parameter() - delete_index = luigi.BoolParameter(default=False) - - def _run(self): - logging.info('Create index and loading mapping: %s/%s', - self.target_index_name, self.target_type_name) - es = elasticsearch.Elasticsearch(self.target_host, timeout=120) - if self.delete_index: - elasticsearch_requests.clear_and_load(es, - self.target_index_name, - self.target_type_name, - self.target_mapping_file) - else: - elasticsearch_requests.load_mapping(es, - self.target_index_name, - self.target_type_name, - self.target_mapping_file) + """Create a new index with the given type and mapping.""" + + target_host = luigi.Parameter() + target_index_name = luigi.Parameter() + target_type_name = luigi.Parameter() + target_mapping_file = luigi.Parameter() + delete_index = luigi.BoolParameter(default=False) + + def _run(self): + logging.info( + "Create index and loading mapping: %s/%s", + self.target_index_name, + self.target_type_name, + ) + es = elasticsearch.Elasticsearch(self.target_host, timeout=120) + if self.delete_index: + elasticsearch_requests.clear_and_load( + es, + self.target_index_name, + self.target_type_name, + self.target_mapping_file, + ) + else: + elasticsearch_requests.load_mapping( + es, + self.target_index_name, + self.target_type_name, + self.target_mapping_file, + ) class LoadJSONMapper(parallel.Mapper): - def __init__(self, es_host, index_name, type_name, docid_key, incremental): - self.index_name = index_name - self.type_name = type_name - self.docid_key = docid_key - self.incremental = incremental - self.es_host = es_host - - def _map_incremental(self, map_input): - es = elasticsearch.Elasticsearch(self.es_host, timeout=120) - with BatchHelper(index_with_checksum, es, self.index_name, self.type_name) as helper: - for key, doc in map_input: - assert self.docid_key in doc, 'Document is missing id field: %s' % json.dumps(doc, indent=2) - doc_id = doc[self.docid_key] - helper.add((doc_id, doc)) - - def _map_non_incremental(self, map_input): - es = elasticsearch.Elasticsearch(self.es_host, timeout=120) - with BatchHelper(index_without_checksum, es, self.index_name, self.type_name) as helper: - for key, doc in map_input: - helper.add(doc) - - def map_shard(self, map_input, map_output): - if self.incremental: - self._map_incremental(map_input) - else: - self._map_non_incremental(map_input) + def __init__(self, es_host, index_name, type_name, docid_key, incremental): + self.index_name = index_name + self.type_name = type_name + self.docid_key = docid_key + self.incremental = incremental + self.es_host = es_host + + def _map_incremental(self, map_input): + es = elasticsearch.Elasticsearch(self.es_host, timeout=120) + with BatchHelper( + index_with_checksum, es, self.index_name, self.type_name + ) as helper: + for key, doc in map_input: + assert ( + self.docid_key in doc + ), "Document is missing id field: %s" % json.dumps(doc, indent=2) + doc_id = doc[self.docid_key] + helper.add((doc_id, doc)) + + def _map_non_incremental(self, map_input): + es = elasticsearch.Elasticsearch(self.es_host, timeout=120) + with BatchHelper( + index_without_checksum, es, self.index_name, self.type_name + ) as helper: + for key, doc in map_input: + helper.add(doc) + + def map_shard(self, map_input, map_output): + if self.incremental: + self._map_incremental(map_input) + else: + self._map_non_incremental(map_input) class LoadJSONBase(AlwaysRunTask): - ''' - Load JSON into elasticsearch, using the given index, type and mapping. - ''' - - 'The index and type to load into.' - index_name = None - type_name = '_doc' - mapping_file = None - - 'Load data incrementally, using a checksum?' - use_checksum = False - - 'For checksum loads: the field to use for document ids.' - docid_key = 'docid' - - 'A task specifying the data source to load into ES.' - data_source = None - - 'If specified, this task will be ordered after `previous_task`' - previous_task = luigi.TaskParameter(default=NoopTask()) - - 'The number of processes to use when loading JSON into ES' - load_json_workers = luigi.IntParameter(default=4) - - 'Should we optimize the index after adding documents.' - optimize_index = False - - 'Last update date of the dataset (different from last_run_date which is merely when the pipeline last ran)' - last_update_date = arrow.utcnow().format('YYYY-MM-DD') - - def _data(self): - ''' - Return a task that supplies the data files to load into ES. - - By default, this is just `data_source`, but this can be overridden - for more complex situations (such as a depending on a parameter) - ''' - return self.data_source - - def _schema(self): - return ResetElasticSearch( - target_host=config.es_host(), - target_index_name=self.index_name, - target_type_name=self.type_name, - target_mapping_file=self.mapping_file, - delete_index=not self.use_checksum) - - def requires(self): - deps = { - 'schema': self._schema(), - 'data': self._data(), - } - - if self.previous_task: - deps['previous_task'] = self.previous_task - - return deps - - def __repr__(self): - # if we have an ordering on this set of tasks, the default representation - # can become overwhelmingly verbose; we truncate it here. - return 'LoadJSON(data=%s)' % self._data() - - def _run(self): - json_dir = self.input()['data'].path - - mapper = LoadJSONMapper( - config.es_host(), - index_name=self.index_name, - type_name=self.type_name, - docid_key=self.docid_key, - incremental=self.use_checksum - ) - - parallel.mapreduce( - parallel.Collection.from_sharded(json_dir), - mapper=mapper, - reducer=parallel.IdentityReducer(), - output_format=parallel.NullOutput(), - map_workers=self.load_json_workers, - num_shards=1, - output_prefix=config.tmp_dir('%s/load-json' % self.index_name) - ) - - # update metadata index - elasticsearch_requests.update_process_datetime( - config.es_client(), self.index_name, last_run_date=arrow.utcnow().format('YYYY-MM-DD'), - last_update_date=self.last_update_date() if callable(self.last_update_date) else self.last_update_date - ) - - # Refresh the index to make the documents visible to searches. - refresh_index(self.index_name) - - # optimize index, if requested - if self.optimize_index: - optimize_index(self.index_name, wait_for_merge=False) + """ + Load JSON into elasticsearch, using the given index, type and mapping. + """ + + "The index and type to load into." + index_name = None + type_name = "_doc" + mapping_file = None + + "Load data incrementally, using a checksum?" + use_checksum = False + + "For checksum loads: the field to use for document ids." + docid_key = "docid" + + "A task specifying the data source to load into ES." + data_source = None + + "If specified, this task will be ordered after `previous_task`" + previous_task = luigi.TaskParameter(default=NoopTask()) + + "The number of processes to use when loading JSON into ES" + load_json_workers = luigi.IntParameter(default=4) + + "Should we optimize the index after adding documents." + optimize_index = False + + "Last update date of the dataset (different from last_run_date which is merely when the pipeline last ran)" + last_update_date = arrow.utcnow().format("YYYY-MM-DD") + + def _data(self): + """ + Return a task that supplies the data files to load into ES. + + By default, this is just `data_source`, but this can be overridden + for more complex situations (such as a depending on a parameter) + """ + return self.data_source + + def _schema(self): + return ResetElasticSearch( + target_host=config.es_host(), + target_index_name=self.index_name, + target_type_name=self.type_name, + target_mapping_file=self.mapping_file, + delete_index=not self.use_checksum, + ) + + def requires(self): + deps = { + "schema": self._schema(), + "data": self._data(), + } + + if self.previous_task: + deps["previous_task"] = self.previous_task + + return deps + + def __repr__(self): + # if we have an ordering on this set of tasks, the default representation + # can become overwhelmingly verbose; we truncate it here. + return "LoadJSON(data=%s)" % self._data() + + def _run(self): + json_dir = self.input()["data"].path + + mapper = LoadJSONMapper( + config.es_host(), + index_name=self.index_name, + type_name=self.type_name, + docid_key=self.docid_key, + incremental=self.use_checksum, + ) + + parallel.mapreduce( + parallel.Collection.from_sharded(json_dir), + mapper=mapper, + reducer=parallel.IdentityReducer(), + output_format=parallel.NullOutput(), + map_workers=self.load_json_workers, + num_shards=1, + output_prefix=config.tmp_dir("%s/load-json" % self.index_name), + ) + + # update metadata index + elasticsearch_requests.update_process_datetime( + config.es_client(), + self.index_name, + last_run_date=arrow.utcnow().format("YYYY-MM-DD"), + last_update_date=( + self.last_update_date() + if callable(self.last_update_date) + else self.last_update_date + ), + ) + + # Refresh the index to make the documents visible to searches. + refresh_index(self.index_name) + + # optimize index, if requested + if self.optimize_index: + optimize_index(self.index_name, wait_for_merge=False) diff --git a/openfda/maude/generate_maude_mapping.py b/openfda/maude/generate_maude_mapping.py index 3f0abf4..ec92aa9 100644 --- a/openfda/maude/generate_maude_mapping.py +++ b/openfda/maude/generate_maude_mapping.py @@ -1,7 +1,7 @@ #!/usr/bin/python -''' A Utility to generate a mapping file for the MAUDE dataset (device recalls) -''' +""" A Utility to generate a mapping file for the MAUDE dataset (device recalls) +""" from __future__ import print_function import os @@ -15,75 +15,79 @@ # TODO(hansnelsen): find a way to automate the generation of this file. RUN_DIR = dirname(dirname(os.path.abspath(__file__))) # Wide sample means that it includes all of the date fields, even if null -WIDE_SAMPLE_JSON = join(RUN_DIR, 'maude/data/wide_sample.json') -json_file = open(WIDE_SAMPLE_JSON, 'r').read().strip() +WIDE_SAMPLE_JSON = join(RUN_DIR, "maude/data/wide_sample.json") +json_file = open(WIDE_SAMPLE_JSON, "r").read().strip() json_list = [] json_list.append(json.loads(json_file)) -NESTED = ['mdr_text', 'patient', 'device'] +NESTED = ["mdr_text", "patient", "device"] # This is the base_mapping dictionary. Everything builds upon this dictionary. # Note: a pointer dictionary is used in spots for readability. -base_mapping = {"maude":{"properties": {}}} -base_mapping['maude']['_source'] = {"includes": ["*"], "excludes": ["*_exact"]} -base_mapping['maude']['_source']['excludes'].append('patient.*_exact') -base_mapping['maude']['_source']['excludes'].append('device.*_exact') -base_mapping['maude']['_source']['excludes'].append('mdr_text.*_exact') +base_mapping = {"maude": {"properties": {}}} +base_mapping["maude"]["_source"] = {"includes": ["*"], "excludes": ["*_exact"]} +base_mapping["maude"]["_source"]["excludes"].append("patient.*_exact") +base_mapping["maude"]["_source"]["excludes"].append("device.*_exact") +base_mapping["maude"]["_source"]["excludes"].append("mdr_text.*_exact") for json_dict in json_list: - for key_name, value in json_dict.items(): - # Resetting these types to avoid some pointer wonkiness - date_type = {"type": "date", "format": "basic_date||date"} - exact_type = {"type": "string", "index": "not_analyzed"} - multifield_type = {"type": "multi_field", "fields": {}} - string_type = {"type": "string", "index": "analyzed"} + for key_name, value in json_dict.items(): + # Resetting these types to avoid some pointer wonkiness + date_type = {"type": "date", "format": "basic_date||date"} + exact_type = {"type": "string", "index": "not_analyzed"} + multifield_type = {"type": "multi_field", "fields": {}} + string_type = {"type": "string", "index": "analyzed"} - # There are three logic branches we are accounting for: - # Dates, the subdocuments and everything else. - # Everything else breaks into lists vs multifield - if 'date' in key_name and 'flag' not in key_name: - base_mapping['maude']['properties'][key_name] = date_type + # There are three logic branches we are accounting for: + # Dates, the subdocuments and everything else. + # Everything else breaks into lists vs multifield + if "date" in key_name and "flag" not in key_name: + base_mapping["maude"]["properties"][key_name] = date_type - elif key_name in NESTED: - nested_type = {"properties": {}} - base_mapping['maude']['properties'][key_name] = nested_type - # Only need to do first one from this array to make the mapping - for k, v in value[0].items(): - # Dates - if 'date' in k and 'flag' not in k: - pointer = {} - pointer = base_mapping['maude']['properties'][key_name]['properties'] - pointer[k] = date_type - # Multifield - else: - new_key = {} - pointer = {} - multifield_type = {"type": "multi_field", "fields": {}} - pointer = base_mapping['maude']['properties'][key_name]['properties'] - if type(v) == type([]): - pointer[k] = exact_type - else: - pointer[k] = multifield_type - new_key[k] = string_type - new_key['exact'] = exact_type - pointer[k]['fields'] = new_key + elif key_name in NESTED: + nested_type = {"properties": {}} + base_mapping["maude"]["properties"][key_name] = nested_type + # Only need to do first one from this array to make the mapping + for k, v in value[0].items(): + # Dates + if "date" in k and "flag" not in k: + pointer = {} + pointer = base_mapping["maude"]["properties"][key_name][ + "properties" + ] + pointer[k] = date_type + # Multifield + else: + new_key = {} + pointer = {} + multifield_type = {"type": "multi_field", "fields": {}} + pointer = base_mapping["maude"]["properties"][key_name][ + "properties" + ] + if type(v) == type([]): + pointer[k] = exact_type + else: + pointer[k] = multifield_type + new_key[k] = string_type + new_key["exact"] = exact_type + pointer[k]["fields"] = new_key - else: - # Lists - if '_exact' in key_name or type(value) == type([]): - if key_name not in list(base_mapping['maude']['properties'].keys()): - new_key = {} - if '_exact' in key_name: - base_mapping['maude']['properties'][key_name] = exact_type - else: - base_mapping['maude']['properties'][key_name] = string_type - # Multifield - else: - if key_name not in list(base_mapping['maude']['properties'].keys()): - new_key = {} - base_mapping['maude']['properties'][key_name] = multifield_type - new_key[key_name] = string_type - new_key['exact'] = exact_type - base_mapping['maude']['properties'][key_name]['fields'] = new_key + else: + # Lists + if "_exact" in key_name or type(value) == type([]): + if key_name not in list(base_mapping["maude"]["properties"].keys()): + new_key = {} + if "_exact" in key_name: + base_mapping["maude"]["properties"][key_name] = exact_type + else: + base_mapping["maude"]["properties"][key_name] = string_type + # Multifield + else: + if key_name not in list(base_mapping["maude"]["properties"].keys()): + new_key = {} + base_mapping["maude"]["properties"][key_name] = multifield_type + new_key[key_name] = string_type + new_key["exact"] = exact_type + base_mapping["maude"]["properties"][key_name]["fields"] = new_key print(json.dumps(base_mapping, sort_keys=True, indent=4)) diff --git a/openfda/maude/pipeline.py b/openfda/maude/pipeline.py index 10b6096..d35778c 100755 --- a/openfda/maude/pipeline.py +++ b/openfda/maude/pipeline.py @@ -1,7 +1,7 @@ #!/usr/bin/python -''' MAUDE pipeline for downloading, joining and loading into elasticsearch -''' +""" MAUDE pipeline for downloading, joining and loading into elasticsearch +""" import collections import csv import glob @@ -19,787 +19,886 @@ from openfda import common, parallel, index_util from openfda import download_util from openfda.common import newest_file_timestamp -from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, - DeviceAnnotateMapper) +from openfda.device_harmonization.pipeline import ( + Harmonized2OpenFDA, + DeviceAnnotateMapper, +) from openfda.tasks import AlwaysRunTask, DependencyTriggeredTask # Exceed default field_size limit, need to set to sys.maxsize csv.field_size_limit(sys.maxsize) RUN_DIR = dirname(dirname(os.path.abspath(__file__))) -BASE_DIR = './data/' -RAW_DIR = join(BASE_DIR, 'maude/raw/events') +BASE_DIR = "./data/" +RAW_DIR = join(BASE_DIR, "maude/raw/events") # See https://github.com/FDA/openfda/issues/27 # Files for resolving device problem codes -DEVICE_PROBLEM_CODES_FILE = join(BASE_DIR, 'maude/extracted/events/deviceproblemcodes.txt') -PATIENT_PROBLEM_CODES_FILE = join(BASE_DIR, 'maude/extracted/events/patientproblemdata.txt') +DEVICE_PROBLEM_CODES_FILE = join( + BASE_DIR, "maude/extracted/events/deviceproblemcodes.txt" +) +PATIENT_PROBLEM_CODES_FILE = join( + BASE_DIR, "maude/extracted/events/patientproblemdata.txt" +) # Use to ensure a standard naming of level db outputs is achieved across tasks. -DATE_FMT = 'YYYY-MM-DD' -CATEGORIES = ['foidevproblem', 'patientproblemcode', 'mdrfoi', 'patient', 'foidev', 'foitext', 'device'] -IGNORE_FILES = ['deviceproblemcodes', 'patientproblemdata', 'add', 'change'] +DATE_FMT = "YYYY-MM-DD" +CATEGORIES = [ + "foidevproblem", + "patientproblemcode", + "mdrfoi", + "patient", + "foidev", + "foitext", + "device", +] +IGNORE_FILES = ["deviceproblemcodes", "patientproblemdata", "add", "change"] -DEVICE_DOWNLOAD_PAGE = ('https://www.fda.gov/medical-devices/' - 'mandatory-reporting-requirements-manufacturers-importers-and-device-user-facilities/' - 'manufacturer-and-user-facility-device-experience-database-maude') +DEVICE_DOWNLOAD_PAGE = ( + "https://www.fda.gov/medical-devices/" + "mandatory-reporting-requirements-manufacturers-importers-and-device-user-facilities/" + "manufacturer-and-user-facility-device-experience-database-maude" +) -enum_file = join(RUN_DIR, 'maude/data/enums.csv') +enum_file = join(RUN_DIR, "maude/data/enums.csv") enum_csv = csv.DictReader(open(enum_file)) ENUM = collections.defaultdict(lambda: collections.defaultdict(dict)) for row in enum_csv: - key = row['key_name'] - code = row['code'] - desc = row['code_desc'] + key = row["key_name"] + code = row["code"] + desc = row["code_desc"] - ENUM[key][code] = desc + ENUM[key][code] = desc # patient and text records are missing header rows FILE_HEADERS = { - 'foidevproblem': [ - 'mdr_report_key', - 'problem_code'], - 'patientproblemcode': [ - 'mdr_report_key', - 'patient_sequence_number', - 'problem_code', - 'date_added', - 'date_changed'], - 'patient': [ - 'mdr_report_key', - 'patient_sequence_number', - 'date_received', - 'sequence_number_treatment', - 'sequence_number_outcome'], - 'foitext': [ - 'mdr_report_key', - 'mdr_text_key', - 'text_type_code', - 'patient_sequence_number', - 'date_report', - 'text'], - 'foidev': [ - 'mdr_report_key', - 'device_event_key', - 'implant_flag', - 'date_removed_flag', - 'device_sequence_number', - 'date_received', - 'brand_name', - 'generic_name', - 'manufacturer_d_name', - 'manufacturer_d_address_1', - 'manufacturer_d_address_2', - 'manufacturer_d_city', - 'manufacturer_d_state', - 'manufacturer_d_zip_code', - 'manufacturer_d_zip_code_ext', - 'manufacturer_d_country', - 'manufacturer_d_postal_code', - 'expiration_date_of_device', - 'model_number', - 'catalog_number', - 'lot_number', - 'other_id_number', - 'device_operator', - 'device_availability', - 'date_returned_to_manufacturer', - 'device_report_product_code', - 'device_age_text', - 'device_evaluated_by_manufacturer', - 'baseline_brand_name', - 'baseline_generic_name', - 'baseline_model_number', - 'baseline_catalog_number', - 'baseline_other_id_number', - 'baseline_device_family', - 'baseline_shelf_life_contained', - 'baseline_shelf_life_in_months', - 'baseline_pma_flag', - 'baseline_pma_number', - 'baseline_510_k__flag', - 'baseline_510_k__number', - 'baseline_preamendment_flag', - 'baseline_transitional_flag', - 'baseline_510_k__exempt_flag', - 'baseline_date_first_marketed', - 'baseline_date_ceased_marketing' - ], - 'device': [ - 'mdr_report_key', - 'device_event_key', - 'implant_flag', - 'date_removed_flag', - 'device_sequence_number', - 'date_received', - 'brand_name', - 'generic_name', - 'manufacturer_d_name', - 'manufacturer_d_address_1', - 'manufacturer_d_address_2', - 'manufacturer_d_city', - 'manufacturer_d_state', - 'manufacturer_d_zip_code', - 'manufacturer_d_zip_code_ext', - 'manufacturer_d_country', - 'manufacturer_d_postal_code', - 'device_operator', - 'expiration_date_of_device', - 'model_number', - 'catalog_number', - 'lot_number', - 'other_id_number', - 'device_availability', - 'date_returned_to_manufacturer', - 'device_report_product_code', - 'device_age_text', - 'device_evaluated_by_manufacturer', - 'combination_product_flag' - ], - 'mdrfoi': [ - 'mdr_report_key', - 'event_key', - 'report_number', - 'report_source_code', - 'manufacturer_link_flag', - 'number_devices_in_event', - 'number_patients_in_event', - 'date_received', - 'adverse_event_flag', - 'product_problem_flag', - 'date_report', - 'date_of_event', - 'reprocessed_and_reused_flag', - 'reporter_occupation_code', - 'health_professional', - 'initial_report_to_fda', - 'date_facility_aware', - 'report_date', - 'report_to_fda', - 'date_report_to_fda', - 'event_location', - 'date_report_to_manufacturer', - 'manufacturer_contact_t_name', - 'manufacturer_contact_f_name', - 'manufacturer_contact_l_name', - 'manufacturer_contact_address_1', - 'manufacturer_contact_address_2', - 'manufacturer_contact_city', - 'manufacturer_contact_state', - 'manufacturer_contact_zip_code', - 'manufacturer_contact_zip_ext', - 'manufacturer_contact_country', - 'manufacturer_contact_postal_code', - 'manufacturer_contact_area_code', - 'manufacturer_contact_exchange', - 'manufacturer_contact_phone_number', - 'manufacturer_contact_extension', - 'manufacturer_contact_pcountry', - 'manufacturer_contact_pcity', - 'manufacturer_contact_plocal', - 'manufacturer_g1_name', - 'manufacturer_g1_address_1', - 'manufacturer_g1_address_2', - 'manufacturer_g1_city', - 'manufacturer_g1_state', - 'manufacturer_g1_zip_code', - 'manufacturer_g1_zip_code_ext', - 'manufacturer_g1_country', - 'manufacturer_g1_postal_code', - 'date_manufacturer_received', - 'device_date_of_manufacturer', - 'single_use_flag', - 'remedial_action', - 'previous_use_code', - 'removal_correction_number', - 'event_type', - 'distributor_name', - 'distributor_address_1', - 'distributor_address_2', - 'distributor_city', - 'distributor_state', - 'distributor_zip_code', - 'distributor_zip_code_ext', - 'report_to_manufacturer', - 'manufacturer_name', - 'manufacturer_address_1', - 'manufacturer_address_2', - 'manufacturer_city', - 'manufacturer_state', - 'manufacturer_zip_code', - 'manufacturer_zip_code_ext', - 'manufacturer_country', - 'manufacturer_postal_code', - 'type_of_report', - 'source_type', - 'date_added', - 'date_changed', - 'reporter_country_code', - 'pma_pmn_number', - 'exemption_number', - 'summary_report_flag' - ] + "foidevproblem": ["mdr_report_key", "problem_code"], + "patientproblemcode": [ + "mdr_report_key", + "patient_sequence_number", + "problem_code", + "date_added", + "date_changed", + ], + "patient": [ + "mdr_report_key", + "patient_sequence_number", + "date_received", + "sequence_number_treatment", + "sequence_number_outcome", + ], + "foitext": [ + "mdr_report_key", + "mdr_text_key", + "text_type_code", + "patient_sequence_number", + "date_report", + "text", + ], + "foidev": [ + "mdr_report_key", + "device_event_key", + "implant_flag", + "date_removed_flag", + "device_sequence_number", + "date_received", + "brand_name", + "generic_name", + "manufacturer_d_name", + "manufacturer_d_address_1", + "manufacturer_d_address_2", + "manufacturer_d_city", + "manufacturer_d_state", + "manufacturer_d_zip_code", + "manufacturer_d_zip_code_ext", + "manufacturer_d_country", + "manufacturer_d_postal_code", + "expiration_date_of_device", + "model_number", + "catalog_number", + "lot_number", + "other_id_number", + "device_operator", + "device_availability", + "date_returned_to_manufacturer", + "device_report_product_code", + "device_age_text", + "device_evaluated_by_manufacturer", + "baseline_brand_name", + "baseline_generic_name", + "baseline_model_number", + "baseline_catalog_number", + "baseline_other_id_number", + "baseline_device_family", + "baseline_shelf_life_contained", + "baseline_shelf_life_in_months", + "baseline_pma_flag", + "baseline_pma_number", + "baseline_510_k__flag", + "baseline_510_k__number", + "baseline_preamendment_flag", + "baseline_transitional_flag", + "baseline_510_k__exempt_flag", + "baseline_date_first_marketed", + "baseline_date_ceased_marketing", + ], + "device": [ + "mdr_report_key", + "device_event_key", + "implant_flag", + "date_removed_flag", + "device_sequence_number", + "date_received", + "brand_name", + "generic_name", + "manufacturer_d_name", + "manufacturer_d_address_1", + "manufacturer_d_address_2", + "manufacturer_d_city", + "manufacturer_d_state", + "manufacturer_d_zip_code", + "manufacturer_d_zip_code_ext", + "manufacturer_d_country", + "manufacturer_d_postal_code", + "device_operator", + "expiration_date_of_device", + "model_number", + "catalog_number", + "lot_number", + "other_id_number", + "device_availability", + "date_returned_to_manufacturer", + "device_report_product_code", + "device_age_text", + "device_evaluated_by_manufacturer", + "combination_product_flag", + ], + "mdrfoi": [ + "mdr_report_key", + "event_key", + "report_number", + "report_source_code", + "manufacturer_link_flag", + "number_devices_in_event", + "number_patients_in_event", + "date_received", + "adverse_event_flag", + "product_problem_flag", + "date_report", + "date_of_event", + "reprocessed_and_reused_flag", + "reporter_occupation_code", + "health_professional", + "initial_report_to_fda", + "date_facility_aware", + "report_date", + "report_to_fda", + "date_report_to_fda", + "event_location", + "date_report_to_manufacturer", + "manufacturer_contact_t_name", + "manufacturer_contact_f_name", + "manufacturer_contact_l_name", + "manufacturer_contact_address_1", + "manufacturer_contact_address_2", + "manufacturer_contact_city", + "manufacturer_contact_state", + "manufacturer_contact_zip_code", + "manufacturer_contact_zip_ext", + "manufacturer_contact_country", + "manufacturer_contact_postal_code", + "manufacturer_contact_area_code", + "manufacturer_contact_exchange", + "manufacturer_contact_phone_number", + "manufacturer_contact_extension", + "manufacturer_contact_pcountry", + "manufacturer_contact_pcity", + "manufacturer_contact_plocal", + "manufacturer_g1_name", + "manufacturer_g1_address_1", + "manufacturer_g1_address_2", + "manufacturer_g1_city", + "manufacturer_g1_state", + "manufacturer_g1_zip_code", + "manufacturer_g1_zip_code_ext", + "manufacturer_g1_country", + "manufacturer_g1_postal_code", + "date_manufacturer_received", + "device_date_of_manufacturer", + "single_use_flag", + "remedial_action", + "previous_use_code", + "removal_correction_number", + "event_type", + "distributor_name", + "distributor_address_1", + "distributor_address_2", + "distributor_city", + "distributor_state", + "distributor_zip_code", + "distributor_zip_code_ext", + "report_to_manufacturer", + "manufacturer_name", + "manufacturer_address_1", + "manufacturer_address_2", + "manufacturer_city", + "manufacturer_state", + "manufacturer_zip_code", + "manufacturer_zip_code_ext", + "manufacturer_country", + "manufacturer_postal_code", + "type_of_report", + "source_type", + "date_added", + "date_changed", + "reporter_country_code", + "pma_pmn_number", + "exemption_number", + "summary_report_flag", + ], } DATE_KEYS = [ - 'date_received', - 'baseline_date_first_marketed', - 'date_returned_to_manufacturer', - 'date_report_to_fda', - 'baseline_date_ceased_marketing', - 'date_report_to_manufacturer', - 'expiration_date_of_device', - 'device_date_of_manufacturer', - 'date_facility_aware', - 'report_date', - 'date_report', - 'date_manufacturer_received', - 'date_of_event', - 'date_added', - 'date_changed' + "date_received", + "baseline_date_first_marketed", + "date_returned_to_manufacturer", + "date_report_to_fda", + "baseline_date_ceased_marketing", + "date_report_to_manufacturer", + "expiration_date_of_device", + "device_date_of_manufacturer", + "date_facility_aware", + "report_date", + "date_report", + "date_manufacturer_received", + "date_of_event", + "date_added", + "date_changed", ] # split these keys in an array on ';' -SPLIT_KEYS = ['sequence_number_treatment', 'sequence_number_outcome'] +SPLIT_KEYS = ["sequence_number_treatment", "sequence_number_outcome"] # multiple submits are separated by ',' need to split these keys on ',' -MULTI_SUBMIT = ['source_type', 'remedial_action', 'type_of_report'] +MULTI_SUBMIT = ["source_type", "remedial_action", "type_of_report"] # These keys have malformed integers in them: left-padded with a space and decimal point added. -MALFORMED_KEYS = ['mdr_report_key', 'device_event_key', 'device_sequence_number', 'patient_sequence_number'] +MALFORMED_KEYS = [ + "mdr_report_key", + "device_event_key", + "device_sequence_number", + "patient_sequence_number", +] + def _fix_date(input_date): - ''' Converts input dates for known formats to a standard format that is - Elasticsearch friendly. - Returns the input_date if it is not a known format - ''' - supported_formats = [ - 'DD-MMM-YY', - 'YYYY/MM/DD HH:mm:ss.SSS', - 'MM/DD/YYYY', - 'YYYYMMDD', - 'YYYY/MM/DD' - ] - - # arrow needs 3 char months to be sentence case: e.g. Dec not DEC - formated_date = input_date.title() - try: - date = arrow.get(formated_date, supported_formats).format('YYYYMMDD') - return date.format('YYYYMMDD') - except: - if input_date: - logging.info('unparseable date: %s with input %s', input_date, formated_date) - - return None + """Converts input dates for known formats to a standard format that is + Elasticsearch friendly. + Returns the input_date if it is not a known format + """ + supported_formats = [ + "DD-MMM-YY", + "YYYY/MM/DD HH:mm:ss.SSS", + "MM/DD/YYYY", + "YYYYMMDD", + "YYYY/MM/DD", + ] + + # arrow needs 3 char months to be sentence case: e.g. Dec not DEC + formated_date = input_date.title() + try: + date = arrow.get(formated_date, supported_formats).format("YYYYMMDD") + return date.format("YYYYMMDD") + except: + if input_date: + logging.info( + "unparseable date: %s with input %s", input_date, formated_date + ) + + return None def _split(key, value, sep): - ''' Helper function that splits a string into an array, swaps the encoded - values for more descriptive ones and then generates an _exact field - ''' - value = value.split(sep) + """Helper function that splits a string into an array, swaps the encoded + values for more descriptive ones and then generates an _exact field + """ + value = value.split(sep) - if key in ENUM: - value = [ENUM[key].get(val, val) for val in value] + if key in ENUM: + value = [ENUM[key].get(val, val) for val in value] - return key, value + return key, value class DownloadDeviceEvents(luigi.Task): - def requires(self): - return [] + def requires(self): + return [] - def output(self): - return luigi.LocalTarget(RAW_DIR) + def output(self): + return luigi.LocalTarget(RAW_DIR) - def run(self): - zip_urls = [] - soup = BeautifulSoup(urlopen(DEVICE_DOWNLOAD_PAGE).read(), "lxml") - for a in soup.find_all(href=re.compile('.*.zip')): - zip_urls.append(a['href']) - if not zip_urls: - logging.fatal('No MAUDE Zip Files Found At %s' % DEVICE_DOWNLOAD_PAGE) - for zip_url in zip_urls: - filename = zip_url.split('/')[-1] - common.download(zip_url, join(self.output().path, filename)) + def run(self): + zip_urls = [] + soup = BeautifulSoup(urlopen(DEVICE_DOWNLOAD_PAGE).read(), "lxml") + for a in soup.find_all(href=re.compile(".*.zip")): + zip_urls.append(a["href"]) + if not zip_urls: + logging.fatal("No MAUDE Zip Files Found At %s" % DEVICE_DOWNLOAD_PAGE) + for zip_url in zip_urls: + filename = zip_url.split("/")[-1] + common.download(zip_url, join(self.output().path, filename)) class ExtractAndCleanDownloadsMaude(luigi.Task): - ''' Unzip each of the download files and remove all the non-UTF8 characters. - Unzip -p streams the data directly to iconv which then writes to disk. - ''' - def requires(self): - return [DownloadDeviceEvents()] - - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'maude/extracted')) - - def run(self): - output_dir = self.output().path - common.shell_cmd('mkdir -p %s', output_dir) - for i in range(len(self.input())): - input_dir = self.input()[i].path - download_util.extract_and_clean(input_dir, - 'ISO-8859-1//TRANSLIT', - 'UTF-8', - 'txt') + """Unzip each of the download files and remove all the non-UTF8 characters. + Unzip -p streams the data directly to iconv which then writes to disk. + """ + + def requires(self): + return [DownloadDeviceEvents()] + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "maude/extracted")) + + def run(self): + output_dir = self.output().path + common.shell_cmd("mkdir -p %s", output_dir) + for i in range(len(self.input())): + input_dir = self.input()[i].path + download_util.extract_and_clean( + input_dir, "ISO-8859-1//TRANSLIT", "UTF-8", "txt" + ) + # This task no longer works properly. Needs refactoring class PreprocessFilesToFixIssues(AlwaysRunTask): - ''' The pipe-separated MAUDE files come with issues: no escaping of special characters. - Many foitext files contain pipe characters that are part of field values and thus are - breaking the layout due to not being escaped properly. In other cases new line characters appear - unescaped and break single lines into multi-line chunks. This task attempts to deal with these - issues via regular expression search & replace. - ''' - def requires(self): - return ExtractAndCleanDownloadsMaude() - - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'maude/extracted')) - - def _run(self): - for filename in glob.glob(self.input().path + '/*/*foi*.txt') + glob.glob(self.input().path + '/*/device*.txt'): - logging.info('Pre-processing %s', filename) - - filtered = filename + '.filtered' - out = open(filtered, 'w') - line_num = 0 - bad_lines = 0 - with open(filename, 'rU') as fp: - for line in fp: - line = line.strip() - if line_num < 1: - # First line is usually the header - out.write(line) - else: - if len(line.strip()) > 0: - if re.search(r'^\d{2,}(\.\d)?\|', line): - # Properly formatted line. Append it and move on. - out.write('\n'+line) - else: - # Bad line, most likely due to an unescaped carriage return. Tuck it onto the previous line - out.write(' ' + line) - bad_lines += 1 - - line_num += 1 - - logging.info('Issues found & fixed: %s', bad_lines) - out.close() - os.remove(filename) - os.rename(filtered, filename) + """The pipe-separated MAUDE files come with issues: no escaping of special characters. + Many foitext files contain pipe characters that are part of field values and thus are + breaking the layout due to not being escaped properly. In other cases new line characters appear + unescaped and break single lines into multi-line chunks. This task attempts to deal with these + issues via regular expression search & replace. + """ + + def requires(self): + return ExtractAndCleanDownloadsMaude() + + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "maude/extracted")) + + def _run(self): + for filename in glob.glob(self.input().path + "/*/*foi*.txt") + glob.glob( + self.input().path + "/*/device*.txt" + ): + logging.info("Pre-processing %s", filename) + + filtered = filename + ".filtered" + out = open(filtered, "w") + line_num = 0 + bad_lines = 0 + with open(filename, "rU") as fp: + for line in fp: + line = line.strip() + if line_num < 1: + # First line is usually the header + out.write(line) + else: + if len(line.strip()) > 0: + if re.search(r"^\d{2,}(\.\d)?\|", line): + # Properly formatted line. Append it and move on. + out.write("\n" + line) + else: + # Bad line, most likely due to an unescaped carriage return. Tuck it onto the previous line + out.write(" " + line) + bad_lines += 1 + + line_num += 1 + + logging.info("Issues found & fixed: %s", bad_lines) + out.close() + os.remove(filename) + os.rename(filtered, filename) + class CSV2JSONMapper(parallel.Mapper): - def __init__(self, device_problem_codes_ref, patient_problem_codes_ref): - parallel.Mapper.__init__(self) - self.device_problem_codes_ref = device_problem_codes_ref - self.patient_problem_codes_ref = patient_problem_codes_ref - - def map_shard(self, map_input, map_output): - self.filename = map_input.filename - return parallel.Mapper.map_shard(self, map_input, map_output) - - @staticmethod - def cleaner(k, v): - if k is None: - return None - - if k in DATE_KEYS: - new_date = _fix_date(v) - return (k, new_date) if new_date else None - - if k in SPLIT_KEYS: - return _split(k, v, ';') - - if k in MULTI_SUBMIT: - return _split(k, v, ',') - - # The DEVICE files have mdr_report_key padded with a space and in decimal format with a ".0" at the end. - if k in MALFORMED_KEYS: - v = v.strip().replace('.0', '') - - if k in ENUM: - if v in ENUM[k]: - if isinstance(v, list): - v = [ENUM[k].get(val, val) for val in v] - else: - v = ENUM[k][v] - - return (k, v) - - # We are seeing a large number of foitext rows not following the column definition and thus - # getting rejected by the reducer. The root cause is the fact that the last column (FOI_TEXT) contains - # text that includes one or more "pipe" | characters that have not been properly escaped - # in the file and thus are throwing column count off. - # We are dealing with that by merely concatenating the extra text columns into a single - # string and stripping out the bogus columns at the end. - def handle_oversized_foitext(self, value): - no_columns = len(FILE_HEADERS['foitext']) - combined_text = '|'.join([t for t in value[no_columns - 1:]]) - value[no_columns - 1] = combined_text[:-1] if combined_text.endswith("|") else combined_text - return value[0:no_columns] - - def map(self, key, value, output): - if len(value) < 1: - return - - mdr_key = self.cleaner('mdr_report_key', value[0])[1] - - # Some of the files have headers, we will apply our own, so row that starts - # with this value is safe to skip - if 'MDR_REPORT_KEY' in mdr_key: - return - - file_type = [s for s in CATEGORIES if s in self.filename][0] - - if file_type == 'foitext' and len(value) > len(FILE_HEADERS[file_type]): - value = self.handle_oversized_foitext(value) - - # We send all data anomalies to a reducer for each file type. - # These non-conforming data are written to a reject file for review. - # This file type as variable lengths over time, so it needs its own check - if file_type == 'foidev': - if len(value) not in [28, 45]: - logging.info('Does not conform to foidev structure. Skipping: %s, %s', - mdr_key, '#' * 5) - output.add(file_type, '%s: missing fields' % mdr_key + ':' + '|'.join(value)) - return - elif file_type == 'mdrfoi': - if len(value) not in [77, 81]: - logging.info('Does not conform to mdrfoi structure. Skipping: %s, %s', - mdr_key, '#' * 5) - output.add(file_type, '%s: missing fields' % mdr_key + ':' + '|'.join(value)) - return - elif len(value) != len(FILE_HEADERS[file_type]): - logging.info('Does not conform to %s structure. Skipping: %s, %s', - file_type, mdr_key, '#' * 5) - output.add(file_type, '%s: missing fields' % mdr_key + ':' + '|'.join(value)) - return - - if not isinstance(int(mdr_key), int): - logging.info('%s is not a number', mdr_key) - output.add(file_type, '%s: NaN' % mdr_key + ':' + '|'.join(value)) - return - - # If it makes it this far, it is a good record - new_value = dict(list(zip(FILE_HEADERS[file_type], value))) - new_value = common.transform_dict(new_value, self.cleaner) - - # https://github.com/FDA/openfda/issues/27 - # We need to see if device problem code is available for this report in the - # foidevproblem.txt file, resolve it to a problem description, and add it to the - # master record. - if file_type == 'foidevproblem': - product_problem = self.device_problem_codes_ref.get(new_value['problem_code']) - new_value['product_problem'] = product_problem - - # Same applies to patient problem codes. - if file_type == 'patientproblemcode': - patient_problem = self.patient_problem_codes_ref.get(new_value['problem_code']) - new_value['patient_problem'] = patient_problem - - output.add(mdr_key, (file_type, new_value)) + def __init__(self, device_problem_codes_ref, patient_problem_codes_ref): + parallel.Mapper.__init__(self) + self.device_problem_codes_ref = device_problem_codes_ref + self.patient_problem_codes_ref = patient_problem_codes_ref + + def map_shard(self, map_input, map_output): + self.filename = map_input.filename + return parallel.Mapper.map_shard(self, map_input, map_output) + + @staticmethod + def cleaner(k, v): + if k is None: + return None + + if k in DATE_KEYS: + new_date = _fix_date(v) + return (k, new_date) if new_date else None + + if k in SPLIT_KEYS: + return _split(k, v, ";") + + if k in MULTI_SUBMIT: + return _split(k, v, ",") + + # The DEVICE files have mdr_report_key padded with a space and in decimal format with a ".0" at the end. + if k in MALFORMED_KEYS: + v = v.strip().replace(".0", "") + + if k in ENUM: + if v in ENUM[k]: + if isinstance(v, list): + v = [ENUM[k].get(val, val) for val in v] + else: + v = ENUM[k][v] + + return (k, v) + + # We are seeing a large number of foitext rows not following the column definition and thus + # getting rejected by the reducer. The root cause is the fact that the last column (FOI_TEXT) contains + # text that includes one or more "pipe" | characters that have not been properly escaped + # in the file and thus are throwing column count off. + # We are dealing with that by merely concatenating the extra text columns into a single + # string and stripping out the bogus columns at the end. + def handle_oversized_foitext(self, value): + no_columns = len(FILE_HEADERS["foitext"]) + combined_text = "|".join([t for t in value[no_columns - 1 :]]) + value[no_columns - 1] = ( + combined_text[:-1] if combined_text.endswith("|") else combined_text + ) + return value[0:no_columns] + + def map(self, key, value, output): + if len(value) < 1: + return + + mdr_key = self.cleaner("mdr_report_key", value[0])[1] + + # Some of the files have headers, we will apply our own, so row that starts + # with this value is safe to skip + if "MDR_REPORT_KEY" in mdr_key: + return + + file_type = [s for s in CATEGORIES if s in self.filename][0] + + if file_type == "foitext" and len(value) > len(FILE_HEADERS[file_type]): + value = self.handle_oversized_foitext(value) + + # We send all data anomalies to a reducer for each file type. + # These non-conforming data are written to a reject file for review. + # This file type as variable lengths over time, so it needs its own check + if file_type == "foidev": + if len(value) not in [28, 45]: + logging.info( + "Does not conform to foidev structure. Skipping: %s, %s", + mdr_key, + "#" * 5, + ) + output.add( + file_type, "%s: missing fields" % mdr_key + ":" + "|".join(value) + ) + return + elif file_type == "mdrfoi": + if len(value) not in [77, 81]: + logging.info( + "Does not conform to mdrfoi structure. Skipping: %s, %s", + mdr_key, + "#" * 5, + ) + output.add( + file_type, "%s: missing fields" % mdr_key + ":" + "|".join(value) + ) + return + elif len(value) != len(FILE_HEADERS[file_type]): + logging.info( + "Does not conform to %s structure. Skipping: %s, %s", + file_type, + mdr_key, + "#" * 5, + ) + output.add( + file_type, "%s: missing fields" % mdr_key + ":" + "|".join(value) + ) + return + + if not isinstance(int(mdr_key), int): + logging.info("%s is not a number", mdr_key) + output.add(file_type, "%s: NaN" % mdr_key + ":" + "|".join(value)) + return + + # If it makes it this far, it is a good record + new_value = dict(list(zip(FILE_HEADERS[file_type], value))) + new_value = common.transform_dict(new_value, self.cleaner) + + # https://github.com/FDA/openfda/issues/27 + # We need to see if device problem code is available for this report in the + # foidevproblem.txt file, resolve it to a problem description, and add it to the + # master record. + if file_type == "foidevproblem": + product_problem = self.device_problem_codes_ref.get( + new_value["problem_code"] + ) + new_value["product_problem"] = product_problem + + # Same applies to patient problem codes. + if file_type == "patientproblemcode": + patient_problem = self.patient_problem_codes_ref.get( + new_value["problem_code"] + ) + new_value["patient_problem"] = patient_problem + + output.add(mdr_key, (file_type, new_value)) class CSV2JSONJoinReducer(parallel.Reducer): - # File type to nested key name mapping - join_map = { - 'device': 'device', - 'foidev': 'device', - 'foitext': 'mdr_text', - 'patient': 'patient' - } - - def _join(self, key, values): - val = parallel.pivot_values(values) - - final = { - 'device': [], - 'mdr_text': [], - 'patient': [] + # File type to nested key name mapping + join_map = { + "device": "device", + "foidev": "device", + "foitext": "mdr_text", + "patient": "patient", } - if not val.get('mdrfoi', []): - # logging.info('MDR REPORT %s: Missing mdrfoi record, Skipping join', key) - return - - for i, main_report in enumerate(val.get('mdrfoi', [])): - final.update(main_report) - - try: - int(final.get('mdr_report_key', None)) - except TypeError: - logging.info('%s', '*' * 2400) - return - - for source_file, target_key in self.join_map.items(): - for row in val.get(source_file, []): - row.pop('mdr_report_key', 0) # No need to keep join key on nested data - if target_key != 'mdr_text' or len( - [txt for txt in final[target_key] if txt['mdr_text_key'] == row['mdr_text_key']]) == 0: - final[target_key].append(row) - - # Now tuck the device and patient problem codes onto the final record - if val.get('foidevproblem', []): - final['product_problems'] = list(map(lambda x: x['product_problem'], val['foidevproblem'])) - - # https://github.com/FDA/openfda/issues/179 - # In some cases we have patient problem codes without the actual patient. - # We create a 'placeholder' empty patient record in this case just to hold the problem codes. - for patient_problem in val.get('patientproblemcode', []): - if len([patient for patient in final['patient'] if - patient['patient_sequence_number'] == patient_problem['patient_sequence_number']]) == 0: - patient = {'patient_sequence_number': patient_problem['patient_sequence_number']} - final['patient'].append(patient) - - for patient in final['patient']: - for patient_problem in val.get('patientproblemcode', []): - if patient['patient_sequence_number'] == patient_problem['patient_sequence_number']: - patient['patient_problems'] = [patient_problem['patient_problem']] if patient.get( - 'patient_problems') is None else patient['patient_problems'] + [patient_problem['patient_problem']] - - return final - - def reduce(self, key, values, output): - # Write out the rejected records - if key in CATEGORIES: - with open(join(BASE_DIR, 'maude', key + '-rejects.txt'), 'a') as rejects: - for row in values: - rejects.write(row + '\n') - else: - output.put(key, self._join(key, values)) + def _join(self, key, values): + val = parallel.pivot_values(values) + + final = {"device": [], "mdr_text": [], "patient": []} + + if not val.get("mdrfoi", []): + # logging.info('MDR REPORT %s: Missing mdrfoi record, Skipping join', key) + return + + for i, main_report in enumerate(val.get("mdrfoi", [])): + final.update(main_report) + + try: + int(final.get("mdr_report_key", None)) + except TypeError: + logging.info("%s", "*" * 2400) + return + + for source_file, target_key in self.join_map.items(): + for row in val.get(source_file, []): + row.pop("mdr_report_key", 0) # No need to keep join key on nested data + if ( + target_key != "mdr_text" + or len( + [ + txt + for txt in final[target_key] + if txt["mdr_text_key"] == row["mdr_text_key"] + ] + ) + == 0 + ): + final[target_key].append(row) + + # Now tuck the device and patient problem codes onto the final record + if val.get("foidevproblem", []): + final["product_problems"] = list( + map(lambda x: x["product_problem"], val["foidevproblem"]) + ) + + # https://github.com/FDA/openfda/issues/179 + # In some cases we have patient problem codes without the actual patient. + # We create a 'placeholder' empty patient record in this case just to hold the problem codes. + for patient_problem in val.get("patientproblemcode", []): + if ( + len( + [ + patient + for patient in final["patient"] + if patient["patient_sequence_number"] + == patient_problem["patient_sequence_number"] + ] + ) + == 0 + ): + patient = { + "patient_sequence_number": patient_problem[ + "patient_sequence_number" + ] + } + final["patient"].append(patient) + + for patient in final["patient"]: + for patient_problem in val.get("patientproblemcode", []): + if ( + patient["patient_sequence_number"] + == patient_problem["patient_sequence_number"] + ): + patient["patient_problems"] = ( + [patient_problem["patient_problem"]] + if patient.get("patient_problems") is None + else patient["patient_problems"] + + [patient_problem["patient_problem"]] + ) + + return final + + def reduce(self, key, values, output): + # Write out the rejected records + if key in CATEGORIES: + with open(join(BASE_DIR, "maude", key + "-rejects.txt"), "a") as rejects: + for row in values: + rejects.write(row + "\n") + else: + output.put(key, self._join(key, values)) class CSV2JSON(luigi.Task): - ''' Task that loads different CSV files, depending upon what the value of - `loader_task`. - `init`: process all files except those listed in IGNORE_FILES. - `add`: process all files with the word `add` in the filename. - `changes`: process all files with the word `change` in the filename. - ''' - run_date = luigi.Parameter() - loader_task = luigi.Parameter() - - def requires(self): - return PreprocessFilesToFixIssues() - - def output(self): - file_name = '-'.join([self.loader_task, self.run_date, 'json.db']) - - return luigi.LocalTarget(join(BASE_DIR, 'maude', file_name)) - - def run(self): - files = glob.glob(self.input().path + '/*/*.txt') - device_problems = glob.glob(self.input().path + '/*/foidevproblem*.txt') - patient_problems = glob.glob(self.input().path + '/*/patientproblemcode*.txt') - - if self.loader_task == 'init': - input_files = [f for f in files if not any(i for i in IGNORE_FILES if i in f)] - else: - input_files = [f for f in files if self.loader_task in f] + device_problems + patient_problems - - # Load and cache device problem codes. - device_problem_codes_ref = {} - reader = csv.reader(open(DEVICE_PROBLEM_CODES_FILE), quoting=csv.QUOTE_NONE, delimiter='|') - for idx, line in enumerate(reader): - if len(line) > 1: - device_problem_codes_ref[line[0]] = line[1].strip() - - # Load and cache patient problem codes. - patient_problem_codes_ref = {} - reader = csv.reader(open(PATIENT_PROBLEM_CODES_FILE), quoting=csv.QUOTE_NONE, delimiter='|') - for idx, line in enumerate(reader): - if len(line) > 1: - patient_problem_codes_ref[line[0]] = line[1].strip() - - parallel.mapreduce( - parallel.Collection.from_glob( - input_files, parallel.CSVSplitLineInput(quoting=csv.QUOTE_NONE, delimiter='|')), - mapper=CSV2JSONMapper(device_problem_codes_ref=device_problem_codes_ref, - patient_problem_codes_ref=patient_problem_codes_ref - ), - reducer=CSV2JSONJoinReducer(), - output_prefix=self.output().path - ) - + """Task that loads different CSV files, depending upon what the value of + `loader_task`. + `init`: process all files except those listed in IGNORE_FILES. + `add`: process all files with the word `add` in the filename. + `changes`: process all files with the word `change` in the filename. + """ -class MergeUpdatesMapper(parallel.Mapper): - def map_shard(self, map_input, map_output): - self.filename = map_input.filename - self.table = basename(dirname(dirname(self.filename))) - return parallel.Mapper.map_shard(self, map_input, map_output) + run_date = luigi.Parameter() + loader_task = luigi.Parameter() - def map(self, key, value, output): - source_type = None - if 'init' in self.filename: - source_type = 'init' - if 'add' in self.filename: - source_type = 'add' - if 'change' in self.filename: - source_type = 'change' + def requires(self): + return PreprocessFilesToFixIssues() - assert source_type, 'Unable to continue for source type %s' % self.filename + def output(self): + file_name = "-".join([self.loader_task, self.run_date, "json.db"]) - output.add(key, (source_type, value)) + return luigi.LocalTarget(join(BASE_DIR, "maude", file_name)) + def run(self): + files = glob.glob(self.input().path + "/*/*.txt") + device_problems = glob.glob(self.input().path + "/*/foidevproblem*.txt") + patient_problems = glob.glob(self.input().path + "/*/patientproblemcode*.txt") -class MergeUpdatesReducer(parallel.Reducer): - ''' This step resolves conflicting data for the same key, which is the result - of merging the init, add and change pipeline outputs. + if self.loader_task == "init": + input_files = [ + f for f in files if not any(i for i in IGNORE_FILES if i in f) + ] + else: + input_files = ( + [f for f in files if self.loader_task in f] + + device_problems + + patient_problems + ) + + # Load and cache device problem codes. + device_problem_codes_ref = {} + reader = csv.reader( + open(DEVICE_PROBLEM_CODES_FILE), quoting=csv.QUOTE_NONE, delimiter="|" + ) + for idx, line in enumerate(reader): + if len(line) > 1: + device_problem_codes_ref[line[0]] = line[1].strip() + + # Load and cache patient problem codes. + patient_problem_codes_ref = {} + reader = csv.reader( + open(PATIENT_PROBLEM_CODES_FILE), quoting=csv.QUOTE_NONE, delimiter="|" + ) + for idx, line in enumerate(reader): + if len(line) > 1: + patient_problem_codes_ref[line[0]] = line[1].strip() + + parallel.mapreduce( + parallel.Collection.from_glob( + input_files, + parallel.CSVSplitLineInput(quoting=csv.QUOTE_NONE, delimiter="|"), + ), + mapper=CSV2JSONMapper( + device_problem_codes_ref=device_problem_codes_ref, + patient_problem_codes_ref=patient_problem_codes_ref, + ), + reducer=CSV2JSONJoinReducer(), + output_prefix=self.output().path, + ) - Reducer that takes in an array of tuples: - [(source, value), (source, value), ...] - One and only one is selected by the reducer. +class MergeUpdatesMapper(parallel.Mapper): + def map_shard(self, map_input, map_output): + self.filename = map_input.filename + self.table = basename(dirname(dirname(self.filename))) + return parallel.Mapper.map_shard(self, map_input, map_output) - ''' - def reduce(self, key, values, output): - def _safe_get(value): - if isinstance(value, list): - if len(value) > 0: - value = value[0] - else: - return None - - return value - - # If there is only one value, then we use it. If there are many then - # then choose the right one in the order: changed, added, or existing. - # Remember, we are merging the additions and updates with last weeks run, - # which is where the existing come from. All of this is due to the fact - # that a record can exist in all three places, which is not ideal but is - # reality. - if len(values) == 1: - value = _safe_get(values[0][1]) - if value: - output.put(key, value) - elif len(values) > 1: - pivoted = parallel.pivot_values(values) - - change = _safe_get(pivoted.get('change', [])) - add = _safe_get(pivoted.get('add', [])) - init = _safe_get(pivoted.get('init', [])) - - if change: - output.put(key, change) - elif add: - output.put(key, add) - else: - output.put(key, init) + def map(self, key, value, output): + source_type = None + if "init" in self.filename: + source_type = "init" + if "add" in self.filename: + source_type = "add" + if "change" in self.filename: + source_type = "change" + assert source_type, "Unable to continue for source type %s" % self.filename -class MergeUpdates(luigi.Task): - ''' Task that takes all three loader_task (init, add, and change), streams - them into a reducer and picks one to write to the weekly output. + output.add(key, (source_type, value)) - Please note that the `init` process attempts to use last weeks init file - as input. If it does not exist, it will make it first. - ''' - run_date = luigi.Parameter() - def requires(self): - previous_run_date = arrow.get(self.run_date).shift(weeks=-1).format(DATE_FMT) +class MergeUpdatesReducer(parallel.Reducer): + """This step resolves conflicting data for the same key, which is the result + of merging the init, add and change pipeline outputs. + + Reducer that takes in an array of tuples: + [(source, value), (source, value), ...] + + One and only one is selected by the reducer. + + """ + + def reduce(self, key, values, output): + def _safe_get(value): + if isinstance(value, list): + if len(value) > 0: + value = value[0] + else: + return None + + return value + + # If there is only one value, then we use it. If there are many then + # then choose the right one in the order: changed, added, or existing. + # Remember, we are merging the additions and updates with last weeks run, + # which is where the existing come from. All of this is due to the fact + # that a record can exist in all three places, which is not ideal but is + # reality. + if len(values) == 1: + value = _safe_get(values[0][1]) + if value: + output.put(key, value) + elif len(values) > 1: + pivoted = parallel.pivot_values(values) + + change = _safe_get(pivoted.get("change", [])) + add = _safe_get(pivoted.get("add", [])) + init = _safe_get(pivoted.get("init", [])) + + if change: + output.put(key, change) + elif add: + output.put(key, add) + else: + output.put(key, init) - return [ - CSV2JSON(loader_task='init', run_date=previous_run_date), - CSV2JSON(loader_task='add', run_date=self.run_date), - CSV2JSON(loader_task='change', run_date=self.run_date) - ] - def output(self): - file_name = '-'.join(['init', self.run_date, 'json.db']) +class MergeUpdates(luigi.Task): + """Task that takes all three loader_task (init, add, and change), streams + them into a reducer and picks one to write to the weekly output. - return luigi.LocalTarget(join(BASE_DIR, 'maude', file_name)) + Please note that the `init` process attempts to use last weeks init file + as input. If it does not exist, it will make it first. + """ - def run(self): - db_list = [s.path for s in self.input()] - parallel.mapreduce( - parallel.Collection.from_sharded_list(db_list), - mapper=MergeUpdatesMapper(), - reducer=MergeUpdatesReducer(), - output_prefix=self.output().path) + run_date = luigi.Parameter() + def requires(self): + previous_run_date = arrow.get(self.run_date).shift(weeks=-1).format(DATE_FMT) -class MaudeAnnotationMapper(DeviceAnnotateMapper): - def filter(self, data, lookup=None): - product_code = data['device_report_product_code'] - harmonized = self.harmonized_db.get(product_code, None) - if harmonized: - # Taking a very conservative approach to annotation to start. Only - # including the classification data and a matching registration. - if '510k' in harmonized: - del harmonized['510k'] - if 'device_pma' in harmonized: - del harmonized['device_pma'] - registration = list(harmonized['registration']) - new_reg = [d for d in registration if d.get('registration_number') == lookup] - harmonized['registration'] = new_reg - return harmonized - return None + return [ + CSV2JSON(loader_task="init", run_date=previous_run_date), + CSV2JSON(loader_task="add", run_date=self.run_date), + CSV2JSON(loader_task="change", run_date=self.run_date), + ] - def harmonize(self, data): - result = dict(data) - report_number = data['report_number'] + def output(self): + file_name = "-".join(["init", self.run_date, "json.db"]) - if not report_number: - return result + return luigi.LocalTarget(join(BASE_DIR, "maude", file_name)) - registration_number = report_number.split('-')[0] - if not registration_number: - return result + def run(self): + db_list = [s.path for s in self.input()] + parallel.mapreduce( + parallel.Collection.from_sharded_list(db_list), + mapper=MergeUpdatesMapper(), + reducer=MergeUpdatesReducer(), + output_prefix=self.output().path, + ) - devices = [] - for row in result.get('device', []): - d = dict(row) - harmonized = self.filter(row, lookup=registration_number) - if harmonized: - d['openfda'] = self.flatten(harmonized) - else: - d['openfda'] = {} - devices.append(d) - result['device'] = devices - return result +class MaudeAnnotationMapper(DeviceAnnotateMapper): + def filter(self, data, lookup=None): + product_code = data["device_report_product_code"] + harmonized = self.harmonized_db.get(product_code, None) + if harmonized: + # Taking a very conservative approach to annotation to start. Only + # including the classification data and a matching registration. + if "510k" in harmonized: + del harmonized["510k"] + if "device_pma" in harmonized: + del harmonized["device_pma"] + registration = list(harmonized["registration"]) + new_reg = [ + d for d in registration if d.get("registration_number") == lookup + ] + harmonized["registration"] = new_reg + return harmonized + return None + + def harmonize(self, data): + result = dict(data) + report_number = data["report_number"] + + if not report_number: + return result + + registration_number = report_number.split("-")[0] + if not registration_number: + return result + + devices = [] + for row in result.get("device", []): + d = dict(row) + harmonized = self.filter(row, lookup=registration_number) + if harmonized: + d["openfda"] = self.flatten(harmonized) + else: + d["openfda"] = {} + devices.append(d) + result["device"] = devices + + return result class AnnotateReport(DependencyTriggeredTask): - run_date = luigi.Parameter() + run_date = luigi.Parameter() - def requires(self): - return [Harmonized2OpenFDA(), MergeUpdates(run_date=self.run_date)] + def requires(self): + return [Harmonized2OpenFDA(), MergeUpdates(run_date=self.run_date)] - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'maude', 'annotate.db')) + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "maude", "annotate.db")) - def run(self): - harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() - db_list = [s.path for s in self.input()[1:]] - parallel.mapreduce( - parallel.Collection.from_sharded_list(db_list), - mapper=MaudeAnnotationMapper(harmonized_db=harmonized_db), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + def run(self): + harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() + db_list = [s.path for s in self.input()[1:]] + parallel.mapreduce( + parallel.Collection.from_sharded_list(db_list), + mapper=MaudeAnnotationMapper(harmonized_db=harmonized_db), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) class LoadJSONByRunDate(index_util.LoadJSONBase): - run_date = luigi.Parameter() - index_name = 'deviceevent' - mapping_file = 'schemas/maude_mapping.json' - optimize_index = False - docid_key='mdr_report_key' - use_checksum = True - last_update_date = lambda _: newest_file_timestamp(RAW_DIR) + run_date = luigi.Parameter() + index_name = "deviceevent" + mapping_file = "schemas/maude_mapping.json" + optimize_index = False + docid_key = "mdr_report_key" + use_checksum = True + last_update_date = lambda _: newest_file_timestamp(RAW_DIR) - - def _data(self): - return AnnotateReport(run_date=self.run_date) + def _data(self): + return AnnotateReport(run_date=self.run_date) class LoadJSON(luigi.WrapperTask): - run_date = arrow.utcnow().ceil('weeks').format(DATE_FMT) + run_date = arrow.utcnow().ceil("weeks").format(DATE_FMT) + + def requires(self): + return LoadJSONByRunDate(run_date=self.run_date) - def requires(self): - return LoadJSONByRunDate(run_date=self.run_date) -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/ndc/annotate.py b/openfda/ndc/annotate.py index 0997041..0940a3b 100644 --- a/openfda/ndc/annotate.py +++ b/openfda/ndc/annotate.py @@ -5,41 +5,45 @@ from openfda import common from openfda.spl.fix_date import validate_date + def read_json_file(json_file): - ''' - Reads an ElasticSearch style multi-json file: - Each line contains a single JSON record. - Yields one object for each line - ''' - for line in json_file: - yield json.loads(line) + """ + Reads an ElasticSearch style multi-json file: + Each line contains a single JSON record. + Yields one object for each line + """ + for line in json_file: + yield json.loads(line) + # TODO(hansnelsen): add to common library, is used in all annotation processing def read_harmonized_file(harmonized_file): - ''' - Create a dictionary that is keyed by spl_id. - ''' - harmonization_dict = {} - for row in read_json_file(harmonized_file): - spl_id = row['id'] - if spl_id in harmonization_dict: - harmonization_dict[spl_id].append(row) - else: - harmonization_dict[spl_id] = [row] - return harmonization_dict + """ + Create a dictionary that is keyed by spl_id. + """ + harmonization_dict = {} + for row in read_json_file(harmonized_file): + spl_id = row["id"] + if spl_id in harmonization_dict: + harmonization_dict[spl_id].append(row) + else: + harmonization_dict[spl_id] = [row] + return harmonization_dict + # TODO(hansnelsen): Add to a common library, since it is used by faers and res # annotate.py. _add_field() is identical in all files. def _add_field(openfda, field, value): - if type(value) != type([]): - value = [value] - for v in value: - if not v: - continue - if field not in openfda: - openfda[field] = {} - openfda[field][v] = True - return + if type(value) != type([]): + value = [value] + for v in value: + if not v: + continue + if field not in openfda: + openfda[field] = {} + openfda[field][v] = True + return + # TODO(hansnelsen): Looks very similiar to the code in faers/annotate.py, we # should consider refactoring this into a general piece of code for generating @@ -50,63 +54,63 @@ def _add_field(openfda, field, value): # style of other function def AddHarmonizedRowToOpenfda(openfda, row): - _add_field(openfda, 'manufacturer_name', row['manufacturer_name']) - - for rxnorm in row['rxnorm']: - _add_field(openfda, 'rxcui', rxnorm['rxcui']) - - _add_field(openfda, 'spl_set_id', row['spl_set_id']) - _add_field(openfda, - 'is_original_packager', - row['is_original_packager']) - - _add_field(openfda, 'upc', row['upc']) - - if row['unii_indexing'] != []: - unii_list = [] - for unii_row in row['unii_indexing']: - for key, value in unii_row.items(): - if key == 'unii': - unii_list.append(value) - if key == 'va': - for this_item in value: - for va_key, va_value in this_item.items(): - if va_key == 'name': - if va_value.find('[MoA]') != -1: - _add_field(openfda, 'pharm_class_moa', va_value) - if va_value.find('[Chemical/Ingredient]') != -1 or va_value.find('[CS]') != -1: - _add_field(openfda, 'pharm_class_cs', va_value) - if va_value.find('[PE]') != -1: - _add_field(openfda, 'pharm_class_pe', va_value) - if va_value.find('[EPC]') != -1: - _add_field(openfda, 'pharm_class_epc', va_value) - if va_key == 'number': - _add_field(openfda, 'nui', va_value) - _add_field(openfda, 'unii', unii_list) + _add_field(openfda, "manufacturer_name", row["manufacturer_name"]) + + for rxnorm in row["rxnorm"]: + _add_field(openfda, "rxcui", rxnorm["rxcui"]) + + _add_field(openfda, "spl_set_id", row["spl_set_id"]) + _add_field(openfda, "is_original_packager", row["is_original_packager"]) + + _add_field(openfda, "upc", row["upc"]) + + if row["unii_indexing"] != []: + unii_list = [] + for unii_row in row["unii_indexing"]: + for key, value in unii_row.items(): + if key == "unii": + unii_list.append(value) + if key == "va": + for this_item in value: + for va_key, va_value in this_item.items(): + if va_key == "name": + if va_value.find("[MoA]") != -1: + _add_field(openfda, "pharm_class_moa", va_value) + if ( + va_value.find("[Chemical/Ingredient]") != -1 + or va_value.find("[CS]") != -1 + ): + _add_field(openfda, "pharm_class_cs", va_value) + if va_value.find("[PE]") != -1: + _add_field(openfda, "pharm_class_pe", va_value) + if va_value.find("[EPC]") != -1: + _add_field(openfda, "pharm_class_epc", va_value) + if va_key == "number": + _add_field(openfda, "nui", va_value) + _add_field(openfda, "unii", unii_list) def annotate_drug(drug, harmonized_dict): - openfda = {} - spl_id = drug.get('spl_id') - if spl_id != None: - if spl_id in harmonized_dict: - AddHarmonizedRowToOpenfda(openfda, harmonized_dict[spl_id][0]) + openfda = {} + spl_id = drug.get("spl_id") + if spl_id != None: + if spl_id in harmonized_dict: + AddHarmonizedRowToOpenfda(openfda, harmonized_dict[spl_id][0]) - openfda_lists = {} - for field, value in openfda.items(): - openfda_lists[field] = [s for s in value.keys()] - drug['openfda'] = openfda_lists + openfda_lists = {} + for field, value in openfda.items(): + openfda_lists[field] = [s for s in value.keys()] + drug["openfda"] = openfda_lists class AnnotateMapper(parallel.Mapper): - def __init__(self, harmonized_file): - self.harmonized_file = harmonized_file - - def map_shard(self, map_input, map_output): - self.harmonized_dict = read_harmonized_file(open(self.harmonized_file)) - parallel.Mapper.map_shard(self, map_input, map_output) + def __init__(self, harmonized_file): + self.harmonized_file = harmonized_file - def map(self, key, drug, out): - annotate_drug(drug, self.harmonized_dict) - out.add(key, drug) + def map_shard(self, map_input, map_output): + self.harmonized_dict = read_harmonized_file(open(self.harmonized_file)) + parallel.Mapper.map_shard(self, map_input, map_output) + def map(self, key, drug, out): + annotate_drug(drug, self.harmonized_dict) + out.add(key, drug) diff --git a/openfda/ndc/pipeline.py b/openfda/ndc/pipeline.py index 3ac2b21..280cee0 100644 --- a/openfda/ndc/pipeline.py +++ b/openfda/ndc/pipeline.py @@ -1,8 +1,8 @@ #!/usr/local/bin/python -''' +""" Pipeline for converting NDC product and package data sets (TXT) to JSON and importing into Elasticsearch. -''' +""" import logging import os import re @@ -19,347 +19,388 @@ from openfda.common import newest_file_timestamp from openfda.ndc.annotate import AnnotateMapper -NDC_DOWNLOAD_PAGE = \ - 'https://www.fda.gov/drugs/drug-approvals-and-databases/national-drug-code-directory' -BASE_DIR = join(config.data_dir(), 'ndc') -RAW_DIR = join(BASE_DIR, 'raw') +NDC_DOWNLOAD_PAGE = "https://www.fda.gov/drugs/drug-approvals-and-databases/national-drug-code-directory" +BASE_DIR = join(config.data_dir(), "ndc") +RAW_DIR = join(BASE_DIR, "raw") -MERGED_PACKAGES = join(BASE_DIR, 'merge/package.csv') -MERGED_PRODUCTS = join(BASE_DIR, 'merge/product.csv') +MERGED_PACKAGES = join(BASE_DIR, "merge/package.csv") +MERGED_PRODUCTS = join(BASE_DIR, "merge/product.csv") -NDC_PACKAGE_DB = join(BASE_DIR, 'json/package.db') -NDC_PRODUCT_DB = join(BASE_DIR, 'json/product.db') -NDC_MERGE_DB = join(BASE_DIR, 'json/merged.db') -NDC_ANNOTATED_DB = join(BASE_DIR, 'json/annotated.db') +NDC_PACKAGE_DB = join(BASE_DIR, "json/package.db") +NDC_PRODUCT_DB = join(BASE_DIR, "json/product.db") +NDC_MERGE_DB = join(BASE_DIR, "json/merged.db") +NDC_ANNOTATED_DB = join(BASE_DIR, "json/annotated.db") class DownloadNDCFiles(luigi.Task): - def requires(self): - return [] + def requires(self): + return [] - def output(self): - return luigi.LocalTarget(RAW_DIR) + def output(self): + return luigi.LocalTarget(RAW_DIR) - def run(self): - finished_ndc_url = None - unfinished_ndc_url = None + def run(self): + finished_ndc_url = None + unfinished_ndc_url = None - soup = BeautifulSoup(urlopen(NDC_DOWNLOAD_PAGE).read(), 'lxml') - for a in soup.find_all(href=re.compile('.*.zip')): - if 'ndc database file' in a.text.lower() and 'text' in a['href']: - finished_ndc_url = urljoin('https://www.fda.gov', a['href']) - if 'ndc unfinished' in a.text.lower() and 'unfinished.zip' in a['href']: - unfinished_ndc_url = urljoin('https://www.fda.gov', a['href']) + soup = BeautifulSoup(urlopen(NDC_DOWNLOAD_PAGE).read(), "lxml") + for a in soup.find_all(href=re.compile(".*.zip")): + if "ndc database file" in a.text.lower() and "text" in a["href"]: + finished_ndc_url = urljoin("https://www.fda.gov", a["href"]) + if "ndc unfinished" in a.text.lower() and "unfinished.zip" in a["href"]: + unfinished_ndc_url = urljoin("https://www.fda.gov", a["href"]) - if not finished_ndc_url: - logging.fatal('NDC finished database file not found!') - if not unfinished_ndc_url: - logging.fatal('NDC unfinished drugs database file not found!') + if not finished_ndc_url: + logging.fatal("NDC finished database file not found!") + if not unfinished_ndc_url: + logging.fatal("NDC unfinished drugs database file not found!") - common.download(finished_ndc_url, join(RAW_DIR, 'finished.zip')) - common.download(unfinished_ndc_url, join(RAW_DIR, 'unfinished.zip')) + common.download(finished_ndc_url, join(RAW_DIR, "finished.zip")) + common.download(unfinished_ndc_url, join(RAW_DIR, "unfinished.zip")) class ExtractNDCFiles(luigi.Task): - def requires(self): - return DownloadNDCFiles() + def requires(self): + return DownloadNDCFiles() - def output(self): - return luigi.LocalTarget(join(BASE_DIR, 'extracted')) + def output(self): + return luigi.LocalTarget(join(BASE_DIR, "extracted")) - def run(self): - zip_filename = join(BASE_DIR, 'raw/finished.zip') - output_dir = self.output().path - os.system('mkdir -p %s' % output_dir) - os.system('unzip -o %(zip_filename)s -d %(output_dir)s' % locals()) + def run(self): + zip_filename = join(BASE_DIR, "raw/finished.zip") + output_dir = self.output().path + os.system("mkdir -p %s" % output_dir) + os.system("unzip -o %(zip_filename)s -d %(output_dir)s" % locals()) - zip_filename = join(BASE_DIR, 'raw/unfinished.zip') - os.system('unzip -o %(zip_filename)s -d %(output_dir)s' % locals()) + zip_filename = join(BASE_DIR, "raw/unfinished.zip") + os.system("unzip -o %(zip_filename)s -d %(output_dir)s" % locals()) - # Fix annoying UTF-8 issues in product.txt - self.fix_utf8_issues(join(output_dir, 'product.txt')) - self.fix_utf8_issues(join(output_dir, 'unfinished_product.txt')) + # Fix annoying UTF-8 issues in product.txt + self.fix_utf8_issues(join(output_dir, "product.txt")) + self.fix_utf8_issues(join(output_dir, "unfinished_product.txt")) - def fix_utf8_issues(self, filename): - file = open(filename, 'r', encoding='utf-8', errors='ignore') - data = file.read() - file.close() + def fix_utf8_issues(self, filename): + file = open(filename, "r", encoding="utf-8", errors="ignore") + data = file.read() + file.close() - file = open(filename, 'w', encoding='utf-8') - file.write(common.convert_unicode(data)) - file.flush() - file.close() + file = open(filename, "w", encoding="utf-8") + file.write(common.convert_unicode(data)) + file.flush() + file.close() class MergePackageNDC(luigi.Task): - def requires(self): - return ExtractNDCFiles() - - def output(self): - return luigi.LocalTarget(MERGED_PACKAGES) - - def run(self): - output_dir = dirname(MERGED_PACKAGES) - os.system('mkdir -p %s' % output_dir) - - dtype = {'STARTMARKETINGDATE': np.unicode, 'ENDMARKETINGDATE': np.unicode} - finished = pd.read_csv(join(self.input().path, 'package.txt'), sep='\t', index_col=False, encoding='utf-8', - dtype=dtype) - unfinished = pd.read_csv(join(self.input().path, 'unfinished_package.txt'), sep='\t', index_col=False, - encoding='utf-8', - dtype=dtype) - - finished['FINISHED'] = 'Y' - unfinished['FINISHED'] = 'N' - merged = finished.append(unfinished, ignore_index=True, sort=False) - merged.to_csv(MERGED_PACKAGES, sep='\t', encoding='utf-8', index=False) + def requires(self): + return ExtractNDCFiles() + + def output(self): + return luigi.LocalTarget(MERGED_PACKAGES) + + def run(self): + output_dir = dirname(MERGED_PACKAGES) + os.system("mkdir -p %s" % output_dir) + + dtype = {"STARTMARKETINGDATE": np.unicode, "ENDMARKETINGDATE": np.unicode} + finished = pd.read_csv( + join(self.input().path, "package.txt"), + sep="\t", + index_col=False, + encoding="utf-8", + dtype=dtype, + ) + unfinished = pd.read_csv( + join(self.input().path, "unfinished_package.txt"), + sep="\t", + index_col=False, + encoding="utf-8", + dtype=dtype, + ) + + finished["FINISHED"] = "Y" + unfinished["FINISHED"] = "N" + merged = finished.append(unfinished, ignore_index=True, sort=False) + merged.to_csv(MERGED_PACKAGES, sep="\t", encoding="utf-8", index=False) class MergeProductNDC(luigi.Task): - def requires(self): - return ExtractNDCFiles() - - def output(self): - return luigi.LocalTarget(MERGED_PRODUCTS) - - def run(self): - output_dir = dirname(MERGED_PRODUCTS) - os.system('mkdir -p %s' % output_dir) - - dtype = {'STARTMARKETINGDATE': np.unicode, 'ENDMARKETINGDATE': np.unicode, 'APPLICATIONNUMBER': np.unicode - , 'ACTIVE_NUMERATOR_STRENGTH': np.unicode, 'LISTING_RECORD_CERTIFIED_THROUGH': np.unicode - , 'PROPRIETARYNAMESUFFIX': np.unicode} - finished = pd.read_csv(join(self.input().path, 'product.txt'), sep='\t', index_col=False, encoding='utf-8', - dtype=dtype) - unfinished = pd.read_csv(join(self.input().path, 'unfinished_product.txt'), sep='\t', index_col=False, - encoding='utf-8', - dtype=dtype) - - finished['FINISHED'] = 'Y' - unfinished['FINISHED'] = 'N' - merged = finished.append(unfinished, ignore_index=True, sort=False) - merged.to_csv(MERGED_PRODUCTS, sep='\t', encoding='utf-8', index=False) + def requires(self): + return ExtractNDCFiles() + + def output(self): + return luigi.LocalTarget(MERGED_PRODUCTS) + + def run(self): + output_dir = dirname(MERGED_PRODUCTS) + os.system("mkdir -p %s" % output_dir) + + dtype = { + "STARTMARKETINGDATE": np.unicode, + "ENDMARKETINGDATE": np.unicode, + "APPLICATIONNUMBER": np.unicode, + "ACTIVE_NUMERATOR_STRENGTH": np.unicode, + "LISTING_RECORD_CERTIFIED_THROUGH": np.unicode, + "PROPRIETARYNAMESUFFIX": np.unicode, + } + finished = pd.read_csv( + join(self.input().path, "product.txt"), + sep="\t", + index_col=False, + encoding="utf-8", + dtype=dtype, + ) + unfinished = pd.read_csv( + join(self.input().path, "unfinished_product.txt"), + sep="\t", + index_col=False, + encoding="utf-8", + dtype=dtype, + ) + + finished["FINISHED"] = "Y" + unfinished["FINISHED"] = "N" + merged = finished.append(unfinished, ignore_index=True, sort=False) + merged.to_csv(MERGED_PRODUCTS, sep="\t", encoding="utf-8", index=False) class NDCPackage2JSONMapper(parallel.Mapper): - rename_map = { - 'PRODUCTID': 'product_id', - 'PRODUCTNDC': 'product_ndc', - 'NDCPACKAGECODE': 'package_ndc', - 'PACKAGEDESCRIPTION': 'description', - 'STARTMARKETINGDATE': 'marketing_start_date', - 'ENDMARKETINGDATE': 'marketing_end_date', - 'SAMPLE_PACKAGE': 'sample', - 'FINISHED': 'finished' - } - - booleans = ['SAMPLE_PACKAGE', 'FINISHED'] - - def map(self, key, value, output): - def _cleaner(k, v): - ''' Helper function to rename keys and purge any keys that are not in - the map. - ''' - if k in self.booleans: - v = v in ['Y', 'y'] if v is not None and v != '' else None - v = v.strip() if isinstance(v, str) else v - if k in self.rename_map and v is not None and v != '': - return (self.rename_map[k], v) - - new_value = common.transform_dict(value, _cleaner) - output.add(key, new_value) + rename_map = { + "PRODUCTID": "product_id", + "PRODUCTNDC": "product_ndc", + "NDCPACKAGECODE": "package_ndc", + "PACKAGEDESCRIPTION": "description", + "STARTMARKETINGDATE": "marketing_start_date", + "ENDMARKETINGDATE": "marketing_end_date", + "SAMPLE_PACKAGE": "sample", + "FINISHED": "finished", + } + + booleans = ["SAMPLE_PACKAGE", "FINISHED"] + + def map(self, key, value, output): + def _cleaner(k, v): + """Helper function to rename keys and purge any keys that are not in + the map. + """ + if k in self.booleans: + v = v in ["Y", "y"] if v is not None and v != "" else None + v = v.strip() if isinstance(v, str) else v + if k in self.rename_map and v is not None and v != "": + return (self.rename_map[k], v) + + new_value = common.transform_dict(value, _cleaner) + output.add(key, new_value) class NDCPackage2JSON(luigi.Task): - def requires(self): - return MergePackageNDC() + def requires(self): + return MergePackageNDC() - def output(self): - return luigi.LocalTarget(NDC_PACKAGE_DB) + def output(self): + return luigi.LocalTarget(NDC_PACKAGE_DB) - def run(self): - parallel.mapreduce( - parallel.Collection.from_glob( - self.input().path, parallel.CSVDictLineInput(delimiter='\t')), - mapper=NDCPackage2JSONMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=1) + def run(self): + parallel.mapreduce( + parallel.Collection.from_glob( + self.input().path, parallel.CSVDictLineInput(delimiter="\t") + ), + mapper=NDCPackage2JSONMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=1, + ) class NDCProduct2JSONMapper(parallel.Mapper): - rename_map = { - 'PRODUCTID': 'product_id', - 'PRODUCTNDC': 'product_ndc', - 'PRODUCTTYPENAME': 'product_type', - 'PROPRIETARYNAME': 'brand_name', - 'PROPRIETARYNAMESUFFIX': 'brand_name_suffix', - 'NONPROPRIETARYNAME': 'generic_name', - 'DOSAGEFORMNAME': 'dosage_form', - 'ROUTENAME': 'route', - 'STARTMARKETINGDATE': 'marketing_start_date', - 'ENDMARKETINGDATE': 'marketing_end_date', - 'MARKETINGCATEGORYNAME': 'marketing_category', - 'APPLICATIONNUMBER': 'application_number', - 'LABELERNAME': 'labeler_name', - 'SUBSTANCENAME': 'active_ingredients', - 'ACTIVE_NUMERATOR_STRENGTH': 'strength', - 'ACTIVE_INGRED_UNIT': 'unit', - 'PHARM_CLASSES': 'pharm_class', - 'DEASCHEDULE': 'dea_schedule', - 'LISTING_RECORD_CERTIFIED_THROUGH': 'listing_expiration_date', - 'FINISHED': 'finished' - } - - booleans = ['FINISHED'] - - def map(self, key, value, output): - def _cleaner(k, v): - ''' Helper function to rename keys and purge any keys that are not in - the map. - ''' - if k in self.booleans: - v = v in ['Y', 'y'] if v is not None and v != '' else None - v = v.strip() if isinstance(v, str) else v - if k in self.rename_map and v is not None and v != '': - return (self.rename_map[k], v) - - json = common.transform_dict(value, _cleaner) - - # SPL ID - json['spl_id'] = json['product_id'].split('_')[1] - - # Brand name parsing - json['brand_name_base'] = json.get('brand_name') - if json.get('brand_name_suffix'): - json['brand_name'] = (json.get('brand_name') if json.get('brand_name') is not None else '') + ' ' + json[ - 'brand_name_suffix'] - - # Route is a multi-value field easily parseable - if json.get('route'): - json['route'] = re.sub(';\s+', ';', json['route']).split(';') - - # Pharm class field is a multi-value field easily parseable - if json.get('pharm_class'): - json['pharm_class'] = re.sub(',\s+', ',', json['pharm_class']).split(',') - - # Turn active ingredients into an array of objects as per the mapping. - if json.get('active_ingredients'): - ingredientList = re.sub(';\s+', ';', json['active_ingredients']).split(';') - json['active_ingredients'] = [] - - strengthList = re.sub(';\s+', ';', json['strength']).split(';') if json.get('strength') else [] - unitList = re.sub(';\s+', ';', json['unit']).split(';') if json.get('unit') else [] - for idx, name in enumerate(ingredientList): - ingredient = {'name': name} - if len(strengthList) > idx: - ingredient['strength'] = strengthList[idx] + (' ' + unitList[idx] if len(unitList) > idx else '') - json['active_ingredients'].append(ingredient) - else: - # Delete to avoid complaints from Elasticsearch. - if json.get('active_ingredients') is not None: - del json['active_ingredients'] - - if json.get('strength') is not None: - del json['strength'] - if json.get('unit') is not None: - del json['unit'] - - output.add(key, json) + rename_map = { + "PRODUCTID": "product_id", + "PRODUCTNDC": "product_ndc", + "PRODUCTTYPENAME": "product_type", + "PROPRIETARYNAME": "brand_name", + "PROPRIETARYNAMESUFFIX": "brand_name_suffix", + "NONPROPRIETARYNAME": "generic_name", + "DOSAGEFORMNAME": "dosage_form", + "ROUTENAME": "route", + "STARTMARKETINGDATE": "marketing_start_date", + "ENDMARKETINGDATE": "marketing_end_date", + "MARKETINGCATEGORYNAME": "marketing_category", + "APPLICATIONNUMBER": "application_number", + "LABELERNAME": "labeler_name", + "SUBSTANCENAME": "active_ingredients", + "ACTIVE_NUMERATOR_STRENGTH": "strength", + "ACTIVE_INGRED_UNIT": "unit", + "PHARM_CLASSES": "pharm_class", + "DEASCHEDULE": "dea_schedule", + "LISTING_RECORD_CERTIFIED_THROUGH": "listing_expiration_date", + "FINISHED": "finished", + } + + booleans = ["FINISHED"] + + def map(self, key, value, output): + def _cleaner(k, v): + """Helper function to rename keys and purge any keys that are not in + the map. + """ + if k in self.booleans: + v = v in ["Y", "y"] if v is not None and v != "" else None + v = v.strip() if isinstance(v, str) else v + if k in self.rename_map and v is not None and v != "": + return (self.rename_map[k], v) + + json = common.transform_dict(value, _cleaner) + + # SPL ID + json["spl_id"] = json["product_id"].split("_")[1] + + # Brand name parsing + json["brand_name_base"] = json.get("brand_name") + if json.get("brand_name_suffix"): + json["brand_name"] = ( + (json.get("brand_name") if json.get("brand_name") is not None else "") + + " " + + json["brand_name_suffix"] + ) + + # Route is a multi-value field easily parseable + if json.get("route"): + json["route"] = re.sub(";\s+", ";", json["route"]).split(";") + + # Pharm class field is a multi-value field easily parseable + if json.get("pharm_class"): + json["pharm_class"] = re.sub(",\s+", ",", json["pharm_class"]).split(",") + + # Turn active ingredients into an array of objects as per the mapping. + if json.get("active_ingredients"): + ingredientList = re.sub(";\s+", ";", json["active_ingredients"]).split(";") + json["active_ingredients"] = [] + + strengthList = ( + re.sub(";\s+", ";", json["strength"]).split(";") + if json.get("strength") + else [] + ) + unitList = ( + re.sub(";\s+", ";", json["unit"]).split(";") if json.get("unit") else [] + ) + for idx, name in enumerate(ingredientList): + ingredient = {"name": name} + if len(strengthList) > idx: + ingredient["strength"] = strengthList[idx] + ( + " " + unitList[idx] if len(unitList) > idx else "" + ) + json["active_ingredients"].append(ingredient) + else: + # Delete to avoid complaints from Elasticsearch. + if json.get("active_ingredients") is not None: + del json["active_ingredients"] + + if json.get("strength") is not None: + del json["strength"] + if json.get("unit") is not None: + del json["unit"] + + output.add(key, json) class NDCProduct2JSON(luigi.Task): - def requires(self): - return MergeProductNDC() + def requires(self): + return MergeProductNDC() - def output(self): - return luigi.LocalTarget(NDC_PRODUCT_DB) + def output(self): + return luigi.LocalTarget(NDC_PRODUCT_DB) - def run(self): - parallel.mapreduce( - parallel.Collection.from_glob( - self.input().path, parallel.CSVDictLineInput(delimiter='\t')), - mapper=NDCProduct2JSONMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=1) + def run(self): + parallel.mapreduce( + parallel.Collection.from_glob( + self.input().path, parallel.CSVDictLineInput(delimiter="\t") + ), + mapper=NDCProduct2JSONMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=1, + ) class ProductAndPackagingMergingMapper(parallel.Mapper): - def __init__(self, package_db): - parallel.Mapper.__init__(self) - self.package_db = package_db + def __init__(self, package_db): + parallel.Mapper.__init__(self) + self.package_db = package_db - # Build an inverse index: product_id --> a list of packages with that product id. - # Otherwise it is TOO slow. - self.inverse_index = {} - for key, value in self.package_db.items(): - if value.get('product_id'): - product_id = value['product_id'] - if self.inverse_index.get(product_id) is None: - self.inverse_index[product_id] = [] - self.inverse_index[product_id].append(value) + # Build an inverse index: product_id --> a list of packages with that product id. + # Otherwise it is TOO slow. + self.inverse_index = {} + for key, value in self.package_db.items(): + if value.get("product_id"): + product_id = value["product_id"] + if self.inverse_index.get(product_id) is None: + self.inverse_index[product_id] = [] + self.inverse_index[product_id].append(value) - def map(self, key, value, output): - id = value['product_id'] - value['packaging'] = [] + def map(self, key, value, output): + id = value["product_id"] + value["packaging"] = [] - for pkg in self.inverse_index[id] if self.inverse_index.get(id) is not None else []: - del pkg['finished'] - del pkg['product_id'] - del pkg['product_ndc'] - value['packaging'].append(pkg) + for pkg in ( + self.inverse_index[id] if self.inverse_index.get(id) is not None else [] + ): + del pkg["finished"] + del pkg["product_id"] + del pkg["product_ndc"] + value["packaging"].append(pkg) - output.add(key, value) + output.add(key, value) class MergeProductsAndPackaging(luigi.Task): - def requires(self): - return [NDCPackage2JSON(), NDCProduct2JSON()] + def requires(self): + return [NDCPackage2JSON(), NDCProduct2JSON()] - def output(self): - return luigi.LocalTarget(NDC_MERGE_DB) + def output(self): + return luigi.LocalTarget(NDC_MERGE_DB) - def run(self): - # Read the entire packaging DB in memory for speed. - package_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() + def run(self): + # Read the entire packaging DB in memory for speed. + package_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() - parallel.mapreduce( - parallel.Collection.from_sharded(self.input()[1].path), - mapper=ProductAndPackagingMergingMapper(package_db=package_db), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=1) + parallel.mapreduce( + parallel.Collection.from_sharded(self.input()[1].path), + mapper=ProductAndPackagingMergingMapper(package_db=package_db), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=1, + ) class AnnotateJSON(luigi.Task): - def requires(self): - return [MergeProductsAndPackaging(), CombineHarmonization()] + def requires(self): + return [MergeProductsAndPackaging(), CombineHarmonization()] - def output(self): - return luigi.LocalTarget(NDC_ANNOTATED_DB) + def output(self): + return luigi.LocalTarget(NDC_ANNOTATED_DB) - def run(self): - input_db = self.input()[0].path - harmonized_file = self.input()[1].path + def run(self): + input_db = self.input()[0].path + harmonized_file = self.input()[1].path - parallel.mapreduce( - parallel.Collection.from_sharded(input_db), - mapper=AnnotateMapper(harmonized_file), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=1) + parallel.mapreduce( + parallel.Collection.from_sharded(input_db), + mapper=AnnotateMapper(harmonized_file), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=1, + ) class LoadJSON(index_util.LoadJSONBase): - index_name = 'ndc' - mapping_file = './schemas/ndc_mapping.json' - data_source = AnnotateJSON() - use_checksum = False - optimize_index = True - last_update_date = lambda _: newest_file_timestamp(RAW_DIR) + index_name = "ndc" + mapping_file = "./schemas/ndc_mapping.json" + data_source = AnnotateJSON() + use_checksum = False + optimize_index = True + last_update_date = lambda _: newest_file_timestamp(RAW_DIR) -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/ndc/tests/pipeline_test.py b/openfda/ndc/tests/pipeline_test.py index 0f61217..19f61ac 100644 --- a/openfda/ndc/tests/pipeline_test.py +++ b/openfda/ndc/tests/pipeline_test.py @@ -15,413 +15,564 @@ class NDCPipelineTests(unittest.TestCase): - def setUp(self): - self.test_dir = tempfile.mkdtemp() - openfda.ndc.pipeline.BASE_DIR = self.test_dir - openfda.ndc.pipeline.RAW_DIR = join(openfda.ndc.pipeline.BASE_DIR, 'raw') - openfda.ndc.pipeline.MERGED_PACKAGES = join(openfda.ndc.pipeline.BASE_DIR, 'merge/package.csv') - openfda.ndc.pipeline.MERGED_PRODUCTS = join(openfda.ndc.pipeline.BASE_DIR, 'merge/product.csv') - openfda.ndc.pipeline.NDC_PACKAGE_DB = join(openfda.ndc.pipeline.BASE_DIR, 'json/package.db') - openfda.ndc.pipeline.NDC_PRODUCT_DB = join(openfda.ndc.pipeline.BASE_DIR, 'json/product.db') - openfda.ndc.pipeline.NDC_MERGE_DB = join(openfda.ndc.pipeline.BASE_DIR, 'json/merged.db') - - print(('Temp dir is ' + self.test_dir)) - - def tearDown(self): - shutil.rmtree(self.test_dir) - - def test_merge_product_with_packaging(self): - os.mkdir(os.path.join(self.test_dir, "extracted")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "product_trimmed.txt"), - os.path.join(self.test_dir, "extracted/product.txt")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "unfinished_product_trimmed.txt"), - os.path.join(self.test_dir, "extracted/unfinished_product.txt")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "package_trimmed.txt"), - os.path.join(self.test_dir, "extracted/package.txt")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "unfinished_package_trimmed.txt"), - os.path.join(self.test_dir, "extracted/unfinished_package.txt")) - - merge = MergeProductNDC() - merge.run() - - merge = MergePackageNDC() - merge.run() - - toJSON = NDCPackage2JSON() - toJSON.run() - - toJSON = NDCProduct2JSON() - toJSON.run() - - merge = MergeProductsAndPackaging() - merge.run() - - ldb = leveldb.LevelDB(os.path.join(merge.output().path, 'shard-00000-of-00001.db')) - data = list(ldb.RangeIter()) - - # Verify base logic. - pkg = pickle.loads(data[0][1]) - eq_('0002-0800_ab2a5006-1880-4b6c-9640-dc2ec1066449', pkg.get('product_id')) - eq_('ab2a5006-1880-4b6c-9640-dc2ec1066449', pkg.get('spl_id')) - eq_('0002-0800', pkg.get('product_ndc')) - eq_('HUMAN OTC DRUG', pkg.get('product_type')) - eq_('Sterile Diluent', pkg.get('brand_name')) - eq_('Sterile Diluent', pkg.get('brand_name_base')) - eq_(None, pkg.get('brand_name_suffix')) - eq_('diluent', pkg.get('generic_name')) - eq_('INJECTION, SOLUTION', pkg.get('dosage_form')) - eq_(['SUBCUTANEOUS'], pkg.get('route')) - eq_('19870710', pkg.get('marketing_start_date')) - eq_(None, pkg.get('marketing_end_date')) - eq_('NDA', pkg.get('marketing_category')) - eq_('NDA018781', pkg.get('application_number')) - eq_('Eli Lilly and Company', pkg.get('labeler_name')) - eq_([{'strength': '1 mL/mL', 'name': 'WATER'}], pkg.get('active_ingredients')) - eq_(None, pkg.get('strength')) - eq_(None, pkg.get('unit')) - eq_(None, pkg.get('pharm_class')) - eq_(None, pkg.get('dea_schedule')) - eq_('20191231', pkg.get('listing_expiration_date')) - ok_(pkg.get('finished')) - - # Verify packaging got properly attached. - eq_(1, len(pkg['packaging'])) - eq_('0002-0800-01', pkg['packaging'][0]['package_ndc']) - eq_('1 VIAL in 1 CARTON (0002-0800-01) > 10 mL in 1 VIAL', pkg['packaging'][0]['description']) - eq_('19870710', pkg['packaging'][0]['marketing_start_date']) - eq_(None, pkg['packaging'][0].get('marketing_end_date')) - ok_(not pkg['packaging'][0]['sample']) - - # Verify multiple packaging per product also work as expected - pkg = pickle.loads(data[1][1]) - eq_('0002-1200_4bd46cbe-cdc1-4329-a8e7-22816bd7fc33', pkg.get('product_id')) - - eq_(2, len(pkg['packaging'])) - eq_('0002-1200-30', pkg['packaging'][0]['package_ndc']) - eq_('1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-30) > 30 mL in 1 VIAL, MULTI-DOSE', pkg['packaging'][0]['description']) - eq_('20120601', pkg['packaging'][0]['marketing_start_date']) - eq_('20190203', pkg['packaging'][0]['marketing_end_date']) - ok_(pkg['packaging'][0]['sample']) - eq_('0002-1200-50', pkg['packaging'][1]['package_ndc']) - eq_('1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-50) > 50 mL in 1 VIAL, MULTI-DOSE', pkg['packaging'][1]['description']) - eq_('20120601', pkg['packaging'][1]['marketing_start_date']) - eq_(None, pkg['packaging'][1].get('marketing_end_date')) - ok_(not pkg['packaging'][1]['sample']) - - # Verify unfinished drugs. - pkg = pickle.loads(data[3][1]) - eq_('0002-0485_6d8a935c-fc7b-44c6-92d1-2337b959d1ce', pkg.get('product_id')) - eq_('6d8a935c-fc7b-44c6-92d1-2337b959d1ce', pkg.get('spl_id')) - eq_('0002-0485', pkg.get('product_ndc')) - eq_('BULK INGREDIENT', pkg.get('product_type')) - eq_(None, pkg.get('brand_name')) - eq_(None, pkg.get('brand_name_base')) - eq_(None, pkg.get('brand_name_suffix')) - eq_('Insulin human', pkg.get('generic_name')) - eq_('CRYSTAL', pkg.get('dosage_form')) - eq_(None, pkg.get('route')) - eq_('20150401', pkg.get('marketing_start_date')) - eq_(None, pkg.get('marketing_end_date')) - eq_('BULK INGREDIENT', pkg.get('marketing_category')) - eq_(None, pkg.get('application_number')) - eq_('Eli Lilly and Company', pkg.get('labeler_name')) - eq_([{'strength': '1 g/g', 'name': 'INSULIN HUMAN'}], pkg.get('active_ingredients')) - eq_(None, pkg.get('strength')) - eq_(None, pkg.get('unit')) - eq_(None, pkg.get('pharm_class')) - eq_(None, pkg.get('dea_schedule')) - eq_('20181231', pkg.get('listing_expiration_date')) - ok_(not pkg.get('finished')) - - eq_(2, len(pkg['packaging'])) - eq_('0002-0485-04', pkg['packaging'][0]['package_ndc']) - eq_('4 BOTTLE in 1 CONTAINER (0002-0485-04) > 2000 g in 1 BOTTLE', pkg['packaging'][0]['description']) - eq_(None, pkg['packaging'][0].get('marketing_start_date')) - eq_(None, pkg['packaging'][0].get('marketing_end_date')) - eq_(None, pkg['packaging'][0].get('sample')) - - eq_('0002-0485-18', pkg['packaging'][1]['package_ndc']) - eq_('1 BOTTLE in 1 CONTAINER (0002-0485-18) > 2000 g in 1 BOTTLE', pkg['packaging'][1]['description']) - eq_(None, pkg['packaging'][1].get('marketing_start_date')) - eq_(None, pkg['packaging'][1].get('marketing_end_date')) - eq_(None, pkg['packaging'][1].get('sample')) - - - - - - def test_product_ndc_to_json(self): - os.mkdir(os.path.join(self.test_dir, "extracted")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "product_trimmed.txt"), - os.path.join(self.test_dir, "extracted/product.txt")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "unfinished_product_trimmed.txt"), - os.path.join(self.test_dir, "extracted/unfinished_product.txt")) - - merge = MergeProductNDC() - merge.run() - mergedPath = os.path.join(self.test_dir, "merge/product.csv") - - toJSON = NDCProduct2JSON() - toJSON.run() - - ldb = leveldb.LevelDB(os.path.join(toJSON.output().path, 'shard-00000-of-00001.db')) - data = list(ldb.RangeIter()) - - # Verify base logic. - pkg = pickle.loads(data[0][1]) - eq_('0002-0800_ab2a5006-1880-4b6c-9640-dc2ec1066449', pkg.get('product_id')) - eq_('ab2a5006-1880-4b6c-9640-dc2ec1066449', pkg.get('spl_id')) - eq_('0002-0800', pkg.get('product_ndc')) - eq_('HUMAN OTC DRUG', pkg.get('product_type')) - eq_('Sterile Diluent', pkg.get('brand_name')) - eq_('Sterile Diluent', pkg.get('brand_name_base')) - eq_(None, pkg.get('brand_name_suffix')) - eq_('diluent', pkg.get('generic_name')) - eq_('INJECTION, SOLUTION', pkg.get('dosage_form')) - eq_(['SUBCUTANEOUS'], pkg.get('route')) - eq_('19870710', pkg.get('marketing_start_date')) - eq_(None, pkg.get('marketing_end_date')) - eq_('NDA', pkg.get('marketing_category')) - eq_('NDA018781', pkg.get('application_number')) - eq_('Eli Lilly and Company', pkg.get('labeler_name')) - eq_([{'strength': '1 mL/mL', 'name': 'WATER'}], pkg.get('active_ingredients')) - eq_(None, pkg.get('strength')) - eq_(None, pkg.get('unit')) - eq_(None, pkg.get('pharm_class')) - eq_(None, pkg.get('dea_schedule')) - eq_('20191231', pkg.get('listing_expiration_date')) - ok_(pkg.get('finished')) - - # Verify some corner cases. - pkg = pickle.loads(data[1][1]) - eq_('0002-1200_4bd46cbe-cdc1-4329-a8e7-22816bd7fc33', pkg.get('product_id')) - eq_('Amyvid III', pkg.get('brand_name')) - eq_('Amyvid', pkg.get('brand_name_base')) - eq_('III', pkg.get('brand_name_suffix')) - eq_(['INTRALESIONAL', 'INTRAMUSCULAR', 'INTRASYNOVIAL', 'SOFT TISSUE'], pkg.get('route')) - eq_(['Radioactive Diagnostic Agent [EPC]', 'Positron Emitting Activity [MoA]'], pkg.get('pharm_class')) - eq_('DEA', pkg.get('dea_schedule')) - eq_('20180910', pkg.get('marketing_end_date')) - eq_([{'strength': '70 mg/1', 'name': 'ALENDRONATE SODIUM'}, {'strength': '5600 [iU]/1', 'name': 'CHOLECALCIFEROL'}, - {'strength': '100', 'name': 'FLUORIDE'}, {'name': 'ZINC'}], - pkg.get('active_ingredients')) - - # Verify unfinished drugs. - pkg = pickle.loads(data[5][1]) - eq_('0002-1401_70b2e81d-fb7e-48bb-b97d-7f2f26f18c41', pkg.get('product_id')) - eq_('70b2e81d-fb7e-48bb-b97d-7f2f26f18c41', pkg.get('spl_id')) - eq_('0002-1401', pkg.get('product_ndc')) - eq_('DRUG FOR FURTHER PROCESSING', pkg.get('product_type')) - eq_(None, pkg.get('brand_name')) - eq_(None, pkg.get('brand_name_base')) - eq_(None, pkg.get('brand_name_suffix')) - eq_('Dulaglutide', pkg.get('generic_name')) - eq_('INJECTION, SOLUTION', pkg.get('dosage_form')) - eq_(None, pkg.get('route')) - eq_('20140922', pkg.get('marketing_start_date')) - eq_(None, pkg.get('marketing_end_date')) - eq_('DRUG FOR FURTHER PROCESSING', pkg.get('marketing_category')) - eq_(None, pkg.get('application_number')) - eq_('Eli Lilly and Company', pkg.get('labeler_name')) - eq_([{'strength': '.75 mg/.5mL', 'name': 'DULAGLUTIDE'}], pkg.get('active_ingredients')) - eq_(None, pkg.get('strength')) - eq_(None, pkg.get('unit')) - eq_(None, pkg.get('pharm_class')) - eq_(None, pkg.get('dea_schedule')) - eq_('20181231', pkg.get('listing_expiration_date')) - ok_(not pkg.get('finished')) - - def test_package_ndc_to_json(self): - os.mkdir(os.path.join(self.test_dir, "extracted")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "package_trimmed.txt"), - os.path.join(self.test_dir, "extracted/package.txt")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "unfinished_package_trimmed.txt"), - os.path.join(self.test_dir, "extracted/unfinished_package.txt")) - - merge = MergePackageNDC() - merge.run() - mergedPath = os.path.join(self.test_dir, "merge/package.csv") - - toJSON = NDCPackage2JSON() - toJSON.run() - - ldb = leveldb.LevelDB(os.path.join(toJSON.output().path, 'shard-00000-of-00001.db')) - data = list(ldb.RangeIter()) - pkg = pickle.loads(data[0][1]) - eq_('0002-0800_ab2a5006-1880-4b6c-9640-dc2ec1066449', pkg.get('product_id')) - eq_('0002-0800', pkg.get('product_ndc')) - eq_('0002-0800-01', pkg.get('package_ndc')) - eq_('1 VIAL in 1 CARTON (0002-0800-01) > 10 mL in 1 VIAL', pkg.get('description')) - eq_('19870710', pkg.get('marketing_start_date')) - eq_(None, pkg.get('marketing_end_date')) - eq_(False, pkg.get('sample')) - eq_(True, pkg.get('finished')) - - pkg = pickle.loads(data[1][1]) - eq_('20190203', pkg.get('marketing_end_date')) - eq_(True, pkg.get('sample')) - - pkg = pickle.loads(data[3][1]) - eq_('0002-0485_6d8a935c-fc7b-44c6-92d1-2337b959d1ce', pkg.get('product_id')) - eq_('0002-0485', pkg.get('product_ndc')) - eq_('0002-0485-04', pkg.get('package_ndc')) - eq_('4 BOTTLE in 1 CONTAINER (0002-0485-04) > 2000 g in 1 BOTTLE', pkg.get('description')) - eq_(None, pkg.get('marketing_start_date')) - eq_(None, pkg.get('marketing_end_date')) - eq_(None, pkg.get('sample')) - eq_(False, pkg.get('finished')) - - def test_merge_product_ndc(self): - os.mkdir(os.path.join(self.test_dir, "extracted")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "product.txt"), - os.path.join(self.test_dir, "extracted/product.txt")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "unfinished_product.txt"), - os.path.join(self.test_dir, "extracted/unfinished_product.txt")) - - merge = MergeProductNDC() - merge.run() - - mergedPath = os.path.join(self.test_dir, "merge/product.csv") - ok_(os.path.isfile(mergedPath)) - - dtype = {'STARTMARKETINGDATE': np.unicode, 'ENDMARKETINGDATE': np.unicode, 'APPLICATIONNUMBER': np.unicode - , 'ACTIVE_NUMERATOR_STRENGTH': np.unicode, 'LISTING_RECORD_CERTIFIED_THROUGH': np.unicode - , 'PROPRIETARYNAMESUFFIX': np.unicode} - csv = pd.read_csv(mergedPath, sep='\t', index_col=False, encoding='utf-8', - dtype=dtype) - # Obviously, the merged file has to have the sum of lines in both source files. - ok_(len(csv.index) == 121856 + 21478) - - # Make sure every line from finished product.txt made it correctly into the merged CSV. - finished = pd.read_csv(os.path.join(self.test_dir, "extracted/product.txt"), sep='\t', index_col=False, - encoding='utf-8', - dtype=dtype) - truncated = csv.truncate(after=len(finished.index) - 1) - # Next line ensures all of the finished lines got Y in the FINISHED column indeed. - eq_(truncated['FINISHED'].unique(), ['Y']) - - # Drop the last column, after which the two frames must match precisely. - truncated = truncated.drop(columns=['FINISHED']) - ok_(finished.equals(truncated)) - - # Now, make sure every line from unfinished package.txt made it correctly into the merged CSV. - unfinished = pd.read_csv(os.path.join(self.test_dir, "extracted/unfinished_product.txt"), sep='\t', index_col=False, - encoding='utf-8', - dtype=dtype) - truncated = csv.truncate(before=len(finished.index)).reindex() - # Next line ensures all of the finished lines got N in the FINISHED column indeed. - eq_(truncated['FINISHED'].unique(), ['N']) - - # Drop the last column, after which the two frames must match precisely. - truncated = truncated.drop( - columns=['FINISHED', 'PROPRIETARYNAME', 'PROPRIETARYNAMESUFFIX', 'ROUTENAME', 'APPLICATIONNUMBER', - 'PHARM_CLASSES', 'NDC_EXCLUDE_FLAG']) - eq_(unfinished.to_dict(orient='list'), truncated.to_dict(orient='list')) - - def test_merge_package_ndc(self): - os.mkdir(os.path.join(self.test_dir, "extracted")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "package.txt"), - os.path.join(self.test_dir, "extracted/package.txt")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "unfinished_package.txt"), - os.path.join(self.test_dir, "extracted/unfinished_package.txt")) - - merge = MergePackageNDC() - merge.run() - - mergedPath = os.path.join(self.test_dir, "merge/package.csv") - ok_(os.path.isfile(mergedPath)) - - dtype = {'STARTMARKETINGDATE': np.unicode, 'ENDMARKETINGDATE': np.unicode} - csv = pd.read_csv(mergedPath, sep='\t', index_col=False, encoding='utf-8', - dtype=dtype) - # Obviously, the merged file has to have the sum of lines in both source files. - ok_(len(csv.index) == 234309 + 35321) - - # Make sure every line from finished package.txt made it correctly into the merged CSV. - finished = pd.read_csv(os.path.join(self.test_dir, "extracted/package.txt"), sep='\t', index_col=False, - encoding='utf-8', - dtype=dtype) - truncated = csv.truncate(after=len(finished.index) - 1) - # Next line ensures all of the finished lines got Y in the FINISHED column indeed. - eq_(truncated['FINISHED'].unique(), ['Y']) - - # Drop the last column, after which the two frames must match precisely. - truncated = truncated.drop(columns=['FINISHED']) - ok_(finished.equals(truncated)) - - # Finally, test a random line manually. Just to make sure there isn't a CSV reading bug equally breaking both data sets. - ok_(csv.at[22, 'PRODUCTID'] == '0002-3235_48f72160-f6f7-4890-8471-041da7cdd3a7') - ok_(csv.at[22, 'PRODUCTNDC'] == '0002-3235') - ok_(csv.at[22, 'NDCPACKAGECODE'] == '0002-3235-60') - ok_(csv.at[22, 'PACKAGEDESCRIPTION'] == '60 CAPSULE, DELAYED RELEASE in 1 BOTTLE (0002-3235-60) ') - ok_(csv.at[22, 'STARTMARKETINGDATE'] == '20040824') - ok_(math.isnan(csv.at[22, 'ENDMARKETINGDATE'])) - ok_(csv.at[22, 'NDC_EXCLUDE_FLAG'] == 'N') - ok_(csv.at[22, 'SAMPLE_PACKAGE'] == 'N') - - # Now, make sure every line from unfinished package.txt made it correctly into the merged CSV. - unfinished = pd.read_csv(os.path.join(self.test_dir, "extracted/unfinished_package.txt"), sep='\t', index_col=False, - encoding='utf-8', - dtype=dtype) - truncated = csv.truncate(before=len(finished.index)).reindex() - # Next line ensures all of the finished lines got N in the FINISHED column indeed. - eq_(truncated['FINISHED'].unique(), ['N']) - - # Drop the last column, after which the two frames must match precisely. - truncated = truncated.drop( - columns=['FINISHED', 'STARTMARKETINGDATE', 'ENDMARKETINGDATE', 'NDC_EXCLUDE_FLAG', 'SAMPLE_PACKAGE']) - eq_(unfinished.to_dict(orient='list'), truncated.to_dict(orient='list')) - - # Finally, test a random line manually. Just to make sure there isn't a CSV reading bug equally breaking both data sets. - ok_(csv.at[234309, 'PRODUCTID'] == '0002-0485_6d8a935c-fc7b-44c6-92d1-2337b959d1ce') - ok_(csv.at[234309, 'PRODUCTNDC'] == '0002-0485') - ok_(csv.at[234309, 'NDCPACKAGECODE'] == '0002-0485-04') - ok_(csv.at[234309, 'PACKAGEDESCRIPTION'] == '4 BOTTLE in 1 CONTAINER (0002-0485-04) > 2000 g in 1 BOTTLE') - ok_(math.isnan(csv.at[234309, 'STARTMARKETINGDATE'])) - ok_(math.isnan(csv.at[234309, 'ENDMARKETINGDATE'])) - ok_(math.isnan(csv.at[234309, 'NDC_EXCLUDE_FLAG'])) - ok_(math.isnan(csv.at[234309, 'SAMPLE_PACKAGE'])) - - def test_fix_utf8_issues(self): - testfile = os.path.join(self.test_dir, "badutf8.txt") - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "badutf8.txt"), - testfile) - - extract = ExtractNDCFiles() - extract.fix_utf8_issues(testfile) - - file = open(testfile, 'r') - data = file.read() - file.close() - ok_(data == "This is a \"bad\" UTF-8 file.") - - def test_extract_ndc(self): - os.mkdir(os.path.join(self.test_dir, "raw")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "finished.zip"), - os.path.join(self.test_dir, "raw/finished.zip")) - shutil.copyfile(os.path.join(dirname(os.path.abspath(__file__)), "unfinished.zip"), - os.path.join(self.test_dir, "raw/unfinished.zip")) - - extract = ExtractNDCFiles() - extract.run() - - eq_(os.stat(os.path.join(self.test_dir, "extracted/package.txt")).st_size, 31980050) - eq_(os.stat(os.path.join(self.test_dir, "extracted/product.txt")).st_size, 41165614) - eq_(os.stat(os.path.join(self.test_dir, "extracted/unfinished_package.txt")).st_size, 3853326) - eq_(os.stat(os.path.join(self.test_dir, "extracted/unfinished_product.txt")).st_size, 4493918) - - def test_download_ndc(self): - downloadNDC = DownloadNDCFiles() - downloadNDC.run() - ok_(os.path.isfile(os.path.join(self.test_dir, "raw/finished.zip"))) - ok_(os.path.isfile(os.path.join(self.test_dir, "raw/unfinished.zip"))) - ok_(os.stat(os.path.join(self.test_dir, "raw/finished.zip")).st_size > 0) - ok_(os.stat(os.path.join(self.test_dir, "raw/unfinished.zip")).st_size > 0) + def setUp(self): + self.test_dir = tempfile.mkdtemp() + openfda.ndc.pipeline.BASE_DIR = self.test_dir + openfda.ndc.pipeline.RAW_DIR = join(openfda.ndc.pipeline.BASE_DIR, "raw") + openfda.ndc.pipeline.MERGED_PACKAGES = join( + openfda.ndc.pipeline.BASE_DIR, "merge/package.csv" + ) + openfda.ndc.pipeline.MERGED_PRODUCTS = join( + openfda.ndc.pipeline.BASE_DIR, "merge/product.csv" + ) + openfda.ndc.pipeline.NDC_PACKAGE_DB = join( + openfda.ndc.pipeline.BASE_DIR, "json/package.db" + ) + openfda.ndc.pipeline.NDC_PRODUCT_DB = join( + openfda.ndc.pipeline.BASE_DIR, "json/product.db" + ) + openfda.ndc.pipeline.NDC_MERGE_DB = join( + openfda.ndc.pipeline.BASE_DIR, "json/merged.db" + ) + + print(("Temp dir is " + self.test_dir)) + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_merge_product_with_packaging(self): + os.mkdir(os.path.join(self.test_dir, "extracted")) + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "product_trimmed.txt"), + os.path.join(self.test_dir, "extracted/product.txt"), + ) + shutil.copyfile( + os.path.join( + dirname(os.path.abspath(__file__)), "unfinished_product_trimmed.txt" + ), + os.path.join(self.test_dir, "extracted/unfinished_product.txt"), + ) + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "package_trimmed.txt"), + os.path.join(self.test_dir, "extracted/package.txt"), + ) + shutil.copyfile( + os.path.join( + dirname(os.path.abspath(__file__)), "unfinished_package_trimmed.txt" + ), + os.path.join(self.test_dir, "extracted/unfinished_package.txt"), + ) + + merge = MergeProductNDC() + merge.run() + + merge = MergePackageNDC() + merge.run() + + toJSON = NDCPackage2JSON() + toJSON.run() + + toJSON = NDCProduct2JSON() + toJSON.run() + + merge = MergeProductsAndPackaging() + merge.run() + + ldb = leveldb.LevelDB( + os.path.join(merge.output().path, "shard-00000-of-00001.db") + ) + data = list(ldb.RangeIter()) + + # Verify base logic. + pkg = pickle.loads(data[0][1]) + eq_("0002-0800_ab2a5006-1880-4b6c-9640-dc2ec1066449", pkg.get("product_id")) + eq_("ab2a5006-1880-4b6c-9640-dc2ec1066449", pkg.get("spl_id")) + eq_("0002-0800", pkg.get("product_ndc")) + eq_("HUMAN OTC DRUG", pkg.get("product_type")) + eq_("Sterile Diluent", pkg.get("brand_name")) + eq_("Sterile Diluent", pkg.get("brand_name_base")) + eq_(None, pkg.get("brand_name_suffix")) + eq_("diluent", pkg.get("generic_name")) + eq_("INJECTION, SOLUTION", pkg.get("dosage_form")) + eq_(["SUBCUTANEOUS"], pkg.get("route")) + eq_("19870710", pkg.get("marketing_start_date")) + eq_(None, pkg.get("marketing_end_date")) + eq_("NDA", pkg.get("marketing_category")) + eq_("NDA018781", pkg.get("application_number")) + eq_("Eli Lilly and Company", pkg.get("labeler_name")) + eq_([{"strength": "1 mL/mL", "name": "WATER"}], pkg.get("active_ingredients")) + eq_(None, pkg.get("strength")) + eq_(None, pkg.get("unit")) + eq_(None, pkg.get("pharm_class")) + eq_(None, pkg.get("dea_schedule")) + eq_("20191231", pkg.get("listing_expiration_date")) + ok_(pkg.get("finished")) + + # Verify packaging got properly attached. + eq_(1, len(pkg["packaging"])) + eq_("0002-0800-01", pkg["packaging"][0]["package_ndc"]) + eq_( + "1 VIAL in 1 CARTON (0002-0800-01) > 10 mL in 1 VIAL", + pkg["packaging"][0]["description"], + ) + eq_("19870710", pkg["packaging"][0]["marketing_start_date"]) + eq_(None, pkg["packaging"][0].get("marketing_end_date")) + ok_(not pkg["packaging"][0]["sample"]) + + # Verify multiple packaging per product also work as expected + pkg = pickle.loads(data[1][1]) + eq_("0002-1200_4bd46cbe-cdc1-4329-a8e7-22816bd7fc33", pkg.get("product_id")) + + eq_(2, len(pkg["packaging"])) + eq_("0002-1200-30", pkg["packaging"][0]["package_ndc"]) + eq_( + "1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-30) > 30 mL in 1 VIAL, MULTI-DOSE", + pkg["packaging"][0]["description"], + ) + eq_("20120601", pkg["packaging"][0]["marketing_start_date"]) + eq_("20190203", pkg["packaging"][0]["marketing_end_date"]) + ok_(pkg["packaging"][0]["sample"]) + eq_("0002-1200-50", pkg["packaging"][1]["package_ndc"]) + eq_( + "1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-50) > 50 mL in 1 VIAL, MULTI-DOSE", + pkg["packaging"][1]["description"], + ) + eq_("20120601", pkg["packaging"][1]["marketing_start_date"]) + eq_(None, pkg["packaging"][1].get("marketing_end_date")) + ok_(not pkg["packaging"][1]["sample"]) + + # Verify unfinished drugs. + pkg = pickle.loads(data[3][1]) + eq_("0002-0485_6d8a935c-fc7b-44c6-92d1-2337b959d1ce", pkg.get("product_id")) + eq_("6d8a935c-fc7b-44c6-92d1-2337b959d1ce", pkg.get("spl_id")) + eq_("0002-0485", pkg.get("product_ndc")) + eq_("BULK INGREDIENT", pkg.get("product_type")) + eq_(None, pkg.get("brand_name")) + eq_(None, pkg.get("brand_name_base")) + eq_(None, pkg.get("brand_name_suffix")) + eq_("Insulin human", pkg.get("generic_name")) + eq_("CRYSTAL", pkg.get("dosage_form")) + eq_(None, pkg.get("route")) + eq_("20150401", pkg.get("marketing_start_date")) + eq_(None, pkg.get("marketing_end_date")) + eq_("BULK INGREDIENT", pkg.get("marketing_category")) + eq_(None, pkg.get("application_number")) + eq_("Eli Lilly and Company", pkg.get("labeler_name")) + eq_( + [{"strength": "1 g/g", "name": "INSULIN HUMAN"}], + pkg.get("active_ingredients"), + ) + eq_(None, pkg.get("strength")) + eq_(None, pkg.get("unit")) + eq_(None, pkg.get("pharm_class")) + eq_(None, pkg.get("dea_schedule")) + eq_("20181231", pkg.get("listing_expiration_date")) + ok_(not pkg.get("finished")) + + eq_(2, len(pkg["packaging"])) + eq_("0002-0485-04", pkg["packaging"][0]["package_ndc"]) + eq_( + "4 BOTTLE in 1 CONTAINER (0002-0485-04) > 2000 g in 1 BOTTLE", + pkg["packaging"][0]["description"], + ) + eq_(None, pkg["packaging"][0].get("marketing_start_date")) + eq_(None, pkg["packaging"][0].get("marketing_end_date")) + eq_(None, pkg["packaging"][0].get("sample")) + + eq_("0002-0485-18", pkg["packaging"][1]["package_ndc"]) + eq_( + "1 BOTTLE in 1 CONTAINER (0002-0485-18) > 2000 g in 1 BOTTLE", + pkg["packaging"][1]["description"], + ) + eq_(None, pkg["packaging"][1].get("marketing_start_date")) + eq_(None, pkg["packaging"][1].get("marketing_end_date")) + eq_(None, pkg["packaging"][1].get("sample")) + + def test_product_ndc_to_json(self): + os.mkdir(os.path.join(self.test_dir, "extracted")) + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "product_trimmed.txt"), + os.path.join(self.test_dir, "extracted/product.txt"), + ) + shutil.copyfile( + os.path.join( + dirname(os.path.abspath(__file__)), "unfinished_product_trimmed.txt" + ), + os.path.join(self.test_dir, "extracted/unfinished_product.txt"), + ) + + merge = MergeProductNDC() + merge.run() + mergedPath = os.path.join(self.test_dir, "merge/product.csv") + + toJSON = NDCProduct2JSON() + toJSON.run() + + ldb = leveldb.LevelDB( + os.path.join(toJSON.output().path, "shard-00000-of-00001.db") + ) + data = list(ldb.RangeIter()) + + # Verify base logic. + pkg = pickle.loads(data[0][1]) + eq_("0002-0800_ab2a5006-1880-4b6c-9640-dc2ec1066449", pkg.get("product_id")) + eq_("ab2a5006-1880-4b6c-9640-dc2ec1066449", pkg.get("spl_id")) + eq_("0002-0800", pkg.get("product_ndc")) + eq_("HUMAN OTC DRUG", pkg.get("product_type")) + eq_("Sterile Diluent", pkg.get("brand_name")) + eq_("Sterile Diluent", pkg.get("brand_name_base")) + eq_(None, pkg.get("brand_name_suffix")) + eq_("diluent", pkg.get("generic_name")) + eq_("INJECTION, SOLUTION", pkg.get("dosage_form")) + eq_(["SUBCUTANEOUS"], pkg.get("route")) + eq_("19870710", pkg.get("marketing_start_date")) + eq_(None, pkg.get("marketing_end_date")) + eq_("NDA", pkg.get("marketing_category")) + eq_("NDA018781", pkg.get("application_number")) + eq_("Eli Lilly and Company", pkg.get("labeler_name")) + eq_([{"strength": "1 mL/mL", "name": "WATER"}], pkg.get("active_ingredients")) + eq_(None, pkg.get("strength")) + eq_(None, pkg.get("unit")) + eq_(None, pkg.get("pharm_class")) + eq_(None, pkg.get("dea_schedule")) + eq_("20191231", pkg.get("listing_expiration_date")) + ok_(pkg.get("finished")) + + # Verify some corner cases. + pkg = pickle.loads(data[1][1]) + eq_("0002-1200_4bd46cbe-cdc1-4329-a8e7-22816bd7fc33", pkg.get("product_id")) + eq_("Amyvid III", pkg.get("brand_name")) + eq_("Amyvid", pkg.get("brand_name_base")) + eq_("III", pkg.get("brand_name_suffix")) + eq_( + ["INTRALESIONAL", "INTRAMUSCULAR", "INTRASYNOVIAL", "SOFT TISSUE"], + pkg.get("route"), + ) + eq_( + ["Radioactive Diagnostic Agent [EPC]", "Positron Emitting Activity [MoA]"], + pkg.get("pharm_class"), + ) + eq_("DEA", pkg.get("dea_schedule")) + eq_("20180910", pkg.get("marketing_end_date")) + eq_( + [ + {"strength": "70 mg/1", "name": "ALENDRONATE SODIUM"}, + {"strength": "5600 [iU]/1", "name": "CHOLECALCIFEROL"}, + {"strength": "100", "name": "FLUORIDE"}, + {"name": "ZINC"}, + ], + pkg.get("active_ingredients"), + ) + + # Verify unfinished drugs. + pkg = pickle.loads(data[5][1]) + eq_("0002-1401_70b2e81d-fb7e-48bb-b97d-7f2f26f18c41", pkg.get("product_id")) + eq_("70b2e81d-fb7e-48bb-b97d-7f2f26f18c41", pkg.get("spl_id")) + eq_("0002-1401", pkg.get("product_ndc")) + eq_("DRUG FOR FURTHER PROCESSING", pkg.get("product_type")) + eq_(None, pkg.get("brand_name")) + eq_(None, pkg.get("brand_name_base")) + eq_(None, pkg.get("brand_name_suffix")) + eq_("Dulaglutide", pkg.get("generic_name")) + eq_("INJECTION, SOLUTION", pkg.get("dosage_form")) + eq_(None, pkg.get("route")) + eq_("20140922", pkg.get("marketing_start_date")) + eq_(None, pkg.get("marketing_end_date")) + eq_("DRUG FOR FURTHER PROCESSING", pkg.get("marketing_category")) + eq_(None, pkg.get("application_number")) + eq_("Eli Lilly and Company", pkg.get("labeler_name")) + eq_( + [{"strength": ".75 mg/.5mL", "name": "DULAGLUTIDE"}], + pkg.get("active_ingredients"), + ) + eq_(None, pkg.get("strength")) + eq_(None, pkg.get("unit")) + eq_(None, pkg.get("pharm_class")) + eq_(None, pkg.get("dea_schedule")) + eq_("20181231", pkg.get("listing_expiration_date")) + ok_(not pkg.get("finished")) + + def test_package_ndc_to_json(self): + os.mkdir(os.path.join(self.test_dir, "extracted")) + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "package_trimmed.txt"), + os.path.join(self.test_dir, "extracted/package.txt"), + ) + shutil.copyfile( + os.path.join( + dirname(os.path.abspath(__file__)), "unfinished_package_trimmed.txt" + ), + os.path.join(self.test_dir, "extracted/unfinished_package.txt"), + ) + + merge = MergePackageNDC() + merge.run() + mergedPath = os.path.join(self.test_dir, "merge/package.csv") + + toJSON = NDCPackage2JSON() + toJSON.run() + + ldb = leveldb.LevelDB( + os.path.join(toJSON.output().path, "shard-00000-of-00001.db") + ) + data = list(ldb.RangeIter()) + pkg = pickle.loads(data[0][1]) + eq_("0002-0800_ab2a5006-1880-4b6c-9640-dc2ec1066449", pkg.get("product_id")) + eq_("0002-0800", pkg.get("product_ndc")) + eq_("0002-0800-01", pkg.get("package_ndc")) + eq_( + "1 VIAL in 1 CARTON (0002-0800-01) > 10 mL in 1 VIAL", + pkg.get("description"), + ) + eq_("19870710", pkg.get("marketing_start_date")) + eq_(None, pkg.get("marketing_end_date")) + eq_(False, pkg.get("sample")) + eq_(True, pkg.get("finished")) + + pkg = pickle.loads(data[1][1]) + eq_("20190203", pkg.get("marketing_end_date")) + eq_(True, pkg.get("sample")) + + pkg = pickle.loads(data[3][1]) + eq_("0002-0485_6d8a935c-fc7b-44c6-92d1-2337b959d1ce", pkg.get("product_id")) + eq_("0002-0485", pkg.get("product_ndc")) + eq_("0002-0485-04", pkg.get("package_ndc")) + eq_( + "4 BOTTLE in 1 CONTAINER (0002-0485-04) > 2000 g in 1 BOTTLE", + pkg.get("description"), + ) + eq_(None, pkg.get("marketing_start_date")) + eq_(None, pkg.get("marketing_end_date")) + eq_(None, pkg.get("sample")) + eq_(False, pkg.get("finished")) + + def test_merge_product_ndc(self): + os.mkdir(os.path.join(self.test_dir, "extracted")) + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "product.txt"), + os.path.join(self.test_dir, "extracted/product.txt"), + ) + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "unfinished_product.txt"), + os.path.join(self.test_dir, "extracted/unfinished_product.txt"), + ) + + merge = MergeProductNDC() + merge.run() + + mergedPath = os.path.join(self.test_dir, "merge/product.csv") + ok_(os.path.isfile(mergedPath)) + + dtype = { + "STARTMARKETINGDATE": np.unicode, + "ENDMARKETINGDATE": np.unicode, + "APPLICATIONNUMBER": np.unicode, + "ACTIVE_NUMERATOR_STRENGTH": np.unicode, + "LISTING_RECORD_CERTIFIED_THROUGH": np.unicode, + "PROPRIETARYNAMESUFFIX": np.unicode, + } + csv = pd.read_csv( + mergedPath, sep="\t", index_col=False, encoding="utf-8", dtype=dtype + ) + # Obviously, the merged file has to have the sum of lines in both source files. + ok_(len(csv.index) == 121856 + 21478) + + # Make sure every line from finished product.txt made it correctly into the merged CSV. + finished = pd.read_csv( + os.path.join(self.test_dir, "extracted/product.txt"), + sep="\t", + index_col=False, + encoding="utf-8", + dtype=dtype, + ) + truncated = csv.truncate(after=len(finished.index) - 1) + # Next line ensures all of the finished lines got Y in the FINISHED column indeed. + eq_(truncated["FINISHED"].unique(), ["Y"]) + + # Drop the last column, after which the two frames must match precisely. + truncated = truncated.drop(columns=["FINISHED"]) + ok_(finished.equals(truncated)) + + # Now, make sure every line from unfinished package.txt made it correctly into the merged CSV. + unfinished = pd.read_csv( + os.path.join(self.test_dir, "extracted/unfinished_product.txt"), + sep="\t", + index_col=False, + encoding="utf-8", + dtype=dtype, + ) + truncated = csv.truncate(before=len(finished.index)).reindex() + # Next line ensures all of the finished lines got N in the FINISHED column indeed. + eq_(truncated["FINISHED"].unique(), ["N"]) + + # Drop the last column, after which the two frames must match precisely. + truncated = truncated.drop( + columns=[ + "FINISHED", + "PROPRIETARYNAME", + "PROPRIETARYNAMESUFFIX", + "ROUTENAME", + "APPLICATIONNUMBER", + "PHARM_CLASSES", + "NDC_EXCLUDE_FLAG", + ] + ) + eq_(unfinished.to_dict(orient="list"), truncated.to_dict(orient="list")) + + def test_merge_package_ndc(self): + os.mkdir(os.path.join(self.test_dir, "extracted")) + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "package.txt"), + os.path.join(self.test_dir, "extracted/package.txt"), + ) + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "unfinished_package.txt"), + os.path.join(self.test_dir, "extracted/unfinished_package.txt"), + ) + + merge = MergePackageNDC() + merge.run() + + mergedPath = os.path.join(self.test_dir, "merge/package.csv") + ok_(os.path.isfile(mergedPath)) + + dtype = {"STARTMARKETINGDATE": np.unicode, "ENDMARKETINGDATE": np.unicode} + csv = pd.read_csv( + mergedPath, sep="\t", index_col=False, encoding="utf-8", dtype=dtype + ) + # Obviously, the merged file has to have the sum of lines in both source files. + ok_(len(csv.index) == 234309 + 35321) + + # Make sure every line from finished package.txt made it correctly into the merged CSV. + finished = pd.read_csv( + os.path.join(self.test_dir, "extracted/package.txt"), + sep="\t", + index_col=False, + encoding="utf-8", + dtype=dtype, + ) + truncated = csv.truncate(after=len(finished.index) - 1) + # Next line ensures all of the finished lines got Y in the FINISHED column indeed. + eq_(truncated["FINISHED"].unique(), ["Y"]) + + # Drop the last column, after which the two frames must match precisely. + truncated = truncated.drop(columns=["FINISHED"]) + ok_(finished.equals(truncated)) + + # Finally, test a random line manually. Just to make sure there isn't a CSV reading bug equally breaking both data sets. + ok_(csv.at[22, "PRODUCTID"] == "0002-3235_48f72160-f6f7-4890-8471-041da7cdd3a7") + ok_(csv.at[22, "PRODUCTNDC"] == "0002-3235") + ok_(csv.at[22, "NDCPACKAGECODE"] == "0002-3235-60") + ok_( + csv.at[22, "PACKAGEDESCRIPTION"] + == "60 CAPSULE, DELAYED RELEASE in 1 BOTTLE (0002-3235-60) " + ) + ok_(csv.at[22, "STARTMARKETINGDATE"] == "20040824") + ok_(math.isnan(csv.at[22, "ENDMARKETINGDATE"])) + ok_(csv.at[22, "NDC_EXCLUDE_FLAG"] == "N") + ok_(csv.at[22, "SAMPLE_PACKAGE"] == "N") + + # Now, make sure every line from unfinished package.txt made it correctly into the merged CSV. + unfinished = pd.read_csv( + os.path.join(self.test_dir, "extracted/unfinished_package.txt"), + sep="\t", + index_col=False, + encoding="utf-8", + dtype=dtype, + ) + truncated = csv.truncate(before=len(finished.index)).reindex() + # Next line ensures all of the finished lines got N in the FINISHED column indeed. + eq_(truncated["FINISHED"].unique(), ["N"]) + + # Drop the last column, after which the two frames must match precisely. + truncated = truncated.drop( + columns=[ + "FINISHED", + "STARTMARKETINGDATE", + "ENDMARKETINGDATE", + "NDC_EXCLUDE_FLAG", + "SAMPLE_PACKAGE", + ] + ) + eq_(unfinished.to_dict(orient="list"), truncated.to_dict(orient="list")) + + # Finally, test a random line manually. Just to make sure there isn't a CSV reading bug equally breaking both data sets. + ok_( + csv.at[234309, "PRODUCTID"] + == "0002-0485_6d8a935c-fc7b-44c6-92d1-2337b959d1ce" + ) + ok_(csv.at[234309, "PRODUCTNDC"] == "0002-0485") + ok_(csv.at[234309, "NDCPACKAGECODE"] == "0002-0485-04") + ok_( + csv.at[234309, "PACKAGEDESCRIPTION"] + == "4 BOTTLE in 1 CONTAINER (0002-0485-04) > 2000 g in 1 BOTTLE" + ) + ok_(math.isnan(csv.at[234309, "STARTMARKETINGDATE"])) + ok_(math.isnan(csv.at[234309, "ENDMARKETINGDATE"])) + ok_(math.isnan(csv.at[234309, "NDC_EXCLUDE_FLAG"])) + ok_(math.isnan(csv.at[234309, "SAMPLE_PACKAGE"])) + + def test_fix_utf8_issues(self): + testfile = os.path.join(self.test_dir, "badutf8.txt") + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "badutf8.txt"), testfile + ) + + extract = ExtractNDCFiles() + extract.fix_utf8_issues(testfile) + + file = open(testfile, "r") + data = file.read() + file.close() + ok_(data == 'This is a "bad" UTF-8 file.') + + def test_extract_ndc(self): + os.mkdir(os.path.join(self.test_dir, "raw")) + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "finished.zip"), + os.path.join(self.test_dir, "raw/finished.zip"), + ) + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "unfinished.zip"), + os.path.join(self.test_dir, "raw/unfinished.zip"), + ) + + extract = ExtractNDCFiles() + extract.run() + + eq_( + os.stat(os.path.join(self.test_dir, "extracted/package.txt")).st_size, + 31980050, + ) + eq_( + os.stat(os.path.join(self.test_dir, "extracted/product.txt")).st_size, + 41165614, + ) + eq_( + os.stat( + os.path.join(self.test_dir, "extracted/unfinished_package.txt") + ).st_size, + 3853326, + ) + eq_( + os.stat( + os.path.join(self.test_dir, "extracted/unfinished_product.txt") + ).st_size, + 4493918, + ) + + def test_download_ndc(self): + downloadNDC = DownloadNDCFiles() + downloadNDC.run() + ok_(os.path.isfile(os.path.join(self.test_dir, "raw/finished.zip"))) + ok_(os.path.isfile(os.path.join(self.test_dir, "raw/unfinished.zip"))) + ok_(os.stat(os.path.join(self.test_dir, "raw/finished.zip")).st_size > 0) + ok_(os.stat(os.path.join(self.test_dir, "raw/unfinished.zip")).st_size > 0) def main(argv): - unittest.main(argv=argv) + unittest.main(argv=argv) -if __name__ == '__main__': - unittest.main() +if __name__ == "__main__": + unittest.main() diff --git a/openfda/nsde/pipeline.py b/openfda/nsde/pipeline.py index 07ca0a0..69ffdbd 100644 --- a/openfda/nsde/pipeline.py +++ b/openfda/nsde/pipeline.py @@ -1,8 +1,8 @@ #!/usr/local/bin/python -''' +""" Pipeline for converting CSV nsde data to JSON and importing into Elasticsearch. -''' +""" import glob import os @@ -13,82 +13,86 @@ from openfda import common, config, parallel, index_util from openfda.common import oldest_file_timestamp -NSDE_DOWNLOAD = \ - 'https://download.open.fda.gov/Comprehensive_NDC_SPL_Data_Elements_File.zip' -NSDE_EXTRACT_DB = 'nsde/nsde.db' -NSDE_RAW_DIR = config.data_dir('nsde/raw') +NSDE_DOWNLOAD = ( + "https://download.open.fda.gov/Comprehensive_NDC_SPL_Data_Elements_File.zip" +) +NSDE_EXTRACT_DB = "nsde/nsde.db" +NSDE_RAW_DIR = config.data_dir("nsde/raw") + class DownloadNSDE(luigi.Task): - def output(self): - return luigi.LocalTarget(join(NSDE_RAW_DIR, 'nsde.csv')) + def output(self): + return luigi.LocalTarget(join(NSDE_RAW_DIR, "nsde.csv")) - def run(self): - output_dir = dirname(self.output().path) - zip_filename = join(output_dir, 'nsde.zip') - common.download(NSDE_DOWNLOAD, zip_filename) - os.system('unzip -o %(zip_filename)s -d %(output_dir)s' % locals()) - os.rename(glob.glob(join(output_dir, '*.csv'))[0], self.output().path) + def run(self): + output_dir = dirname(self.output().path) + zip_filename = join(output_dir, "nsde.zip") + common.download(NSDE_DOWNLOAD, zip_filename) + os.system("unzip -o %(zip_filename)s -d %(output_dir)s" % locals()) + os.rename(glob.glob(join(output_dir, "*.csv"))[0], self.output().path) class NSDE2JSONMapper(parallel.Mapper): - rename_map = { - "Item Code": "package_ndc", - "NDC11": "package_ndc11", - "Marketing Category": "marketing_category", - "Marketing Start Date": "marketing_start_date", - "Marketing End Date": "marketing_end_date", - "Billing Unit": "billing_unit", - "Proprietary Name": "proprietary_name", - "Dosage Form": "dosage_form", - "Application Number or Citation": "application_number_or_citation", - "Product Type": "product_type", - "Inactivation Date": "inactivation_date", - "Reactivation Date": "reactivation_date" - } - - def map(self, key, value, output): - def _cleaner(k, v): - ''' Helper function to rename keys and purge any keys that are not in - the map. - ''' - - if k in self.rename_map and v is not None and v != '': - if "Date" in k: - return (self.rename_map[k], str(int(v))) - if "Proprietary Name" in k: - return (self.rename_map[k], str(v).title()) - else: - return (self.rename_map[k], v) - - new_value = common.transform_dict(value, _cleaner) - output.add(key, new_value) + rename_map = { + "Item Code": "package_ndc", + "NDC11": "package_ndc11", + "Marketing Category": "marketing_category", + "Marketing Start Date": "marketing_start_date", + "Marketing End Date": "marketing_end_date", + "Billing Unit": "billing_unit", + "Proprietary Name": "proprietary_name", + "Dosage Form": "dosage_form", + "Application Number or Citation": "application_number_or_citation", + "Product Type": "product_type", + "Inactivation Date": "inactivation_date", + "Reactivation Date": "reactivation_date", + } + + def map(self, key, value, output): + def _cleaner(k, v): + """Helper function to rename keys and purge any keys that are not in + the map. + """ + + if k in self.rename_map and v is not None and v != "": + if "Date" in k: + return (self.rename_map[k], str(int(v))) + if "Proprietary Name" in k: + return (self.rename_map[k], str(v).title()) + else: + return (self.rename_map[k], v) + + new_value = common.transform_dict(value, _cleaner) + output.add(key, new_value) class NSDE2JSON(luigi.Task): - def requires(self): - return DownloadNSDE() + def requires(self): + return DownloadNSDE() - def output(self): - return luigi.LocalTarget(config.data_dir(NSDE_EXTRACT_DB)) + def output(self): + return luigi.LocalTarget(config.data_dir(NSDE_EXTRACT_DB)) - def run(self): - parallel.mapreduce( - parallel.Collection.from_glob( - self.input().path, parallel.CSVDictLineInput()), - mapper=NSDE2JSONMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + def run(self): + parallel.mapreduce( + parallel.Collection.from_glob( + self.input().path, parallel.CSVDictLineInput() + ), + mapper=NSDE2JSONMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) class LoadJSON(index_util.LoadJSONBase): - index_name = 'othernsde' - mapping_file = './schemas/othernsde_mapping.json' - data_source = NSDE2JSON() - use_checksum = False - optimize_index = True - last_update_date = lambda _: oldest_file_timestamp(NSDE_RAW_DIR) + index_name = "othernsde" + mapping_file = "./schemas/othernsde_mapping.json" + data_source = NSDE2JSON() + use_checksum = False + optimize_index = True + last_update_date = lambda _: oldest_file_timestamp(NSDE_RAW_DIR) -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/parallel/__init__.py b/openfda/parallel/__init__.py index 0cd212e..6436c7e 100644 --- a/openfda/parallel/__init__.py +++ b/openfda/parallel/__init__.py @@ -1,7 +1,27 @@ from .mapreduce import Collection, mapreduce, MRException from .mapper import Mapper, IdentityMapper -from .reducer import Reducer, IdentityReducer, ListReducer, NullReducer, SumReducer, PivotReducer, pivot_values +from .reducer import ( + Reducer, + IdentityReducer, + ListReducer, + NullReducer, + SumReducer, + PivotReducer, + pivot_values, +) from .outputs import MROutput, LevelDBOutput, JSONLineOutput, JSONOutput, NullOutput -from .inputs import MRInput, CSVDictLineInput, CSVLineInput, LevelDBInput, LineInput, CSVSplitLineInput, JSONInput, JSONListInput, JSONLineInput, FilenameInput, XMLDictInput +from .inputs import ( + MRInput, + CSVDictLineInput, + CSVLineInput, + LevelDBInput, + LineInput, + CSVSplitLineInput, + JSONInput, + JSONListInput, + JSONLineInput, + FilenameInput, + XMLDictInput, +) from .sharded_db import ShardedDB from .luigi_support import MRTask diff --git a/openfda/parallel/inputs.py b/openfda/parallel/inputs.py index aa663d0..6a50b35 100644 --- a/openfda/parallel/inputs.py +++ b/openfda/parallel/inputs.py @@ -1,7 +1,8 @@ -''' A set of input classes used by the mapreduce() function to parse, split and +""" A set of input classes used by the mapreduce() function to parse, split and stream data to Mapper. The input types are built out organically as needed by the different data source on the openFDA project: XML, CSV, and JSON. -''' +""" + import csv import io import logging @@ -12,257 +13,291 @@ import simplejson as json import xmltodict -logger = logging.getLogger('mapreduce') +logger = logging.getLogger("mapreduce") + class FileSplit(object): - ''' - A split represents a subset (potentially all) of an input file. + """ + A split represents a subset (potentially all) of an input file. + + Splits allow for processing files that can be read by multiple processes + simultaneously (e.g. lines from a text input). - Splits allow for processing files that can be read by multiple processes - simultaneously (e.g. lines from a text input). + The start_pos and end_pos are interpreted by the input format (for example + they could be line numbers for a line oriented format, or start/stop keys + for a key-value format). + """ - The start_pos and end_pos are interpreted by the input format (for example - they could be line numbers for a line oriented format, or start/stop keys - for a key-value format). - ''' - def __init__(self, filename, mr_input, start_pos, end_pos): - self.filename = filename - self.mr_input = mr_input - self.start_pos = start_pos - self.end_pos = end_pos + def __init__(self, filename, mr_input, start_pos, end_pos): + self.filename = filename + self.mr_input = mr_input + self.start_pos = start_pos + self.end_pos = end_pos - def __repr__(self): - return 'Split(%s -- %s:%s)' % (self.filename, self.start_pos, self.end_pos) + def __repr__(self): + return "Split(%s -- %s:%s)" % (self.filename, self.start_pos, self.end_pos) class MRInput(object): - class Reader(object): - def __init__(self, split, **kw): - self.filename = split.filename - self.start_pos = split.start_pos - self.end_pos = split.end_pos - self.split = split + class Reader(object): + def __init__(self, split, **kw): + self.filename = split.filename + self.start_pos = split.start_pos + self.end_pos = split.end_pos + self.split = split - for k, v in kw.items(): - setattr(self, k, v) + for k, v in kw.items(): + setattr(self, k, v) - def __repr__(self): - return 'Reader(%s)' % self.filename + def __repr__(self): + return "Reader(%s)" % self.filename + + def __iter__(self): + for count, (k, v) in enumerate(self.entries()): + yield k, v - def __iter__(self): - for count, (k, v) in enumerate(self.entries()): - yield k, v + def compute_splits(self, filename, desired_splits): + """ + The default behavior for splitting files is to have one split per file. + """ + return [FileSplit(filename, self, 0, -1)] - def compute_splits(self, filename, desired_splits): - ''' - The default behavior for splitting files is to have one split per file. - ''' - return [FileSplit(filename, self, 0, -1)] + def create_reader(self, split): + return self.__class__.Reader(split) - def create_reader(self, split): - return self.__class__.Reader(split) class XMLDictInput(MRInput): - ''' Simple class for set XML records and streaming dictionaries. - This input reads all of the records into memory; care should be taken - when using it with large inputs. - ''' - def __init__(self, depth=1): - MRInput.__init__(self) - self.depth = depth + """Simple class for set XML records and streaming dictionaries. + This input reads all of the records into memory; care should be taken + when using it with large inputs. + """ - def create_reader(self, split): - return XMLDictInput.Reader(split, depth=self.depth) + def __init__(self, depth=1): + MRInput.__init__(self) + self.depth = depth - class Reader(MRInput.Reader): - def entries(self): - _item = [] - def _handler(_, ord_dict): - _item.append(json.loads(json.dumps(ord_dict))) - return True + def create_reader(self, split): + return XMLDictInput.Reader(split, depth=self.depth) - xml_file = open(self.filename).read() - xmltodict.parse(xml_file, item_depth=self.depth, item_callback=_handler) + class Reader(MRInput.Reader): + def entries(self): + _item = [] - for idx, line in enumerate(_item): - yield str(idx), line + def _handler(_, ord_dict): + _item.append(json.loads(json.dumps(ord_dict))) + return True + + xml_file = open(self.filename).read() + xmltodict.parse(xml_file, item_depth=self.depth, item_callback=_handler) + + for idx, line in enumerate(_item): + yield str(idx), line class LineInput(MRInput): - def compute_splits(self, filename, desired_splits): - ''' - The default behavior for splitting files is to have one split per file. - ''' - file_len = os.path.getsize(filename) - split_size = max(1, int(file_len / desired_splits)) - splits = [] - last_pos = 0 - - with open(filename, 'rb') as f: - while f.tell() < file_len: - f.seek(split_size, 1) - f.readline() - splits.append( - FileSplit(filename, self, last_pos, f.tell())) - last_pos = f.tell() - - return splits - - class Reader(MRInput.Reader): - def entries(self): - f = open(self.filename) - f.seek(self.start_pos) - while f.tell() < self.end_pos: - line = f.readline() - if not line: - return - - yield str(f.tell()), line.rstrip() + def compute_splits(self, filename, desired_splits): + """ + The default behavior for splitting files is to have one split per file. + """ + file_len = os.path.getsize(filename) + split_size = max(1, int(file_len / desired_splits)) + splits = [] + last_pos = 0 + + with open(filename, "rb") as f: + while f.tell() < file_len: + f.seek(split_size, 1) + f.readline() + splits.append(FileSplit(filename, self, last_pos, f.tell())) + last_pos = f.tell() + + return splits + + class Reader(MRInput.Reader): + def entries(self): + f = open(self.filename) + f.seek(self.start_pos) + while f.tell() < self.end_pos: + line = f.readline() + if not line: + return + + yield str(f.tell()), line.rstrip() class CSVSplitLineInput(MRInput): - def __init__(self, delimiter, quoting, fixed_splits=None): - MRInput.__init__(self) - self.delimiter = delimiter - self.quoting = quoting - self.fixed_splits = fixed_splits - - def create_reader(self, split): - return CSVSplitLineInput.Reader(split, delimiter=self.delimiter, quoting=self.quoting) - - def compute_splits(self, filename, desired_splits): - ''' - The default behavior for splitting files is to have one split per file. - ''' - file_len = os.path.getsize(filename) - split_size = max(1, int(file_len / (desired_splits if self.fixed_splits is None else self.fixed_splits))) - splits = [] - last_pos = 0 - - logger.info('Calculating splits for %s each sized %s' % (filename, split_size)) - with open(filename, 'rb') as f: - while f.tell() < file_len: - f.seek(split_size, 1) - f.readline() - splits.append( - FileSplit(filename, self, last_pos, f.tell() if f.tell() < file_len else file_len)) - last_pos = f.tell() - - logger.info('Got %s splits for %s:' % (len(splits), filename)) - logger.info(splits) - return splits - - class Reader(MRInput.Reader): - def entries(self): - f = open(self.filename) - f.seek(self.start_pos) - while f.tell() < self.end_pos: - line = f.readline() - if not line: - f.close() - return - yield str(f.tell()), csv.reader(io.StringIO(line), delimiter=self.delimiter, quoting=self.quoting).__next__() + def __init__(self, delimiter, quoting, fixed_splits=None): + MRInput.__init__(self) + self.delimiter = delimiter + self.quoting = quoting + self.fixed_splits = fixed_splits + + def create_reader(self, split): + return CSVSplitLineInput.Reader( + split, delimiter=self.delimiter, quoting=self.quoting + ) + + def compute_splits(self, filename, desired_splits): + """ + The default behavior for splitting files is to have one split per file. + """ + file_len = os.path.getsize(filename) + split_size = max( + 1, + int( + file_len + / (desired_splits if self.fixed_splits is None else self.fixed_splits) + ), + ) + splits = [] + last_pos = 0 + + logger.info("Calculating splits for %s each sized %s" % (filename, split_size)) + with open(filename, "rb") as f: + while f.tell() < file_len: + f.seek(split_size, 1) + f.readline() + splits.append( + FileSplit( + filename, + self, + last_pos, + f.tell() if f.tell() < file_len else file_len, + ) + ) + last_pos = f.tell() + + logger.info("Got %s splits for %s:" % (len(splits), filename)) + logger.info(splits) + return splits + + class Reader(MRInput.Reader): + def entries(self): + f = open(self.filename) + f.seek(self.start_pos) + while f.tell() < self.end_pos: + line = f.readline() + if not line: + f.close() + return + yield str(f.tell()), csv.reader( + io.StringIO(line), delimiter=self.delimiter, quoting=self.quoting + ).__next__() class FilenameInput(MRInput): - class Reader(MRInput.Reader): - def entries(self): - yield (self.filename, '') + class Reader(MRInput.Reader): + def entries(self): + yield (self.filename, "") class JSONInput(MRInput): - class Reader(MRInput.Reader): - def entries(self): - with open(self.filename, 'r') as input_file: - data = json.load(input_file) - for k, v in iter(data.items()): - yield k, v + class Reader(MRInput.Reader): + def entries(self): + with open(self.filename, "r") as input_file: + data = json.load(input_file) + for k, v in iter(data.items()): + yield k, v + class JSONLineInput(MRInput): - class Reader(MRInput.Reader): - def entries(self): - for idx, line in enumerate(open(self.filename, encoding='utf-8', errors='ignore')): - yield str(idx), json.loads(line) + class Reader(MRInput.Reader): + def entries(self): + for idx, line in enumerate( + open(self.filename, encoding="utf-8", errors="ignore") + ): + yield str(idx), json.loads(line) + class JSONListInput(MRInput): - class Reader(MRInput.Reader): - def entries(self): - with open(self.filename, 'r') as input_file: - data = json.load(input_file) - for item in data: - for k, v in iter(item.items()): - yield k, v + class Reader(MRInput.Reader): + def entries(self): + with open(self.filename, "r") as input_file: + data = json.load(input_file) + for item in data: + for k, v in iter(item.items()): + yield k, v + # TODO(hansnelsen): convert all the input parameters to a inputdict_args for # readability and simplicity class CSVDictLineInput(MRInput): - def __init__(self, - delimiter=',', - quoting=csv.QUOTE_MINIMAL, - fieldnames=None, - escapechar=None, - strip_str=None): - MRInput.__init__(self) - self.delimiter = delimiter - self.quoting = quoting - self.fieldnames = fieldnames - self.escapechar = escapechar - self.strip_str = strip_str - - def create_reader(self, split): - return CSVDictLineInput.Reader(split, - delimiter=self.delimiter, - quoting=self.quoting, - fieldnames=self.fieldnames, - escapechar=self.escapechar, - strip_str=self.strip_str) - - class Reader(MRInput.Reader): - def entries(self): - fh = open(self.filename, 'rU', encoding='utf-8', errors='ignore') - if self.strip_str: - # Some source files have null bytes that need to be stripped first - fh = (s.replace(self.strip_str, '') for s in fh) - - dict_reader = csv.DictReader(fh, - delimiter=self.delimiter, - fieldnames=self.fieldnames, - quoting=self.quoting, - escapechar=self.escapechar) - - for idx, line in enumerate(dict_reader): - yield str(idx), line + def __init__( + self, + delimiter=",", + quoting=csv.QUOTE_MINIMAL, + fieldnames=None, + escapechar=None, + strip_str=None, + ): + MRInput.__init__(self) + self.delimiter = delimiter + self.quoting = quoting + self.fieldnames = fieldnames + self.escapechar = escapechar + self.strip_str = strip_str + + def create_reader(self, split): + return CSVDictLineInput.Reader( + split, + delimiter=self.delimiter, + quoting=self.quoting, + fieldnames=self.fieldnames, + escapechar=self.escapechar, + strip_str=self.strip_str, + ) + + class Reader(MRInput.Reader): + def entries(self): + fh = open(self.filename, "rU", encoding="utf-8", errors="ignore") + if self.strip_str: + # Some source files have null bytes that need to be stripped first + fh = (s.replace(self.strip_str, "") for s in fh) + + dict_reader = csv.DictReader( + fh, + delimiter=self.delimiter, + fieldnames=self.fieldnames, + quoting=self.quoting, + escapechar=self.escapechar, + ) + + for idx, line in enumerate(dict_reader): + yield str(idx), line class CSVLineInput(MRInput): - ''' A plain csv reader for when there are no headers in the source files. - CSVDictLineInput should be used in cases where there are headers to - convert directly into a dictionary. - ''' - def __init__(self, delimiter, quoting): - MRInput.__init__(self) - self.delimiter = delimiter - self.quoting = quoting + """A plain csv reader for when there are no headers in the source files. + CSVDictLineInput should be used in cases where there are headers to + convert directly into a dictionary. + """ + + def __init__(self, delimiter, quoting): + MRInput.__init__(self) + self.delimiter = delimiter + self.quoting = quoting - def create_reader(self, split): - return CSVLineInput.Reader(split, delimiter=self.delimiter, quoting=self.quoting) + def create_reader(self, split): + return CSVLineInput.Reader( + split, delimiter=self.delimiter, quoting=self.quoting + ) - class Reader(MRInput.Reader): - def entries(self): - fh = open(self.filename) - reader = csv.reader(fh, delimiter=self.delimiter, quoting=self.quoting) + class Reader(MRInput.Reader): + def entries(self): + fh = open(self.filename) + reader = csv.reader(fh, delimiter=self.delimiter, quoting=self.quoting) - for idx, line in enumerate(reader): - yield str(idx), line + for idx, line in enumerate(reader): + yield str(idx), line class LevelDBInput(MRInput): - class Reader(MRInput.Reader): - def __init__(self, split): - MRInput.Reader.__init__(self, split) - self.db = leveldb.LevelDB(self.filename) - - def entries(self): - for key, value in self.db.RangeIter(): - yield key.decode("utf-8"), pickle.loads(value) + class Reader(MRInput.Reader): + def __init__(self, split): + MRInput.Reader.__init__(self, split) + self.db = leveldb.LevelDB(self.filename) + + def entries(self): + for key, value in self.db.RangeIter(): + yield key.decode("utf-8"), pickle.loads(value) diff --git a/openfda/parallel/luigi_support.py b/openfda/parallel/luigi_support.py index 253a4ce..72f410f 100644 --- a/openfda/parallel/luigi_support.py +++ b/openfda/parallel/luigi_support.py @@ -4,35 +4,37 @@ from .reducer import Reducer from .outputs import LevelDBOutput + class MRTask(luigi.Task, Mapper, Reducer): - def map(self, key, value, output): - output.add(key, value) - - def reduce(self, key, values, output): - for value in values: - output.put(key, value) - - def mapreduce_inputs(self): - inputs = self.input() - if not isinstance(inputs, list): inputs = [inputs] - return Collection.from_list([input.path for input in inputs]) - - def num_shards(self): - return None - - def map_workers(self): - return None - - def output_format(self): - return LevelDBOutput() - - def run(self): - mapreduce( - self.mapreduce_inputs(), - mapper=self, - reducer=self, - output_prefix=self.output().path, - output_format=self.output_format(), - num_shards=self.num_shards(), - map_workers=self.map_workers() - ) + def map(self, key, value, output): + output.add(key, value) + + def reduce(self, key, values, output): + for value in values: + output.put(key, value) + + def mapreduce_inputs(self): + inputs = self.input() + if not isinstance(inputs, list): + inputs = [inputs] + return Collection.from_list([input.path for input in inputs]) + + def num_shards(self): + return None + + def map_workers(self): + return None + + def output_format(self): + return LevelDBOutput() + + def run(self): + mapreduce( + self.mapreduce_inputs(), + mapper=self, + reducer=self, + output_prefix=self.output().path, + output_format=self.output_format(), + num_shards=self.num_shards(), + map_workers=self.map_workers(), + ) diff --git a/openfda/parallel/mapper.py b/openfda/parallel/mapper.py index f2f2492..8ff10d1 100644 --- a/openfda/parallel/mapper.py +++ b/openfda/parallel/mapper.py @@ -1,23 +1,25 @@ import logging -logger = logging.getLogger('mapreduce') +logger = logging.getLogger("mapreduce") + class Mapper(object): - def map(self, key, value, output): - raise NotImplementedError + def map(self, key, value, output): + raise NotImplementedError + + def map_shard(self, map_input, map_output): + self.filename = map_input.filename + self.map_input = map_input + self.map_output = map_output - def map_shard(self, map_input, map_output): - self.filename = map_input.filename - self.map_input = map_input - self.map_output = map_output + # logger.info('Starting mapper: input=%s', map_input) + mapper = self.map + for idx, (key, value) in enumerate(map_input): + # if idx % 1000 == 0: + # logger.info('Mapping records=%d input=%s key=%s', idx, map_input, key) + mapper(key, value, map_output) - # logger.info('Starting mapper: input=%s', map_input) - mapper = self.map - for idx, (key, value) in enumerate(map_input): - # if idx % 1000 == 0: - # logger.info('Mapping records=%d input=%s key=%s', idx, map_input, key) - mapper(key, value, map_output) class IdentityMapper(Mapper): - def map(self, key, value, output): - output.add(key, value) + def map(self, key, value, output): + output.add(key, value) diff --git a/openfda/parallel/mapreduce.py b/openfda/parallel/mapreduce.py index 246fc0a..f2b2c22 100644 --- a/openfda/parallel/mapreduce.py +++ b/openfda/parallel/mapreduce.py @@ -1,11 +1,11 @@ #!/usr/bin/python -''' +""" Parallel processing utilities. The main function of interest is `map_reduce`, which provides a local version of mapreduce. -''' +""" import collections import glob @@ -27,251 +27,292 @@ from .outputs import LevelDBOutput from .reducer import Reducer -logger = logging.getLogger('mapreduce') +logger = logging.getLogger("mapreduce") + class WrappedException(Exception): - def __init__(self): - self.exception_message = traceback.format_exc() + def __init__(self): + self.exception_message = traceback.format_exc() class MRException(Exception): - def __init__(self, msg, child_message=None): - Exception.__init__(self, msg) - self._child_message = child_message + def __init__(self, msg, child_message=None): + Exception.__init__(self, msg) + self._child_message = child_message - def __str__(self): - return '%s. Child exception was:\n-- %s' % ( - self.message, self._child_message.replace('\n', '\n-- ')) + def __str__(self): + return "%s. Child exception was:\n-- %s" % ( + self.message, + self._child_message.replace("\n", "\n-- "), + ) class _MapperOutput(object): - ''' - A _MapperOutput object is passed to the user mapper. + """ + A _MapperOutput object is passed to the user mapper. + + When the map output is added, it is pushed onto a reducer queue based on + the hash of the map key. + """ - When the map output is added, it is pushed onto a reducer queue based on - the hash of the map key. - ''' - def __init__(self, shuffle_queues): - self.shuffle_queues = shuffle_queues + def __init__(self, shuffle_queues): + self.shuffle_queues = shuffle_queues - def add(self, k, v): - queue = self.shuffle_queues[zlib.adler32(k.encode()) % len(self.shuffle_queues)] - queue.put((k, pickle.dumps(v, -1))) + def add(self, k, v): + queue = self.shuffle_queues[zlib.adler32(k.encode()) % len(self.shuffle_queues)] + queue.put((k, pickle.dumps(v, -1))) class Collection(object): - ''' - Represents an input to mapreduce: a set of files and an input reader - that knows how to process them. - ''' - def __init__(self, filenames_or_glob, mr_input): - if not isinstance(filenames_or_glob, list): - filenames = glob.glob(filenames_or_glob) - else: - filenames = filenames_or_glob - - assert isinstance(mr_input, MRInput), mr_input - self.mr_input = mr_input - self.splits = [] - for f in filenames: - assert os.path.exists(f), 'Missing input: %s' % f - self.splits.extend( - mr_input.compute_splits(f, desired_splits=multiprocessing.cpu_count()) - ) - - def __repr__(self): - return 'Collection(%d items)' % len(self.splits) - - @staticmethod - def from_list(list, mr_input=FilenameInput()): - return Collection(list, mr_input) - - @staticmethod - def from_glob(glob, mr_input=FilenameInput()): - return Collection(glob, mr_input) - - @staticmethod - def from_sharded(prefix, mr_input=LevelDBInput()): - return Collection(prefix + '/*-of-*', mr_input) - - @staticmethod - def from_sharded_list(shard_list, mr_input=LevelDBInput()): - files = [] - for db in shard_list: - files += glob.glob(db + '/*-of-*') - return Collection(files, mr_input) - - def __iter__(self): - for split in self.splits: - yield split + """ + Represents an input to mapreduce: a set of files and an input reader + that knows how to process them. + """ + + def __init__(self, filenames_or_glob, mr_input): + if not isinstance(filenames_or_glob, list): + filenames = glob.glob(filenames_or_glob) + else: + filenames = filenames_or_glob + + assert isinstance(mr_input, MRInput), mr_input + self.mr_input = mr_input + self.splits = [] + for f in filenames: + assert os.path.exists(f), "Missing input: %s" % f + self.splits.extend( + mr_input.compute_splits(f, desired_splits=multiprocessing.cpu_count()) + ) + + def __repr__(self): + return "Collection(%d items)" % len(self.splits) + + @staticmethod + def from_list(list, mr_input=FilenameInput()): + return Collection(list, mr_input) + + @staticmethod + def from_glob(glob, mr_input=FilenameInput()): + return Collection(glob, mr_input) + + @staticmethod + def from_sharded(prefix, mr_input=LevelDBInput()): + return Collection(prefix + "/*-of-*", mr_input) + + @staticmethod + def from_sharded_list(shard_list, mr_input=LevelDBInput()): + files = [] + for db in shard_list: + files += glob.glob(db + "/*-of-*") + return Collection(files, mr_input) + + def __iter__(self): + for split in self.splits: + yield split class Counters(object): - def __init__(self, master): - self.master = master - self._local_counts = collections.defaultdict(int) + def __init__(self, master): + self.master = master + self._local_counts = collections.defaultdict(int) - def increment(self, key): - self._local_counts[key] += 1 + def increment(self, key): + self._local_counts[key] += 1 - def add(self, key, value): - self._local_counts[key] += value + def add(self, key, value): + self._local_counts[key] += value def _run_mapper(mapper, shuffle_queues, split): - with watchdog.Watchdog(): - try: - setproctitle('python -- Mapper.%s' % split) - map_output = _MapperOutput(shuffle_queues) - map_input = split.mr_input.create_reader(split) - return mapper.map_shard(map_input, map_output) - except: - print('Error running mapper (%s, %s)' % (mapper, split), file=sys.stderr) - return WrappedException() + with watchdog.Watchdog(): + try: + setproctitle("python -- Mapper.%s" % split) + map_output = _MapperOutput(shuffle_queues) + map_input = split.mr_input.create_reader(split) + return mapper.map_shard(map_input, map_output) + except: + print("Error running mapper (%s, %s)" % (mapper, split), file=sys.stderr) + return WrappedException() def _run_reducer(reducer, queue, tmp_prefix, output_format, output_tmp, i, num_shards): - with watchdog.Watchdog(): - try: - setproctitle('python -- Reducer.%s-of-%s' % (i, num_shards)) - reducer.initialize(queue, tmp_prefix, output_format, output_tmp, i, num_shards) - reducer.shuffle() - return reducer.reduce_finished() - except: - print('Error running reducer (%s)' % reducer, file=sys.stderr) - traceback.print_exc() - return WrappedException() + with watchdog.Watchdog(): + try: + setproctitle("python -- Reducer.%s-of-%s" % (i, num_shards)) + reducer.initialize( + queue, tmp_prefix, output_format, output_tmp, i, num_shards + ) + reducer.shuffle() + return reducer.reduce_finished() + except: + print("Error running reducer (%s)" % reducer, file=sys.stderr) + traceback.print_exc() + return WrappedException() def mapreduce( - inputs, - mapper, - reducer, - output_prefix, - output_format=LevelDBOutput(), - num_shards=None, - map_workers=None): - - if num_shards is None: - num_shards = output_format.recommended_shards() - - if map_workers is None: - map_workers = multiprocessing.cpu_count() - - # Managers "manage" the queues used by the mapper and reducers to communicate - # A single manager ends up being a processing bottleneck; hence we shard the - # queues across a number of managers. - managers = [multiprocessing.Manager() for i in range(int(max(map_workers, num_shards)))] - shuffle_queues = [managers[i % len(managers)].Queue(1000) for i in range(int(num_shards))] - - mapper_pool = multiprocessing.Pool(processes=map_workers) - reducer_pool = multiprocessing.Pool(processes=num_shards) - - assert isinstance(mapper, Mapper) - assert isinstance(reducer, Reducer) - - if not isinstance(inputs, list): - inputs = [inputs] - - logger.info('Starting MapReduce: %s -> %s@%s, M: %s, R: %s', - inputs, output_prefix, num_shards, mapper.__class__.__name__, reducer.__class__.__name__) - - # output data is initially generated in a temporary output directory. - # it is moved to it's final location when all reducers finish successfully. - tmp_prefix = output_prefix + '-mapreduce-tmp-%s' % arrow.utcnow().format('YYYY-MM-DD-hh-mm') - output_tmp = output_prefix + '-mapreduce-output-%s' % arrow.utcnow().format('YYYY-MM-DD-hh-mm') - output_final = output_prefix - - try: - os.system('mkdir -p %s' % os.path.dirname(tmp_prefix)) - - mapper_tasks = [] - for input_collection in inputs: - assert isinstance(input_collection, Collection) - for split in input_collection: - mapper_tasks.append( - mapper_pool.apply_async( - _run_mapper, - args=(mapper, shuffle_queues, split))) - - reducer_tasks = [] - for i in range(num_shards): - reducer_tasks.append(reducer_pool.apply_async( - _run_reducer, - args=(reducer, shuffle_queues[i], tmp_prefix, output_format, output_tmp, i, num_shards))) - - map_results = {} - last_log_time = time.time() - while len(map_results) < len(mapper_tasks): - for i, task in enumerate(mapper_tasks): - if i in map_results or not task.ready(): - continue - - result = task.get() - map_results[i] = result - if isinstance(result, WrappedException): - print("Exception mapper tasks here??") - print(result.exception_message) - - logger.error('Caught exception during mapper execution.') - logger.error('%s', result.exception_message) - raise MRException('Mapper %d failed.' % i, - child_message=result.exception_message) - # else: - # logger.info('Finished mapper: %d', i) - time.sleep(0.1) - - if time.time() - last_log_time > 5: + inputs, + mapper, + reducer, + output_prefix, + output_format=LevelDBOutput(), + num_shards=None, + map_workers=None, +): + + if num_shards is None: + num_shards = output_format.recommended_shards() + + if map_workers is None: + map_workers = multiprocessing.cpu_count() + + # Managers "manage" the queues used by the mapper and reducers to communicate + # A single manager ends up being a processing bottleneck; hence we shard the + # queues across a number of managers. + managers = [ + multiprocessing.Manager() for i in range(int(max(map_workers, num_shards))) + ] + shuffle_queues = [ + managers[i % len(managers)].Queue(1000) for i in range(int(num_shards)) + ] + + mapper_pool = multiprocessing.Pool(processes=map_workers) + reducer_pool = multiprocessing.Pool(processes=num_shards) + + assert isinstance(mapper, Mapper) + assert isinstance(reducer, Reducer) + + if not isinstance(inputs, list): + inputs = [inputs] + + logger.info( + "Starting MapReduce: %s -> %s@%s, M: %s, R: %s", + inputs, + output_prefix, + num_shards, + mapper.__class__.__name__, + reducer.__class__.__name__, + ) + + # output data is initially generated in a temporary output directory. + # it is moved to it's final location when all reducers finish successfully. + tmp_prefix = output_prefix + "-mapreduce-tmp-%s" % arrow.utcnow().format( + "YYYY-MM-DD-hh-mm" + ) + output_tmp = output_prefix + "-mapreduce-output-%s" % arrow.utcnow().format( + "YYYY-MM-DD-hh-mm" + ) + output_final = output_prefix + + try: + os.system("mkdir -p %s" % os.path.dirname(tmp_prefix)) + + mapper_tasks = [] + for input_collection in inputs: + assert isinstance(input_collection, Collection) + for split in input_collection: + mapper_tasks.append( + mapper_pool.apply_async( + _run_mapper, args=(mapper, shuffle_queues, split) + ) + ) + + reducer_tasks = [] + for i in range(num_shards): + reducer_tasks.append( + reducer_pool.apply_async( + _run_reducer, + args=( + reducer, + shuffle_queues[i], + tmp_prefix, + output_format, + output_tmp, + i, + num_shards, + ), + ) + ) + + map_results = {} last_log_time = time.time() - # print 'Waiting for mappers: %d/%d' % (len(map_results), len(mapper_tasks)), '\r', - - # logger.info('All mappers finished.') - - # flush shuffle queues - for q in shuffle_queues: - q.put(None) - - reduce_outputs = [] - for i, task in enumerate(reducer_tasks): - # print 'Waiting for reducers... %d/%d' % (i, len(reducer_tasks)), '\r', - result = task.get() - if isinstance(result, WrappedException): - logger.info('Caught exception during reducer execution.') - logger.info('%s', result.exception_message) - raise MRException('Reducer %d failed.' % i, - child_message=result.exception_message) - - reduce_outputs.append(result) - - # logger.info('All reducers finished.') - output_format.finalize(output_tmp, output_final) - return reduce_outputs - except: - logger.error( - 'MapReduce run failed. mapper=%s reducer=%s input=%s output=%s', - mapper, reducer, inputs, output_prefix, exc_info=1) - os.system('rm -rf "%s"' % output_final) - raise - finally: - # Ensure all the processes and queues we've allocated are cleaned up. - # - # This cleanup step shouldn't strictly be necessary: the finalizers for - # pools and managers should destroy everything when the objects are - # garbage collected; for some reason this isn't happening: it's - # possible there is a reference loop somewhere that is preventing - # collection. - logger.info('Closing mapper pool.') - mapper_pool.terminate() - - logger.info('Closing reducer pool.') - reducer_pool.terminate() - - # Trying to close the queues fails, as they are actually proxy objects. - # Quitting the managers should result in queues being destroyed as well. - # [q.close() for q in shuffle_queues] - - logger.debug('Shutting down managers.') - [m.shutdown() for m in managers] - - os.system('rm -rf "%s"' % output_tmp) - os.system('rm -rf "%s"' % tmp_prefix) + while len(map_results) < len(mapper_tasks): + for i, task in enumerate(mapper_tasks): + if i in map_results or not task.ready(): + continue + + result = task.get() + map_results[i] = result + if isinstance(result, WrappedException): + print("Exception mapper tasks here??") + print(result.exception_message) + + logger.error("Caught exception during mapper execution.") + logger.error("%s", result.exception_message) + raise MRException( + "Mapper %d failed." % i, child_message=result.exception_message + ) + # else: + # logger.info('Finished mapper: %d', i) + time.sleep(0.1) + + if time.time() - last_log_time > 5: + last_log_time = time.time() + # print 'Waiting for mappers: %d/%d' % (len(map_results), len(mapper_tasks)), '\r', + + # logger.info('All mappers finished.') + + # flush shuffle queues + for q in shuffle_queues: + q.put(None) + + reduce_outputs = [] + for i, task in enumerate(reducer_tasks): + # print 'Waiting for reducers... %d/%d' % (i, len(reducer_tasks)), '\r', + result = task.get() + if isinstance(result, WrappedException): + logger.info("Caught exception during reducer execution.") + logger.info("%s", result.exception_message) + raise MRException( + "Reducer %d failed." % i, child_message=result.exception_message + ) + + reduce_outputs.append(result) + + # logger.info('All reducers finished.') + output_format.finalize(output_tmp, output_final) + return reduce_outputs + except: + logger.error( + "MapReduce run failed. mapper=%s reducer=%s input=%s output=%s", + mapper, + reducer, + inputs, + output_prefix, + exc_info=1, + ) + os.system('rm -rf "%s"' % output_final) + raise + finally: + # Ensure all the processes and queues we've allocated are cleaned up. + # + # This cleanup step shouldn't strictly be necessary: the finalizers for + # pools and managers should destroy everything when the objects are + # garbage collected; for some reason this isn't happening: it's + # possible there is a reference loop somewhere that is preventing + # collection. + logger.info("Closing mapper pool.") + mapper_pool.terminate() + + logger.info("Closing reducer pool.") + reducer_pool.terminate() + + # Trying to close the queues fails, as they are actually proxy objects. + # Quitting the managers should result in queues being destroyed as well. + # [q.close() for q in shuffle_queues] + + logger.debug("Shutting down managers.") + [m.shutdown() for m in managers] + + os.system('rm -rf "%s"' % output_tmp) + os.system('rm -rf "%s"' % tmp_prefix) diff --git a/openfda/parallel/outputs.py b/openfda/parallel/outputs.py index 453ad55..c9e98e7 100644 --- a/openfda/parallel/outputs.py +++ b/openfda/parallel/outputs.py @@ -10,129 +10,147 @@ from openfda.common import convert_unicode -logger = logging.getLogger('mapreduce') +logger = logging.getLogger("mapreduce") -class MROutput(object): - suffix = 'db' - - class Writer(object): - def __init__(self, filename, **kw): - self.filename = filename - - def put(self, key, value): - assert False, "Don't use this class directly: use an output like LevelDBOutput" - - def flush(self): - pass - - def __init__(self, **kw): - self.writer_args = kw - - def create_writer(self, prefix, shard_idx, num_shards): - assert prefix, 'No output prefix specified for output' - assert shard_idx < num_shards, 'Invalid shard index (%d > %d)' % (shard_idx, num_shards) - os.system('mkdir -p "%s"' % prefix) - return self.__class__.Writer( - prefix + '/shard-%05d-of-%05d.%s' % (shard_idx, num_shards, self.suffix), - **self.writer_args) - def recommended_shards(self): - return multiprocessing.cpu_count() - - def finalize(self, tmp_dir, final_dir): - logger.info('Moving results from %s -> %s', tmp_dir, final_dir) - if os.path.exists(final_dir): - logger.warn('Replace existing output directory.') - subprocess.check_output('rm -rf "%s"' % final_dir, shell=True) - subprocess.check_output('mv "%s" "%s"' % (tmp_dir, final_dir), shell=True) +class MROutput(object): + suffix = "db" + + class Writer(object): + def __init__(self, filename, **kw): + self.filename = filename + + def put(self, key, value): + assert ( + False + ), "Don't use this class directly: use an output like LevelDBOutput" + + def flush(self): + pass + + def __init__(self, **kw): + self.writer_args = kw + + def create_writer(self, prefix, shard_idx, num_shards): + assert prefix, "No output prefix specified for output" + assert shard_idx < num_shards, "Invalid shard index (%d > %d)" % ( + shard_idx, + num_shards, + ) + os.system('mkdir -p "%s"' % prefix) + return self.__class__.Writer( + prefix + "/shard-%05d-of-%05d.%s" % (shard_idx, num_shards, self.suffix), + **self.writer_args + ) + + def recommended_shards(self): + return multiprocessing.cpu_count() + + def finalize(self, tmp_dir, final_dir): + logger.info("Moving results from %s -> %s", tmp_dir, final_dir) + if os.path.exists(final_dir): + logger.warn("Replace existing output directory.") + subprocess.check_output('rm -rf "%s"' % final_dir, shell=True) + subprocess.check_output('mv "%s" "%s"' % (tmp_dir, final_dir), shell=True) class LevelDBOutput(MROutput): - class Writer(MROutput.Writer): - def __init__(self, filename): - self.db = leveldb.LevelDB(filename) - self._last_key = None + class Writer(MROutput.Writer): + def __init__(self, filename): + self.db = leveldb.LevelDB(filename) + self._last_key = None - def put(self, key, value): - assert isinstance(key, str) - assert key != self._last_key, ( - 'Duplicate keys (%s) passed to LevelDBOutput.' - 'This output does not support multiple keys!' % key - ) - self.db.Put(key.encode(), pickle.dumps(value, -1)) + def put(self, key, value): + assert isinstance(key, str) + assert key != self._last_key, ( + "Duplicate keys (%s) passed to LevelDBOutput." + "This output does not support multiple keys!" % key + ) + self.db.Put(key.encode(), pickle.dumps(value, -1)) class JSONOutput(MROutput): - suffix = 'json' + suffix = "json" - class Writer(MROutput.Writer): - def __init__(self, filename, **kw): - self.db = {} - self.json_args = kw - self.filename = filename + class Writer(MROutput.Writer): + def __init__(self, filename, **kw): + self.db = {} + self.json_args = kw + self.filename = filename - def put(self, key, value): - assert isinstance(key, str) - self.db[key] = value + def put(self, key, value): + assert isinstance(key, str) + self.db[key] = value - def flush(self): - with open(self.filename, 'w') as out_file: - json.dump(self.db, out_file, **self.json_args) + def flush(self): + with open(self.filename, "w") as out_file: + json.dump(self.db, out_file, **self.json_args) - def create_writer(self, prefix, shard_idx, num_shards): - assert num_shards == 1, 'JSONOutput only works with a single output shard!' - return MROutput.create_writer(self, prefix, shard_idx, num_shards) + def create_writer(self, prefix, shard_idx, num_shards): + assert num_shards == 1, "JSONOutput only works with a single output shard!" + return MROutput.create_writer(self, prefix, shard_idx, num_shards) - def recommended_shards(self): - return 1 + def recommended_shards(self): + return 1 - def finalize(self, tmp_dir, final_dir): - ''' - Move the output JSON file to the final location. + def finalize(self, tmp_dir, final_dir): + """ + Move the output JSON file to the final location. - There should only be one file -- this will fail if the user specified multiple shards! - ''' - import glob - files = glob.glob('%s/*.json' % tmp_dir) - assert len(files) == 1, 'JSONOutput expected one temporary file, got: %s' % files - logger.info('Moving temporary file: %s to final destination: %s', files[0], final_dir) - subprocess.check_output('mv "%s" "%s"' % (files[0], final_dir), shell=True) + There should only be one file -- this will fail if the user specified multiple shards! + """ + import glob + files = glob.glob("%s/*.json" % tmp_dir) + assert len(files) == 1, ( + "JSONOutput expected one temporary file, got: %s" % files + ) + logger.info( + "Moving temporary file: %s to final destination: %s", files[0], final_dir + ) + subprocess.check_output('mv "%s" "%s"' % (files[0], final_dir), shell=True) class JSONLineOutput(MROutput): - ''' - Writes values as JSON, with one value per line. + """ + Writes values as JSON, with one value per line. + + The result is a single file. + """ - The result is a single file. - ''' - suffix = 'jsonline' + suffix = "jsonline" - def finalize(self, tmp_dir, final_dir): - os.system('ls "%s"' % tmp_dir) - subprocess.check_output('cat %s/*.jsonline > "%s"' % (tmp_dir, final_dir), shell=True) + def finalize(self, tmp_dir, final_dir): + os.system('ls "%s"' % tmp_dir) + subprocess.check_output( + 'cat %s/*.jsonline > "%s"' % (tmp_dir, final_dir), shell=True + ) - class Writer(MROutput.Writer): - def __init__(self, filename): - MROutput.Writer.__init__(self, filename) - # convert to io.open() to support unicode writes - self.output_file = io.open(filename, 'w') + class Writer(MROutput.Writer): + def __init__(self, filename): + MROutput.Writer.__init__(self, filename) + # convert to io.open() to support unicode writes + self.output_file = io.open(filename, "w") - def put(self, key, value): - json_str = json.dumps(convert_unicode(value), ensure_ascii=False, encoding='utf-8') - self.output_file.write(str(json_str + '\n')) + def put(self, key, value): + json_str = json.dumps( + convert_unicode(value), ensure_ascii=False, encoding="utf-8" + ) + self.output_file.write(str(json_str + "\n")) + + def flush(self): + logger.info("Flushing: %s", self.filename) + self.output_file.close() - def flush(self): - logger.info('Flushing: %s', self.filename) - self.output_file.close() class NullOutput(MROutput): - ''' - Ignores all outputs and produces no output files. - ''' - def finalize(self, tmp_dir, final_dir): - os.system('rm -rf "%s"' % tmp_dir) - - class Writer(MROutput.Writer): - def put(self, key, value): - pass + """ + Ignores all outputs and produces no output files. + """ + + def finalize(self, tmp_dir, final_dir): + os.system('rm -rf "%s"' % tmp_dir) + + class Writer(MROutput.Writer): + def put(self, key, value): + pass diff --git a/openfda/parallel/reducer.py b/openfda/parallel/reducer.py index 3bba7ed..f107b05 100644 --- a/openfda/parallel/reducer.py +++ b/openfda/parallel/reducer.py @@ -6,118 +6,133 @@ import leveldb -logger = logging.getLogger('mapreduce') +logger = logging.getLogger("mapreduce") + def group_by_key(iterator): - '''Group identical keys together. - - Given a sorted iterator of (key, value) pairs, returns an iterator of - (key1, values), (key2, values). - ''' - last_key = None - values = [] - for key, value in iterator: - value = loads(value) - key = key.decode() - user_key, _ = key.rsplit('.', 1) - if user_key != last_key: - if last_key is not None: + """Group identical keys together. + + Given a sorted iterator of (key, value) pairs, returns an iterator of + (key1, values), (key2, values). + """ + last_key = None + values = [] + for key, value in iterator: + value = loads(value) + key = key.decode() + user_key, _ = key.rsplit(".", 1) + if user_key != last_key: + if last_key is not None: + yield last_key, values + last_key = user_key + values = [value] + else: + values.append(value) + + if last_key is not None: yield last_key, values - last_key = user_key - values = [value] - else: - values.append(value) - - if last_key is not None: - yield last_key, values class Reducer(object): - def initialize(self, input_queue, tmp_prefix, output_class, output_prefix, shard_idx, num_shards): - self.tmp_prefix = tmp_prefix - self.output_prefix = output_prefix - self.input_queue = input_queue - self.output_class = output_class - self.shard_idx = shard_idx - self.num_shards = num_shards - - def reduce_shard(self, input_db, output_db): - for idx, (key, values) in enumerate(group_by_key(input_db.RangeIter())): - # if idx % 1000 == 0: - # logger.info('Reducing records=%d key=%s shard=%d', idx, key, self.shard_idx) - self.reduce(key, values, output_db) - - def shuffle(self): - os.system('mkdir -p "%s"' % self.tmp_prefix) - shuffle_dir = tempfile.mkdtemp( - prefix='shard-%05d-of-%05d' % (self.shard_idx, self.num_shards), - dir=self.tmp_prefix) - shuffle_db = leveldb.LevelDB(shuffle_dir) - - idx = 0 - while 1: - next_entry = self.input_queue.get() - if next_entry is None: - break - key, value_str = next_entry - shuffle_db.Put((key + ('.%s' % idx)).encode(), value_str) - idx += 1 - - # if idx % 1000 == 0: - # logger.info('Shuffling records=%d key=%s shard=%d', idx, key, self.shard_idx) - - output_db = self.output_class.create_writer(self.output_prefix, self.shard_idx, self.num_shards) - # logger.debug('Reducer: %s', output_db) - self.reduce_shard(shuffle_db, output_db) - output_db.flush() - del output_db - del shuffle_db - os.system('rm -rf "%s"' % shuffle_dir) - - def reduce(self, key, values, output): - raise NotImplementedError - - def reduce_finished(self): - '''Called after all values have been reduced. - - The result of this call is returned to the caller of `mapreduce`. - ''' - pass - + def initialize( + self, + input_queue, + tmp_prefix, + output_class, + output_prefix, + shard_idx, + num_shards, + ): + self.tmp_prefix = tmp_prefix + self.output_prefix = output_prefix + self.input_queue = input_queue + self.output_class = output_class + self.shard_idx = shard_idx + self.num_shards = num_shards + + def reduce_shard(self, input_db, output_db): + for idx, (key, values) in enumerate(group_by_key(input_db.RangeIter())): + # if idx % 1000 == 0: + # logger.info('Reducing records=%d key=%s shard=%d', idx, key, self.shard_idx) + self.reduce(key, values, output_db) + + def shuffle(self): + os.system('mkdir -p "%s"' % self.tmp_prefix) + shuffle_dir = tempfile.mkdtemp( + prefix="shard-%05d-of-%05d" % (self.shard_idx, self.num_shards), + dir=self.tmp_prefix, + ) + shuffle_db = leveldb.LevelDB(shuffle_dir) + + idx = 0 + while 1: + next_entry = self.input_queue.get() + if next_entry is None: + break + key, value_str = next_entry + shuffle_db.Put((key + (".%s" % idx)).encode(), value_str) + idx += 1 + + # if idx % 1000 == 0: + # logger.info('Shuffling records=%d key=%s shard=%d', idx, key, self.shard_idx) + + output_db = self.output_class.create_writer( + self.output_prefix, self.shard_idx, self.num_shards + ) + # logger.debug('Reducer: %s', output_db) + self.reduce_shard(shuffle_db, output_db) + output_db.flush() + del output_db + del shuffle_db + os.system('rm -rf "%s"' % shuffle_dir) + + def reduce(self, key, values, output): + raise NotImplementedError + + def reduce_finished(self): + """Called after all values have been reduced. + + The result of this call is returned to the caller of `mapreduce`. + """ + pass class IdentityReducer(Reducer): - def reduce(self, key, values, output): - for value in values: - output.put(key, value) + def reduce(self, key, values, output): + for value in values: + output.put(key, value) class SumReducer(Reducer): - def reduce(self, key, values, output): - output.put(key, sum([float(v) for v in values])) + def reduce(self, key, values, output): + output.put(key, sum([float(v) for v in values])) + class ListReducer(Reducer): - def reduce(self, key, values, output): - output.put(key, list(values)) + def reduce(self, key, values, output): + output.put(key, list(values)) + class NullReducer(Reducer): - def reduce(self, key, values, output): - return + def reduce(self, key, values, output): + return + def pivot_values(value_list): - ''' Takes a list of (name, value) tuples, and `pivots` them, returning - a dictionary from name -> [values]. - - This is frequently used when joining a number of inputs together, - where each input is tagged with a table name. - ''' - intermediate = collections.defaultdict(list) - for row in value_list: - table_name, val = row - intermediate[table_name].append(val) - return intermediate + """Takes a list of (name, value) tuples, and `pivots` them, returning + a dictionary from name -> [values]. + + This is frequently used when joining a number of inputs together, + where each input is tagged with a table name. + """ + intermediate = collections.defaultdict(list) + for row in value_list: + table_name, val = row + intermediate[table_name].append(val) + return intermediate + class PivotReducer(Reducer): - def reduce(self, key, values, output): - val = pivot_values(values) - output.put(key, val) + def reduce(self, key, values, output): + val = pivot_values(values) + output.put(key, val) diff --git a/openfda/parallel/sharded_db.py b/openfda/parallel/sharded_db.py index c425b98..a2bb941 100644 --- a/openfda/parallel/sharded_db.py +++ b/openfda/parallel/sharded_db.py @@ -6,56 +6,60 @@ import os import leveldb + class ShardedDB(object): - ''' - Manages a number of leveldb "shards" (partitions). - - LevelDB does not support concurrent writers, so we create a separate output - shard for each reducer. `ShardedDB` provides a unified interface to - multiple shards. - ''' - def __init__(self, filebase, num_shards, create_if_missing): - self.filebase = filebase - self.num_shards = num_shards - self._shards = [] - os.system('mkdir -p "%s"' % filebase) - for i in range(num_shards): - shard_file = '%s/shard-%05d-of-%05d.db' % (filebase, i, num_shards) - self._shards.append(leveldb.LevelDB(shard_file, create_if_missing=create_if_missing)) - - logging.info('Opened DB with %s files', num_shards) - - @staticmethod - def create(filebase, num_shards): - 'Create a new ShardedDB with the given number of output shards.' - return ShardedDB(filebase, num_shards, True) - - @staticmethod - def open(filebase): - 'Open an existing ShardedDB.' - files = glob.glob('%s/*.db' % filebase) - return ShardedDB(filebase, len(files), False) - - def _shard_for(self, key): - return self._shards[zlib.adler32(key) % self.num_shards] - - def put(self, key, value): - self._shard_for(key).Put(key, pickle.dumps(value, -1)) - - def get(self, key): - return pickle.loads(self._shard_for(key).Get(key)) - - def range_iter(self, start_key, end_key): - iters = [db.RangeIter(start_key, end_key) for db in self._shards] - for i in iters: - for key, value in i: - yield key.decode(), pickle.loads(value) - - def __iter__(self): - for shard in self._shards: - for key, value in shard.RangeIter(): - yield key, pickle.loads(value) - - def as_dict(self): - 'Returns the content of this database as an in-memory Python dictionary' - return dict(self.range_iter(None, None)) + """ + Manages a number of leveldb "shards" (partitions). + + LevelDB does not support concurrent writers, so we create a separate output + shard for each reducer. `ShardedDB` provides a unified interface to + multiple shards. + """ + + def __init__(self, filebase, num_shards, create_if_missing): + self.filebase = filebase + self.num_shards = num_shards + self._shards = [] + os.system('mkdir -p "%s"' % filebase) + for i in range(num_shards): + shard_file = "%s/shard-%05d-of-%05d.db" % (filebase, i, num_shards) + self._shards.append( + leveldb.LevelDB(shard_file, create_if_missing=create_if_missing) + ) + + logging.info("Opened DB with %s files", num_shards) + + @staticmethod + def create(filebase, num_shards): + "Create a new ShardedDB with the given number of output shards." + return ShardedDB(filebase, num_shards, True) + + @staticmethod + def open(filebase): + "Open an existing ShardedDB." + files = glob.glob("%s/*.db" % filebase) + return ShardedDB(filebase, len(files), False) + + def _shard_for(self, key): + return self._shards[zlib.adler32(key) % self.num_shards] + + def put(self, key, value): + self._shard_for(key).Put(key, pickle.dumps(value, -1)) + + def get(self, key): + return pickle.loads(self._shard_for(key).Get(key)) + + def range_iter(self, start_key, end_key): + iters = [db.RangeIter(start_key, end_key) for db in self._shards] + for i in iters: + for key, value in i: + yield key.decode(), pickle.loads(value) + + def __iter__(self): + for shard in self._shards: + for key, value in shard.RangeIter(): + yield key, pickle.loads(value) + + def as_dict(self): + "Returns the content of this database as an in-memory Python dictionary" + return dict(self.range_iter(None, None)) diff --git a/openfda/parallel/watchdog.py b/openfda/parallel/watchdog.py index 5b31162..41faa85 100644 --- a/openfda/parallel/watchdog.py +++ b/openfda/parallel/watchdog.py @@ -4,42 +4,43 @@ import threading import traceback -OUTPUT_DIR = './watchdog/' +OUTPUT_DIR = "./watchdog/" + class Watchdog(threading.Thread): - def __init__(self, *args, **kw): - threading.Thread.__init__(self, *args, **kw) - self.daemon = True - self._queue = queue.Queue(maxsize=1) - self._log_frequency = kw.get('log_frequency', 30) - self._output_dir = kw.get('output_dir', OUTPUT_DIR) - - def _output_file(self): - return os.path.join(self._output_dir, '%s.dump' % os.getpid()) - - def run(self): - while True: - try: - self._queue.get(timeout=self._log_frequency) - return - except queue.Empty: - pass - - os.system('mkdir -p "%s"' % self._output_dir) - with open(self._output_file(), 'w') as out_f: - for threadid, frame in sys._current_frames().items(): - stack = ''.join(traceback.format_stack(frame)) - print('Thread: %s' % threadid, file=out_f) - print(stack, file=out_f) - print('\n\n', file=out_f) - - def __enter__(self): - self.start() - return self - - def __exit__(self, type, value, traceback): - # Sentinel value: wake up the watchdog and have it exit - self._queue.put(None) - self.join() - if os.path.exists(self._output_file()): - os.unlink(self._output_file()) + def __init__(self, *args, **kw): + threading.Thread.__init__(self, *args, **kw) + self.daemon = True + self._queue = queue.Queue(maxsize=1) + self._log_frequency = kw.get("log_frequency", 30) + self._output_dir = kw.get("output_dir", OUTPUT_DIR) + + def _output_file(self): + return os.path.join(self._output_dir, "%s.dump" % os.getpid()) + + def run(self): + while True: + try: + self._queue.get(timeout=self._log_frequency) + return + except queue.Empty: + pass + + os.system('mkdir -p "%s"' % self._output_dir) + with open(self._output_file(), "w") as out_f: + for threadid, frame in sys._current_frames().items(): + stack = "".join(traceback.format_stack(frame)) + print("Thread: %s" % threadid, file=out_f) + print(stack, file=out_f) + print("\n\n", file=out_f) + + def __enter__(self): + self.start() + return self + + def __exit__(self, type, value, traceback): + # Sentinel value: wake up the watchdog and have it exit + self._queue.put(None) + self.join() + if os.path.exists(self._output_file()): + os.unlink(self._output_file()) diff --git a/openfda/registration/pipeline.py b/openfda/registration/pipeline.py index 8eeae03..d7073b1 100755 --- a/openfda/registration/pipeline.py +++ b/openfda/registration/pipeline.py @@ -1,6 +1,6 @@ #!/usr/bin/python -''' Device pipeline for downloading, joining and loading registration and +""" Device pipeline for downloading, joining and loading registration and listings into elasticsearch. The input data for registrations is normalized, so we need to do @@ -42,7 +42,7 @@ 'intermediate_establishment_listing', 'intermediate_registration') ON 'reg_key' -''' +""" import csv import glob @@ -60,579 +60,634 @@ from openfda import common, config, index_util, parallel from openfda import download_util from openfda.common import newest_file_timestamp -from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, - DeviceAnnotateMapper) +from openfda.device_harmonization.pipeline import ( + Harmonized2OpenFDA, + DeviceAnnotateMapper, +) RUN_DIR = dirname(dirname(os.path.abspath(__file__))) -BASE_DIR = config.data_dir('registration') -RAW_DIR = config.data_dir('registration/raw') -common.shell_cmd('mkdir -p %s', BASE_DIR) +BASE_DIR = config.data_dir("registration") +RAW_DIR = config.data_dir("registration/raw") +common.shell_cmd("mkdir -p %s", BASE_DIR) # A directory for holding files that track Task state -META_DIR = config.data_dir('registration/meta') -common.shell_cmd('mkdir -p %s', META_DIR) +META_DIR = config.data_dir("registration/meta") +common.shell_cmd("mkdir -p %s", META_DIR) -DEVICE_REG_PAGE = ('https://www.fda.gov/medical-devices/' - 'device-registration-and-listing/' - 'establishment-registration-and-medical-device-listing-files-download') +DEVICE_REG_PAGE = ( + "https://www.fda.gov/medical-devices/" + "device-registration-and-listing/" + "establishment-registration-and-medical-device-listing-files-download" +) -REG_LISTING_URL = 'https://download.open.fda.gov/reglist/registration_listing.txt' -REG_LISTING_LOCAL_DIR = config.data_dir('registration/reg_listing') +REG_LISTING_URL = "https://download.open.fda.gov/reglist/registration_listing.txt" +REG_LISTING_LOCAL_DIR = config.data_dir("registration/reg_listing") REMAPPED_FILES = { - 'registration_listing.txt': 'remapped_registration_listing.txt', - } + "registration_listing.txt": "remapped_registration_listing.txt", +} # TODO(hansnelsen): analyze and add the following files to the pipeline, schema, # es mapping, documentation. # For now, we will just exclude them. EXCLUDED_FILES = [ - 'Manu_ID_by_Imp.txt', - 'Non_Reg_Imp_ID_by_Manu.txt', - 'Reg_Imp_ID_by_Manu.txt' + "Manu_ID_by_Imp.txt", + "Non_Reg_Imp_ID_by_Manu.txt", + "Reg_Imp_ID_by_Manu.txt", ] + # TODO(hansnelsen): copied from spl/pipeline.py, consolidate to a common place. # This version has been slightly altered, so we will need to # do this refactor once all of the S3 requirements are in. class DownloadRegListing(luigi.Task): - url = REG_LISTING_URL - local_dir = REG_LISTING_LOCAL_DIR + url = REG_LISTING_URL + local_dir = REG_LISTING_LOCAL_DIR + + def output(self): + return luigi.LocalTarget(self.local_dir) - def output(self): - return luigi.LocalTarget(self.local_dir) + def run(self): + common.download( + self.url, os.path.join(self.local_dir, "registration_listing.txt") + ) - def run(self): - common.download(self.url, os.path.join(self.local_dir, 'registration_listing.txt')) def remap_supplemental_files(original, supplemental, output_file): - orig = pandas.read_csv(original, sep='|') - supp = pandas.read_csv(supplemental, sep='|') - orig.columns = list(map(str.lower, orig.columns)) - supp.columns = list(map(str.lower, supp.columns)) + orig = pandas.read_csv(original, sep="|") + supp = pandas.read_csv(supplemental, sep="|") + orig.columns = list(map(str.lower, orig.columns)) + supp.columns = list(map(str.lower, supp.columns)) - combined = pandas.merge(orig, supp, how='left') + combined = pandas.merge(orig, supp, how="left") - combined['premarket_submission_number'] = \ - combined['premarket_submission_number'].astype('str') + combined["premarket_submission_number"] = combined[ + "premarket_submission_number" + ].astype("str") - submission = combined['premarket_submission_number'] + submission = combined["premarket_submission_number"] - combined['pma_number'] = submission.map(common.get_p_number) - combined['k_number'] = submission.map(common.get_k_number) + combined["pma_number"] = submission.map(common.get_p_number) + combined["k_number"] = submission.map(common.get_k_number) - combined.drop('premarket_submission_number', axis=1, inplace=True) + combined.drop("premarket_submission_number", axis=1, inplace=True) - # to_csv() will prepend an extra delimiter to the CSV header row unless you - # specifiy `index=False` - combined.to_csv(output_file, sep='|', index=False) + # to_csv() will prepend an extra delimiter to the CSV header row unless you + # specifiy `index=False` + combined.to_csv(output_file, sep="|", index=False) + + return - return def construct_join_key(data, join_keys): - ''' A helper function to construct a join key from dictionary values. - ''' - return ':'.join([v for k, v in data.items() if k in join_keys and v != None]) + """A helper function to construct a join key from dictionary values.""" + return ":".join([v for k, v in data.items() if k in join_keys and v != None]) + class JoinMapper(parallel.Mapper): - def __init__(self, tables, join_keys): - parallel.Mapper.__init__(self) - self.tables = tables - self.join_keys = join_keys - - def map(self, key, value, output): - if not isinstance(value, list): value = [value] - table = key.split(':')[0] - if table in self.tables: - for row in value: - jk = construct_join_key(row, self.join_keys) - if jk: - output.add(jk, (table, row)) + def __init__(self, tables, join_keys): + parallel.Mapper.__init__(self) + self.tables = tables + self.join_keys = join_keys + + def map(self, key, value, output): + if not isinstance(value, list): + value = [value] + table = key.split(":")[0] + if table in self.tables: + for row in value: + jk = construct_join_key(row, self.join_keys) + if jk: + output.add(jk, (table, row)) + class DownloadDeviceRegistrationAndListings(luigi.Task): - def requires(self): - return [] - - def output(self): - return luigi.LocalTarget(RAW_DIR) - - def run(self): - zip_urls = [] - soup = BeautifulSoup(urlopen(DEVICE_REG_PAGE).read()) - for a in soup.find_all(href=re.compile('.*.zip')): - zip_urls.append(a['href']) - if not zip_urls: - logging.info('No Registration Zip Files Found At %s' % DEVICE_REG_PAGE) - for zip_url in zip_urls: - filename = zip_url.split('/')[-1] - common.download(zip_url, join(self.output().path, filename)) + def requires(self): + return [] + + def output(self): + return luigi.LocalTarget(RAW_DIR) + + def run(self): + zip_urls = [] + soup = BeautifulSoup(urlopen(DEVICE_REG_PAGE).read()) + for a in soup.find_all(href=re.compile(".*.zip")): + zip_urls.append(a["href"]) + if not zip_urls: + logging.info("No Registration Zip Files Found At %s" % DEVICE_REG_PAGE) + for zip_url in zip_urls: + filename = zip_url.split("/")[-1] + common.download(zip_url, join(self.output().path, filename)) + class ExtractAndCleanDownloadsReg(luigi.Task): - ''' Unzip each of the download files and remove all the non-UTF8 characters. - Unzip -p streams the data directly to iconv which then writes to disk. - ''' - # These files have floats, e.g. 123.0 instead of 123, on the join keys, which - # causes problems downstream. - problem_files = [ - 'registration_listing.txt', - 'remapped_registration_listing.txt', - 'Listing_Proprietary_Name.txt' - ] - - def requires(self): - return [DownloadDeviceRegistrationAndListings(), DownloadRegListing()] - - def output(self): - return luigi.LocalTarget(config.data_dir('registration/extracted')) - - def run(self): - output_dir = self.output().path - common.shell_cmd('mkdir -p %s', output_dir) - input_dir = self.input()[0].path - supplemental_dir = self.input()[1].path - download_util.extract_and_clean(input_dir, 'ISO-8859-1', 'UTF-8', 'txt') - - # One of the files needs to be remapped from one column (submission_number) - # to two columns (pma_number and k_number) depending on the prefix. - file_name = 'registration_listing.txt' - output_file = join(output_dir, 'remapped_' + file_name) - remap_supplemental_files(join(output_dir, file_name), - join(supplemental_dir, file_name), - output_file) - - # There are a handful of files with floats for keys - # This step can be removed once it is fixed on the source system. - for fix_file in self.problem_files: - with open(join(output_dir, fix_file), 'r') as needs_fixing: - lines = needs_fixing.readlines() - with open(join(output_dir, fix_file), 'w') as gets_fixing: - for line in lines: - gets_fixing.write(re.sub(r'\.0', '', line)) + """Unzip each of the download files and remove all the non-UTF8 characters. + Unzip -p streams the data directly to iconv which then writes to disk. + """ + + # These files have floats, e.g. 123.0 instead of 123, on the join keys, which + # causes problems downstream. + problem_files = [ + "registration_listing.txt", + "remapped_registration_listing.txt", + "Listing_Proprietary_Name.txt", + ] + + def requires(self): + return [DownloadDeviceRegistrationAndListings(), DownloadRegListing()] + + def output(self): + return luigi.LocalTarget(config.data_dir("registration/extracted")) + + def run(self): + output_dir = self.output().path + common.shell_cmd("mkdir -p %s", output_dir) + input_dir = self.input()[0].path + supplemental_dir = self.input()[1].path + download_util.extract_and_clean(input_dir, "ISO-8859-1", "UTF-8", "txt") + + # One of the files needs to be remapped from one column (submission_number) + # to two columns (pma_number and k_number) depending on the prefix. + file_name = "registration_listing.txt" + output_file = join(output_dir, "remapped_" + file_name) + remap_supplemental_files( + join(output_dir, file_name), join(supplemental_dir, file_name), output_file + ) + + # There are a handful of files with floats for keys + # This step can be removed once it is fixed on the source system. + for fix_file in self.problem_files: + with open(join(output_dir, fix_file), "r") as needs_fixing: + lines = needs_fixing.readlines() + with open(join(output_dir, fix_file), "w") as gets_fixing: + for line in lines: + gets_fixing.write(re.sub(r"\.0", "", line)) + class TXT2JSONMapper(parallel.Mapper): - def map_shard(self, map_input, map_output): - self.filename = map_input.filename - return parallel.Mapper.map_shard(self, map_input, map_output) - - def map(self, key, value, output): - def transform_dates(coll): - DATE_KEYS = ['created_date'] - def _replace_date(k, v): - if k is None: return - k = k.lower() - if v is None: return (k, v) - if k in DATE_KEYS: return (k, arrow.get(v, 'MM/DD/YYYY') - .format('YYYY-MM-DD')) - if isinstance(v, list): return (k, v) - return (k, v.strip()) - return common.transform_dict(coll, _replace_date) - - new_value = transform_dates(value) - table_name = basename(self.filename).lower().replace('.txt', '') - new_key = table_name + ':' + key - output.add(new_key, new_value) + def map_shard(self, map_input, map_output): + self.filename = map_input.filename + return parallel.Mapper.map_shard(self, map_input, map_output) + + def map(self, key, value, output): + def transform_dates(coll): + DATE_KEYS = ["created_date"] + + def _replace_date(k, v): + if k is None: + return + k = k.lower() + if v is None: + return (k, v) + if k in DATE_KEYS: + return (k, arrow.get(v, "MM/DD/YYYY").format("YYYY-MM-DD")) + if isinstance(v, list): + return (k, v) + return (k, v.strip()) + + return common.transform_dict(coll, _replace_date) + + new_value = transform_dates(value) + table_name = basename(self.filename).lower().replace(".txt", "") + new_key = table_name + ":" + key + output.add(new_key, new_value) + class TXT2JSON(luigi.Task): - def requires(self): - return ExtractAndCleanDownloadsReg() - - def output(self): - return luigi.LocalTarget(config.data_dir('registration/json.db')) - - def run(self): - input_dir = self.input().path - output_dir = self.output().path - common.shell_cmd('mkdir -p %s', dirname(output_dir)) - - NEEDS_HEADERS = { - 'estabtypes.txt': ['establishment_type_id', 'description'] - } - - inputs = [] - for input_file in glob.glob(input_dir + '/*.txt'): - if basename(input_file) in REMAPPED_FILES: - continue - if basename(input_file) in EXCLUDED_FILES: - continue - header_key = basename(input_file) - fieldnames = NEEDS_HEADERS.get(header_key, None) - inputs.append( - parallel.Collection.from_glob( - input_file, parallel.CSVDictLineInput(delimiter='|', - fieldnames=fieldnames, - quoting=csv.QUOTE_NONE, - escapechar='\\'))) - - parallel.mapreduce( - inputs=inputs, - mapper=TXT2JSONMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + def requires(self): + return ExtractAndCleanDownloadsReg() + + def output(self): + return luigi.LocalTarget(config.data_dir("registration/json.db")) + + def run(self): + input_dir = self.input().path + output_dir = self.output().path + common.shell_cmd("mkdir -p %s", dirname(output_dir)) + + NEEDS_HEADERS = {"estabtypes.txt": ["establishment_type_id", "description"]} + + inputs = [] + for input_file in glob.glob(input_dir + "/*.txt"): + if basename(input_file) in REMAPPED_FILES: + continue + if basename(input_file) in EXCLUDED_FILES: + continue + header_key = basename(input_file) + fieldnames = NEEDS_HEADERS.get(header_key, None) + inputs.append( + parallel.Collection.from_glob( + input_file, + parallel.CSVDictLineInput( + delimiter="|", + fieldnames=fieldnames, + quoting=csv.QUOTE_NONE, + escapechar="\\", + ), + ) + ) + + parallel.mapreduce( + inputs=inputs, + mapper=TXT2JSONMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) + class OwnerOperatorJoinReducer(parallel.Reducer): - def _join(self, values): - intermediate = parallel.pivot_values(values) - result = [] - for row in intermediate['owner_operator']: - final = dict(row) - final['official_correspondent'] = {} - final['contact_address'] = {} - # Should only be one address, but the intermediate interface is a list - # so we will just grab the first item from the list. - contact_address_data = intermediate.get('contact_addresses', None) - if contact_address_data: - final['contact_address'] = contact_address_data[0] - - left_key = final['reg_key'] - - for data in intermediate.get('official_correspondent', []): - right_key = data['reg_key'] - if right_key == left_key: - final['official_correspondent'] = data - result.append(final) - - return result - - def reduce(self, key, values, output): - new_key = 'intermediate_owner_operator:' + key - val = self._join(values) - if val: output.put(new_key, val) + def _join(self, values): + intermediate = parallel.pivot_values(values) + result = [] + for row in intermediate["owner_operator"]: + final = dict(row) + final["official_correspondent"] = {} + final["contact_address"] = {} + # Should only be one address, but the intermediate interface is a list + # so we will just grab the first item from the list. + contact_address_data = intermediate.get("contact_addresses", None) + if contact_address_data: + final["contact_address"] = contact_address_data[0] + + left_key = final["reg_key"] + + for data in intermediate.get("official_correspondent", []): + right_key = data["reg_key"] + if right_key == left_key: + final["official_correspondent"] = data + result.append(final) + + return result + + def reduce(self, key, values, output): + new_key = "intermediate_owner_operator:" + key + val = self._join(values) + if val: + output.put(new_key, val) + class JoinOwnerOperator(luigi.Task): - def requires(self): - return TXT2JSON() - - def output(self): - return luigi.LocalTarget(config.data_dir('registration/owner_operator.db')) - - def run(self): - tables = ['owner_operator', 'contact_addresses', 'official_correspondent'] - join_keys = ['contact_id'] - parallel.mapreduce( - parallel.Collection.from_sharded(self.input().path), - mapper=JoinMapper(tables=tables, join_keys=join_keys), - reducer=OwnerOperatorJoinReducer(), - output_prefix=self.output().path, - num_shards=10) + def requires(self): + return TXT2JSON() + + def output(self): + return luigi.LocalTarget(config.data_dir("registration/owner_operator.db")) + + def run(self): + tables = ["owner_operator", "contact_addresses", "official_correspondent"] + join_keys = ["contact_id"] + parallel.mapreduce( + parallel.Collection.from_sharded(self.input().path), + mapper=JoinMapper(tables=tables, join_keys=join_keys), + reducer=OwnerOperatorJoinReducer(), + output_prefix=self.output().path, + num_shards=10, + ) + class RegistrationJoinReducer(parallel.Reducer): - def _join(self, values): - address_keys = [ - 'address_line_1', - 'address_line_2', - 'city', - 'state_id', - 'zip_code', - 'postal_code', - 'iso_country_code' - ] + def _join(self, values): + address_keys = [ + "address_line_1", + "address_line_2", + "city", + "state_id", + "zip_code", + "postal_code", + "iso_country_code", + ] + + intermediate = parallel.pivot_values(values) + + # The US Agent Address is in the registration dataset, we need to pluck it + # out and merge it with each us agent record. + us_agent_address = {} + for row in intermediate.get("registration", []): + _type = row.get("address_type_id", None) + if _type == "U": + us_agent_address = {k: v for k, v in row.items() if k in address_keys} + + # There are 0 or 1 US Agents assigned to a facility + us_agent = {} + agent_data = intermediate.get("us_agent", []) + if agent_data: + us_agent = dict( + list(agent_data[0].items()) + list(us_agent_address.items()) + ) + + # There is 0 or 1 owner operators + owner_operator = {} + owner_operator_data = intermediate.get("intermediate_owner_operator", []) + if owner_operator_data: + owner_operator = owner_operator_data[0] + + result = [] + for row in intermediate.get("registration", []): + _type = row.get("address_type_id", None) + # We only want `Facility` records, i.e. skip all the us agent addresses + if _type == "F": + final = dict(row) + final["us_agent"] = us_agent + final["owner_operator"] = owner_operator + result.append(final) + + return result + + def reduce(self, key, values, output): + new_key = "intermediate_registration:" + key + val = self._join(values) + if val: + output.put(new_key, val) - intermediate = parallel.pivot_values(values) - - # The US Agent Address is in the registration dataset, we need to pluck it - # out and merge it with each us agent record. - us_agent_address = {} - for row in intermediate.get('registration', []): - _type = row.get('address_type_id', None) - if _type == 'U': - us_agent_address = {k:v for k, v in row.items() if k in address_keys} - - # There are 0 or 1 US Agents assigned to a facility - us_agent = {} - agent_data = intermediate.get('us_agent', []) - if agent_data: - us_agent = dict(list(agent_data[0].items()) + list(us_agent_address.items())) - - # There is 0 or 1 owner operators - owner_operator = {} - owner_operator_data = intermediate.get('intermediate_owner_operator', []) - if owner_operator_data: - owner_operator = owner_operator_data[0] - - result = [] - for row in intermediate.get('registration', []): - _type = row.get('address_type_id', None) - # We only want `Facility` records, i.e. skip all the us agent addresses - if _type == 'F': - final = dict(row) - final['us_agent'] = us_agent - final['owner_operator'] = owner_operator - result.append(final) - - return result - - def reduce(self, key, values, output): - new_key = 'intermediate_registration:' + key - val = self._join(values) - if val: output.put(new_key, val) class JoinRegistration(luigi.Task): - def requires(self): - return [TXT2JSON(), JoinOwnerOperator()] + def requires(self): + return [TXT2JSON(), JoinOwnerOperator()] - def output(self): - return luigi.LocalTarget(config.data_dir('registration/registration.db')) + def output(self): + return luigi.LocalTarget(config.data_dir("registration/registration.db")) - def run(self): - tables = ['intermediate_owner_operator', 'registration', 'us_agent'] - join_keys = ['reg_key'] - db_list = [s.path for s in self.input()] + def run(self): + tables = ["intermediate_owner_operator", "registration", "us_agent"] + join_keys = ["reg_key"] + db_list = [s.path for s in self.input()] + + parallel.mapreduce( + parallel.Collection.from_sharded_list(db_list), + mapper=JoinMapper(tables=tables, join_keys=join_keys), + reducer=RegistrationJoinReducer(), + output_prefix=self.output().path, + num_shards=10, + ) - parallel.mapreduce( - parallel.Collection.from_sharded_list(db_list), - mapper=JoinMapper(tables=tables, join_keys=join_keys), - reducer=RegistrationJoinReducer(), - output_prefix=self.output().path, - num_shards=10) class JoinEstablishmentTypesReducer(parallel.Reducer): - def _join(self, values): - intermediate = parallel.pivot_values(values) - result = [] - # There should be only one estblishment type streamed - # However, establishment type with ID of 4 no longer exists in estabtypes.txt, but there - # were at least two listing_estabtypes records still referring to this non-existent value. - # Likely a data issue on the FDA side, which we need to handle here too. - est_type = intermediate.get('estabtypes', [{'description': ''}])[0] - for data in intermediate.get('listing_estabtypes', []): - final = dict(list(data.items()) + list(est_type.items())) - result.append(final) - return result - - def reduce(self, key, values, output): - key_prefix = 'intermediate_establishment_listing:' + key - val = self._join(values) - if val: - for i, row in enumerate(val): - new_key = key_prefix + ':' + str(i) - output.put(new_key, row) + def _join(self, values): + intermediate = parallel.pivot_values(values) + result = [] + # There should be only one estblishment type streamed + # However, establishment type with ID of 4 no longer exists in estabtypes.txt, but there + # were at least two listing_estabtypes records still referring to this non-existent value. + # Likely a data issue on the FDA side, which we need to handle here too. + est_type = intermediate.get("estabtypes", [{"description": ""}])[0] + for data in intermediate.get("listing_estabtypes", []): + final = dict(list(data.items()) + list(est_type.items())) + result.append(final) + return result + + def reduce(self, key, values, output): + key_prefix = "intermediate_establishment_listing:" + key + val = self._join(values) + if val: + for i, row in enumerate(val): + new_key = key_prefix + ":" + str(i) + output.put(new_key, row) + class JoinEstablishmentTypes(luigi.Task): - def requires(self): - return TXT2JSON() - - def output(self): - return luigi.LocalTarget(config.data_dir('registration/establishment_listing.db')) - - def run(self): - tables = ['listing_estabtypes', 'estabtypes'] - join_keys = ['establishment_type_id'] - parallel.mapreduce( - parallel.Collection.from_sharded(self.input().path), - mapper=JoinMapper(tables=tables, join_keys=join_keys), - reducer=JoinEstablishmentTypesReducer(), - output_prefix=self.output().path, - num_shards=10) + def requires(self): + return TXT2JSON() + + def output(self): + return luigi.LocalTarget( + config.data_dir("registration/establishment_listing.db") + ) + + def run(self): + tables = ["listing_estabtypes", "estabtypes"] + join_keys = ["establishment_type_id"] + parallel.mapreduce( + parallel.Collection.from_sharded(self.input().path), + mapper=JoinMapper(tables=tables, join_keys=join_keys), + reducer=JoinEstablishmentTypesReducer(), + output_prefix=self.output().path, + num_shards=10, + ) + class ListingJoinReducer(parallel.Reducer): - def _join(self, values): - intermediate = parallel.pivot_values(values) - result = [] + def _join(self, values): + intermediate = parallel.pivot_values(values) + result = [] - for row in intermediate.get('remapped_registration_listing', []): - final = dict(row) - final['products'] = intermediate.get('listing_pcd', []) - final['proprietary_name'] = intermediate.get('listing_proprietary_name', []) - result.append(final) + for row in intermediate.get("remapped_registration_listing", []): + final = dict(row) + final["products"] = intermediate.get("listing_pcd", []) + final["proprietary_name"] = intermediate.get("listing_proprietary_name", []) + result.append(final) - return result + return result + + def reduce(self, key, values, output): + new_key = "intermediate_registration_listing:" + key + val = self._join(values) + output.put(new_key, val) - def reduce(self, key, values, output): - new_key = 'intermediate_registration_listing:' + key - val = self._join(values) - output.put(new_key, val) class JoinListings(luigi.Task): - def requires(self): - return TXT2JSON() - - def output(self): - return luigi.LocalTarget(config.data_dir('registration/registration_listing.db')) - - def run(self): - tables = [ - 'remapped_registration_listing', - 'listing_pcd', - 'listing_proprietary_name'] - join_keys = ['key_val'] - - parallel.mapreduce( - parallel.Collection.from_sharded(self.input().path), - mapper=JoinMapper(tables=tables, join_keys=join_keys), - reducer=ListingJoinReducer(), - output_prefix=self.output().path, - num_shards=10) + def requires(self): + return TXT2JSON() + + def output(self): + return luigi.LocalTarget( + config.data_dir("registration/registration_listing.db") + ) + + def run(self): + tables = [ + "remapped_registration_listing", + "listing_pcd", + "listing_proprietary_name", + ] + join_keys = ["key_val"] + + parallel.mapreduce( + parallel.Collection.from_sharded(self.input().path), + mapper=JoinMapper(tables=tables, join_keys=join_keys), + reducer=ListingJoinReducer(), + output_prefix=self.output().path, + num_shards=10, + ) + class JoinAllReducer(parallel.Reducer): - def _join(self, values): - intermediate = parallel.pivot_values(values) - result = [] - - for data in intermediate.get('intermediate_registration_listing', []): - final = dict(data) - final['establishment_type'] = [] - final['registration'] = [] - final['proprietary_name'] = [] - - # De-dup the proprietary names - for prop_name in data.get('proprietary_name', []): - name = prop_name.get('proprietary_name', None) - if name and name not in final['proprietary_name']: - final['proprietary_name'].append(name) - - est_join = ['registration_listing_id'] - reg_join = ['reg_key'] - - est_left_key = construct_join_key(data, est_join) - reg_left_key = construct_join_key(final, reg_join) - - # Grab just the descriptions of the establishment type - for row in intermediate.get('intermediate_establishment_listing', []): - est_right_key = construct_join_key(row, est_join) - if est_left_key == est_right_key: - final['establishment_type'].append(row['description']) - - # There is only one registered facility - registrant = {} - facility = intermediate.get('intermediate_registration', []) - if facility: - registrant = facility[0] - final['registration'] = registrant - - result.append(final) - - return result - - def reduce(self, key, values, output): - # There are lot of keys that we do not need once all the joining has been - # done, so we can now transform the output and remove the join keys and - # changes some of the names to be in line with naming conventions. - - IGNORE = ['key_val', - 'address_id', - 'address_type_id', - 'contact_id', - 'reg_key', - 'listing_prop_id', - 'listing_prop_name_id', - 'registration_listing_id', - 'establishment' - ] + def _join(self, values): + intermediate = parallel.pivot_values(values) + result = [] + + for data in intermediate.get("intermediate_registration_listing", []): + final = dict(data) + final["establishment_type"] = [] + final["registration"] = [] + final["proprietary_name"] = [] + + # De-dup the proprietary names + for prop_name in data.get("proprietary_name", []): + name = prop_name.get("proprietary_name", None) + if name and name not in final["proprietary_name"]: + final["proprietary_name"].append(name) + + est_join = ["registration_listing_id"] + reg_join = ["reg_key"] + + est_left_key = construct_join_key(data, est_join) + reg_left_key = construct_join_key(final, reg_join) + + # Grab just the descriptions of the establishment type + for row in intermediate.get("intermediate_establishment_listing", []): + est_right_key = construct_join_key(row, est_join) + if est_left_key == est_right_key: + final["establishment_type"].append(row["description"]) + + # There is only one registered facility + registrant = {} + facility = intermediate.get("intermediate_registration", []) + if facility: + registrant = facility[0] + final["registration"] = registrant + + result.append(final) + + return result + + def reduce(self, key, values, output): + # There are lot of keys that we do not need once all the joining has been + # done, so we can now transform the output and remove the join keys and + # changes some of the names to be in line with naming conventions. + + IGNORE = [ + "key_val", + "address_id", + "address_type_id", + "contact_id", + "reg_key", + "listing_prop_id", + "listing_prop_name_id", + "registration_listing_id", + "establishment", + ] + + RENAME_MAP = { + "address_line1": "address_1", + "address_line2": "address_2", + "reg_status_id": "status_code", + "state_id": "state_code", + } + + def _prune(k, v): + """A helper function used for removing and renaming dictionary keys.""" + if k in IGNORE: + return None + if k in RENAME_MAP: + k = RENAME_MAP[k] + return (k, v) + + key_prefix = "final_result:" + key + val = self._join(values) + if val: + for i, row in enumerate(val): + new_key = key_prefix + ":" + str(i) + new_value = common.transform_dict(row, _prune) + output.put(new_key, new_value) - RENAME_MAP = { - 'address_line1': 'address_1', - 'address_line2': 'address_2', - 'reg_status_id': 'status_code', - 'state_id': 'state_code' - } - - def _prune(k, v): - ''' A helper function used for removing and renaming dictionary keys. - ''' - if k in IGNORE: return None - if k in RENAME_MAP: - k = RENAME_MAP[k] - return (k, v) - - key_prefix = 'final_result:' + key - val = self._join(values) - if val: - for i, row in enumerate(val): - new_key = key_prefix + ':' + str(i) - new_value = common.transform_dict(row, _prune) - output.put(new_key, new_value) class JoinAll(luigi.Task): - def requires(self): - return [JoinListings(), JoinEstablishmentTypes(), JoinRegistration()] + def requires(self): + return [JoinListings(), JoinEstablishmentTypes(), JoinRegistration()] + + def output(self): + return luigi.LocalTarget(config.data_dir("registration/final.db")) + + def run(self): + tables = [ + "intermediate_registration_listing", + "intermediate_establishment_listing", + "intermediate_registration", + ] + join_keys = ["reg_key"] + db_list = [s.path for s in self.input()] + + parallel.mapreduce( + parallel.Collection.from_sharded_list(db_list), + mapper=JoinMapper(tables=tables, join_keys=join_keys), + reducer=JoinAllReducer(), + output_prefix=self.output().path, + num_shards=10, + ) - def output(self): - return luigi.LocalTarget(config.data_dir('registration/final.db')) - - def run(self): - tables = [ - 'intermediate_registration_listing', - 'intermediate_establishment_listing', - 'intermediate_registration' - ] - join_keys = ['reg_key'] - db_list = [s.path for s in self.input()] - - parallel.mapreduce( - parallel.Collection.from_sharded_list(db_list), - mapper=JoinMapper(tables=tables, join_keys=join_keys), - reducer=JoinAllReducer(), - output_prefix=self.output().path, - num_shards=10) class RegistrationAnnotateDevice(DeviceAnnotateMapper): - ''' The registration document has a unique placement requirement, so we need - to override the `harmonize()` method to place the `openfda` section on - each product in the `products` list within the document. - ''' - def harmonize(self, data): - result = dict(data) - k_number = data.get('k_number', None) - pma_number = data.get('pma_number', None) - - product_list = [] - for row in result.get('products', []): - d = dict(row) - harmonized = self.filter(row, k_num=k_number, p_num=pma_number) - if harmonized: - d['openfda'] = self.flatten(harmonized) - else: - d['openfda'] = {} - product_list.append(d) - result['products'] = product_list - return result - - def filter(self, data, k_num=None, p_num=None): - product_code = data['product_code'] - harmonized = self.harmonized_db.get(product_code, None) - if harmonized: - if self.table in harmonized: - del harmonized[self.table] - if k_num: - k_numbers = list(harmonized['510k']) - harmonized['510k'] = [d for d in k_numbers if d['k_number'] == k_num] - else: - harmonized['510k'] = [] - - if p_num: - pma_numbers = list(harmonized['device_pma']) - harmonized['device_pma'] = \ - [d for d in pma_numbers if d['pma_number'] == p_num] - else: - harmonized['device_pma'] = [] - - return harmonized - return None + """The registration document has a unique placement requirement, so we need + to override the `harmonize()` method to place the `openfda` section on + each product in the `products` list within the document. + """ + + def harmonize(self, data): + result = dict(data) + k_number = data.get("k_number", None) + pma_number = data.get("pma_number", None) + + product_list = [] + for row in result.get("products", []): + d = dict(row) + harmonized = self.filter(row, k_num=k_number, p_num=pma_number) + if harmonized: + d["openfda"] = self.flatten(harmonized) + else: + d["openfda"] = {} + product_list.append(d) + result["products"] = product_list + return result + + def filter(self, data, k_num=None, p_num=None): + product_code = data["product_code"] + harmonized = self.harmonized_db.get(product_code, None) + if harmonized: + if self.table in harmonized: + del harmonized[self.table] + if k_num: + k_numbers = list(harmonized["510k"]) + harmonized["510k"] = [d for d in k_numbers if d["k_number"] == k_num] + else: + harmonized["510k"] = [] + + if p_num: + pma_numbers = list(harmonized["device_pma"]) + harmonized["device_pma"] = [ + d for d in pma_numbers if d["pma_number"] == p_num + ] + else: + harmonized["device_pma"] = [] + + return harmonized + return None + class AnnotateDevice(luigi.Task): - def requires(self): - return [Harmonized2OpenFDA(), JoinAll()] + def requires(self): + return [Harmonized2OpenFDA(), JoinAll()] - def output(self): - return luigi.LocalTarget(config.data_dir('registration/annotate.db')) + def output(self): + return luigi.LocalTarget(config.data_dir("registration/annotate.db")) - def run(self): - harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() + def run(self): + harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() - parallel.mapreduce( - parallel.Collection.from_sharded(self.input()[1].path), - mapper=RegistrationAnnotateDevice(harmonized_db=harmonized_db), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=10) + parallel.mapreduce( + parallel.Collection.from_sharded(self.input()[1].path), + mapper=RegistrationAnnotateDevice(harmonized_db=harmonized_db), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=10, + ) class LoadJSON(index_util.LoadJSONBase): - index_name = 'devicereglist' - mapping_file = './schemas/registration_mapping.json' - data_source = AnnotateDevice() - use_checksum = False - optimize_index = True - last_update_date = lambda _: newest_file_timestamp(RAW_DIR) + index_name = "devicereglist" + mapping_file = "./schemas/registration_mapping.json" + data_source = AnnotateDevice() + use_checksum = False + optimize_index = True + last_update_date = lambda _: newest_file_timestamp(RAW_DIR) -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/res/annotate.py b/openfda/res/annotate.py index c1c1dac..fd1dc06 100644 --- a/openfda/res/annotate.py +++ b/openfda/res/annotate.py @@ -5,189 +5,199 @@ import arrow from openfda import parallel + def read_json_file(json_file): - for line in json_file: - yield json.loads(line) + for line in json_file: + yield json.loads(line) + -#TODO(hansnelsen): add to common library, is used in all annotation processing +# TODO(hansnelsen): add to common library, is used in all annotation processing def read_harmonized_file(harmonized_file): - ''' - Create a dictionary that is keyed by UPC and NDC. - ''' - upc_or_ndc_to_harmonized = {} - for row in read_json_file(harmonized_file): - if row['upc'] != []: - row['upc'] = [] - - for upc in row['upc']: - if upc in upc_or_ndc_to_harmonized: - upc_or_ndc_to_harmonized[upc].append(row) - else: - upc_or_ndc_to_harmonized[upc] = [row] - - harmonized_file.seek(0) - for row in read_json_file(harmonized_file): - for product_ndc in row['spl_product_ndc']: - if product_ndc in upc_or_ndc_to_harmonized: - upc_or_ndc_to_harmonized[product_ndc].append(row) - else: - upc_or_ndc_to_harmonized[product_ndc] = [row] - - harmonized_file.seek(0) - for row in read_json_file(harmonized_file): - for package_ndc in row['package_ndc']: - if package_ndc in upc_or_ndc_to_harmonized: - upc_or_ndc_to_harmonized[package_ndc].append(row) - else: - upc_or_ndc_to_harmonized[package_ndc] = [row] - return upc_or_ndc_to_harmonized + """ + Create a dictionary that is keyed by UPC and NDC. + """ + upc_or_ndc_to_harmonized = {} + for row in read_json_file(harmonized_file): + if row["upc"] != []: + row["upc"] = [] + + for upc in row["upc"]: + if upc in upc_or_ndc_to_harmonized: + upc_or_ndc_to_harmonized[upc].append(row) + else: + upc_or_ndc_to_harmonized[upc] = [row] + + harmonized_file.seek(0) + for row in read_json_file(harmonized_file): + for product_ndc in row["spl_product_ndc"]: + if product_ndc in upc_or_ndc_to_harmonized: + upc_or_ndc_to_harmonized[product_ndc].append(row) + else: + upc_or_ndc_to_harmonized[product_ndc] = [row] + + harmonized_file.seek(0) + for row in read_json_file(harmonized_file): + for package_ndc in row["package_ndc"]: + if package_ndc in upc_or_ndc_to_harmonized: + upc_or_ndc_to_harmonized[package_ndc].append(row) + else: + upc_or_ndc_to_harmonized[package_ndc] = [row] + return upc_or_ndc_to_harmonized + # TODO(hansnelsen): Add to a common library, since it is used by faers and res # annotate.py. _add_field() is identical in both files. def _add_field(openfda, field, value): - if type(value) != type([]): - value = [value] - for v in value: - if not v: - continue - if field not in openfda: - openfda[field] = {} - openfda[field][v] = True - return + if type(value) != type([]): + value = [value] + for v in value: + if not v: + continue + if field not in openfda: + openfda[field] = {} + openfda[field][v] = True + return + def _get_ndc_type(value): - ''' Identifying the NDC type is based on string length WITHOUT '-', we need - to strip all non-numeric characters before we look at the length - ''' - ndc_type = '' - numbers_only = re.sub(r'[^\d+]', '', value) - if len(numbers_only) == 8: - ndc_type = 'product_ndc' - if len(numbers_only) in [10, 11]: - ndc_type = 'package_ndc' - else: - ndc_type = None - return ndc_type + """Identifying the NDC type is based on string length WITHOUT '-', we need + to strip all non-numeric characters before we look at the length + """ + ndc_type = "" + numbers_only = re.sub(r"[^\d+]", "", value) + if len(numbers_only) == 8: + ndc_type = "product_ndc" + if len(numbers_only) in [10, 11]: + ndc_type = "package_ndc" + else: + ndc_type = None + return ndc_type + def _insert_or_update(recall, code_type, code_value): - if code_type == 'ndc': - if 'product_ndc' in recall['openfda']: - combined_ndcs = recall['openfda']['product_ndc'] + \ - recall['openfda']['package_ndc'] - if code_value not in combined_ndcs: - ndc_type = _get_ndc_type(code_value) - if ndc_type in ['product_ndc', 'package_ndc']: - recall['openfda'][ndc_type].append(code_value) - - if code_type == 'upc': - if len(recall['openfda']) > 0: - if 'upc' in recall['openfda']: - if code_value not in recall['openfda']['upc']: - recall['openfda']['upc'].append(code_value) - else: - recall['openfda']['upc'] = [code_value] + if code_type == "ndc": + if "product_ndc" in recall["openfda"]: + combined_ndcs = ( + recall["openfda"]["product_ndc"] + recall["openfda"]["package_ndc"] + ) + if code_value not in combined_ndcs: + ndc_type = _get_ndc_type(code_value) + if ndc_type in ["product_ndc", "package_ndc"]: + recall["openfda"][ndc_type].append(code_value) + + if code_type == "upc": + if len(recall["openfda"]) > 0: + if "upc" in recall["openfda"]: + if code_value not in recall["openfda"]["upc"]: + recall["openfda"]["upc"].append(code_value) + else: + recall["openfda"]["upc"] = [code_value] + # TODO(hansnelsen): Looks very similiar to the code in faers/annotate.py, we # should consider refactoring this into a general piece of code for generating # a de-duped list for each key in the openfda dict. Note: this openfda record # has a few more keys than its faers counterpart def AddHarmonizedRowToOpenfda(openfda, row): - _add_field(openfda, 'application_number', row['application_number']) - _add_field(openfda, 'brand_name', row['brand_name'].rstrip().upper()) - _add_field(openfda, 'generic_name', row['generic_name'].upper()) - _add_field(openfda, 'manufacturer_name', row['manufacturer_name']) - _add_field(openfda, 'product_ndc', row['product_ndc']) - _add_field(openfda, 'product_ndc', row['spl_product_ndc']) - _add_field(openfda, 'product_type', row['product_type']) - - for route in row['route'].split(';'): - route = route.strip() - _add_field(openfda, 'route', route) - - for substance in row['substance_name'].split(';'): - substance = substance.strip() - _add_field(openfda, 'substance_name', substance) - - for rxnorm in row['rxnorm']: - _add_field(openfda, 'rxcui', rxnorm['rxcui']) - - _add_field(openfda, 'spl_id', row['id']) - _add_field(openfda, 'spl_set_id', row['spl_set_id']) - _add_field(openfda, 'package_ndc', row['package_ndc']) - _add_field(openfda, - 'original_packager_product_ndc', - row['original_packager_product_ndc']) - _add_field(openfda, - 'is_original_packager', - row['is_original_packager']) - - _add_field(openfda, 'upc', row['upc']) - - if row['unii_indexing'] != []: - unii_list = [] - for unii_row in row['unii_indexing']: - for key, value in unii_row.items(): - if key == 'unii': - unii_list.append(value) - if key == 'va': - for this_item in value: - for va_key, va_value in this_item.items(): - if va_key == 'name': - if va_value.find('[MoA]') != -1: - _add_field(openfda, 'pharm_class_moa', va_value) - if va_value.find('[Chemical/Ingredient]') != -1 or va_value.find('[CS]') != -1: - _add_field(openfda, 'pharm_class_cs', va_value) - if va_value.find('[PE]') != -1: - _add_field(openfda, 'pharm_class_pe', va_value) - if va_value.find('[EPC]') != -1: - _add_field(openfda, 'pharm_class_epc', va_value) - if va_key == 'number': - _add_field(openfda, 'nui', va_value) - _add_field(openfda, 'unii', unii_list) + _add_field(openfda, "application_number", row["application_number"]) + _add_field(openfda, "brand_name", row["brand_name"].rstrip().upper()) + _add_field(openfda, "generic_name", row["generic_name"].upper()) + _add_field(openfda, "manufacturer_name", row["manufacturer_name"]) + _add_field(openfda, "product_ndc", row["product_ndc"]) + _add_field(openfda, "product_ndc", row["spl_product_ndc"]) + _add_field(openfda, "product_type", row["product_type"]) + + for route in row["route"].split(";"): + route = route.strip() + _add_field(openfda, "route", route) + + for substance in row["substance_name"].split(";"): + substance = substance.strip() + _add_field(openfda, "substance_name", substance) + + for rxnorm in row["rxnorm"]: + _add_field(openfda, "rxcui", rxnorm["rxcui"]) + + _add_field(openfda, "spl_id", row["id"]) + _add_field(openfda, "spl_set_id", row["spl_set_id"]) + _add_field(openfda, "package_ndc", row["package_ndc"]) + _add_field( + openfda, "original_packager_product_ndc", row["original_packager_product_ndc"] + ) + _add_field(openfda, "is_original_packager", row["is_original_packager"]) + + _add_field(openfda, "upc", row["upc"]) + + if row["unii_indexing"] != []: + unii_list = [] + for unii_row in row["unii_indexing"]: + for key, value in unii_row.items(): + if key == "unii": + unii_list.append(value) + if key == "va": + for this_item in value: + for va_key, va_value in this_item.items(): + if va_key == "name": + if va_value.find("[MoA]") != -1: + _add_field(openfda, "pharm_class_moa", va_value) + if ( + va_value.find("[Chemical/Ingredient]") != -1 + or va_value.find("[CS]") != -1 + ): + _add_field(openfda, "pharm_class_cs", va_value) + if va_value.find("[PE]") != -1: + _add_field(openfda, "pharm_class_pe", va_value) + if va_value.find("[EPC]") != -1: + _add_field(openfda, "pharm_class_epc", va_value) + if va_key == "number": + _add_field(openfda, "nui", va_value) + _add_field(openfda, "unii", unii_list) + def AnnotateRecall(recall, harmonized_dict): - openfda = {} - if recall['product-type'] == 'Drugs': - for ndc in recall['ndc']: - if ndc in harmonized_dict: - AddHarmonizedRowToOpenfda(openfda, harmonized_dict[ndc][0]) - for upc in recall['upc']: - if upc in harmonized_dict: - AddHarmonizedRowToOpenfda(openfda, harmonized_dict[upc][0]) + openfda = {} + if recall["product-type"] == "Drugs": + for ndc in recall["ndc"]: + if ndc in harmonized_dict: + AddHarmonizedRowToOpenfda(openfda, harmonized_dict[ndc][0]) + for upc in recall["upc"]: + if upc in harmonized_dict: + AddHarmonizedRowToOpenfda(openfda, harmonized_dict[upc][0]) - openfda_lists = {} - for field, value in openfda.items(): - openfda_lists[field] = [s for s in value.keys()] + openfda_lists = {} + for field, value in openfda.items(): + openfda_lists[field] = [s for s in value.keys()] - recall['openfda'] = openfda_lists + recall["openfda"] = openfda_lists def AnnotateEvent(recall, harmonized_dict): - '''Doing cleanup work here so that the json to be loaded in to ES is date - friendly and the naming conventions line up with the API standard - ''' - AnnotateRecall(recall, harmonized_dict) - recall['@timestamp'] = arrow.get(recall['report-date']).format('YYYY-MM-DD') - - for key, value in list(recall.items()): - if '-' in key: - new_name = key.replace('-', '_') - recall[new_name] = value - recall.pop(key, None) - if key in ['upc', 'ndc']: - if value: - for this_value in value: - _insert_or_update(recall, key, this_value) - recall.pop(key, None) + """Doing cleanup work here so that the json to be loaded in to ES is date + friendly and the naming conventions line up with the API standard + """ + AnnotateRecall(recall, harmonized_dict) + recall["@timestamp"] = arrow.get(recall["report-date"]).format("YYYY-MM-DD") + + for key, value in list(recall.items()): + if "-" in key: + new_name = key.replace("-", "_") + recall[new_name] = value + recall.pop(key, None) + if key in ["upc", "ndc"]: + if value: + for this_value in value: + _insert_or_update(recall, key, this_value) + recall.pop(key, None) + class AnnotateMapper(parallel.Mapper): - def __init__(self, harmonized_file): - self.harmonized_file = harmonized_file + def __init__(self, harmonized_file): + self.harmonized_file = harmonized_file - def map_shard(self, map_input, map_output): - self.harmonized_dict = read_harmonized_file(open(self.harmonized_file)) - parallel.Mapper.map_shard(self, map_input, map_output) + def map_shard(self, map_input, map_output): + self.harmonized_dict = read_harmonized_file(open(self.harmonized_file)) + parallel.Mapper.map_shard(self, map_input, map_output) - def map(self, key, recall, out): - AnnotateEvent(recall, self.harmonized_dict) - out.add(key, recall) + def map(self, key, recall, out): + AnnotateEvent(recall, self.harmonized_dict) + out.add(key, recall) diff --git a/openfda/res/ean.py b/openfda/res/ean.py index 516fb1e..85b01a9 100644 --- a/openfda/res/ean.py +++ b/openfda/res/ean.py @@ -20,38 +20,46 @@ middle 0 digits removed. """ + from functools import reduce __version__ = "$Revision: 0.4$" -codeLength = { "EAN13": 13, - "EAN8": 8, - "UPCA": 12, - "UPCE": 8 } +codeLength = {"EAN13": 13, "EAN8": 8, "UPCA": 12, "UPCE": 8} + -def __len_check(chk, _type='EAN13'): - return (len(chk) == codeLength[_type]) or \ - (len(chk) == codeLength[_type]-1) +def __len_check(chk, _type="EAN13"): + return (len(chk) == codeLength[_type]) or (len(chk) == codeLength[_type] - 1) def __sum_digits(chk, start=0, end=1, step=2, mult=1): - return reduce(lambda x, y: int(x)+int(y), list(chk[start:end:step])) * mult + return reduce(lambda x, y: int(x) + int(y), list(chk[start:end:step])) * mult -def ean_check_digit(chk, code='EAN13'): + +def ean_check_digit(chk, code="EAN13"): """Returns the checksum digit of an EAN-13/8 code""" if chk.isdigit() and __len_check(chk): - if code == 'EAN13': - m0=1 - m1=3 - elif code == 'EAN8': - m0=3 - m1=1 + if code == "EAN13": + m0 = 1 + m1 = 3 + elif code == "EAN8": + m0 = 3 + m1 = 1 else: return None - _len = codeLength[code]-1 - t = 10 - (( __sum_digits(chk, start=0, end=_len, mult=m0) + \ - __sum_digits(chk, start=1, end=_len, mult=m1) ) %10 ) %10 + _len = codeLength[code] - 1 + t = ( + 10 + - ( + ( + __sum_digits(chk, start=0, end=_len, mult=m0) + + __sum_digits(chk, start=1, end=_len, mult=m1) + ) + % 10 + ) + % 10 + ) if t == 10: return 0 @@ -60,14 +68,16 @@ def ean_check_digit(chk, code='EAN13'): return None + def ean_13_valid(chk): """Verify if code is valid EAN13 barcode. Returns True|False""" - return chk.isdigit() and __len_check(chk) and \ - (int(chk[-1]) == ean_check_digit(chk)) + return chk.isdigit() and __len_check(chk) and (int(chk[-1]) == ean_check_digit(chk)) + def ean_8_check_digit(chk): """Returns the checksum digit of an EAN8 code""" - return ean_check_digit(chk, code='EAN8') + return ean_check_digit(chk, code="EAN8") + def ean_8_valid(chk): """Verify if code is valid EAN8 barcode. Returns True|False""" @@ -75,70 +85,76 @@ def ean_8_valid(chk): return int(chk[-1]) == ean_8_check_digit(chk) return False + # UPC codes below + def upca_check_digit(chk): if chk is not None: - return ean_check_digit('0'+chk) + return ean_check_digit("0" + chk) return None + def upca_valid(chk): if chk is not None: - return ean_13_valid('0'+chk) + return ean_13_valid("0" + chk) return False + def upca2e(chk): t = None - if chk.isdigit() and __len_check(chk, 'UPCA'): - if '012'.find(chk[3]) >= 0 and chk[4:8] == '0000': - t=chk[:3]+chk[8:11]+chk[3] - elif chk[4:9] == '00000': - t=chk[:4]+chk[9:11]+'3' - elif chk[5:10] == '00000': - t = chk[:5]+chk[10]+'4' - elif '5678'.find(chk[10]) >= 0 and chk[6:10] == '0000': - t=chk[:6]+chk[10] + if chk.isdigit() and __len_check(chk, "UPCA"): + if "012".find(chk[3]) >= 0 and chk[4:8] == "0000": + t = chk[:3] + chk[8:11] + chk[3] + elif chk[4:9] == "00000": + t = chk[:4] + chk[9:11] + "3" + elif chk[5:10] == "00000": + t = chk[:5] + chk[10] + "4" + elif "5678".find(chk[10]) >= 0 and chk[6:10] == "0000": + t = chk[:6] + chk[10] else: - t=None + t = None # Check digit if t is not None: if upca_valid(chk): - t=t+chk[-1] - elif len(chk) == codeLength["UPCA"]-1: - t=t+str(upca_check_digit(chk)) + t = t + chk[-1] + elif len(chk) == codeLength["UPCA"] - 1: + t = t + str(upca_check_digit(chk)) else: - t=None + t = None return t + def upce2a(chk): - t=None - if chk.isdigit() and __len_check(chk, 'UPCE'): - if '012'.find(chk[6]) >= 0: - t=chk[:3]+chk[6]+'0000'+chk[3:6] - elif chk[6] == '3': - t=chk[:4]+'00000'+chk[4:6] - elif chk[6] == '4': - t=chk[:5]+'00000'+chk[5] - elif '5678'.find(chk[6]) >= 0: - t=chk[:6]+'0000'+chk[6] + t = None + if chk.isdigit() and __len_check(chk, "UPCE"): + if "012".find(chk[6]) >= 0: + t = chk[:3] + chk[6] + "0000" + chk[3:6] + elif chk[6] == "3": + t = chk[:4] + "00000" + chk[4:6] + elif chk[6] == "4": + t = chk[:5] + "00000" + chk[5] + elif "5678".find(chk[6]) >= 0: + t = chk[:6] + "0000" + chk[6] else: - t=None + t = None if t is not None: if len(chk) == codeLength["UPCE"] - 1: - t=t+str(upca_check_digit(t)) - elif len(chk) == codeLength['UPCE'] and \ - int(chk[-1]) == upca_check_digit(t): - t=t+chk[-1] + t = t + str(upca_check_digit(t)) + elif len(chk) == codeLength["UPCE"] and int(chk[-1]) == upca_check_digit(t): + t = t + chk[-1] else: - t=None + t = None return t + def upce_valid(chk): return len(chk) == codeLength["UPCE"] and upca_valid(upce2a(chk)) + def upce_check_digit(chk): if chk is not None: return upca_check_digit(upce2a(chk)) - return None \ No newline at end of file + return None diff --git a/openfda/res/extract.py b/openfda/res/extract.py index 9fdf68d..708cf24 100755 --- a/openfda/res/extract.py +++ b/openfda/res/extract.py @@ -9,9 +9,11 @@ # http://en.wikipedia.org/wiki/National_Drug_Code # # Supports NDC10 and NDC11 -NDC_RE = (r'\d{10}|\d{11}|\d{4}-\d{4}-\d{2}|\d{5}-\d{3}-\d{2}|' - r'\d{5}-\d{4}-\d{2}|' - r'\d{5}-\d{4}-\d{1}') +NDC_RE = ( + r"\d{10}|\d{11}|\d{4}-\d{4}-\d{2}|\d{5}-\d{3}-\d{2}|" + r"\d{5}-\d{4}-\d{2}|" + r"\d{5}-\d{4}-\d{1}" +) # http://en.wikipedia.org/wiki/Universal_Product_Code # @@ -23,59 +25,67 @@ # # Note: We can be very flexible in parsing UPCs as it's easy to verify # if they are valid. -UPC_RE = r'(?:\d[\- ]?){12}' +UPC_RE = r"(?:\d[\- ]?){12}" + # TODO(mattmo): Load in a whitelist of valid NDCs from the NDC dataset or # from SPL. def is_valid_ndc(ndc): - return True + return True + # TODO(mattmo): Use whitelist to map NDCs without dashes to NDCs with dashes. def clean_ndc(ndc): - return ndc + return ndc + def extract_ndc(text): - raw_ndcs = re.findall(NDC_RE, text) - ndcs = {} - for raw_ndc in raw_ndcs: - ndc = clean_ndc(raw_ndc) - if is_valid_ndc(ndc): - ndcs[ndc] = True - return list(ndcs.keys()) + raw_ndcs = re.findall(NDC_RE, text) + ndcs = {} + for raw_ndc in raw_ndcs: + ndc = clean_ndc(raw_ndc) + if is_valid_ndc(ndc): + ndcs[ndc] = True + return list(ndcs.keys()) + def extract_ndc_from_recall(recall): - desc = recall['product-description'] - if 'code-info' in recall: - desc += ' ' + recall['code-info'] - if recall.get('more-code-info', ''): - desc += ' ' + recall['more-code-info'] - return extract_ndc(desc) + desc = recall["product-description"] + if "code-info" in recall: + desc += " " + recall["code-info"] + if recall.get("more-code-info", ""): + desc += " " + recall["more-code-info"] + return extract_ndc(desc) + def is_valid_upc(upc): - return ean.upca_valid(upc) + return ean.upca_valid(upc) + def clean_upc(upc): - # Remove spaces - upc = upc.replace(' ', '') + # Remove spaces + upc = upc.replace(" ", "") - # Remove dashes - upc = upc.replace('-', '') + # Remove dashes + upc = upc.replace("-", "") + + return upc - return upc def extract_upc(text): - raw_upcs = re.findall(UPC_RE, text) - upcs = {} - for raw_upc in raw_upcs: - upc = clean_upc(raw_upc) - if is_valid_upc(upc): - upcs[upc] = True - return list(upcs.keys()) + raw_upcs = re.findall(UPC_RE, text) + upcs = {} + for raw_upc in raw_upcs: + upc = clean_upc(raw_upc) + if is_valid_upc(upc): + upcs[upc] = True + return list(upcs.keys()) + def extract_upc_from_recall(recall): - desc = recall['product-description'] - if 'code-info' in recall: - desc += ' ' + recall['code-info'] - if recall.get('more-code-info', ''): - desc += ' ' + recall['more-code-info'] - return extract_upc(desc) + desc = recall["product-description"] + if "code-info" in recall: + desc += " " + recall["code-info"] + if recall.get("more-code-info", ""): + desc += " " + recall["more-code-info"] + return extract_upc(desc) diff --git a/openfda/res/pipeline.py b/openfda/res/pipeline.py index cfc00c4..da1145d 100755 --- a/openfda/res/pipeline.py +++ b/openfda/res/pipeline.py @@ -1,13 +1,13 @@ #!/usr/local/bin/python -''' +""" Pipeline for converting CSV RES data in JSON and importing into Elasticsearch. The data is available since 20 June 2012. The data actually goes back much longer than that, but it is in an unparseable form or at least would require a handful of parsers. This date represents the centers switch to a structured data format. -''' +""" import csv import glob import hashlib @@ -33,253 +33,275 @@ RUN_DIR = dirname(dirname(os.path.abspath(__file__))) DATE_KEYS = [ - 'recall-initiation-date', - 'center-classification-date', - 'termination-date', - 'report-date' + "recall-initiation-date", + "center-classification-date", + "termination-date", + "report-date", ] # Fields to be used to derive a Document ID for Elasticsearch -ID_FIELDS = [ - 'product-type', - 'recall-number' -] +ID_FIELDS = ["product-type", "recall-number"] # Enforcement data changed from XML to CSV the week of 2016-03-10 # Keeping names in line with the former XML style so that all of the pipeline # logic can remain in place. All `-` elements get converted to `_` in the end. RENAME_MAP = { - 'Product Type': 'product-type', - 'Event ID': 'event-id', - 'Status': 'status', - 'Recalling Firm': 'recalling-firm', - 'Address1': 'address-1', - 'Address2': 'address-2', - 'City': 'city', - 'State/Province': 'state', - 'Postal Code': 'postal-code', - 'Country': 'country', - 'Voluntary/Mandated': 'voluntary-mandated', - 'Initial Firm Notification of Consignee or Public': 'initial-firm-notification', - 'Distribution Pattern': 'distribution-pattern', - 'Recall Number': 'recall-number', - 'Classification': 'classification', - 'Product Description': 'product-description', - 'Product Quantity': 'product-quantity', - 'Reason for Recall': 'reason-for-recall', - 'Recall Initiation Date': 'recall-initiation-date', - 'Center Classification Date': 'center-classification-date', - 'Termination Date': 'termination-date', - 'Report Date': 'report-date', - 'Code Info': 'code-info', - 'More Code Info': 'more-code-info' + "Product Type": "product-type", + "Event ID": "event-id", + "Status": "status", + "Recalling Firm": "recalling-firm", + "Address1": "address-1", + "Address2": "address-2", + "City": "city", + "State/Province": "state", + "Postal Code": "postal-code", + "Country": "country", + "Voluntary/Mandated": "voluntary-mandated", + "Initial Firm Notification of Consignee or Public": "initial-firm-notification", + "Distribution Pattern": "distribution-pattern", + "Recall Number": "recall-number", + "Classification": "classification", + "Product Description": "product-description", + "Product Quantity": "product-quantity", + "Reason for Recall": "reason-for-recall", + "Recall Initiation Date": "recall-initiation-date", + "Center Classification Date": "center-classification-date", + "Termination Date": "termination-date", + "Report Date": "report-date", + "Code Info": "code-info", + "More Code Info": "more-code-info", } -DOWNLOAD_URL = ('https://www.accessdata.fda.gov/scripts/ires/index.cfm?action=' - 'export.getWeeklyExportCSVResult&' - 'monthVal=--M--&dayVal=--D--&yearVal=--Y--') +DOWNLOAD_URL = ( + "https://www.accessdata.fda.gov/scripts/ires/index.cfm?action=" + "export.getWeeklyExportCSVResult&" + "monthVal=--M--&dayVal=--D--&yearVal=--Y--" +) + class DownloadCSVReports(luigi.Task): - batch = luigi.Parameter() + batch = luigi.Parameter() - def requires(self): - return [] + def requires(self): + return [] - def output(self): - file_name = 'res/batches/%s/enforcement.csv' % arrow.get(self.batch).strftime('%Y%m%d') - return luigi.LocalTarget(config.data_dir(file_name)) + def output(self): + file_name = "res/batches/%s/enforcement.csv" % arrow.get(self.batch).strftime( + "%Y%m%d" + ) + return luigi.LocalTarget(config.data_dir(file_name)) - def run(self): - output_dir = dirname(self.output().path) - common.shell_cmd('mkdir -p %s', output_dir) - dt = arrow.get(self.batch) - url = DOWNLOAD_URL.replace('--Y--', str(dt.year)) - url = url.replace('--M--', str(dt.month)) - url = url.replace('--D--', str(dt.day)) - download_to_file_with_retry(url, self.output().path) + def run(self): + output_dir = dirname(self.output().path) + common.shell_cmd("mkdir -p %s", output_dir) + dt = arrow.get(self.batch) + url = DOWNLOAD_URL.replace("--Y--", str(dt.year)) + url = url.replace("--M--", str(dt.month)) + url = url.replace("--D--", str(dt.day)) + download_to_file_with_retry(url, self.output().path) class CleanCSV(luigi.Task): - batch = luigi.Parameter() - - def requires(self): - return DownloadCSVReports(self.batch) - - def output(self): - file_name = basename(self.input().path) - dir_name = dirname(self.input().path) - file_name = 'clean-' + file_name - return luigi.LocalTarget(join(dir_name, file_name)) - - def run(self): - cmd = 'iconv -f %s -t %s -c %s > %s' % \ - ('ISO-8859-1//TRANSLIT', 'UTF-8', self.input().path, self.output().path) - common.shell_cmd_quiet(cmd) - - # CSV exported by FDA iRes is often malformed because it can contain multiple columns - # with the same name: More Code Info. Most likely iRes does this when the code information is - # deemed too large to fit into a single column; but in any case the columns should have been named - # distinctly, e.g. "More Code Info 01", "More Code Info 02" etc. - # We handle this case here with Pandas and give the columns distinct names. - df = pd.read_csv(self.output().path, index_col=False, encoding='utf-8', - dtype=str) - code_info_columns = list(filter(lambda col: col.startswith('More Code Info'), list(df.columns))) - if len(code_info_columns) > 1: - df['Code Info All'] = df[code_info_columns].apply( - lambda row: ' '.join(list(filter(lambda v: not pd.isna(v), list(row.values)))).strip(), axis=1) - df.drop(code_info_columns, axis=1, inplace=True) - df.rename(columns={"Code Info All": "More Code Info"}, inplace=True) - df.to_csv(self.output().path, encoding='utf-8', index=False, quoting=csv.QUOTE_ALL) + batch = luigi.Parameter() + + def requires(self): + return DownloadCSVReports(self.batch) + + def output(self): + file_name = basename(self.input().path) + dir_name = dirname(self.input().path) + file_name = "clean-" + file_name + return luigi.LocalTarget(join(dir_name, file_name)) + + def run(self): + cmd = "iconv -f %s -t %s -c %s > %s" % ( + "ISO-8859-1//TRANSLIT", + "UTF-8", + self.input().path, + self.output().path, + ) + common.shell_cmd_quiet(cmd) + + # CSV exported by FDA iRes is often malformed because it can contain multiple columns + # with the same name: More Code Info. Most likely iRes does this when the code information is + # deemed too large to fit into a single column; but in any case the columns should have been named + # distinctly, e.g. "More Code Info 01", "More Code Info 02" etc. + # We handle this case here with Pandas and give the columns distinct names. + df = pd.read_csv( + self.output().path, index_col=False, encoding="utf-8", dtype=str + ) + code_info_columns = list( + filter(lambda col: col.startswith("More Code Info"), list(df.columns)) + ) + if len(code_info_columns) > 1: + df["Code Info All"] = df[code_info_columns].apply( + lambda row: " ".join( + list(filter(lambda v: not pd.isna(v), list(row.values))) + ).strip(), + axis=1, + ) + df.drop(code_info_columns, axis=1, inplace=True) + df.rename(columns={"Code Info All": "More Code Info"}, inplace=True) + df.to_csv( + self.output().path, encoding="utf-8", index=False, quoting=csv.QUOTE_ALL + ) class CSV2JSONMapper(parallel.Mapper): - ''' Mapper for the CSV2JSON map-reduction. There is some special logic in here - to generate a hash() from the top level key/value pairs for the id in - Elasticsearch. - Also, upc and ndc are extracted and added to reports that are of drug - type. - ''' - def _generate_doc_id(self, res_report): - ''' Hash function used to create unique IDs for the reports - ''' - key_fields = {} - for field in ID_FIELDS: - key_fields[field] = res_report.get(field) - json_str = json.dumps(key_fields, sort_keys=True) - hasher = hashlib.sha256() - hasher.update(json_str.encode('utf-8')) - return hasher.hexdigest() - - def map(self, key, value, output): - def cleaner(k, v): - if k not in RENAME_MAP: - return None - - k = RENAME_MAP[k] - - if k in DATE_KEYS: - if not v: - return None - v = arrow.get(v, 'MM/DD/YYYY').format('YYYYMMDD') - - if isinstance(v, str): - v = v.strip() - - if v is None: - v = '' - - return (k, v) - - val = common.transform_dict(value, cleaner) - - # These keys must exist in the JSON records for the annotation logic to work - logic_keys = [ - 'code-info', - 'report-date', - 'product-description' - ] - - if val.get('product-type') == 'Drugs': - val['upc'] = extract.extract_upc_from_recall(val) - val['ndc'] = extract.extract_ndc_from_recall(val) - - # There is not a decent ID for the report, so we need to make one - doc_id = self._generate_doc_id(val) - val['@id'] = doc_id - val['@version'] = 1 - - # Only write out vals that have required keys and a meaningful date - if set(logic_keys).issubset(val) and val['report-date'] is not None: - output.add(doc_id, val) - else: - logging.warning('Document is missing required fields. %s', - json.dumps(val, indent=2, sort_keys=True)) + """Mapper for the CSV2JSON map-reduction. There is some special logic in here + to generate a hash() from the top level key/value pairs for the id in + Elasticsearch. + Also, upc and ndc are extracted and added to reports that are of drug + type. + """ + + def _generate_doc_id(self, res_report): + """Hash function used to create unique IDs for the reports""" + key_fields = {} + for field in ID_FIELDS: + key_fields[field] = res_report.get(field) + json_str = json.dumps(key_fields, sort_keys=True) + hasher = hashlib.sha256() + hasher.update(json_str.encode("utf-8")) + return hasher.hexdigest() + + def map(self, key, value, output): + def cleaner(k, v): + if k not in RENAME_MAP: + return None + + k = RENAME_MAP[k] + + if k in DATE_KEYS: + if not v: + return None + v = arrow.get(v, "MM/DD/YYYY").format("YYYYMMDD") + + if isinstance(v, str): + v = v.strip() + + if v is None: + v = "" + + return (k, v) + + val = common.transform_dict(value, cleaner) + + # These keys must exist in the JSON records for the annotation logic to work + logic_keys = ["code-info", "report-date", "product-description"] + + if val.get("product-type") == "Drugs": + val["upc"] = extract.extract_upc_from_recall(val) + val["ndc"] = extract.extract_ndc_from_recall(val) + + # There is not a decent ID for the report, so we need to make one + doc_id = self._generate_doc_id(val) + val["@id"] = doc_id + val["@version"] = 1 + + # Only write out vals that have required keys and a meaningful date + if set(logic_keys).issubset(val) and val["report-date"] is not None: + output.add(doc_id, val) + else: + logging.warning( + "Document is missing required fields. %s", + json.dumps(val, indent=2, sort_keys=True), + ) class CSV2JSON(luigi.Task): - batch = luigi.Parameter() - - def requires(self): - return CleanCSV(self.batch) - - def output(self): - dir_name = dirname(self.input().path) - return luigi.LocalTarget(join(dir_name, 'json.db')) - - def run(self): - input_dir = dirname(self.input().path) - for csv_filename in glob.glob('%(input_dir)s/clean-*.csv' % locals()): - parallel.mapreduce( - parallel.Collection.from_glob( - csv_filename, parallel.CSVDictLineInput()), - mapper=CSV2JSONMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=1, - map_workers=8) + batch = luigi.Parameter() + + def requires(self): + return CleanCSV(self.batch) + + def output(self): + dir_name = dirname(self.input().path) + return luigi.LocalTarget(join(dir_name, "json.db")) + + def run(self): + input_dir = dirname(self.input().path) + for csv_filename in glob.glob("%(input_dir)s/clean-*.csv" % locals()): + parallel.mapreduce( + parallel.Collection.from_glob( + csv_filename, parallel.CSVDictLineInput() + ), + mapper=CSV2JSONMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=1, + map_workers=8, + ) class AnnotateJSON(luigi.Task): - batch = luigi.Parameter() + batch = luigi.Parameter() - def requires(self): - return [CSV2JSON(self.batch), CombineHarmonization()] + def requires(self): + return [CSV2JSON(self.batch), CombineHarmonization()] - def output(self): - output_dir = self.input()[0].path.replace('json.db', 'annotated.db') - return luigi.LocalTarget(output_dir) + def output(self): + output_dir = self.input()[0].path.replace("json.db", "annotated.db") + return luigi.LocalTarget(output_dir) - def run(self): - input_db = self.input()[0].path - harmonized_file = self.input()[1].path + def run(self): + input_db = self.input()[0].path + harmonized_file = self.input()[1].path - parallel.mapreduce( - parallel.Collection.from_sharded(input_db), - mapper=annotate.AnnotateMapper(harmonized_file), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=1, - map_workers=1) + parallel.mapreduce( + parallel.Collection.from_sharded(input_db), + mapper=annotate.AnnotateMapper(harmonized_file), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=1, + map_workers=1, + ) class LoadJSON(index_util.LoadJSONBase): - batch = luigi.Parameter() - last_update_date = luigi.Parameter() - index_name = 'recall' - mapping_file = './schemas/res_mapping.json' - use_checksum = True - optimize_index = False - docid_key = '@id' + batch = luigi.Parameter() + last_update_date = luigi.Parameter() + index_name = "recall" + mapping_file = "./schemas/res_mapping.json" + use_checksum = True + optimize_index = False + docid_key = "@id" - def _data(self): - return AnnotateJSON(self.batch) + def _data(self): + return AnnotateJSON(self.batch) class RunWeeklyProcess(AlwaysRunTask): - ''' Generates a date object that is passed through the pipeline tasks in order - to generate and load JSON documents for the weekly enforcement reports. - ''' - def requires(self): - run_date = arrow.utcnow() - dow = run_date.isoweekday() - - # Always run for the most recent wednesday (iso 3) that has passed - # Reminder: iso day of weeks are Monday (1) through Sunday (7) - end_delta = 3 - dow if dow >= 3 else (-1 * (4 + dow)) - end_date = run_date.shift(days=end_delta) - - # arrow.Arrow.range() likes the dates in particular datetime format - start = arrow.get(2012, 0o6, 20) - end = arrow.get(end_date.year, end_date.month, end_date.day) - previous_task = NoopTask() - - for batch in arrow.Arrow.range('week', start, end): - task = LoadJSON(batch=batch.format(), last_update_date=end.format('YYYY-MM-DD'), previous_task=previous_task) - previous_task = task - yield task - - def _run(self): - index_util.optimize_index('recall', wait_for_merge=0) - -if __name__ == '__main__': - luigi.run() + """Generates a date object that is passed through the pipeline tasks in order + to generate and load JSON documents for the weekly enforcement reports. + """ + + def requires(self): + run_date = arrow.utcnow() + dow = run_date.isoweekday() + + # Always run for the most recent wednesday (iso 3) that has passed + # Reminder: iso day of weeks are Monday (1) through Sunday (7) + end_delta = 3 - dow if dow >= 3 else (-1 * (4 + dow)) + end_date = run_date.shift(days=end_delta) + + # arrow.Arrow.range() likes the dates in particular datetime format + start = arrow.get(2012, 0o6, 20) + end = arrow.get(end_date.year, end_date.month, end_date.day) + previous_task = NoopTask() + + for batch in arrow.Arrow.range("week", start, end): + task = LoadJSON( + batch=batch.format(), + last_update_date=end.format("YYYY-MM-DD"), + previous_task=previous_task, + ) + previous_task = task + yield task + + def _run(self): + index_util.optimize_index("recall", wait_for_merge=0) + + +if __name__ == "__main__": + luigi.run() diff --git a/openfda/res/tests/extract_test.py b/openfda/res/tests/extract_test.py index 9c9b22b..dbcf0be 100644 --- a/openfda/res/tests/extract_test.py +++ b/openfda/res/tests/extract_test.py @@ -9,92 +9,88 @@ class ExtractUnitTest(unittest.TestCase): - 'RES Extract Unit Test' + "RES Extract Unit Test" - def setUp(self): - pass + def setUp(self): + pass - def test_extract_ndc_4_4_2(self): - expected_ndc = '0024-0590-10' - xml = open_data_file('ndc-4-4-2.xml') - xml_dict = xmltodict.parse(xml) - extracted_ndcs = extract.extract_ndc_from_recall(xml_dict['recall-number']) - self.assertEqual(expected_ndc, extracted_ndcs[0], - 'ndc-4-4-2.xml') + def test_extract_ndc_4_4_2(self): + expected_ndc = "0024-0590-10" + xml = open_data_file("ndc-4-4-2.xml") + xml_dict = xmltodict.parse(xml) + extracted_ndcs = extract.extract_ndc_from_recall(xml_dict["recall-number"]) + self.assertEqual(expected_ndc, extracted_ndcs[0], "ndc-4-4-2.xml") - def test_extract_ndc_5_3_2(self): - expected_ndc = '10631-106-08' - xml = open_data_file('ndc-5-3-2.xml') - xml_dict = xmltodict.parse(xml) - extracted_ndcs = extract.extract_ndc_from_recall(xml_dict['recall-number']) - self.assertEqual(expected_ndc, extracted_ndcs[0], - 'ndc-5-3-2.xml') + def test_extract_ndc_5_3_2(self): + expected_ndc = "10631-106-08" + xml = open_data_file("ndc-5-3-2.xml") + xml_dict = xmltodict.parse(xml) + extracted_ndcs = extract.extract_ndc_from_recall(xml_dict["recall-number"]) + self.assertEqual(expected_ndc, extracted_ndcs[0], "ndc-5-3-2.xml") - def test_extract_ndc_5_4_2(self): - expected_ndc = '00591-0369-01' - xml = open_data_file('ndc-5-4-2.xml') - xml_dict = xmltodict.parse(xml) - extracted_ndcs = extract.extract_ndc_from_recall(xml_dict['recall-number']) - self.assertEqual(expected_ndc, extracted_ndcs[0], - 'ndc-5-4-2.xml') + def test_extract_ndc_5_4_2(self): + expected_ndc = "00591-0369-01" + xml = open_data_file("ndc-5-4-2.xml") + xml_dict = xmltodict.parse(xml) + extracted_ndcs = extract.extract_ndc_from_recall(xml_dict["recall-number"]) + self.assertEqual(expected_ndc, extracted_ndcs[0], "ndc-5-4-2.xml") - def test_extract_upc_with_spaces(self): - expected_upcs = [ '361958010115' ] - xml = open_data_file('upc-with-spaces.xml') - xml_dict = xmltodict.parse(xml) - extracted_upcs = extract.extract_upc_from_recall(xml_dict['recall-number']) - self.assertListEqual(expected_upcs, extracted_upcs, - 'upc-with-spaces.xml') + def test_extract_upc_with_spaces(self): + expected_upcs = ["361958010115"] + xml = open_data_file("upc-with-spaces.xml") + xml_dict = xmltodict.parse(xml) + extracted_upcs = extract.extract_upc_from_recall(xml_dict["recall-number"]) + self.assertListEqual(expected_upcs, extracted_upcs, "upc-with-spaces.xml") - def test_extract_upc_with_dashes(self): - # ensure the best by date "041913 12265 1" is not included - expected_upcs = [ '698997806158' ] - xml = open_data_file('upc-with-dashes.xml') - xml_dict = xmltodict.parse(xml) - extracted_upcs = extract.extract_upc_from_recall(xml_dict['recall-number']) - self.assertListEqual(expected_upcs, extracted_upcs, - 'upc-with-dashes.xml') + def test_extract_upc_with_dashes(self): + # ensure the best by date "041913 12265 1" is not included + expected_upcs = ["698997806158"] + xml = open_data_file("upc-with-dashes.xml") + xml_dict = xmltodict.parse(xml) + extracted_upcs = extract.extract_upc_from_recall(xml_dict["recall-number"]) + self.assertListEqual(expected_upcs, extracted_upcs, "upc-with-dashes.xml") - def test_extract_upc_with_spaces_and_dashes(self): - expected_upcs = [ '306037039397' ] - xml = open_data_file('upc-with-spaces-and-dashes.xml') - xml_dict = xmltodict.parse(xml) - extracted_upcs = extract.extract_upc_from_recall(xml_dict['recall-number']) - self.assertListEqual(expected_upcs, extracted_upcs, - 'upc-with-spaces-and-dashes.xml') + def test_extract_upc_with_spaces_and_dashes(self): + expected_upcs = ["306037039397"] + xml = open_data_file("upc-with-spaces-and-dashes.xml") + xml_dict = xmltodict.parse(xml) + extracted_upcs = extract.extract_upc_from_recall(xml_dict["recall-number"]) + self.assertListEqual( + expected_upcs, extracted_upcs, "upc-with-spaces-and-dashes.xml" + ) - def test_extract_upc_more_than_one_labeler(self): - # Note 300436122 is not included (invalid length) even though the record - # indicates it's a UPC. - expected_upcs = [ - '300676122304', - '032251004292', - '400910131502', - '300670675103', - '300670668150', - '300670675301', - '300670675400', - '300670675004', - '300670675127', - '300676255736', - '300670674106', - '300670674304', - '681131739276', - '681131739283', - '050428068595', - '050428046784', - '050428065747', - '050428579350', - '050428046791', - '050428065822', - '050428160282' - ] + def test_extract_upc_more_than_one_labeler(self): + # Note 300436122 is not included (invalid length) even though the record + # indicates it's a UPC. + expected_upcs = [ + "300676122304", + "032251004292", + "400910131502", + "300670675103", + "300670668150", + "300670675301", + "300670675400", + "300670675004", + "300670675127", + "300676255736", + "300670674106", + "300670674304", + "681131739276", + "681131739283", + "050428068595", + "050428046784", + "050428065747", + "050428579350", + "050428046791", + "050428065822", + "050428160282", + ] - xml = open_data_file('upc-more-than-one-labeler.xml') - xml_dict = xmltodict.parse(xml) - extracted_upcs = extract.extract_upc_from_recall(xml_dict['recall-number']) - self.assertListEqual(sorted(expected_upcs), sorted(extracted_upcs) ) + xml = open_data_file("upc-more-than-one-labeler.xml") + xml_dict = xmltodict.parse(xml) + extracted_upcs = extract.extract_upc_from_recall(xml_dict["recall-number"]) + self.assertListEqual(sorted(expected_upcs), sorted(extracted_upcs)) -if __name__ == '__main__': - unittest.main() +if __name__ == "__main__": + unittest.main() diff --git a/openfda/spl/annotate.py b/openfda/spl/annotate.py index 60337cf..3d92546 100644 --- a/openfda/spl/annotate.py +++ b/openfda/spl/annotate.py @@ -5,41 +5,45 @@ from openfda import common from openfda.spl.fix_date import validate_date + def read_json_file(json_file): - ''' - Reads an ElasticSearch style multi-json file: - Each line contains a single JSON record. - Yields one object for each line - ''' - for line in json_file: - yield json.loads(line) + """ + Reads an ElasticSearch style multi-json file: + Each line contains a single JSON record. + Yields one object for each line + """ + for line in json_file: + yield json.loads(line) + # TODO(hansnelsen): add to common library, is used in all annotation processing def read_harmonized_file(harmonized_file): - ''' - Create a dictionary that is keyed by spl_set_id. - ''' - spl_set_id_to_harmonized = {} - for row in read_json_file(harmonized_file): - set_id = row['spl_set_id'] - if set_id in spl_set_id_to_harmonized: - spl_set_id_to_harmonized[set_id].append(row) - else: - spl_set_id_to_harmonized[set_id] = [row] - return spl_set_id_to_harmonized + """ + Create a dictionary that is keyed by spl_set_id. + """ + spl_set_id_to_harmonized = {} + for row in read_json_file(harmonized_file): + set_id = row["spl_set_id"] + if set_id in spl_set_id_to_harmonized: + spl_set_id_to_harmonized[set_id].append(row) + else: + spl_set_id_to_harmonized[set_id] = [row] + return spl_set_id_to_harmonized + # TODO(hansnelsen): Add to a common library, since it is used by faers and res # annotate.py. _add_field() is identical in all files. def _add_field(openfda, field, value): - if type(value) != type([]): - value = [value] - for v in value: - if not v: - continue - if field not in openfda: - openfda[field] = {} - openfda[field][v] = True - return + if type(value) != type([]): + value = [value] + for v in value: + if not v: + continue + if field not in openfda: + openfda[field] = {} + openfda[field][v] = True + return + # TODO(hansnelsen): Looks very similiar to the code in faers/annotate.py, we # should consider refactoring this into a general piece of code for generating @@ -49,93 +53,93 @@ def _add_field(openfda, field, value): # add_harmonized_row_to_openfda so that it follows naming # style of other function def AddHarmonizedRowToOpenfda(openfda, row): - _add_field(openfda, 'application_number', row['application_number']) - - # Using the most precise possible name for brand_name - if row['brand_name_suffix']: - brand_name = row['brand_name'] + ' ' + row['brand_name_suffix'] - else: - brand_name = row['brand_name'] - - _add_field(openfda, 'brand_name', brand_name) - _add_field(openfda, 'generic_name', row['generic_name'].upper()) - _add_field(openfda, 'manufacturer_name', row['manufacturer_name']) - _add_field(openfda, 'product_ndc', row['product_ndc']) - _add_field(openfda, 'product_ndc', row['spl_product_ndc']) - _add_field(openfda, 'product_type', row['product_type']) - - for route in row['route'].split(';'): - route = route.strip() - _add_field(openfda, 'route', route) - - for substance in row['substance_name'].split(';'): - substance = substance.strip() - _add_field(openfda, 'substance_name', substance) - - for rxnorm in row['rxnorm']: - _add_field(openfda, 'rxcui', rxnorm['rxcui']) - - _add_field(openfda, 'spl_id', row['id']) - _add_field(openfda, 'spl_set_id', row['spl_set_id']) - _add_field(openfda, 'package_ndc', row['package_ndc']) - _add_field(openfda, - 'original_packager_product_ndc', - row['original_packager_product_ndc']) - _add_field(openfda, - 'is_original_packager', - row['is_original_packager']) - - _add_field(openfda, 'upc', row['upc']) - - if row['unii_indexing'] != []: - unii_list = [] - for unii_row in row['unii_indexing']: - for key, value in unii_row.items(): - if key == 'unii': - unii_list.append(value) - if key == 'va': - for this_item in value: - for va_key, va_value in this_item.items(): - if va_key == 'name': - if va_value.find('[MoA]') != -1: - _add_field(openfda, 'pharm_class_moa', va_value) - if va_value.find('[Chemical/Ingredient]') != -1 or va_value.find('[CS]') != -1: - _add_field(openfda, 'pharm_class_cs', va_value) - if va_value.find('[PE]') != -1: - _add_field(openfda, 'pharm_class_pe', va_value) - if va_value.find('[EPC]') != -1: - _add_field(openfda, 'pharm_class_epc', va_value) - if va_key == 'number': - _add_field(openfda, 'nui', va_value) - _add_field(openfda, 'unii', unii_list) + _add_field(openfda, "application_number", row["application_number"]) + + # Using the most precise possible name for brand_name + if row["brand_name_suffix"]: + brand_name = row["brand_name"] + " " + row["brand_name_suffix"] + else: + brand_name = row["brand_name"] + + _add_field(openfda, "brand_name", brand_name) + _add_field(openfda, "generic_name", row["generic_name"].upper()) + _add_field(openfda, "manufacturer_name", row["manufacturer_name"]) + _add_field(openfda, "product_ndc", row["product_ndc"]) + _add_field(openfda, "product_ndc", row["spl_product_ndc"]) + _add_field(openfda, "product_type", row["product_type"]) + + for route in row["route"].split(";"): + route = route.strip() + _add_field(openfda, "route", route) + + for substance in row["substance_name"].split(";"): + substance = substance.strip() + _add_field(openfda, "substance_name", substance) + + for rxnorm in row["rxnorm"]: + _add_field(openfda, "rxcui", rxnorm["rxcui"]) + + _add_field(openfda, "spl_id", row["id"]) + _add_field(openfda, "spl_set_id", row["spl_set_id"]) + _add_field(openfda, "package_ndc", row["package_ndc"]) + _add_field( + openfda, "original_packager_product_ndc", row["original_packager_product_ndc"] + ) + _add_field(openfda, "is_original_packager", row["is_original_packager"]) + + _add_field(openfda, "upc", row["upc"]) + + if row["unii_indexing"] != []: + unii_list = [] + for unii_row in row["unii_indexing"]: + for key, value in unii_row.items(): + if key == "unii": + unii_list.append(value) + if key == "va": + for this_item in value: + for va_key, va_value in this_item.items(): + if va_key == "name": + if va_value.find("[MoA]") != -1: + _add_field(openfda, "pharm_class_moa", va_value) + if ( + va_value.find("[Chemical/Ingredient]") != -1 + or va_value.find("[CS]") != -1 + ): + _add_field(openfda, "pharm_class_cs", va_value) + if va_value.find("[PE]") != -1: + _add_field(openfda, "pharm_class_pe", va_value) + if va_value.find("[EPC]") != -1: + _add_field(openfda, "pharm_class_epc", va_value) + if va_key == "number": + _add_field(openfda, "nui", va_value) + _add_field(openfda, "unii", unii_list) # TODO(hansnelsen): change name of AnnotateLabel to annotate_label def AnnotateLabel(label, harmonized_dict): - openfda = {} - spl_set_id = label.get('set_id') - date = label.get('effective_time') - label['@timestamp'] = validate_date(common.extract_date(date)) + openfda = {} + spl_set_id = label.get("set_id") + date = label.get("effective_time") + label["@timestamp"] = validate_date(common.extract_date(date)) - if spl_set_id != None: - if spl_set_id in harmonized_dict: - AddHarmonizedRowToOpenfda(openfda, harmonized_dict[spl_set_id][0]) + if spl_set_id != None: + if spl_set_id in harmonized_dict: + AddHarmonizedRowToOpenfda(openfda, harmonized_dict[spl_set_id][0]) - openfda_lists = {} - for field, value in openfda.items(): - openfda_lists[field] = [s for s in value.keys()] - label['openfda'] = openfda_lists + openfda_lists = {} + for field, value in openfda.items(): + openfda_lists[field] = [s for s in value.keys()] + label["openfda"] = openfda_lists class AnnotateMapper(parallel.Mapper): - def __init__(self, harmonized_file): - self.harmonized_file = harmonized_file - - def map_shard(self, map_input, map_output): - self.harmonized_dict = read_harmonized_file(open(self.harmonized_file)) - parallel.Mapper.map_shard(self, map_input, map_output) + def __init__(self, harmonized_file): + self.harmonized_file = harmonized_file - def map(self, key, label, out): - AnnotateLabel(label, self.harmonized_dict) - out.add(key, label) + def map_shard(self, map_input, map_output): + self.harmonized_dict = read_harmonized_file(open(self.harmonized_file)) + parallel.Mapper.map_shard(self, map_input, map_output) + def map(self, key, label, out): + AnnotateLabel(label, self.harmonized_dict) + out.add(key, label) diff --git a/openfda/spl/extract.py b/openfda/spl/extract.py index 94c5a0a..4b9dbf4 100644 --- a/openfda/spl/extract.py +++ b/openfda/spl/extract.py @@ -11,82 +11,95 @@ import re # http://www.fda.gov/ForIndustry/DataStandards/StructuredProductLabeling/ucm162061.htm -DUNS_OID = '1.3.6.1.4.1.519.1' -NDC_OID = '2.16.840.1.113883.6.69' -UNII_OID = '2.16.840.1.113883.4.9' -UNII_OTHER_OID = '2.16.840.1.113883.3.26.1.5' +DUNS_OID = "1.3.6.1.4.1.519.1" +NDC_OID = "2.16.840.1.113883.6.69" +UNII_OID = "2.16.840.1.113883.4.9" +UNII_OTHER_OID = "2.16.840.1.113883.3.26.1.5" + def remove_namespace(xml_file): - """SPL files have namespaces that don't play nice with python's libxml. - For now we just remove namespaces to work around this.""" - # TODO(mattmo): Figure out a better solution to this. - # http://stackoverflow.com/questions/5572247/how-to-find-xml-elements- - # via-xpath-in-python-in-a-namespace-agnostic-way - lines = xml_file.read() - lines = re.sub(r']+>', '', lines) - lines = re.sub(r'\w+:type="[^"]+"', '', lines) - return BytesIO(lines.encode()) + """SPL files have namespaces that don't play nice with python's libxml. + For now we just remove namespaces to work around this.""" + # TODO(mattmo): Figure out a better solution to this. + # http://stackoverflow.com/questions/5572247/how-to-find-xml-elements- + # via-xpath-in-python-in-a-namespace-agnostic-way + lines = xml_file.read() + lines = re.sub(r"]+>", "", lines) + lines = re.sub(r'\w+:type="[^"]+"', "", lines) + return BytesIO(lines.encode()) + def parse_xml(xml_file): - p = etree.XMLParser(huge_tree=True) - tree = etree.parse(remove_namespace(open(xml_file)), parser=p) - return tree + p = etree.XMLParser(huge_tree=True) + tree = etree.parse(remove_namespace(open(xml_file)), parser=p) + return tree + def first_match_or_empty_string(matches): - if len(matches) > 0: - return matches[0] - else: - return '' + if len(matches) > 0: + return matches[0] + else: + return "" + def extract_title(tree): - return ' '.join(tree.getroot().xpath('/document/title/descendant::text()')) + return " ".join(tree.getroot().xpath("/document/title/descendant::text()")) + def extract_id(tree): - return first_match_or_empty_string( - tree.getroot().xpath('/document/id/@root')) + return first_match_or_empty_string(tree.getroot().xpath("/document/id/@root")) + def extract_set_id(tree): - return first_match_or_empty_string( - tree.getroot().xpath('/document/setId/@root')) + return first_match_or_empty_string(tree.getroot().xpath("/document/setId/@root")) + def extract_effective_time(tree): - raw_date = first_match_or_empty_string( - tree.getroot().xpath('/document/effectiveTime/@value')) - if not raw_date: - return '' - year, month, day = raw_date[:4], raw_date[4:6], raw_date[6:] - return year + "-" + month + "-" + day + raw_date = first_match_or_empty_string( + tree.getroot().xpath("/document/effectiveTime/@value") + ) + if not raw_date: + return "" + year, month, day = raw_date[:4], raw_date[4:6], raw_date[6:] + return year + "-" + month + "-" + day + def extract_version_number(tree): - return first_match_or_empty_string( - tree.getroot().xpath('/document/versionNumber/@value')) + return first_match_or_empty_string( + tree.getroot().xpath("/document/versionNumber/@value") + ) + def extract_display_name(tree): - return first_match_or_empty_string( - tree.getroot().xpath('/document/code/@displayName')) + return first_match_or_empty_string( + tree.getroot().xpath("/document/code/@displayName") + ) + def extract_duns(tree): - duns_xpath = '//id[@root="%s"]/@extension' % DUNS_OID - return first_match_or_empty_string(tree.getroot().xpath(duns_xpath)) + duns_xpath = '//id[@root="%s"]/@extension' % DUNS_OID + return first_match_or_empty_string(tree.getroot().xpath(duns_xpath)) + def is_original_packager(tree): - "Returns true if this SPL's manufacturer is the original packager." - return len(extract_original_packager_product_ndcs(tree)) == 0 + "Returns true if this SPL's manufacturer is the original packager." + return len(extract_original_packager_product_ndcs(tree)) == 0 + def extract_product_ndcs(tree): - "Extracts the partial (labelercode-productcode) NDCs." - ndc_xpath = '//manufacturedProduct/code[@codeSystem="%s"]/@code' % NDC_OID - return tree.getroot().xpath(ndc_xpath) + "Extracts the partial (labelercode-productcode) NDCs." + ndc_xpath = '//manufacturedProduct/code[@codeSystem="%s"]/@code' % NDC_OID + return tree.getroot().xpath(ndc_xpath) + def extract_original_packager_product_ndcs(tree): - """Extracts the partial (labelercode-productcode) NDCs for the original - packager. Only populated if this SPL is for a repackager.""" - ndc_xpath = \ - '//definingMaterialKind/code[@codeSystem="%s"]/@code' % NDC_OID - return tree.getroot().xpath(ndc_xpath) + """Extracts the partial (labelercode-productcode) NDCs for the original + packager. Only populated if this SPL is for a repackager.""" + ndc_xpath = '//definingMaterialKind/code[@codeSystem="%s"]/@code' % NDC_OID + return tree.getroot().xpath(ndc_xpath) + def extract_package_ndcs(tree): - """Extracts the full labelercode-productcode-packagecode NDCs""" - ndc_xpath = \ - '//containerPackagedProduct/code[@codeSystem="%s"]/@code' % NDC_OID - return tree.getroot().xpath(ndc_xpath) + """Extracts the full labelercode-productcode-packagecode NDCs""" + ndc_xpath = '//containerPackagedProduct/code[@codeSystem="%s"]/@code' % NDC_OID + return tree.getroot().xpath(ndc_xpath) diff --git a/openfda/spl/fix_date.py b/openfda/spl/fix_date.py index 89c5369..fe02c84 100644 --- a/openfda/spl/fix_date.py +++ b/openfda/spl/fix_date.py @@ -1,42 +1,44 @@ #!/usr/bin/python -''' +""" Fixes invalid dates found in spl JSONs. -''' +""" import arrow -DATE_FORMAT = 'YYYY-MM-DD' +DATE_FORMAT = "YYYY-MM-DD" + # Restores the date string after parsing. def fixDate(y, m, d): - return (y + '-' + m + '-' + d) + return y + "-" + m + "-" + d + # Accepts a date string in format YYYY-MM-DD and fixes invalid dates. def validate_date(date): - year, month, day = date.split('-', 2) - year, month, day = year[:4].zfill(4), month[:2].zfill(2), day[:2].zfill(2) - - try: - date_obj = arrow.get(fixDate(year, month, day), DATE_FORMAT) - except Exception as e: - if str(e) == 'day is out of range for month': - if int(day) < 1: - day = '01' - elif int(day) > 31: - day = '31' - elif int(day) > 28: - day = str((int(day) - 1)) - elif str(e) == 'month must be in 1..12': - if int(month) < 1: - month = '01' - if int(month) > 12: - month = '12' - else: - raise e - try: - date_obj = arrow.get(fixDate(year, month, day), DATE_FORMAT) - except: - date_obj = validate_date(fixDate(year, month, day)) - finally: - return date_obj.format(DATE_FORMAT) + year, month, day = date.split("-", 2) + year, month, day = year[:4].zfill(4), month[:2].zfill(2), day[:2].zfill(2) + + try: + date_obj = arrow.get(fixDate(year, month, day), DATE_FORMAT) + except Exception as e: + if str(e) == "day is out of range for month": + if int(day) < 1: + day = "01" + elif int(day) > 31: + day = "31" + elif int(day) > 28: + day = str((int(day) - 1)) + elif str(e) == "month must be in 1..12": + if int(month) < 1: + month = "01" + if int(month) > 12: + month = "12" + else: + raise e + try: + date_obj = arrow.get(fixDate(year, month, day), DATE_FORMAT) + except: + date_obj = validate_date(fixDate(year, month, day)) + finally: + return date_obj.format(DATE_FORMAT) diff --git a/openfda/spl/pipeline.py b/openfda/spl/pipeline.py index b794349..d47e233 100644 --- a/openfda/spl/pipeline.py +++ b/openfda/spl/pipeline.py @@ -1,7 +1,7 @@ #!/usr/bin/python -''' A pipeline for loading SPL data into ElasticSearch -''' +""" A pipeline for loading SPL data into ElasticSearch +""" import gzip import glob import logging @@ -29,277 +29,319 @@ import urllib.request RUN_DIR = dirname(dirname(os.path.abspath(__file__))) -SPL_JS = join(RUN_DIR, 'spl/spl_to_json.js') -LOINC = join(RUN_DIR, 'spl/data/sections.csv') +SPL_JS = join(RUN_DIR, "spl/spl_to_json.js") +LOINC = join(RUN_DIR, "spl/data/sections.csv") -SPL_S3_BUCKET = 's3://openfda-data-spl/' -SPL_STAGING_S3_BUCKET = 's3://openfda-data-spl-staging/' -SPL_S3_LOCAL_DIR = config.data_dir('spl/s3_sync') -SPL_INDEX_DIR = config.data_dir('spl/index.db') -SPL_JSON_DIR = config.data_dir('spl/json.db') -SPL_ANNOTATED_DIR = config.data_dir('spl/annotated.db') +SPL_S3_BUCKET = "s3://openfda-data-spl/" +SPL_STAGING_S3_BUCKET = "s3://openfda-data-spl-staging/" +SPL_S3_LOCAL_DIR = config.data_dir("spl/s3_sync") +SPL_INDEX_DIR = config.data_dir("spl/index.db") +SPL_JSON_DIR = config.data_dir("spl/json.db") +SPL_ANNOTATED_DIR = config.data_dir("spl/annotated.db") -DAILY_MED_DIR = config.data_dir('spl/dailymed') -DAILY_MED_DOWNLOADS_DIR = config.data_dir('spl/dailymed/raw') -DAILY_MED_EXTRACT_DIR = config.data_dir('spl/dailymed/extract') -DAILY_MED_FLATTEN_DIR = config.data_dir('spl/dailymed/flatten') -DAILY_MED_DOWNLOADS_PAGE = 'https://dailymed.nlm.nih.gov/dailymed/spl-resources-all-drug-labels.cfm' +DAILY_MED_DIR = config.data_dir("spl/dailymed") +DAILY_MED_DOWNLOADS_DIR = config.data_dir("spl/dailymed/raw") +DAILY_MED_EXTRACT_DIR = config.data_dir("spl/dailymed/extract") +DAILY_MED_FLATTEN_DIR = config.data_dir("spl/dailymed/flatten") +DAILY_MED_DOWNLOADS_PAGE = ( + "https://dailymed.nlm.nih.gov/dailymed/spl-resources-all-drug-labels.cfm" +) -common.shell_cmd_quiet('mkdir -p %s', SPL_S3_LOCAL_DIR) -common.shell_cmd_quiet('mkdir -p %s', DAILY_MED_DIR) +common.shell_cmd_quiet("mkdir -p %s", SPL_S3_LOCAL_DIR) +common.shell_cmd_quiet("mkdir -p %s", DAILY_MED_DIR) -class DownloadDailyMedSPL(luigi.Task): - local_dir = DAILY_MED_DOWNLOADS_DIR - - def output(self): - return luigi.LocalTarget(self.local_dir) - def run(self): - common.shell_cmd_quiet('mkdir -p %s', self.local_dir) - soup = BeautifulSoup(urllib.request.urlopen(DAILY_MED_DOWNLOADS_PAGE).read(), 'lxml') - for a in soup.find_all(href=re.compile('.*.zip')): - if '_human_' in a.text: - try: - common.download(a['href'], join(self.local_dir, a['href'].split('/')[-1])) - except ProcessException as e: - logging.error("Could not download a DailyMed SPL archive: {0}: {1}".format(a['href'], e)) +class DownloadDailyMedSPL(luigi.Task): + local_dir = DAILY_MED_DOWNLOADS_DIR + + def output(self): + return luigi.LocalTarget(self.local_dir) + + def run(self): + common.shell_cmd_quiet("mkdir -p %s", self.local_dir) + soup = BeautifulSoup( + urllib.request.urlopen(DAILY_MED_DOWNLOADS_PAGE).read(), "lxml" + ) + for a in soup.find_all(href=re.compile(".*.zip")): + if "_human_" in a.text: + try: + common.download( + a["href"], join(self.local_dir, a["href"].split("/")[-1]) + ) + except ProcessException as e: + logging.error( + "Could not download a DailyMed SPL archive: {0}: {1}".format( + a["href"], e + ) + ) class ExtractDailyMedSPL(luigi.Task): - local_dir = DAILY_MED_EXTRACT_DIR + local_dir = DAILY_MED_EXTRACT_DIR - def requires(self): - return DownloadDailyMedSPL() + def requires(self): + return DownloadDailyMedSPL() - def output(self): - return luigi.LocalTarget(self.local_dir) + def output(self): + return luigi.LocalTarget(self.local_dir) - def run(self): - src_dir = self.input().path - os.system('mkdir -p "%s"' % self.output().path) - pattern = join(src_dir, '*.zip') - zip_files = glob.glob(pattern) + def run(self): + src_dir = self.input().path + os.system('mkdir -p "%s"' % self.output().path) + pattern = join(src_dir, "*.zip") + zip_files = glob.glob(pattern) - if len(zip_files) == 0: - logging.warning( - 'Expected to find one or more daily med SPL files') + if len(zip_files) == 0: + logging.warning("Expected to find one or more daily med SPL files") - extract_dir = self.output().path - for zip_file in zip_files: - common.shell_cmd_quiet('unzip -oq -d %(extract_dir)s %(zip_file)s' % locals()) + extract_dir = self.output().path + for zip_file in zip_files: + common.shell_cmd_quiet( + "unzip -oq -d %(extract_dir)s %(zip_file)s" % locals() + ) -class FlattenDailyMedSPL(parallel.MRTask): - local_dir = DAILY_MED_FLATTEN_DIR - def map(self, zip_file, value, output): - cmd = 'zipinfo -1 %(zip_file)s' % locals() - xml_file_name = None - zip_contents = common.shell_cmd_quiet(cmd) - xml_match = re.search('^([0-9a-f-]{36})\.xml$', zip_contents.decode(), re.I | re.M) - if (xml_match): - xml_file_name = xml_match.group() - spl_dir_name = os.path.join(self.output().path, xml_match.group(1)) - os.system('mkdir -p "%s"' % spl_dir_name) - common.shell_cmd_quiet('unzip -oq %(zip_file)s -d %(spl_dir_name)s' % locals()) - output.add(xml_file_name, zip_file) +class FlattenDailyMedSPL(parallel.MRTask): + local_dir = DAILY_MED_FLATTEN_DIR + def map(self, zip_file, value, output): + cmd = "zipinfo -1 %(zip_file)s" % locals() + xml_file_name = None + zip_contents = common.shell_cmd_quiet(cmd) + xml_match = re.search( + "^([0-9a-f-]{36})\.xml$", zip_contents.decode(), re.I | re.M + ) + if xml_match: + xml_file_name = xml_match.group() + spl_dir_name = os.path.join(self.output().path, xml_match.group(1)) + os.system('mkdir -p "%s"' % spl_dir_name) + common.shell_cmd_quiet( + "unzip -oq %(zip_file)s -d %(spl_dir_name)s" % locals() + ) + output.add(xml_file_name, zip_file) - def requires(self): - return ExtractDailyMedSPL() + def requires(self): + return ExtractDailyMedSPL() - def output(self): - return luigi.LocalTarget(self.local_dir) + def output(self): + return luigi.LocalTarget(self.local_dir) - def mapreduce_inputs(self): - return parallel.Collection.from_glob(os.path.join(self.input().path, '*/*.zip')) + def mapreduce_inputs(self): + return parallel.Collection.from_glob(os.path.join(self.input().path, "*/*.zip")) - def output_format(self): - return NullOutput() + def output_format(self): + return NullOutput() class AddInDailyMedSPL(luigi.Task): - local_dir = DAILY_MED_DIR + local_dir = DAILY_MED_DIR - def requires(self): - return FlattenDailyMedSPL() + def requires(self): + return FlattenDailyMedSPL() - def flag_file(self): - return os.path.join(self.local_dir, '.last_sync_time') + def flag_file(self): + return os.path.join(self.local_dir, ".last_sync_time") - def complete(self): - # Only run daily med once per day. - if config.disable_downloads(): - return True + def complete(self): + # Only run daily med once per day. + if config.disable_downloads(): + return True - return os.path.exists(self.flag_file()) and ( - arrow.get(os.path.getmtime(self.flag_file())) > arrow.now().floor('day')) + return os.path.exists(self.flag_file()) and ( + arrow.get(os.path.getmtime(self.flag_file())) > arrow.now().floor("day") + ) - def run(self): - src = self.input().path - dest = SPL_S3_LOCAL_DIR - os.system('cp -nr %(src)s/. %(dest)s/' % locals()) + def run(self): + src = self.input().path + dest = SPL_S3_LOCAL_DIR + os.system("cp -nr %(src)s/. %(dest)s/" % locals()) - with open(self.flag_file(), 'w') as out_f: - out_f.write('') + with open(self.flag_file(), "w") as out_f: + out_f.write("") class SyncS3SPL(luigi.Task): - # bucket = SPL_S3_BUCKET - staging_bucket = SPL_STAGING_S3_BUCKET - local_dir = SPL_S3_LOCAL_DIR - - def flag_file(self): - return os.path.join(self.local_dir, '.last_sync_time') - - def complete(self): - # Only run S3 sync once per day. - if config.disable_downloads(): - return True - - return os.path.exists(self.flag_file()) and ( - arrow.get(os.path.getmtime(self.flag_file())) > arrow.now().floor('day')) - - def run(self): - # Copying from the staging bucket to the archival bucket is now being done via S3 replication. - ''' - common.quiet_cmd(['aws', - '--cli-read-timeout=3600', - '--profile=' + config.aws_profile(), - 's3', 'sync', - self.staging_bucket, - self.bucket]) - ''' - common.quiet_cmd(['aws', - '--cli-read-timeout=3600', - '--profile=' + config.aws_profile(), - 's3', 'sync', + # bucket = SPL_S3_BUCKET + staging_bucket = SPL_STAGING_S3_BUCKET + local_dir = SPL_S3_LOCAL_DIR + + def flag_file(self): + return os.path.join(self.local_dir, ".last_sync_time") + + def complete(self): + # Only run S3 sync once per day. + if config.disable_downloads(): + return True + + return os.path.exists(self.flag_file()) and ( + arrow.get(os.path.getmtime(self.flag_file())) > arrow.now().floor("day") + ) + + def run(self): + # Copying from the staging bucket to the archival bucket is now being done via S3 replication. + """ + common.quiet_cmd(['aws', + '--cli-read-timeout=3600', + '--profile=' + config.aws_profile(), + 's3', 'sync', + self.staging_bucket, + self.bucket]) + """ + common.quiet_cmd( + [ + "aws", + "--cli-read-timeout=3600", + "--profile=" + config.aws_profile(), + "s3", + "sync", self.staging_bucket, - self.local_dir]) + self.local_dir, + ] + ) - with open(self.flag_file(), 'w') as out_f: - out_f.write('') + with open(self.flag_file(), "w") as out_f: + out_f.write("") class DetermineSPLToIndex(parallel.MRTask): - NS = {'ns': 'urn:hl7-org:v3'} - - def requires(self): - return SyncS3SPL() #, AddInDailyMedSPL()] - - def output(self): - return luigi.LocalTarget(SPL_INDEX_DIR) - - def mapreduce_inputs(self): - return parallel.Collection.from_glob(join(SPL_S3_LOCAL_DIR, '*/*.xml')) - - def map(self, xml_file, value, output): - if os.path.getsize(xml_file) > 0: - - # Oddly enough, some SPL XML files arrive from FDA gzipped, which requires us to take an additional - # uncompressing step. - filetype = common.shell_cmd_quiet('file %(xml_file)s' % locals()) - if "gzip compressed data" in filetype.decode() or "DOS/MBR boot sector" in filetype.decode(): - # logging.warning("SPL XML is gzipped: " + xml_file) - gz_file = xml_file + '.gz' - os.rename(xml_file, gz_file) - with gzip.open(gz_file, 'rb') as f_in, open(xml_file, 'wb') as f_out: - shutil.copyfileobj(f_in, f_out) - - p = etree.XMLParser(huge_tree=True) - try: - tree = etree.parse(open(xml_file), parser=p) - code = next(iter( - tree.xpath("//ns:document/ns:code[@codeSystem='2.16.840.1.113883.6.1']/@displayName", namespaces=self.NS)), - '') - if code.lower().find('human') != -1: - spl_id = tree.xpath('//ns:document/ns:id/@root', namespaces=self.NS)[0].lower() - spl_set_id = tree.xpath('//ns:document/ns:setId/@root', namespaces=self.NS)[0].lower() - version = tree.xpath('//ns:document/ns:versionNumber/@value', namespaces=self.NS)[0] - output.add(spl_set_id, {'spl_id': spl_id, 'version': version}) - elif len(code) == 0: - logging.warning("Not a drug label SPL file: " + xml_file) - except XMLSyntaxError as e: - logging.warning("Invalid SPL file: " + xml_file) - logging.warning(e) - except: - logging.error("Error processing SPL file: " + xml_file) - traceback.print_exc() - raise - else: - logging.warning("Zero length SPL file: " + xml_file) - - def reduce(self, key, values, output): - values.sort(key=lambda spl: int(spl['version'])) - output.put(key, values[-1]) - - def map_workers(self): - return multiprocessing.cpu_count() * 2 + NS = {"ns": "urn:hl7-org:v3"} + + def requires(self): + return SyncS3SPL() # , AddInDailyMedSPL()] + + def output(self): + return luigi.LocalTarget(SPL_INDEX_DIR) + + def mapreduce_inputs(self): + return parallel.Collection.from_glob(join(SPL_S3_LOCAL_DIR, "*/*.xml")) + + def map(self, xml_file, value, output): + if os.path.getsize(xml_file) > 0: + + # Oddly enough, some SPL XML files arrive from FDA gzipped, which requires us to take an additional + # uncompressing step. + filetype = common.shell_cmd_quiet("file %(xml_file)s" % locals()) + if ( + "gzip compressed data" in filetype.decode() + or "DOS/MBR boot sector" in filetype.decode() + ): + # logging.warning("SPL XML is gzipped: " + xml_file) + gz_file = xml_file + ".gz" + os.rename(xml_file, gz_file) + with gzip.open(gz_file, "rb") as f_in, open(xml_file, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + + p = etree.XMLParser(huge_tree=True) + try: + tree = etree.parse(open(xml_file), parser=p) + code = next( + iter( + tree.xpath( + "//ns:document/ns:code[@codeSystem='2.16.840.1.113883.6.1']/@displayName", + namespaces=self.NS, + ) + ), + "", + ) + if code.lower().find("human") != -1: + spl_id = tree.xpath( + "//ns:document/ns:id/@root", namespaces=self.NS + )[0].lower() + spl_set_id = tree.xpath( + "//ns:document/ns:setId/@root", namespaces=self.NS + )[0].lower() + version = tree.xpath( + "//ns:document/ns:versionNumber/@value", namespaces=self.NS + )[0] + output.add(spl_set_id, {"spl_id": spl_id, "version": version}) + elif len(code) == 0: + logging.warning("Not a drug label SPL file: " + xml_file) + except XMLSyntaxError as e: + logging.warning("Invalid SPL file: " + xml_file) + logging.warning(e) + except: + logging.error("Error processing SPL file: " + xml_file) + traceback.print_exc() + raise + else: + logging.warning("Zero length SPL file: " + xml_file) + + def reduce(self, key, values, output): + values.sort(key=lambda spl: int(spl["version"])) + output.put(key, values[-1]) + + def map_workers(self): + return multiprocessing.cpu_count() * 2 + class SPL2JSON(parallel.MRTask): - spl_path = SPL_S3_LOCAL_DIR - - def map(self, spl_set_id, value, output): - value = value['spl_id'].strip() - xml_file = join(self.spl_path, value, value + '.xml') - if not os.path.exists(xml_file): - # logging.error('File does not exist, skipping %s', xml_file) - return - spl_js = SPL_JS - loinc = LOINC - cmd = 'node %(spl_js)s %(xml_file)s %(loinc)s' % locals() - json_str = '' - try: - json_str = common.shell_cmd_quiet(cmd) - json_obj = json.loads(json_str) - if not json_obj.get('set_id'): - raise RuntimeError('SPL file has no set_id: %s', xml_file) - else: - output.add(xml_file, json_obj) - except: - logging.error('Unable to convert SPL XML to JSON: %s', xml_file) - logging.error('cmd: %s', cmd) - logging.error('json: %s', json_str) - logging.error(sys.exc_info()[0]) - raise - - def requires(self): - return DetermineSPLToIndex() - - def output(self): - return luigi.LocalTarget(SPL_JSON_DIR) - - def mapreduce_inputs(self): - return parallel.Collection.from_sharded(self.input().path) + spl_path = SPL_S3_LOCAL_DIR + + def map(self, spl_set_id, value, output): + value = value["spl_id"].strip() + xml_file = join(self.spl_path, value, value + ".xml") + if not os.path.exists(xml_file): + # logging.error('File does not exist, skipping %s', xml_file) + return + spl_js = SPL_JS + loinc = LOINC + cmd = "node %(spl_js)s %(xml_file)s %(loinc)s" % locals() + json_str = "" + try: + json_str = common.shell_cmd_quiet(cmd) + json_obj = json.loads(json_str) + if not json_obj.get("set_id"): + raise RuntimeError("SPL file has no set_id: %s", xml_file) + else: + output.add(xml_file, json_obj) + except: + logging.error("Unable to convert SPL XML to JSON: %s", xml_file) + logging.error("cmd: %s", cmd) + logging.error("json: %s", json_str) + logging.error(sys.exc_info()[0]) + raise + + def requires(self): + return DetermineSPLToIndex() + + def output(self): + return luigi.LocalTarget(SPL_JSON_DIR) + + def mapreduce_inputs(self): + return parallel.Collection.from_sharded(self.input().path) class AnnotateJSON(luigi.Task): - def requires(self): - return [SPL2JSON(), CombineHarmonization()] + def requires(self): + return [SPL2JSON(), CombineHarmonization()] + + def output(self): + return luigi.LocalTarget(SPL_ANNOTATED_DIR) - def output(self): - return luigi.LocalTarget(SPL_ANNOTATED_DIR) + def run(self): + input_db = self.input()[0].path + harmonized_file = self.input()[1].path - def run(self): - input_db = self.input()[0].path - harmonized_file = self.input()[1].path + parallel.mapreduce( + parallel.Collection.from_sharded(input_db), + mapper=annotate.AnnotateMapper(harmonized_file), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) - parallel.mapreduce( - parallel.Collection.from_sharded(input_db), - mapper=annotate.AnnotateMapper(harmonized_file), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) class LoadJSON(index_util.LoadJSONBase): - index_name = 'druglabel' - mapping_file = './schemas/spl_mapping.json' - use_checksum = True - docid_key = 'set_id' - optimize_index = True + index_name = "druglabel" + mapping_file = "./schemas/spl_mapping.json" + use_checksum = True + docid_key = "set_id" + optimize_index = True + + def _data(self): + return AnnotateJSON() - def _data(self): - return AnnotateJSON() -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/spl/process_barcodes.py b/openfda/spl/process_barcodes.py index 3a92938..1fbe6ee 100755 --- a/openfda/spl/process_barcodes.py +++ b/openfda/spl/process_barcodes.py @@ -11,87 +11,93 @@ from openfda.common import strip_unicode XML_ESCAPE_CHARS = { - '"': '"', - '\'': ''', - '<': '<', - '>': '>', - '&': '&' + '"': """, + "'": "'", + "<": "<", + ">": ">", + "&": "&", } -BARCODE_TYPES = ['EAN-13'] +BARCODE_TYPES = ["EAN-13"] + def XML2JSON(input_file): - rows = [] - - def handle_barcode(_, barcode): - if isinstance(barcode, dict): - row_dict = {} - try: - all_symbols = [] - symbols = barcode['source']['index']['symbol'] - if isinstance(symbols, list): - all_symbols.extend(symbols) - else: - all_symbols.append(symbols) - - symbol = next((x for x in all_symbols if x['@type'] in BARCODE_TYPES), None) - - if symbol: - href = barcode['source']['@href'] - barcode_type = symbol['@type'] - quality = symbol['@quality'] - data = symbol['data'] - - row_dict['id'] = basename(href) - row_dict['barcode_type'] = barcode_type - row_dict['quality'] = quality - row_dict['upc'] = data - rows.append(row_dict) - - except KeyError: - pass - return True - - def escape_xml(raw_xml): - # zbar doesn't do xml escaping, so we have to do it here before we hand the - # xml off to a parser - lines = [] - #need to handle non-escaped barcodes that have no source - #if the current line is a barcodes and the previous one was too - #then prefix the line with a closing barcodes - previous_line_is_barcodes = False - for line in raw_xml.split('\n'): - if '", '') - href = href.split('/')[:-1] - href = '/'.join(href) - line = "" - - line = line.strip() - if line: - lines.append(line) - - if len(lines) > 0 and '/barcodes' not in lines[-1]: - lines.append('') - - return '\n' + '\n'.join(lines) + '\n' - - out_file = input_file.replace('.xml', '.json') - out = open(out_file, 'w') - escaped_xml = StringIO(escape_xml(io.open(input_file, encoding="utf-8").read())) - - try: - xmltodict.parse(strip_unicode(escaped_xml.getvalue(), True), item_depth=2, item_callback=handle_barcode) - except xml.parsers.expat.ExpatError as ee: - logging.info('Error parsing barcode XML file %s', input_file) - logging.debug(ee) - - for row in rows: - out.write(json.dumps(row) + '\n') - + for line in raw_xml.split("\n"): + if "", "") + href = href.split("/")[:-1] + href = "/".join(href) + line = "" + + line = line.strip() + if line: + lines.append(line) + + if len(lines) > 0 and "/barcodes" not in lines[-1]: + lines.append("") + + return "\n" + "\n".join(lines) + "\n" + + out_file = input_file.replace(".xml", ".json") + out = open(out_file, "w") + escaped_xml = StringIO(escape_xml(io.open(input_file, encoding="utf-8").read())) + + try: + xmltodict.parse( + strip_unicode(escaped_xml.getvalue(), True), + item_depth=2, + item_callback=handle_barcode, + ) + except xml.parsers.expat.ExpatError as ee: + logging.info("Error parsing barcode XML file %s", input_file) + logging.debug(ee) + + for row in rows: + out.write(json.dumps(row) + "\n") diff --git a/openfda/spl/tests/date_fix_test.py b/openfda/spl/tests/date_fix_test.py index 09e0999..a39f5fc 100644 --- a/openfda/spl/tests/date_fix_test.py +++ b/openfda/spl/tests/date_fix_test.py @@ -3,29 +3,29 @@ from arrow.parser import ParserError import openfda.spl.fix_date as fix_date + # Validates that combinations of invalid dates are corrected. def test_invalid_dates(): - eq_(fix_date.validate_date('2011-04-00'), '2011-04-01') - eq_(fix_date.validate_date('2011-05-32'), '2011-05-31') - eq_(fix_date.validate_date('2011-04-31'), '2011-04-30') - eq_(fix_date.validate_date('2011-04-32'), '2011-04-30') - eq_(fix_date.validate_date('2011-02-30'), '2011-02-28') - eq_(fix_date.validate_date('2016-02-30'), '2016-02-29') - eq_(fix_date.validate_date('2011-02-330'), '2011-02-28') - eq_(fix_date.validate_date('2011-00-30'), '2011-01-30') - eq_(fix_date.validate_date('2011-22-30'), '2011-12-30') - eq_(fix_date.validate_date('2011-223-30'), '2011-12-30') - eq_(fix_date.validate_date('10-12-30'), '0010-12-30') - eq_(fix_date.validate_date('1-0-0'), '0001-01-01') - eq_(fix_date.validate_date('2019-12-30'), '2019-12-30') - eq_(fix_date.validate_date('2019-13-0'), '2019-12-01') + eq_(fix_date.validate_date("2011-04-00"), "2011-04-01") + eq_(fix_date.validate_date("2011-05-32"), "2011-05-31") + eq_(fix_date.validate_date("2011-04-31"), "2011-04-30") + eq_(fix_date.validate_date("2011-04-32"), "2011-04-30") + eq_(fix_date.validate_date("2011-02-30"), "2011-02-28") + eq_(fix_date.validate_date("2016-02-30"), "2016-02-29") + eq_(fix_date.validate_date("2011-02-330"), "2011-02-28") + eq_(fix_date.validate_date("2011-00-30"), "2011-01-30") + eq_(fix_date.validate_date("2011-22-30"), "2011-12-30") + eq_(fix_date.validate_date("2011-223-30"), "2011-12-30") + eq_(fix_date.validate_date("10-12-30"), "0010-12-30") + eq_(fix_date.validate_date("1-0-0"), "0001-01-01") + eq_(fix_date.validate_date("2019-12-30"), "2019-12-30") + eq_(fix_date.validate_date("2019-13-0"), "2019-12-01") + # Verifies that alpha character raise an exception. def test_alpha_character(): - try: - fix_date.validate_date('10-12-a') - fail('ParserError expected') - except ParserError: - pass - - + try: + fix_date.validate_date("10-12-a") + fail("ParserError expected") + except ParserError: + pass diff --git a/openfda/spl/tests/extract_test.py b/openfda/spl/tests/extract_test.py index 09d64e4..fa41804 100644 --- a/openfda/spl/tests/extract_test.py +++ b/openfda/spl/tests/extract_test.py @@ -8,189 +8,196 @@ class SPLExtractHumiraUnitTest(unittest.TestCase): - 'SPL Humira Extract Unit Test' - - def setUp(self): - self.tree = extract.parse_xml(data_filename('humira.xml')) - - def test_extract_title(self): - expected_title = \ - ('These highlights do not include all the information needed to use ' - 'HUMIRA safely and effectively. See full prescribing information for ' - 'HUMIRA. HUMIRA (adalimumab) injection, for subcutaneous use Initial ' - 'U.S. Approval: 2002') - extracted_title = extract.extract_title(self.tree) - self.assertEqual(expected_title, extracted_title, - extracted_title) - - def test_extract_id(self): - expected_id = '2c9fb32d-4b1b-b5da-4bdf-6b06908ba8b3' - extracted_id = extract.extract_id(self.tree) - self.assertEqual(expected_id, extracted_id, - extracted_id) - - def test_extract_setid(self): - expected_setid = '608d4f0d-b19f-46d3-749a-7159aa5f933d' - extracted_setid = extract.extract_set_id(self.tree) - self.assertEqual(expected_setid, extracted_setid, - extracted_setid) - - def test_extract_effective_time(self): - expected_effective_time = '2013-09-30' - extracted_effective_time = extract.extract_effective_time(self.tree) - self.assertEqual(expected_effective_time, extracted_effective_time, - extracted_effective_time) - - def test_extract_version_number(self): - expected_version_number = '1560' - extracted_version_number = extract.extract_version_number(self.tree) - self.assertEqual(expected_version_number, extracted_version_number, - extracted_version_number) - - def test_extract_display_name(self): - expected_display_name = 'HUMAN PRESCRIPTION DRUG LABEL' - extracted_display_name = extract.extract_display_name(self.tree) - self.assertEqual(expected_display_name, extracted_display_name, - extracted_display_name) - - def test_extract_duns(self): - expected_duns = '078458370' - extracted_duns = extract.extract_duns(self.tree) - self.assertEqual(expected_duns, extracted_duns, - extracted_duns) - - def test_extract_product_ndcs(self): - expected_ndcs = ['0074-3799', '0074-9374', '0074-4339', '0074-3797'] - extracted_ndcs = extract.extract_product_ndcs(self.tree) - self.assertListEqual(expected_ndcs, extracted_ndcs, - extracted_ndcs) - - def test_is_original_packager(self): - self.assertEqual(True, extract.is_original_packager(self.tree), - True) - - def test_extract_original_packager_product_ndcs(self): - expected_ndcs = [] - extracted_ndcs = extract.extract_original_packager_product_ndcs(self.tree) - self.assertListEqual(expected_ndcs, extracted_ndcs, - extracted_ndcs) - - def test_extract_package_ndcs(self): - expected_ndcs = ['0074-3799-02', '0074-3799-71', '0074-9374-02', - '0074-9374-71', '0074-4339-02', '0074-4339-06', - '0074-4339-07', '0074-4339-71', '0074-4339-73', - '0074-3797-01'] - extracted_ndcs = extract.extract_package_ndcs(self.tree) - self.assertListEqual(expected_ndcs, extracted_ndcs, - extracted_ndcs) + "SPL Humira Extract Unit Test" + + def setUp(self): + self.tree = extract.parse_xml(data_filename("humira.xml")) + + def test_extract_title(self): + expected_title = ( + "These highlights do not include all the information needed to use " + "HUMIRA safely and effectively. See full prescribing information for " + "HUMIRA. HUMIRA (adalimumab) injection, for subcutaneous use Initial " + "U.S. Approval: 2002" + ) + extracted_title = extract.extract_title(self.tree) + self.assertEqual(expected_title, extracted_title, extracted_title) + + def test_extract_id(self): + expected_id = "2c9fb32d-4b1b-b5da-4bdf-6b06908ba8b3" + extracted_id = extract.extract_id(self.tree) + self.assertEqual(expected_id, extracted_id, extracted_id) + + def test_extract_setid(self): + expected_setid = "608d4f0d-b19f-46d3-749a-7159aa5f933d" + extracted_setid = extract.extract_set_id(self.tree) + self.assertEqual(expected_setid, extracted_setid, extracted_setid) + + def test_extract_effective_time(self): + expected_effective_time = "2013-09-30" + extracted_effective_time = extract.extract_effective_time(self.tree) + self.assertEqual( + expected_effective_time, extracted_effective_time, extracted_effective_time + ) + + def test_extract_version_number(self): + expected_version_number = "1560" + extracted_version_number = extract.extract_version_number(self.tree) + self.assertEqual( + expected_version_number, extracted_version_number, extracted_version_number + ) + + def test_extract_display_name(self): + expected_display_name = "HUMAN PRESCRIPTION DRUG LABEL" + extracted_display_name = extract.extract_display_name(self.tree) + self.assertEqual( + expected_display_name, extracted_display_name, extracted_display_name + ) + + def test_extract_duns(self): + expected_duns = "078458370" + extracted_duns = extract.extract_duns(self.tree) + self.assertEqual(expected_duns, extracted_duns, extracted_duns) + + def test_extract_product_ndcs(self): + expected_ndcs = ["0074-3799", "0074-9374", "0074-4339", "0074-3797"] + extracted_ndcs = extract.extract_product_ndcs(self.tree) + self.assertListEqual(expected_ndcs, extracted_ndcs, extracted_ndcs) + + def test_is_original_packager(self): + self.assertEqual(True, extract.is_original_packager(self.tree), True) + + def test_extract_original_packager_product_ndcs(self): + expected_ndcs = [] + extracted_ndcs = extract.extract_original_packager_product_ndcs(self.tree) + self.assertListEqual(expected_ndcs, extracted_ndcs, extracted_ndcs) + + def test_extract_package_ndcs(self): + expected_ndcs = [ + "0074-3799-02", + "0074-3799-71", + "0074-9374-02", + "0074-9374-71", + "0074-4339-02", + "0074-4339-06", + "0074-4339-07", + "0074-4339-71", + "0074-4339-73", + "0074-3797-01", + ] + extracted_ndcs = extract.extract_package_ndcs(self.tree) + self.assertListEqual(expected_ndcs, extracted_ndcs, extracted_ndcs) + class SPLExtractLipitorOrigUnitTest(unittest.TestCase): - 'SPL Lipitor Original Packager Extract Unit Test' - - def setUp(self): - self.tree = extract.parse_xml(data_filename('lipitor-orig.xml')) - - def test_extract_product_ndcs(self): - expected_ndcs = ['0071-0155', '0071-0156', '0071-0157', '0071-0158'] - extracted_ndcs = extract.extract_product_ndcs(self.tree) - self.assertListEqual(expected_ndcs, extracted_ndcs, - extracted_ndcs) - - def test_is_original_packager(self): - self.assertEqual(True, extract.is_original_packager(self.tree), - True) - - def test_extract_original_packager_product_ndcs(self): - expected_ndcs = [] - extracted_ndcs = extract.extract_original_packager_product_ndcs(self.tree) - self.assertListEqual(expected_ndcs, extracted_ndcs, - extracted_ndcs) - - def test_extract_package_ndcs(self): - expected_ndcs = ['0071-0155-23', '0071-0155-34', '0071-0155-40', - '0071-0155-10', '0071-0155-97', '0071-0156-23', - '0071-0156-94', '0071-0156-40', '0071-0156-10', - '0071-0156-96', '0071-0157-23', '0071-0157-73', - '0071-0157-88', '0071-0157-40', '0071-0157-97', - '0071-0158-23', '0071-0158-73', '0071-0158-88', - '0071-0158-92'] - extracted_ndcs = extract.extract_package_ndcs(self.tree) - self.assertListEqual(expected_ndcs, extracted_ndcs, - extracted_ndcs) + "SPL Lipitor Original Packager Extract Unit Test" + + def setUp(self): + self.tree = extract.parse_xml(data_filename("lipitor-orig.xml")) + + def test_extract_product_ndcs(self): + expected_ndcs = ["0071-0155", "0071-0156", "0071-0157", "0071-0158"] + extracted_ndcs = extract.extract_product_ndcs(self.tree) + self.assertListEqual(expected_ndcs, extracted_ndcs, extracted_ndcs) + + def test_is_original_packager(self): + self.assertEqual(True, extract.is_original_packager(self.tree), True) + + def test_extract_original_packager_product_ndcs(self): + expected_ndcs = [] + extracted_ndcs = extract.extract_original_packager_product_ndcs(self.tree) + self.assertListEqual(expected_ndcs, extracted_ndcs, extracted_ndcs) + + def test_extract_package_ndcs(self): + expected_ndcs = [ + "0071-0155-23", + "0071-0155-34", + "0071-0155-40", + "0071-0155-10", + "0071-0155-97", + "0071-0156-23", + "0071-0156-94", + "0071-0156-40", + "0071-0156-10", + "0071-0156-96", + "0071-0157-23", + "0071-0157-73", + "0071-0157-88", + "0071-0157-40", + "0071-0157-97", + "0071-0158-23", + "0071-0158-73", + "0071-0158-88", + "0071-0158-92", + ] + extracted_ndcs = extract.extract_package_ndcs(self.tree) + self.assertListEqual(expected_ndcs, extracted_ndcs, extracted_ndcs) class SPLExtractLipitorRepackUnitTest(unittest.TestCase): - 'SPL Lipitor Repackager Extract Unit Test' + "SPL Lipitor Repackager Extract Unit Test" + + def setUp(self): + self.tree = extract.parse_xml(data_filename("lipitor-repack.xml")) - def setUp(self): - self.tree = extract.parse_xml(data_filename('lipitor-repack.xml')) + def test_extract_product_ndcs(self): + expected_ndcs = ["55289-800"] + extracted_ndcs = extract.extract_product_ndcs(self.tree) + self.assertListEqual(expected_ndcs, extracted_ndcs, extracted_ndcs) - def test_extract_product_ndcs(self): - expected_ndcs = ['55289-800'] - extracted_ndcs = extract.extract_product_ndcs(self.tree) - self.assertListEqual(expected_ndcs, extracted_ndcs, - extracted_ndcs) + def test_is_original_packager(self): + self.assertEqual(False, extract.is_original_packager(self.tree), False) - def test_is_original_packager(self): - self.assertEqual(False, extract.is_original_packager(self.tree), - False) + def test_extract_original_packager_product_ndcs(self): + expected_ndcs = ["0071-0156"] + extracted_ndcs = extract.extract_original_packager_product_ndcs(self.tree) + self.assertListEqual(expected_ndcs, extracted_ndcs, extracted_ndcs) - def test_extract_original_packager_product_ndcs(self): - expected_ndcs = ['0071-0156'] - extracted_ndcs = extract.extract_original_packager_product_ndcs(self.tree) - self.assertListEqual(expected_ndcs, extracted_ndcs, - extracted_ndcs) + def test_extract_package_ndcs(self): + expected_ndcs = ["55289-800-30"] + extracted_ndcs = extract.extract_package_ndcs(self.tree) + self.assertListEqual(expected_ndcs, extracted_ndcs, extracted_ndcs) - def test_extract_package_ndcs(self): - expected_ndcs = ['55289-800-30'] - extracted_ndcs = extract.extract_package_ndcs(self.tree) - self.assertListEqual(expected_ndcs, extracted_ndcs, - extracted_ndcs) class SPLExtractCoughUnitTest(unittest.TestCase): - 'SPL Lipitor Repackager Extract Unit Test' + "SPL Lipitor Repackager Extract Unit Test" + + def setUp(self): + self.tree = extract.parse_xml(data_filename("cough.xml")) - def setUp(self): - self.tree = extract.parse_xml(data_filename('cough.xml')) + def test_extract_product_ndcs(self): + expected_ndcs = ["0067-6344"] + extracted_ndcs = extract.extract_product_ndcs(self.tree) + self.assertListEqual(expected_ndcs, extracted_ndcs, extracted_ndcs) - def test_extract_product_ndcs(self): - expected_ndcs = ['0067-6344'] - extracted_ndcs = extract.extract_product_ndcs(self.tree) - self.assertListEqual(expected_ndcs, extracted_ndcs, - extracted_ndcs) + def test_is_original_packager(self): + self.assertEqual(True, extract.is_original_packager(self.tree), True) - def test_is_original_packager(self): - self.assertEqual(True, extract.is_original_packager(self.tree), - True) + def test_extract_original_packager_product_ndcs(self): + expected_ndcs = [] + extracted_ndcs = extract.extract_original_packager_product_ndcs(self.tree) + self.assertListEqual(expected_ndcs, extracted_ndcs, extracted_ndcs) - def test_extract_original_packager_product_ndcs(self): - expected_ndcs = [] - extracted_ndcs = extract.extract_original_packager_product_ndcs(self.tree) - self.assertListEqual(expected_ndcs, extracted_ndcs, - extracted_ndcs) + def test_extract_package_ndcs(self): + expected_ndcs = ["0067-6344-04", "0067-6344-08"] + extracted_ndcs = extract.extract_package_ndcs(self.tree) + self.assertListEqual(expected_ndcs, extracted_ndcs, extracted_ndcs) - def test_extract_package_ndcs(self): - expected_ndcs = ['0067-6344-04', '0067-6344-08'] - extracted_ndcs = extract.extract_package_ndcs(self.tree) - self.assertListEqual(expected_ndcs, extracted_ndcs, - extracted_ndcs) class SPLExtractNoTitleUnitTest(unittest.TestCase): - 'SPL With No Title Extract Unit Test' + "SPL With No Title Extract Unit Test" - def setUp(self): - self.tree = extract.parse_xml(data_filename('no-title.xml')) + def setUp(self): + self.tree = extract.parse_xml(data_filename("no-title.xml")) - def test_is_original_packager(self): - self.assertEqual(False, extract.is_original_packager(self.tree), - False) + def test_is_original_packager(self): + self.assertEqual(False, extract.is_original_packager(self.tree), False) - def test_extract_title(self): - expected_title = '' - extracted_title = extract.extract_title(self.tree) - self.assertEqual(expected_title, extracted_title, - extracted_title) + def test_extract_title(self): + expected_title = "" + extracted_title = extract.extract_title(self.tree) + self.assertEqual(expected_title, extracted_title, extracted_title) -if __name__ == '__main__': - unittest.main() +if __name__ == "__main__": + unittest.main() diff --git a/openfda/spl/tests/process_barcodes_test.py b/openfda/spl/tests/process_barcodes_test.py index 4268534..b098333 100644 --- a/openfda/spl/tests/process_barcodes_test.py +++ b/openfda/spl/tests/process_barcodes_test.py @@ -5,50 +5,73 @@ from openfda.test_common import data_filename from openfda.spl.process_barcodes import XML2JSON + class ProcessBarCodesTest(unittest.TestCase): - def setUp(self): - self.test_dir = tempfile.mkdtemp() - - def tearDown(self): - shutil.rmtree(self.test_dir) - - - def verify_parsing(self, f, actual, msg): - src_file = data_filename('barcodes/' + f) - raw_input_file = path.join(self.test_dir, f) - shutil.copy(src=src_file, dst=raw_input_file) - XML2JSON(raw_input_file) - raw_output_file = path.join(self.test_dir, f.replace('.xml', '.json')) - with open(raw_output_file, 'r') as fjson: - result = fjson.read() - if result == '': - self.assertEquals(actual, result, msg) - else: - self.assertDictEqual(actual, json.loads(result), msg) - - def test_escape_xml(self): - - self.verify_parsing('1_otc-bars.xml', - {"quality": "247", "id": "b8cd1d5a-9d3b-465e-9e5b-2f67af8dbeb2", "upc": "0361570045106", - "barcode_type": "EAN-13"}, - "1_otc-bars.xml failed, handle UNICODE characters properly") - self.verify_parsing('2_otc-bars.xml',"", - "2_otc-bars.xml failed, must handle unclosed tag properly") - self.verify_parsing('3_otc-bars.xml',"", - "3_otc-bars.xml failed, must handle empty source") - self.verify_parsing('4_otc-bars.xml', - {"quality": "187", "id": "fd5ccac9-a54a-4284-80c8-a5f035c5d9e6", "upc": "0098443601700", - "barcode_type": "EAN-13"}, - "4_otc-bars.xml failed, must handle valid ") - self.verify_parsing('5_otc-bars.xml',"", - "5_otc-bars.xml failed, must ignore types other than EAN-13") - self.verify_parsing('6_otc-bars.xml', "", - "6_otc-bars.xml failed, must handle properly blank xmls") + def setUp(self): + self.test_dir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def verify_parsing(self, f, actual, msg): + src_file = data_filename("barcodes/" + f) + raw_input_file = path.join(self.test_dir, f) + shutil.copy(src=src_file, dst=raw_input_file) + XML2JSON(raw_input_file) + raw_output_file = path.join(self.test_dir, f.replace(".xml", ".json")) + with open(raw_output_file, "r") as fjson: + result = fjson.read() + if result == "": + self.assertEquals(actual, result, msg) + else: + self.assertDictEqual(actual, json.loads(result), msg) + + def test_escape_xml(self): + + self.verify_parsing( + "1_otc-bars.xml", + { + "quality": "247", + "id": "b8cd1d5a-9d3b-465e-9e5b-2f67af8dbeb2", + "upc": "0361570045106", + "barcode_type": "EAN-13", + }, + "1_otc-bars.xml failed, handle UNICODE characters properly", + ) + self.verify_parsing( + "2_otc-bars.xml", + "", + "2_otc-bars.xml failed, must handle unclosed tag properly", + ) + self.verify_parsing( + "3_otc-bars.xml", "", "3_otc-bars.xml failed, must handle empty source" + ) + self.verify_parsing( + "4_otc-bars.xml", + { + "quality": "187", + "id": "fd5ccac9-a54a-4284-80c8-a5f035c5d9e6", + "upc": "0098443601700", + "barcode_type": "EAN-13", + }, + "4_otc-bars.xml failed, must handle valid ", + ) + self.verify_parsing( + "5_otc-bars.xml", + "", + "5_otc-bars.xml failed, must ignore types other than EAN-13", + ) + self.verify_parsing( + "6_otc-bars.xml", + "", + "6_otc-bars.xml failed, must handle properly blank xmls", + ) + def main(argv): - unittest.main(argv=argv) + unittest.main(argv=argv) -if __name__ == '__main__': - unittest.main() +if __name__ == "__main__": + unittest.main() diff --git a/openfda/substance_data/pipeline.py b/openfda/substance_data/pipeline.py index 61c628c..58cbcf2 100644 --- a/openfda/substance_data/pipeline.py +++ b/openfda/substance_data/pipeline.py @@ -1,8 +1,8 @@ #!/usr/local/bin/python -''' +""" Pipeline for converting Substance data to JSON and importing into Elasticsearch. -''' +""" import logging import os @@ -17,730 +17,732 @@ from openfda.common import first_file_timestamp BASE_DIR = config.data_dir() -RAW_DIR = config.data_dir('substancedata/raw') -GINAS_ROOT_URL = 'https://gsrs.ncats.nih.gov/' -SUBSTANCE_DATA_DOWNLOAD_PAGE_URL = urljoin(GINAS_ROOT_URL, 'release/release.html') -SUBSTANCE_DATA_EXTRACT_DB = 'substancedata/substancedata.db' +RAW_DIR = config.data_dir("substancedata/raw") +GINAS_ROOT_URL = "https://gsrs.ncats.nih.gov/" +SUBSTANCE_DATA_DOWNLOAD_PAGE_URL = urljoin(GINAS_ROOT_URL, "release/release.html") +SUBSTANCE_DATA_EXTRACT_DB = "substancedata/substancedata.db" + class DownloadSubstanceData(luigi.Task): - def requires(self): - return [] + def requires(self): + return [] + + def output(self): + return luigi.LocalTarget(os.path.join(RAW_DIR, "dump-public.gsrs")) - def output(self): - return luigi.LocalTarget(os.path.join(RAW_DIR, 'dump-public.gsrs')) + def run(self): + fileURL = None + soup = BeautifulSoup(urlopen(SUBSTANCE_DATA_DOWNLOAD_PAGE_URL).read(), "lxml") + for a in soup.find_all(href=re.compile(".*.gsrs")): + if "Full Public Data Dump" in a.text: + fileURL = urljoin(GINAS_ROOT_URL, a["href"]) - def run(self): - fileURL = None - soup = BeautifulSoup(urlopen(SUBSTANCE_DATA_DOWNLOAD_PAGE_URL).read(), 'lxml') - for a in soup.find_all(href=re.compile('.*.gsrs')): - if 'Full Public Data Dump' in a.text: - fileURL = urljoin(GINAS_ROOT_URL, a['href']) + common.download(fileURL, self.output().path) - common.download(fileURL, self.output().path) class ExtractZip(luigi.Task): - def requires(self): - return DownloadSubstanceData() + def requires(self): + return DownloadSubstanceData() - def output(self): - return luigi.LocalTarget(config.data_dir('substancedata/raw/dump-public.json')) + def output(self): + return luigi.LocalTarget(config.data_dir("substancedata/raw/dump-public.json")) - def run(self): - logging.info('Extracting: %s', (self.input().path)) + def run(self): + logging.info("Extracting: %s", (self.input().path)) - extract_dir = dirname(self.input().path) - gsrs_file_name = os.path.basename(self.input().path) - gz_filename = os.path.splitext(gsrs_file_name)[0] + ".gz" - gsrs_file = join(extract_dir, gsrs_file_name) + extract_dir = dirname(self.input().path) + gsrs_file_name = os.path.basename(self.input().path) + gz_filename = os.path.splitext(gsrs_file_name)[0] + ".gz" + gsrs_file = join(extract_dir, gsrs_file_name) + + gz_file = join(extract_dir, gz_filename) + os.rename(gsrs_file, gz_file) + common.shell_cmd_quiet("gunzip " + gz_file) + os.rename(os.path.splitext(gz_file)[0], os.path.splitext(gz_file)[0] + ".json") - gz_file = join(extract_dir, gz_filename) - os.rename(gsrs_file, gz_file) - common.shell_cmd_quiet('gunzip ' + gz_file) - os.rename(os.path.splitext(gz_file)[0], os.path.splitext(gz_file)[0] + ".json") class SubstanceData2JSONMapper(parallel.Mapper): - rename_map_5th_level_fields = { - "modifications/physicalModifications/parameters/amount/lowLimit": "low_limit", - "modifications/physicalModifications/parameters/amount/highLimit": "high_limit", - "modifications/physicalModifications/parameters/amount/nonNumericValue": "non_numeric_value", - "modifications/physicalModifications/parameters/amount/references": "references", - "modifications/physicalModifications/parameters/amount/access": "remove", - "modifications/physicalModifications/parameters/amount/created": "remove", - "modifications/physicalModifications/parameters/amount/createdBy": "remove", - "modifications/physicalModifications/parameters/amount/deprecated": "remove", - "modifications/physicalModifications/parameters/amount/lastEdited": "remove", - "modifications/physicalModifications/parameters/amount/lastEditedBy": "remove" - } - - rename_map_4th_level_fields = { - "mixture/components/substance/approvalID": "unii", - "mixture/components/substance/linkingID": "linking_id", - "mixture/components/substance/references": "references", - "mixture/components/substance/refPname": "ref_pname", - "mixture/components/substance/substanceClass": "substance_class", - "mixture/components/substance/access": "remove", - "mixture/components/substance/created": "remove", - "mixture/components/substance/createdBy": "remove", - "mixture/components/substance/deprecated": "remove", - "mixture/components/substance/lastEdited": "remove", - "mixture/components/substance/lastEditedBy": "remove", - "modifications/agentModifications/agentSubstance/approvalID": "unii", - "modifications/agentModifications/agentSubstance/linkingID": "linking_id", - "modifications/agentModifications/agentSubstance/references": "references", - "modifications/agentModifications/agentSubstance/refPname": "ref_pname", - "modifications/agentModifications/agentSubstance/substanceClass": "substance_class", - "modifications/agentModifications/agentSubstance/access": "remove", - "modifications/agentModifications/agentSubstance/created": "remove", - "modifications/agentModifications/agentSubstance/createdBy": "remove", - "modifications/agentModifications/agentSubstance/deprecated": "remove", - "modifications/agentModifications/agentSubstance/lastEdited": "remove", - "modifications/agentModifications/agentSubstance/lastEditedBy": "remove", - "modifications/agentModifications/amount/highLimit": "high_limit", - "modifications/agentModifications/amount/lowLimit": "low_limit", - "modifications/agentModifications/amount/nonNumericValue": "non_numeric_value", - "modifications/agentModifications/amount/references": "references", - "modifications/agentModifications/amount/access": "remove", - "modifications/agentModifications/amount/created": "remove", - "modifications/agentModifications/amount/createdBy": "remove", - "modifications/agentModifications/amount/deprecated": "remove", - "modifications/agentModifications/amount/lastEdited": "remove", - "modifications/agentModifications/amount/lastEditedBy": "remove", - "modifications/physicalModifications/parameters/parameterName": "parameter_name", - "modifications/physicalModifications/parameters/references": "references", - "modifications/physicalModifications/parameters/access": "remove", - "modifications/physicalModifications/parameters/created": "remove", - "modifications/physicalModifications/parameters/createdBy": "remove", - "modifications/physicalModifications/parameters/deprecated": "remove", - "modifications/physicalModifications/parameters/lastEdited": "remove", - "modifications/physicalModifications/parameters/lastEditedBy": "remove", - "modifications/structuralModifications/extentAmount/highLimit": "high_limit", - "modifications/structuralModifications/extentAmount/lowLimit": "low_limit", - "modifications/structuralModifications/extentAmount/nonNumericValue": "non_numeric_value", - "modifications/structuralModifications/extentAmount/refPname": "ref_pname", - "modifications/structuralModifications/extentAmount/access": "remove", - "modifications/structuralModifications/extentAmount/created": "remove", - "modifications/structuralModifications/extentAmount/createdBy": "remove", - "modifications/structuralModifications/extentAmount/deprecated": "remove", - "modifications/structuralModifications/extentAmount/lastEdited": "remove", - "modifications/structuralModifications/extentAmount/lastEditedBy": "remove", - "modifications/structuralModifications/molecularFragment/approvalID": "unii", - "modifications/structuralModifications/molecularFragment/linkingID": "linking_id", - "modifications/structuralModifications/molecularFragment/refPname": "ref_pname", - "modifications/structuralModifications/molecularFragment/references": "references", - "modifications/structuralModifications/molecularFragment/substanceClass": "substance_class", - "modifications/structuralModifications/molecularFragment/access": "remove", - "modifications/structuralModifications/molecularFragment/created": "remove", - "modifications/structuralModifications/molecularFragment/createdBy": "remove", - "modifications/structuralModifications/molecularFragment/deprecated": "remove", - "modifications/structuralModifications/molecularFragment/lastEdited": "remove", - "modifications/structuralModifications/molecularFragment/lastEditedBy": "remove", - "modifications/structuralModifications/sites/residueIndex": "residue_index", - "modifications/structuralModifications/sites/subunitIndex": "subunit_index", - "nucleicAcid/linkages/sites/residueIndex": "residue_index", - "nucleicAcid/linkages/sites/subunitIndex": "subunit_index", - "nucleicAcid/sugars/sites/residueIndex": "residue_index", - "nucleicAcid/sugars/sites/subunitIndex": "subunit_index", - "polymer/classification/parentSubstance/approvalID": "unii", - "polymer/classification/parentSubstance/linkingID": "linking_id", - "polymer/classification/parentSubstance/refPname": "ref_pname", - "polymer/classification/parentSubstance/substanceClass": "substance_class", - "polymer/classification/parentSubstance/access": "remove", - "polymer/classification/parentSubstance/created": "remove", - "polymer/classification/parentSubstance/createdBy": "remove", - "polymer/classification/parentSubstance/deprecated": "remove", - "polymer/classification/parentSubstance/lastEdited": "remove", - "polymer/classification/parentSubstance/lastEditedBy": "remove", - "polymer/monomers/amount/highLimit": "high_limit", - "polymer/monomers/amount/lowLimit": "low_limit", - "polymer/monomers/amount/nonNumericValue": "non_numeric_value", - "polymer/monomers/amount/references": "references", - "polymer/monomers/amount/access": "remove", - "polymer/monomers/amount/created": "remove", - "polymer/monomers/amount/createdBy": "remove", - "polymer/monomers/amount/deprecated": "remove", - "polymer/monomers/amount/lastEdited": "remove", - "polymer/monomers/amount/lastEditedBy": "remove", - "polymer/monomers/monomerSubstance/linkingID": "linking_id", - "polymer/monomers/monomerSubstance/references": "references", - "polymer/monomers/monomerSubstance/refPname": "ref_pname", - "polymer/monomers/monomerSubstance/substanceClass": "substance_class", - "polymer/monomers/monomerSubstance/approvalID": "unii", - "polymer/monomers/monomerSubstance/access": "remove", - "polymer/monomers/monomerSubstance/created": "remove", - "polymer/monomers/monomerSubstance/createdBy": "remove", - "polymer/monomers/monomerSubstance/deprecated": "remove", - "polymer/monomers/monomerSubstance/lastEdited": "remove", - "polymer/monomers/monomerSubstance/lastEditedBy": "remove", - "polymer/structural_units/amount/highLimit": "high_limit", - "polymer/structural_units/amount/lowLimit": "low_limit", - "polymer/structural_units/amount/nonNumericValue": "non_numeric_value", - "polymer/structural_units/amount/access": "remove", - "polymer/structural_units/amount/created": "remove", - "polymer/structural_units/amount/createdBy": "remove", - "polymer/structural_units/amount/deprecated": "remove", - "polymer/structural_units/amount/lastEdited": "remove", - "polymer/structural_units/amount/lastEditedBy": "remove", - "properties/parameters/value/nonNumericValue": "non_numeric_value", - "properties/parameters/value/access": "remove", - "properties/parameters/value/created": "remove", - "properties/parameters/value/createdBy": "remove", - "properties/parameters/value/deprecated": "remove", - "properties/parameters/value/lastEdited": "remove", - "properties/parameters/value/lastEditedBy": "remove", - "protein/disulfideLinks/sites/residueIndex": "residue_index", - "protein/disulfideLinks/sites/subunitIndex": "subunit_index", - "protein/glycosylation/CGlycosylationSites/residueIndex": "residue_index", - "protein/glycosylation/CGlycosylationSites/subunitIndex": "subunit_index", - "protein/glycosylation/NGlycosylationSites/residueIndex": "residue_index", - "protein/glycosylation/NGlycosylationSites/subunitIndex": "subunit_index", - "protein/glycosylation/OGlycosylationSites/residueIndex": "residue_index", - "protein/glycosylation/OGlycosylationSites/subunitIndex": "subunit_index", - "protein/otherLinks/sites/residueIndex": "residue_index", - "protein/otherLinks/sites/subunitIndex": "subunit_index", - } - - rename_map_3rd_level_fields = { - "mixture/components/access": "remove", - "mixture/components/created": "remove", - "mixture/components/createdBy": "remove", - "mixture/components/deprecated": "remove", - "mixture/components/lastEdited": "remove", - "mixture/components/lastEditedBy": "remove", - "mixture/parentSubstance/approvalID": "unii", - "mixture/parentSubstance/linkingID": "linking_id", - "mixture/parentSubstance/refPname": "ref_pname", - "mixture/parentSubstance/substanceClass": "substance_class", - "mixture/parentSubstance/access": "remove", - "mixture/parentSubstance/created": "remove", - "mixture/parentSubstance/createdBy": "remove", - "mixture/parentSubstance/deprecated": "remove", - "mixture/parentSubstance/lastEdited": "remove", - "mixture/parentSubstance/lastEditedBy": "remove", - "modifications/agentModifications/agentModificationProcess": "agent_modification_process", - "modifications/agentModifications/agentModificationRole": "agent_modification_role", - "modifications/agentModifications/agentModificationType": "agent_modification_type", - "modifications/agentModifications/agentSubstance": "agent_substance", - "modifications/agentModifications/modificationGroup": "modification_group", - "modifications/agentModifications/references": "references", - "modifications/agentModifications/access": "remove", - "modifications/agentModifications/created": "remove", - "modifications/agentModifications/createdBy": "remove", - "modifications/agentModifications/deprecated": "remove", - "modifications/agentModifications/lastEdited": "remove", - "modifications/agentModifications/lastEditedBy": "remove", - "modifications/physicalModifications/modification_group": "modification_group", - "modifications/physicalModifications/physicalModificationRole": "physical_modification_role", - "modifications/physicalModifications/references": "references", - "modifications/physicalModifications/access": "remove", - "modifications/physicalModifications/created": "remove", - "modifications/physicalModifications/createdBy": "remove", - "modifications/physicalModifications/deprecated": "remove", - "modifications/physicalModifications/lastEdited": "remove", - "modifications/physicalModifications/lastEditedBy": "remove", - "modifications/structuralModifications/extentAmount": "extent_amount", - "modifications/structuralModifications/locationType": "location_type", - "modifications/structuralModifications/modificationGroup": "modification_group", - "modifications/structuralModifications/molecularFragment": "molecular_fragment", - "modifications/structuralModifications/references": "references", - "modifications/structuralModifications/residueModified": "residue_modified", - "modifications/structuralModifications/structuralModificationType": "structural_modification_type", - "modifications/structuralModifications/access": "remove", - "modifications/structuralModifications/created": "remove", - "modifications/structuralModifications/createdBy": "remove", - "modifications/structuralModifications/deprecated": "remove", - "modifications/structuralModifications/lastEdited": "remove", - "modifications/structuralModifications/lastEditedBy": "remove", - "moieties/countAmount/lowLimit": "low_limit", - "moieties/countAmount/highLimit": "high_limit", - "moieties/countAmount/nonNumericValue": "non_numeric_value", - "moieties/countAmount/references": "references", - "moieties/countAmount/access": "remove", - "moieties/countAmount/created": "remove", - "moieties/countAmount/createdBy": "remove", - "moieties/countAmount/deprecated": "remove", - "moieties/countAmount/lastEdited": "remove", - "moieties/countAmount/lastEditedBy": "remove", - "moieties/references/countAmount": "count_amount", - "names/nameOrgs/deprecatedDate": "deprecated_date", - "names/nameOrgs/nameOrg": "name_org", - "names/nameOrgs/references": "references", - "names/nameOrgs/access": "remove", - "names/nameOrgs/created": "remove", - "names/nameOrgs/createdBy": "remove", - "names/nameOrgs/deprecated": "remove", - "names/nameOrgs/lastEdited": "remove", - "names/nameOrgs/lastEditedBy": "remove", - "nucleicAcid/linkages/access": "remove", - "nucleicAcid/linkages/created": "remove", - "nucleicAcid/linkages/createdBy": "remove", - "nucleicAcid/linkages/deprecated": "remove", - "nucleicAcid/linkages/lastEdited": "remove", - "nucleicAcid/linkages/lastEditedBy": "remove", - "nucleicAcid/linkages/sitesShorthand": "remove", - "nucleicAcid/subunits/references": "references", - "nucleicAcid/subunits/subunitIndex": "subunit_index", - "nucleicAcid/subunits/access": "remove", - "nucleicAcid/subunits/created": "remove", - "nucleicAcid/subunits/createdBy": "remove", - "nucleicAcid/subunits/deprecated": "remove", - "nucleicAcid/subunits/lastEdited": "remove", - "nucleicAcid/subunits/lastEditedBy": "remove", - "nucleicAcid/subunits/length": "remove", - "nucleicAcid/sugars/references": "references", - "nucleicAcid/sugars/access": "remove", - "nucleicAcid/sugars/created": "remove", - "nucleicAcid/sugars/createdBy": "remove", - "nucleicAcid/sugars/deprecated": "remove", - "nucleicAcid/sugars/lastEdited": "remove", - "nucleicAcid/sugars/lastEditedBy": "remove", - "nucleicAcid/sugars/sitesShorthand": "remove", - "polymer/classification/parentSubstance": "parent_substance", - "polymer/classification/polymerClass": "polymer_class", - "polymer/classification/polymerGeometry": "polymer_geometry", - "polymer/classification/polymerSubclass": "polymer_subclass", - "polymer/classification/references": "references", - "polymer/classification/sourceType": "source_type", - "polymer/classification/access": "remove", - "polymer/classification/created": "remove", - "polymer/classification/createdBy": "remove", - "polymer/classification/deprecated": "remove", - "polymer/classification/lastEdited": "remove", - "polymer/classification/lastEditedBy": "remove", - "polymer/displayStructure/definedStereo": "defined_stereo", - "polymer/displayStructure/ezCenters": "ez_centers", - "polymer/displayStructure/mwt": "molecular_weight", - "polymer/displayStructure/opticalActivity": "optical_activity", - "polymer/displayStructure/references": "references", - "polymer/displayStructure/stereoCenters": "stereo_centers", - "polymer/displayStructure/access": "remove", - "polymer/displayStructure/created": "remove", - "polymer/displayStructure/createdBy": "remove", - "polymer/displayStructure/deprecated": "remove", - "polymer/displayStructure/digest": "remove", - "polymer/displayStructure/formula": "remove", - "polymer/displayStructure/lastEdited": "remove", - "polymer/displayStructure/lastEditedBy": "remove", - "polymer/displayStructure/self": "remove", - "polymer/displayStructure/smiles": "remove", - "polymer/idealizedStructure/definedStereo": "defined_stereo", - "polymer/idealizedStructure/ezCenters": "ez_centers", - "polymer/idealizedStructure/mwt": "molecular_weight", - "polymer/idealizedStructure/opticalActivity": "optical_activity", - "polymer/idealizedStructure/stereoCenters": "stereo_centers", - "polymer/idealizedStructure/access": "remove", - "polymer/idealizedStructure/created": "remove", - "polymer/idealizedStructure/createdBy": "remove", - "polymer/idealizedStructure/deprecated": "remove", - "polymer/idealizedStructure/digest": "remove", - "polymer/idealizedStructure/formula": "remove", - "polymer/idealizedStructure/lastEdited": "remove", - "polymer/idealizedStructure/lastEditedBy": "remove", - "polymer/idealizedStructure/self": "remove", - "polymer/idealizedStructure/smiles": "remove", - "polymer/monomers/monomerSubstance": "monomer_substance", - "polymer/monomers/references": "references", - "polymer/monomers/access": "remove", - "polymer/monomers/created": "remove", - "polymer/monomers/createdBy": "remove", - "polymer/monomers/deprecated": "remove", - "polymer/monomers/lastEdited": "remove", - "polymer/monomers/lastEditedBy": "remove", - "polymer/structuralUnits/attachmentCount": "attachment_count", - "polymer/structuralUnits/attachmentMap": "attachment_map", - "polymer/structuralUnits/access": "remove", - "polymer/structuralUnits/created": "remove", - "polymer/structuralUnits/createdBy": "remove", - "polymer/structuralUnits/deprecated": "remove", - "polymer/structuralUnits/lastEdited": "remove", - "polymer/structuralUnits/lastEditedBy": "remove", - "properties/parameters/access": "remove", - "properties/parameters/created": "remove", - "properties/parameters/createdBy": "remove", - "properties/parameters/deprecated": "remove", - "properties/parameters/lastEdited": "remove", - "properties/parameters/lastEditedBy": "remove", - "properties/value/highLimit": "high_limit", - "properties/value/lowLimit": "low_limit", - "properties/value/nonNumericValue": "non_numeric_value", - "properties/value/references": "references", - "properties/value/access": "remove", - "properties/value/created": "remove", - "properties/value/createdBy": "remove", - "properties/value/deprecated": "remove", - "properties/value/lastEdited": "remove", - "properties/value/lastEditedBy": "remove", - "protein/disulfideLinks/sitesShorthand": "remove", - "protein/glycosylation/glycosylationType": "glycosylation_type", - "protein/glycosylation/CGlycosylationSites": "c_glycosylation_sites", - "protein/glycosylation/NGlycosylationSites": "n_glycosylation_sites", - "protein/glycosylation/OGlycosylationSites": "o_glycosylation_sites", - "protein/glycosylation/references": "references", - "protein/glycosylation/access": "remove", - "protein/glycosylation/created": "remove", - "protein/glycosylation/createdBy": "remove", - "protein/glycosylation/deprecated": "remove", - "protein/glycosylation/lastEdited": "remove", - "protein/glycosylation/lastEditedBy": "remove", - "protein/otherLinks/linkageType": "linkage_type", - "protein/otherLinks/references": "references", - "protein/otherLinks/access": "remove", - "protein/otherLinks/created": "remove", - "protein/otherLinks/createdBy": "remove", - "protein/otherLinks/deprecated": "remove", - "protein/otherLinks/lastEdited": "remove", - "protein/otherLinks/lastEditedBy": "remove", - "protein/subunits/references": "references", - "protein/subunits/subunitIndex": "subunit_index", - "protein/subunits/access": "remove", - "protein/subunits/created": "remove", - "protein/subunits/createdBy": "remove", - "protein/subunits/deprecated": "remove", - "protein/subunits/lastEdited": "remove", - "protein/subunits/lastEditedBy": "remove", - "protein/subunits/length": "remove", - "relationships/amount/highLimit": "high_limit", - "relationships/amount/lowLimit": "low_limit", - "relationships/amount/nonNumericValue": "non_numeric_value", - "relationships/amount/references": "references", - "relationships/amount/access": "remove", - "relationships/amount/created": "remove", - "relationships/amount/createdBy": "remove", - "relationships/amount/deprecated": "remove", - "relationships/amount/lastEdited": "remove", - "relationships/amount/lastEditedBy": "remove", - "relationships/relatedSubstance/approvalID": "unii", - "relationships/relatedSubstance/linkingID": "linking_id", - "relationships/relatedSubstance/references": "references", - "relationships/relatedSubstance/refPname": "ref_pname", - "relationships/relatedSubstance/substanceClass": "substance_class", - "relationships/relatedSubstance/access": "remove", - "relationships/relatedSubstance/created": "remove", - "relationships/relatedSubstance/createdBy": "remove", - "relationships/relatedSubstance/deprecated": "remove", - "relationships/relatedSubstance/lastEdited": "remove", - "relationships/relatedSubstance/lastEditedBy": "remove", - "relationships/mediatorSubstance/approvalID": "unii", - "relationships/mediatorSubstance/linkingID": "linking_id", - "relationships/mediatorSubstance/references": "references", - "relationships/mediatorSubstance/refPname": "ref_pname", - "relationships/mediatorSubstance/substanceClass": "substance_class", - "relationships/mediatorSubstance/access": "remove", - "relationships/mediatorSubstance/created": "remove", - "relationships/mediatorSubstance/createdBy": "remove", - "relationships/mediatorSubstance/deprecated": "remove", - "relationships/mediatorSubstance/lastEdited": "remove", - "relationships/mediatorSubstance/lastEditedBy": "remove", - "structurallyDiverse/hybridSpeciesMaternalOrganism/approvalID": "unii", - "structurallyDiverse/hybridSpeciesMaternalOrganism/linkingID": "linking_id", - "structurallyDiverse/hybridSpeciesMaternalOrganism/refPname": "ref_pname", - "structurallyDiverse/hybridSpeciesMaternalOrganism/substanceClass": "substance_class", - "structurallyDiverse/hybridSpeciesMaternalOrganism/access": "remove", - "structurallyDiverse/hybridSpeciesMaternalOrganism/created": "remove", - "structurallyDiverse/hybridSpeciesMaternalOrganism/createdBy": "remove", - "structurallyDiverse/hybridSpeciesMaternalOrganism/deprecated": "remove", - "structurallyDiverse/hybridSpeciesMaternalOrganism/lastEdited": "remove", - "structurallyDiverse/hybridSpeciesMaternalOrganism/lastEditedBy": "remove", - "structurallyDiverse/hybridSpeciesPaternalOrganism/approvalID": "unii", - "structurallyDiverse/hybridSpeciesPaternalOrganism/linkingID": "linking_id", - "structurallyDiverse/hybridSpeciesPaternalOrganism/refPname": "ref_pname", - "structurallyDiverse/hybridSpeciesPaternalOrganism/substanceClass": "substance_class", - "structurallyDiverse/hybridSpeciesPaternalOrganism/access": "remove", - "structurallyDiverse/hybridSpeciesPaternalOrganism/created": "remove", - "structurallyDiverse/hybridSpeciesPaternalOrganism/createdBy": "remove", - "structurallyDiverse/hybridSpeciesPaternalOrganism/deprecated": "remove", - "structurallyDiverse/hybridSpeciesPaternalOrganism/lastEdited": "remove", - "structurallyDiverse/hybridSpeciesPaternalOrganism/lastEditedBy": "remove", - "structurallyDiverse/parentSubstance/approvalID": "unii", - "structurallyDiverse/parentSubstance/linkingID": "linking_id", - "structurallyDiverse/parentSubstance/references": "references", - "structurallyDiverse/parentSubstance/refPname": "ref_pname", - "structurallyDiverse/parentSubstance/substanceClass": "substance_class", - "structurallyDiverse/parentSubstance/access": "remove", - "structurallyDiverse/parentSubstance/created": "remove", - "structurallyDiverse/parentSubstance/createdBy": "remove", - "structurallyDiverse/parentSubstance/deprecated": "remove", - "structurallyDiverse/parentSubstance/lastEdited": "remove", - "structurallyDiverse/parentSubstance/lastEditedBy": "remove", - } - - rename_map_2nd_level_fields = { - "codes/codeSystem": "code_system", - "codes/references": "references", - "codes/access": "remove", - "codes/created": "remove", - "codes/createdBy": "remove", - "codes/deprecated": "remove", - "codes/lastEdited": "remove", - "codes/lastEditedBy": "remove", - "mixture/parentSubstance": "parent_substance", - "mixture/access": "remove", - "mixture/created": "remove", - "mixture/createdBy": "remove", - "mixture/deprecated": "remove", - "mixture/lastEdited": "remove", - "mixture/lastEditedBy": "remove", - "modifications/agentModifications": "agent_modifications", - "modifications/physicalModifications": "physical_modifications", - "modifications/references": "references", - "modifications/structuralModifications": "structural_modifications", - "modifications/access": "remove", - "modifications/created": "remove", - "modifications/createdBy": "remove", - "modifications/deprecated": "remove", - "modifications/lastEdited": "remove", - "modifications/lastEditedBy": "remove", - "moieties/countAmount": "count_amount", - "moieties/definedStereo": "defined_stereo", - "moieties/ezCenters": "ez_centers", - "moieties/mwt": "molecular_weight", - "moieties/opticalActivity": "optical_activity", - "moieties/references": "references", - "moieties/stereoCenters": "stereo_centers", - "moieties/stereoComments": "stereo_comments", - "moieties/access": "remove", - "moieties/created": "remove", - "moieties/createdBy": "remove", - "moieties/deprecated": "remove", - "moieties/digest": "remove", - "moieties/hash": "remove", - "moieties/lastEdited": "remove", - "moieties/lastEditedBy": "remove", - "moieties/self": "remove", - "names/displayName": "display_name", - "names/domains": "domains", - "names/nameJurisdiction": "name_jurisdiction", - "names/nameOrgs": "name_orgs", - "names/access": "remove", - "names/created": "remove", - "names/createdBy": "remove", - "names/deprecated": "remove", - "names/lastEdited": "remove", - "names/lastEditedBy": "remove", - "notes/access": "remove", - "notes/created": "remove", - "notes/createdBy": "remove", - "notes/deprecated": "remove", - "notes/lastEdited": "remove", - "notes/lastEditedBy": "remove", - "nucleicAcid/nucleicAcidSubType": "nucleic_acid_sub_type", - "nucleicAcid/nucleicAcidType": "nucleic_acid_type", - "nucleicAcid/sequenceOrigin": "sequence_origin", - "nucleicAcid/sequenceType": "sequence_type", - "nucleicAcid/access": "remove", - "nucleicAcid/created": "remove", - "nucleicAcid/createdBy": "remove", - "nucleicAcid/deprecated": "remove", - "nucleicAcid/lastEdited": "remove", - "nucleicAcid/lastEditedBy": "remove", - "polymer/displayStructure": "display_structure", - "polymer/idealizedStructure": "idealized_structure", - "polymer/structuralUnits": "structural_units", - "polymer/access": "remove", - "polymer/created": "remove", - "polymer/createdBy": "remove", - "polymer/deprecated": "remove", - "polymer/lastEdited": "remove", - "polymer/lastEditedBy": "remove", - "properties/parameters": "parameters", - "properties/propertyType": "property_type", - "properties/references": "references", - "properties/access": "remove", - "properties/created": "remove", - "properties/createdBy": "remove", - "properties/deprecated": "remove", - "properties/lastEdited": "remove", - "properties/lastEditedBy": "remove", - "protein/disulfideLinks": "disulfide_links", - "protein/otherLinks": "other_links", - "protein/proteinSubType": "protein_sub_type", - "protein/proteinType": "protein_type", - "protein/sequenceOrigin": "sequence_origin", - "protein/sequenceType": "sequence_type", - "protein/access": "remove", - "protein/created": "remove", - "protein/createdBy": "remove", - "protein/deprecated": "remove", - "protein/lastEdited": "remove", - "protein/lastEditedBy": "remove", - "references/nameJurisdiction": "name_jurisdiction", - "references/nameOrgs": "name_orgs", - "references/displayName": "display_name", - "references/docType": "doc_type", - "references/publicDomain": "public_domain", - "references/documentDate": "document_date", - "references/stereoComments": "stereo_comments", - "references/tags": "tags", - "references/access": "remove", - "references/created": "remove", - "references/createdBy": "remove", - "references/deprecated": "remove", - "references/lastEdited": "remove", - "references/lastEditedBy": "remove", - "references/uploadedFile": "remove", - "relationships/interactionType": "interaction_type", - "relationships/mediatorSubstance": "mediator_substance", - "relationships/references": "references", - "relationships/relatedSubstance": "related_substance", - "relationships/access": "remove", - "relationships/created": "remove", - "relationships/createdBy": "remove", - "relationships/deprecated": "remove", - "relationships/lastEdited": "remove", - "relationships/lastEditedBy": "remove", - "relationships/originatorUuid": "remove", - "structurallyDiverse/developmentalStage": "developmental_stage", - "structurallyDiverse/fractionMaterialType": "fraction_material_type", - "structurallyDiverse/fractionName": "fraction_name", - "structurallyDiverse/hybridSpeciesMaternalOrganism": "hybrid_species_maternal_organism", - "structurallyDiverse/hybridSpeciesPaternalOrganism": "hybrid_species_paternal_organism", - "structurallyDiverse/infraSpecificName": "infra_specific_name", - "structurallyDiverse/infraSpecificType": "infra_specific_type", - "structurallyDiverse/organismAuthor": "organism_author", - "structurallyDiverse/organismFamily": "organism_family", - "structurallyDiverse/organismGenus": "organism_genus", - "structurallyDiverse/organismSpecies": "organism_species", - "structurallyDiverse/parentSubstance": "parent_substance", - "structurallyDiverse/part": "part", - "structurallyDiverse/partLocation": "part_location", - "structurallyDiverse/references": "references", - "structurallyDiverse/sourceMaterialClass": "source_material_class", - "structurallyDiverse/sourceMaterialState": "source_material_state", - "structurallyDiverse/sourceMaterialType": "source_material_type", - "structurallyDiverse/access": "remove", - "structurallyDiverse/created": "remove", - "structurallyDiverse/createdBy": "remove", - "structurallyDiverse/deprecated": "remove", - "structurallyDiverse/lastEdited": "remove", - "structurallyDiverse/lastEditedBy": "remove", - "structure/definedStereo": "defined_stereo", - "structure/ezCenters": "ez_centers", - "structure/mwt": "molecular_weight", - "structure/opticalActivity": "optical_activity", - "structure/references": "references", - "structure/stereoCenters": "stereo_centers", - "structure/stereoComments": "stereo_comments", - "structure/access": "remove", - "structure/created": "remove", - "structure/createdBy": "remove", - "structure/deprecated": "remove", - "structure/digest": "remove", - "structure/hash": "remove", - "structure/lastEdited": "remove", - "structure/lastEditedBy": "remove", - "structure/self": "remove", - } - - - rename_map_1st_level_fields = { - "approvalID": "unii", - "codes": "codes", - "definitionLevel": "definition_level", - "definitionType": "definition_type", - "names": "names", - "notes": "notes", - "nucleicAcid": "nucleic_acid", - "properties": "properties", - "references": "references", - "relationships": "relationships", - "structurallyDiverse": "structurally_diverse", - "substanceClass": "substance_class", - "tags": "tags", - "access": "remove", - "approvedBy": "remove", - "changeReason": "remove", - "created": "remove", - "createdBy": "remove", - "deprecated": "remove", - "lastEdited": "remove", - "lastEditedBy": "remove", - "status": "remove" - } - - - def map(self, key, value, output): - def change_key(obj, entry, rename_map): - child_entry_path = entry - parent_entry_obj = obj - - # If entry has nested path find the immediate parent - if entry.find("/") != -1: - parent_entry_path = entry.rsplit("/",1)[0] - child_entry_path = entry.rsplit("/",1)[1] - parent_entry_obj = iterate_dictionary(obj, parent_entry_path) - - if parent_entry_obj != None: - # Check for list - if type(parent_entry_obj) == list: - for o in parent_entry_obj: - if iterate_dictionary(o, child_entry_path) is not None: - # Pop fields with empty array value - if rename_map[entry] == "remove": - o.pop(child_entry_path) - elif o.get(child_entry_path) == []: - o.pop(child_entry_path) - else: - # Rename the key - o[rename_map[entry]] = o.pop(child_entry_path) - elif child_entry_path in o: - o.pop(child_entry_path) - else: - if parent_entry_obj.get(child_entry_path) is not None: - # Pop fields with empty array value - if rename_map[entry] == "remove": - parent_entry_obj.pop(child_entry_path) - elif parent_entry_obj.get(child_entry_path) == []: - parent_entry_obj.pop(child_entry_path) - else: - # Rename the key - parent_entry_obj[rename_map[entry]] = parent_entry_obj.pop(child_entry_path) - - - # Change key names from the leaf nodes back up to the top nodes - for entry in self.rename_map_5th_level_fields: - change_key(value, entry, self.rename_map_5th_level_fields) - - for entry in self.rename_map_4th_level_fields: - change_key(value, entry, self.rename_map_4th_level_fields) - - for entry in self.rename_map_3rd_level_fields: - change_key(value, entry, self.rename_map_3rd_level_fields) - - for entry in self.rename_map_2nd_level_fields: - change_key(value, entry, self.rename_map_2nd_level_fields) - - for entry in self.rename_map_1st_level_fields: - change_key(value, entry, self.rename_map_1st_level_fields) - - - output.add(key, value) + rename_map_5th_level_fields = { + "modifications/physicalModifications/parameters/amount/lowLimit": "low_limit", + "modifications/physicalModifications/parameters/amount/highLimit": "high_limit", + "modifications/physicalModifications/parameters/amount/nonNumericValue": "non_numeric_value", + "modifications/physicalModifications/parameters/amount/references": "references", + "modifications/physicalModifications/parameters/amount/access": "remove", + "modifications/physicalModifications/parameters/amount/created": "remove", + "modifications/physicalModifications/parameters/amount/createdBy": "remove", + "modifications/physicalModifications/parameters/amount/deprecated": "remove", + "modifications/physicalModifications/parameters/amount/lastEdited": "remove", + "modifications/physicalModifications/parameters/amount/lastEditedBy": "remove", + } + + rename_map_4th_level_fields = { + "mixture/components/substance/approvalID": "unii", + "mixture/components/substance/linkingID": "linking_id", + "mixture/components/substance/references": "references", + "mixture/components/substance/refPname": "ref_pname", + "mixture/components/substance/substanceClass": "substance_class", + "mixture/components/substance/access": "remove", + "mixture/components/substance/created": "remove", + "mixture/components/substance/createdBy": "remove", + "mixture/components/substance/deprecated": "remove", + "mixture/components/substance/lastEdited": "remove", + "mixture/components/substance/lastEditedBy": "remove", + "modifications/agentModifications/agentSubstance/approvalID": "unii", + "modifications/agentModifications/agentSubstance/linkingID": "linking_id", + "modifications/agentModifications/agentSubstance/references": "references", + "modifications/agentModifications/agentSubstance/refPname": "ref_pname", + "modifications/agentModifications/agentSubstance/substanceClass": "substance_class", + "modifications/agentModifications/agentSubstance/access": "remove", + "modifications/agentModifications/agentSubstance/created": "remove", + "modifications/agentModifications/agentSubstance/createdBy": "remove", + "modifications/agentModifications/agentSubstance/deprecated": "remove", + "modifications/agentModifications/agentSubstance/lastEdited": "remove", + "modifications/agentModifications/agentSubstance/lastEditedBy": "remove", + "modifications/agentModifications/amount/highLimit": "high_limit", + "modifications/agentModifications/amount/lowLimit": "low_limit", + "modifications/agentModifications/amount/nonNumericValue": "non_numeric_value", + "modifications/agentModifications/amount/references": "references", + "modifications/agentModifications/amount/access": "remove", + "modifications/agentModifications/amount/created": "remove", + "modifications/agentModifications/amount/createdBy": "remove", + "modifications/agentModifications/amount/deprecated": "remove", + "modifications/agentModifications/amount/lastEdited": "remove", + "modifications/agentModifications/amount/lastEditedBy": "remove", + "modifications/physicalModifications/parameters/parameterName": "parameter_name", + "modifications/physicalModifications/parameters/references": "references", + "modifications/physicalModifications/parameters/access": "remove", + "modifications/physicalModifications/parameters/created": "remove", + "modifications/physicalModifications/parameters/createdBy": "remove", + "modifications/physicalModifications/parameters/deprecated": "remove", + "modifications/physicalModifications/parameters/lastEdited": "remove", + "modifications/physicalModifications/parameters/lastEditedBy": "remove", + "modifications/structuralModifications/extentAmount/highLimit": "high_limit", + "modifications/structuralModifications/extentAmount/lowLimit": "low_limit", + "modifications/structuralModifications/extentAmount/nonNumericValue": "non_numeric_value", + "modifications/structuralModifications/extentAmount/refPname": "ref_pname", + "modifications/structuralModifications/extentAmount/access": "remove", + "modifications/structuralModifications/extentAmount/created": "remove", + "modifications/structuralModifications/extentAmount/createdBy": "remove", + "modifications/structuralModifications/extentAmount/deprecated": "remove", + "modifications/structuralModifications/extentAmount/lastEdited": "remove", + "modifications/structuralModifications/extentAmount/lastEditedBy": "remove", + "modifications/structuralModifications/molecularFragment/approvalID": "unii", + "modifications/structuralModifications/molecularFragment/linkingID": "linking_id", + "modifications/structuralModifications/molecularFragment/refPname": "ref_pname", + "modifications/structuralModifications/molecularFragment/references": "references", + "modifications/structuralModifications/molecularFragment/substanceClass": "substance_class", + "modifications/structuralModifications/molecularFragment/access": "remove", + "modifications/structuralModifications/molecularFragment/created": "remove", + "modifications/structuralModifications/molecularFragment/createdBy": "remove", + "modifications/structuralModifications/molecularFragment/deprecated": "remove", + "modifications/structuralModifications/molecularFragment/lastEdited": "remove", + "modifications/structuralModifications/molecularFragment/lastEditedBy": "remove", + "modifications/structuralModifications/sites/residueIndex": "residue_index", + "modifications/structuralModifications/sites/subunitIndex": "subunit_index", + "nucleicAcid/linkages/sites/residueIndex": "residue_index", + "nucleicAcid/linkages/sites/subunitIndex": "subunit_index", + "nucleicAcid/sugars/sites/residueIndex": "residue_index", + "nucleicAcid/sugars/sites/subunitIndex": "subunit_index", + "polymer/classification/parentSubstance/approvalID": "unii", + "polymer/classification/parentSubstance/linkingID": "linking_id", + "polymer/classification/parentSubstance/refPname": "ref_pname", + "polymer/classification/parentSubstance/substanceClass": "substance_class", + "polymer/classification/parentSubstance/access": "remove", + "polymer/classification/parentSubstance/created": "remove", + "polymer/classification/parentSubstance/createdBy": "remove", + "polymer/classification/parentSubstance/deprecated": "remove", + "polymer/classification/parentSubstance/lastEdited": "remove", + "polymer/classification/parentSubstance/lastEditedBy": "remove", + "polymer/monomers/amount/highLimit": "high_limit", + "polymer/monomers/amount/lowLimit": "low_limit", + "polymer/monomers/amount/nonNumericValue": "non_numeric_value", + "polymer/monomers/amount/references": "references", + "polymer/monomers/amount/access": "remove", + "polymer/monomers/amount/created": "remove", + "polymer/monomers/amount/createdBy": "remove", + "polymer/monomers/amount/deprecated": "remove", + "polymer/monomers/amount/lastEdited": "remove", + "polymer/monomers/amount/lastEditedBy": "remove", + "polymer/monomers/monomerSubstance/linkingID": "linking_id", + "polymer/monomers/monomerSubstance/references": "references", + "polymer/monomers/monomerSubstance/refPname": "ref_pname", + "polymer/monomers/monomerSubstance/substanceClass": "substance_class", + "polymer/monomers/monomerSubstance/approvalID": "unii", + "polymer/monomers/monomerSubstance/access": "remove", + "polymer/monomers/monomerSubstance/created": "remove", + "polymer/monomers/monomerSubstance/createdBy": "remove", + "polymer/monomers/monomerSubstance/deprecated": "remove", + "polymer/monomers/monomerSubstance/lastEdited": "remove", + "polymer/monomers/monomerSubstance/lastEditedBy": "remove", + "polymer/structural_units/amount/highLimit": "high_limit", + "polymer/structural_units/amount/lowLimit": "low_limit", + "polymer/structural_units/amount/nonNumericValue": "non_numeric_value", + "polymer/structural_units/amount/access": "remove", + "polymer/structural_units/amount/created": "remove", + "polymer/structural_units/amount/createdBy": "remove", + "polymer/structural_units/amount/deprecated": "remove", + "polymer/structural_units/amount/lastEdited": "remove", + "polymer/structural_units/amount/lastEditedBy": "remove", + "properties/parameters/value/nonNumericValue": "non_numeric_value", + "properties/parameters/value/access": "remove", + "properties/parameters/value/created": "remove", + "properties/parameters/value/createdBy": "remove", + "properties/parameters/value/deprecated": "remove", + "properties/parameters/value/lastEdited": "remove", + "properties/parameters/value/lastEditedBy": "remove", + "protein/disulfideLinks/sites/residueIndex": "residue_index", + "protein/disulfideLinks/sites/subunitIndex": "subunit_index", + "protein/glycosylation/CGlycosylationSites/residueIndex": "residue_index", + "protein/glycosylation/CGlycosylationSites/subunitIndex": "subunit_index", + "protein/glycosylation/NGlycosylationSites/residueIndex": "residue_index", + "protein/glycosylation/NGlycosylationSites/subunitIndex": "subunit_index", + "protein/glycosylation/OGlycosylationSites/residueIndex": "residue_index", + "protein/glycosylation/OGlycosylationSites/subunitIndex": "subunit_index", + "protein/otherLinks/sites/residueIndex": "residue_index", + "protein/otherLinks/sites/subunitIndex": "subunit_index", + } + + rename_map_3rd_level_fields = { + "mixture/components/access": "remove", + "mixture/components/created": "remove", + "mixture/components/createdBy": "remove", + "mixture/components/deprecated": "remove", + "mixture/components/lastEdited": "remove", + "mixture/components/lastEditedBy": "remove", + "mixture/parentSubstance/approvalID": "unii", + "mixture/parentSubstance/linkingID": "linking_id", + "mixture/parentSubstance/refPname": "ref_pname", + "mixture/parentSubstance/substanceClass": "substance_class", + "mixture/parentSubstance/access": "remove", + "mixture/parentSubstance/created": "remove", + "mixture/parentSubstance/createdBy": "remove", + "mixture/parentSubstance/deprecated": "remove", + "mixture/parentSubstance/lastEdited": "remove", + "mixture/parentSubstance/lastEditedBy": "remove", + "modifications/agentModifications/agentModificationProcess": "agent_modification_process", + "modifications/agentModifications/agentModificationRole": "agent_modification_role", + "modifications/agentModifications/agentModificationType": "agent_modification_type", + "modifications/agentModifications/agentSubstance": "agent_substance", + "modifications/agentModifications/modificationGroup": "modification_group", + "modifications/agentModifications/references": "references", + "modifications/agentModifications/access": "remove", + "modifications/agentModifications/created": "remove", + "modifications/agentModifications/createdBy": "remove", + "modifications/agentModifications/deprecated": "remove", + "modifications/agentModifications/lastEdited": "remove", + "modifications/agentModifications/lastEditedBy": "remove", + "modifications/physicalModifications/modification_group": "modification_group", + "modifications/physicalModifications/physicalModificationRole": "physical_modification_role", + "modifications/physicalModifications/references": "references", + "modifications/physicalModifications/access": "remove", + "modifications/physicalModifications/created": "remove", + "modifications/physicalModifications/createdBy": "remove", + "modifications/physicalModifications/deprecated": "remove", + "modifications/physicalModifications/lastEdited": "remove", + "modifications/physicalModifications/lastEditedBy": "remove", + "modifications/structuralModifications/extentAmount": "extent_amount", + "modifications/structuralModifications/locationType": "location_type", + "modifications/structuralModifications/modificationGroup": "modification_group", + "modifications/structuralModifications/molecularFragment": "molecular_fragment", + "modifications/structuralModifications/references": "references", + "modifications/structuralModifications/residueModified": "residue_modified", + "modifications/structuralModifications/structuralModificationType": "structural_modification_type", + "modifications/structuralModifications/access": "remove", + "modifications/structuralModifications/created": "remove", + "modifications/structuralModifications/createdBy": "remove", + "modifications/structuralModifications/deprecated": "remove", + "modifications/structuralModifications/lastEdited": "remove", + "modifications/structuralModifications/lastEditedBy": "remove", + "moieties/countAmount/lowLimit": "low_limit", + "moieties/countAmount/highLimit": "high_limit", + "moieties/countAmount/nonNumericValue": "non_numeric_value", + "moieties/countAmount/references": "references", + "moieties/countAmount/access": "remove", + "moieties/countAmount/created": "remove", + "moieties/countAmount/createdBy": "remove", + "moieties/countAmount/deprecated": "remove", + "moieties/countAmount/lastEdited": "remove", + "moieties/countAmount/lastEditedBy": "remove", + "moieties/references/countAmount": "count_amount", + "names/nameOrgs/deprecatedDate": "deprecated_date", + "names/nameOrgs/nameOrg": "name_org", + "names/nameOrgs/references": "references", + "names/nameOrgs/access": "remove", + "names/nameOrgs/created": "remove", + "names/nameOrgs/createdBy": "remove", + "names/nameOrgs/deprecated": "remove", + "names/nameOrgs/lastEdited": "remove", + "names/nameOrgs/lastEditedBy": "remove", + "nucleicAcid/linkages/access": "remove", + "nucleicAcid/linkages/created": "remove", + "nucleicAcid/linkages/createdBy": "remove", + "nucleicAcid/linkages/deprecated": "remove", + "nucleicAcid/linkages/lastEdited": "remove", + "nucleicAcid/linkages/lastEditedBy": "remove", + "nucleicAcid/linkages/sitesShorthand": "remove", + "nucleicAcid/subunits/references": "references", + "nucleicAcid/subunits/subunitIndex": "subunit_index", + "nucleicAcid/subunits/access": "remove", + "nucleicAcid/subunits/created": "remove", + "nucleicAcid/subunits/createdBy": "remove", + "nucleicAcid/subunits/deprecated": "remove", + "nucleicAcid/subunits/lastEdited": "remove", + "nucleicAcid/subunits/lastEditedBy": "remove", + "nucleicAcid/subunits/length": "remove", + "nucleicAcid/sugars/references": "references", + "nucleicAcid/sugars/access": "remove", + "nucleicAcid/sugars/created": "remove", + "nucleicAcid/sugars/createdBy": "remove", + "nucleicAcid/sugars/deprecated": "remove", + "nucleicAcid/sugars/lastEdited": "remove", + "nucleicAcid/sugars/lastEditedBy": "remove", + "nucleicAcid/sugars/sitesShorthand": "remove", + "polymer/classification/parentSubstance": "parent_substance", + "polymer/classification/polymerClass": "polymer_class", + "polymer/classification/polymerGeometry": "polymer_geometry", + "polymer/classification/polymerSubclass": "polymer_subclass", + "polymer/classification/references": "references", + "polymer/classification/sourceType": "source_type", + "polymer/classification/access": "remove", + "polymer/classification/created": "remove", + "polymer/classification/createdBy": "remove", + "polymer/classification/deprecated": "remove", + "polymer/classification/lastEdited": "remove", + "polymer/classification/lastEditedBy": "remove", + "polymer/displayStructure/definedStereo": "defined_stereo", + "polymer/displayStructure/ezCenters": "ez_centers", + "polymer/displayStructure/mwt": "molecular_weight", + "polymer/displayStructure/opticalActivity": "optical_activity", + "polymer/displayStructure/references": "references", + "polymer/displayStructure/stereoCenters": "stereo_centers", + "polymer/displayStructure/access": "remove", + "polymer/displayStructure/created": "remove", + "polymer/displayStructure/createdBy": "remove", + "polymer/displayStructure/deprecated": "remove", + "polymer/displayStructure/digest": "remove", + "polymer/displayStructure/formula": "remove", + "polymer/displayStructure/lastEdited": "remove", + "polymer/displayStructure/lastEditedBy": "remove", + "polymer/displayStructure/self": "remove", + "polymer/displayStructure/smiles": "remove", + "polymer/idealizedStructure/definedStereo": "defined_stereo", + "polymer/idealizedStructure/ezCenters": "ez_centers", + "polymer/idealizedStructure/mwt": "molecular_weight", + "polymer/idealizedStructure/opticalActivity": "optical_activity", + "polymer/idealizedStructure/stereoCenters": "stereo_centers", + "polymer/idealizedStructure/access": "remove", + "polymer/idealizedStructure/created": "remove", + "polymer/idealizedStructure/createdBy": "remove", + "polymer/idealizedStructure/deprecated": "remove", + "polymer/idealizedStructure/digest": "remove", + "polymer/idealizedStructure/formula": "remove", + "polymer/idealizedStructure/lastEdited": "remove", + "polymer/idealizedStructure/lastEditedBy": "remove", + "polymer/idealizedStructure/self": "remove", + "polymer/idealizedStructure/smiles": "remove", + "polymer/monomers/monomerSubstance": "monomer_substance", + "polymer/monomers/references": "references", + "polymer/monomers/access": "remove", + "polymer/monomers/created": "remove", + "polymer/monomers/createdBy": "remove", + "polymer/monomers/deprecated": "remove", + "polymer/monomers/lastEdited": "remove", + "polymer/monomers/lastEditedBy": "remove", + "polymer/structuralUnits/attachmentCount": "attachment_count", + "polymer/structuralUnits/attachmentMap": "attachment_map", + "polymer/structuralUnits/access": "remove", + "polymer/structuralUnits/created": "remove", + "polymer/structuralUnits/createdBy": "remove", + "polymer/structuralUnits/deprecated": "remove", + "polymer/structuralUnits/lastEdited": "remove", + "polymer/structuralUnits/lastEditedBy": "remove", + "properties/parameters/access": "remove", + "properties/parameters/created": "remove", + "properties/parameters/createdBy": "remove", + "properties/parameters/deprecated": "remove", + "properties/parameters/lastEdited": "remove", + "properties/parameters/lastEditedBy": "remove", + "properties/value/highLimit": "high_limit", + "properties/value/lowLimit": "low_limit", + "properties/value/nonNumericValue": "non_numeric_value", + "properties/value/references": "references", + "properties/value/access": "remove", + "properties/value/created": "remove", + "properties/value/createdBy": "remove", + "properties/value/deprecated": "remove", + "properties/value/lastEdited": "remove", + "properties/value/lastEditedBy": "remove", + "protein/disulfideLinks/sitesShorthand": "remove", + "protein/glycosylation/glycosylationType": "glycosylation_type", + "protein/glycosylation/CGlycosylationSites": "c_glycosylation_sites", + "protein/glycosylation/NGlycosylationSites": "n_glycosylation_sites", + "protein/glycosylation/OGlycosylationSites": "o_glycosylation_sites", + "protein/glycosylation/references": "references", + "protein/glycosylation/access": "remove", + "protein/glycosylation/created": "remove", + "protein/glycosylation/createdBy": "remove", + "protein/glycosylation/deprecated": "remove", + "protein/glycosylation/lastEdited": "remove", + "protein/glycosylation/lastEditedBy": "remove", + "protein/otherLinks/linkageType": "linkage_type", + "protein/otherLinks/references": "references", + "protein/otherLinks/access": "remove", + "protein/otherLinks/created": "remove", + "protein/otherLinks/createdBy": "remove", + "protein/otherLinks/deprecated": "remove", + "protein/otherLinks/lastEdited": "remove", + "protein/otherLinks/lastEditedBy": "remove", + "protein/subunits/references": "references", + "protein/subunits/subunitIndex": "subunit_index", + "protein/subunits/access": "remove", + "protein/subunits/created": "remove", + "protein/subunits/createdBy": "remove", + "protein/subunits/deprecated": "remove", + "protein/subunits/lastEdited": "remove", + "protein/subunits/lastEditedBy": "remove", + "protein/subunits/length": "remove", + "relationships/amount/highLimit": "high_limit", + "relationships/amount/lowLimit": "low_limit", + "relationships/amount/nonNumericValue": "non_numeric_value", + "relationships/amount/references": "references", + "relationships/amount/access": "remove", + "relationships/amount/created": "remove", + "relationships/amount/createdBy": "remove", + "relationships/amount/deprecated": "remove", + "relationships/amount/lastEdited": "remove", + "relationships/amount/lastEditedBy": "remove", + "relationships/relatedSubstance/approvalID": "unii", + "relationships/relatedSubstance/linkingID": "linking_id", + "relationships/relatedSubstance/references": "references", + "relationships/relatedSubstance/refPname": "ref_pname", + "relationships/relatedSubstance/substanceClass": "substance_class", + "relationships/relatedSubstance/access": "remove", + "relationships/relatedSubstance/created": "remove", + "relationships/relatedSubstance/createdBy": "remove", + "relationships/relatedSubstance/deprecated": "remove", + "relationships/relatedSubstance/lastEdited": "remove", + "relationships/relatedSubstance/lastEditedBy": "remove", + "relationships/mediatorSubstance/approvalID": "unii", + "relationships/mediatorSubstance/linkingID": "linking_id", + "relationships/mediatorSubstance/references": "references", + "relationships/mediatorSubstance/refPname": "ref_pname", + "relationships/mediatorSubstance/substanceClass": "substance_class", + "relationships/mediatorSubstance/access": "remove", + "relationships/mediatorSubstance/created": "remove", + "relationships/mediatorSubstance/createdBy": "remove", + "relationships/mediatorSubstance/deprecated": "remove", + "relationships/mediatorSubstance/lastEdited": "remove", + "relationships/mediatorSubstance/lastEditedBy": "remove", + "structurallyDiverse/hybridSpeciesMaternalOrganism/approvalID": "unii", + "structurallyDiverse/hybridSpeciesMaternalOrganism/linkingID": "linking_id", + "structurallyDiverse/hybridSpeciesMaternalOrganism/refPname": "ref_pname", + "structurallyDiverse/hybridSpeciesMaternalOrganism/substanceClass": "substance_class", + "structurallyDiverse/hybridSpeciesMaternalOrganism/access": "remove", + "structurallyDiverse/hybridSpeciesMaternalOrganism/created": "remove", + "structurallyDiverse/hybridSpeciesMaternalOrganism/createdBy": "remove", + "structurallyDiverse/hybridSpeciesMaternalOrganism/deprecated": "remove", + "structurallyDiverse/hybridSpeciesMaternalOrganism/lastEdited": "remove", + "structurallyDiverse/hybridSpeciesMaternalOrganism/lastEditedBy": "remove", + "structurallyDiverse/hybridSpeciesPaternalOrganism/approvalID": "unii", + "structurallyDiverse/hybridSpeciesPaternalOrganism/linkingID": "linking_id", + "structurallyDiverse/hybridSpeciesPaternalOrganism/refPname": "ref_pname", + "structurallyDiverse/hybridSpeciesPaternalOrganism/substanceClass": "substance_class", + "structurallyDiverse/hybridSpeciesPaternalOrganism/access": "remove", + "structurallyDiverse/hybridSpeciesPaternalOrganism/created": "remove", + "structurallyDiverse/hybridSpeciesPaternalOrganism/createdBy": "remove", + "structurallyDiverse/hybridSpeciesPaternalOrganism/deprecated": "remove", + "structurallyDiverse/hybridSpeciesPaternalOrganism/lastEdited": "remove", + "structurallyDiverse/hybridSpeciesPaternalOrganism/lastEditedBy": "remove", + "structurallyDiverse/parentSubstance/approvalID": "unii", + "structurallyDiverse/parentSubstance/linkingID": "linking_id", + "structurallyDiverse/parentSubstance/references": "references", + "structurallyDiverse/parentSubstance/refPname": "ref_pname", + "structurallyDiverse/parentSubstance/substanceClass": "substance_class", + "structurallyDiverse/parentSubstance/access": "remove", + "structurallyDiverse/parentSubstance/created": "remove", + "structurallyDiverse/parentSubstance/createdBy": "remove", + "structurallyDiverse/parentSubstance/deprecated": "remove", + "structurallyDiverse/parentSubstance/lastEdited": "remove", + "structurallyDiverse/parentSubstance/lastEditedBy": "remove", + } + + rename_map_2nd_level_fields = { + "codes/codeSystem": "code_system", + "codes/references": "references", + "codes/access": "remove", + "codes/created": "remove", + "codes/createdBy": "remove", + "codes/deprecated": "remove", + "codes/lastEdited": "remove", + "codes/lastEditedBy": "remove", + "mixture/parentSubstance": "parent_substance", + "mixture/access": "remove", + "mixture/created": "remove", + "mixture/createdBy": "remove", + "mixture/deprecated": "remove", + "mixture/lastEdited": "remove", + "mixture/lastEditedBy": "remove", + "modifications/agentModifications": "agent_modifications", + "modifications/physicalModifications": "physical_modifications", + "modifications/references": "references", + "modifications/structuralModifications": "structural_modifications", + "modifications/access": "remove", + "modifications/created": "remove", + "modifications/createdBy": "remove", + "modifications/deprecated": "remove", + "modifications/lastEdited": "remove", + "modifications/lastEditedBy": "remove", + "moieties/countAmount": "count_amount", + "moieties/definedStereo": "defined_stereo", + "moieties/ezCenters": "ez_centers", + "moieties/mwt": "molecular_weight", + "moieties/opticalActivity": "optical_activity", + "moieties/references": "references", + "moieties/stereoCenters": "stereo_centers", + "moieties/stereoComments": "stereo_comments", + "moieties/access": "remove", + "moieties/created": "remove", + "moieties/createdBy": "remove", + "moieties/deprecated": "remove", + "moieties/digest": "remove", + "moieties/hash": "remove", + "moieties/lastEdited": "remove", + "moieties/lastEditedBy": "remove", + "moieties/self": "remove", + "names/displayName": "display_name", + "names/domains": "domains", + "names/nameJurisdiction": "name_jurisdiction", + "names/nameOrgs": "name_orgs", + "names/access": "remove", + "names/created": "remove", + "names/createdBy": "remove", + "names/deprecated": "remove", + "names/lastEdited": "remove", + "names/lastEditedBy": "remove", + "notes/access": "remove", + "notes/created": "remove", + "notes/createdBy": "remove", + "notes/deprecated": "remove", + "notes/lastEdited": "remove", + "notes/lastEditedBy": "remove", + "nucleicAcid/nucleicAcidSubType": "nucleic_acid_sub_type", + "nucleicAcid/nucleicAcidType": "nucleic_acid_type", + "nucleicAcid/sequenceOrigin": "sequence_origin", + "nucleicAcid/sequenceType": "sequence_type", + "nucleicAcid/access": "remove", + "nucleicAcid/created": "remove", + "nucleicAcid/createdBy": "remove", + "nucleicAcid/deprecated": "remove", + "nucleicAcid/lastEdited": "remove", + "nucleicAcid/lastEditedBy": "remove", + "polymer/displayStructure": "display_structure", + "polymer/idealizedStructure": "idealized_structure", + "polymer/structuralUnits": "structural_units", + "polymer/access": "remove", + "polymer/created": "remove", + "polymer/createdBy": "remove", + "polymer/deprecated": "remove", + "polymer/lastEdited": "remove", + "polymer/lastEditedBy": "remove", + "properties/parameters": "parameters", + "properties/propertyType": "property_type", + "properties/references": "references", + "properties/access": "remove", + "properties/created": "remove", + "properties/createdBy": "remove", + "properties/deprecated": "remove", + "properties/lastEdited": "remove", + "properties/lastEditedBy": "remove", + "protein/disulfideLinks": "disulfide_links", + "protein/otherLinks": "other_links", + "protein/proteinSubType": "protein_sub_type", + "protein/proteinType": "protein_type", + "protein/sequenceOrigin": "sequence_origin", + "protein/sequenceType": "sequence_type", + "protein/access": "remove", + "protein/created": "remove", + "protein/createdBy": "remove", + "protein/deprecated": "remove", + "protein/lastEdited": "remove", + "protein/lastEditedBy": "remove", + "references/nameJurisdiction": "name_jurisdiction", + "references/nameOrgs": "name_orgs", + "references/displayName": "display_name", + "references/docType": "doc_type", + "references/publicDomain": "public_domain", + "references/documentDate": "document_date", + "references/stereoComments": "stereo_comments", + "references/tags": "tags", + "references/access": "remove", + "references/created": "remove", + "references/createdBy": "remove", + "references/deprecated": "remove", + "references/lastEdited": "remove", + "references/lastEditedBy": "remove", + "references/uploadedFile": "remove", + "relationships/interactionType": "interaction_type", + "relationships/mediatorSubstance": "mediator_substance", + "relationships/references": "references", + "relationships/relatedSubstance": "related_substance", + "relationships/access": "remove", + "relationships/created": "remove", + "relationships/createdBy": "remove", + "relationships/deprecated": "remove", + "relationships/lastEdited": "remove", + "relationships/lastEditedBy": "remove", + "relationships/originatorUuid": "remove", + "structurallyDiverse/developmentalStage": "developmental_stage", + "structurallyDiverse/fractionMaterialType": "fraction_material_type", + "structurallyDiverse/fractionName": "fraction_name", + "structurallyDiverse/hybridSpeciesMaternalOrganism": "hybrid_species_maternal_organism", + "structurallyDiverse/hybridSpeciesPaternalOrganism": "hybrid_species_paternal_organism", + "structurallyDiverse/infraSpecificName": "infra_specific_name", + "structurallyDiverse/infraSpecificType": "infra_specific_type", + "structurallyDiverse/organismAuthor": "organism_author", + "structurallyDiverse/organismFamily": "organism_family", + "structurallyDiverse/organismGenus": "organism_genus", + "structurallyDiverse/organismSpecies": "organism_species", + "structurallyDiverse/parentSubstance": "parent_substance", + "structurallyDiverse/part": "part", + "structurallyDiverse/partLocation": "part_location", + "structurallyDiverse/references": "references", + "structurallyDiverse/sourceMaterialClass": "source_material_class", + "structurallyDiverse/sourceMaterialState": "source_material_state", + "structurallyDiverse/sourceMaterialType": "source_material_type", + "structurallyDiverse/access": "remove", + "structurallyDiverse/created": "remove", + "structurallyDiverse/createdBy": "remove", + "structurallyDiverse/deprecated": "remove", + "structurallyDiverse/lastEdited": "remove", + "structurallyDiverse/lastEditedBy": "remove", + "structure/definedStereo": "defined_stereo", + "structure/ezCenters": "ez_centers", + "structure/mwt": "molecular_weight", + "structure/opticalActivity": "optical_activity", + "structure/references": "references", + "structure/stereoCenters": "stereo_centers", + "structure/stereoComments": "stereo_comments", + "structure/access": "remove", + "structure/created": "remove", + "structure/createdBy": "remove", + "structure/deprecated": "remove", + "structure/digest": "remove", + "structure/hash": "remove", + "structure/lastEdited": "remove", + "structure/lastEditedBy": "remove", + "structure/self": "remove", + } + + rename_map_1st_level_fields = { + "approvalID": "unii", + "codes": "codes", + "definitionLevel": "definition_level", + "definitionType": "definition_type", + "names": "names", + "notes": "notes", + "nucleicAcid": "nucleic_acid", + "properties": "properties", + "references": "references", + "relationships": "relationships", + "structurallyDiverse": "structurally_diverse", + "substanceClass": "substance_class", + "tags": "tags", + "access": "remove", + "approvedBy": "remove", + "changeReason": "remove", + "created": "remove", + "createdBy": "remove", + "deprecated": "remove", + "lastEdited": "remove", + "lastEditedBy": "remove", + "status": "remove", + } + + def map(self, key, value, output): + def change_key(obj, entry, rename_map): + child_entry_path = entry + parent_entry_obj = obj + + # If entry has nested path find the immediate parent + if entry.find("/") != -1: + parent_entry_path = entry.rsplit("/", 1)[0] + child_entry_path = entry.rsplit("/", 1)[1] + parent_entry_obj = iterate_dictionary(obj, parent_entry_path) + + if parent_entry_obj != None: + # Check for list + if type(parent_entry_obj) == list: + for o in parent_entry_obj: + if iterate_dictionary(o, child_entry_path) is not None: + # Pop fields with empty array value + if rename_map[entry] == "remove": + o.pop(child_entry_path) + elif o.get(child_entry_path) == []: + o.pop(child_entry_path) + else: + # Rename the key + o[rename_map[entry]] = o.pop(child_entry_path) + elif child_entry_path in o: + o.pop(child_entry_path) + else: + if parent_entry_obj.get(child_entry_path) is not None: + # Pop fields with empty array value + if rename_map[entry] == "remove": + parent_entry_obj.pop(child_entry_path) + elif parent_entry_obj.get(child_entry_path) == []: + parent_entry_obj.pop(child_entry_path) + else: + # Rename the key + parent_entry_obj[rename_map[entry]] = parent_entry_obj.pop( + child_entry_path + ) + + # Change key names from the leaf nodes back up to the top nodes + for entry in self.rename_map_5th_level_fields: + change_key(value, entry, self.rename_map_5th_level_fields) + + for entry in self.rename_map_4th_level_fields: + change_key(value, entry, self.rename_map_4th_level_fields) + + for entry in self.rename_map_3rd_level_fields: + change_key(value, entry, self.rename_map_3rd_level_fields) + + for entry in self.rename_map_2nd_level_fields: + change_key(value, entry, self.rename_map_2nd_level_fields) + + for entry in self.rename_map_1st_level_fields: + change_key(value, entry, self.rename_map_1st_level_fields) + + output.add(key, value) + class SubstanceData2JSON(luigi.Task): - def requires(self): - return ExtractZip() + def requires(self): + return ExtractZip() - def output(self): - return luigi.LocalTarget(config.data_dir(SUBSTANCE_DATA_EXTRACT_DB)) + def output(self): + return luigi.LocalTarget(config.data_dir(SUBSTANCE_DATA_EXTRACT_DB)) - def run(self): - parallel.mapreduce( - parallel.Collection.from_glob( - self.input().path, parallel.JSONLineInput()), - mapper=SubstanceData2JSONMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path) + def run(self): + parallel.mapreduce( + parallel.Collection.from_glob(self.input().path, parallel.JSONLineInput()), + mapper=SubstanceData2JSONMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + ) class LoadJSON(index_util.LoadJSONBase): - index_name = 'substancedata' - mapping_file = './schemas/substancedata_mapping.json' - data_source = SubstanceData2JSON() - use_checksum = False - optimize_index = True - last_update_date = lambda _: first_file_timestamp(RAW_DIR) + index_name = "substancedata" + mapping_file = "./schemas/substancedata_mapping.json" + data_source = SubstanceData2JSON() + use_checksum = False + optimize_index = True + last_update_date = lambda _: first_file_timestamp(RAW_DIR) -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/openfda/tasks.py b/openfda/tasks.py index 74ff386..4155d3e 100644 --- a/openfda/tasks.py +++ b/openfda/tasks.py @@ -5,74 +5,82 @@ import luigi from luigi.task import flatten + class DependencyTriggeredTask(luigi.Task): - ''' - A task that re-runs if any of its dependencies are newer than its output, or - if any dependency is incomplete (and therefore will _become_ newer than it). - ''' - def complete(self): - if not self.output().exists(): - return False + """ + A task that re-runs if any of its dependencies are newer than its output, or + if any dependency is incomplete (and therefore will _become_ newer than it). + """ + + def complete(self): + if not self.output().exists(): + return False - deps = flatten(self.requires()) - logging.debug('Deps: %s', [d.complete() for d in deps]) - for dep in deps: - if not dep.complete(): - logging.info('Dependencies are incomplete, rerunning task.') - return False + deps = flatten(self.requires()) + logging.debug("Deps: %s", [d.complete() for d in deps]) + for dep in deps: + if not dep.complete(): + logging.info("Dependencies are incomplete, rerunning task.") + return False - inputs = flatten(self.input()) - output_ts = os.path.getmtime(self.output().path) - input_ts = max([os.path.getmtime(in_file.path) for in_file in inputs]) + inputs = flatten(self.input()) + output_ts = os.path.getmtime(self.output().path) + input_ts = max([os.path.getmtime(in_file.path) for in_file in inputs]) - if input_ts > output_ts: - logging.info('Dependencies are newer, rerunning task.') - return False + if input_ts > output_ts: + logging.info("Dependencies are newer, rerunning task.") + return False - return True + return True class AlwaysRunTask(luigi.Task): - ''' - A task that should always run once. + """ + A task that should always run once. + + This is generally used for tasks that are idempotent, and for which the + "done" state is difficult to determine: e.g. initializing an index, or + downloading files. - This is generally used for tasks that are idempotent, and for which the - "done" state is difficult to determine: e.g. initializing an index, or - downloading files. + N.B. Luigi creates *new instances* of tasks when performing completeness + checks. This means we can't store our completeness as a local boolean, + and instead have to use a file on disk as a flag. - N.B. Luigi creates *new instances* of tasks when performing completeness - checks. This means we can't store our completeness as a local boolean, - and instead have to use a file on disk as a flag. + To allow subclasses to use `output` normally, we don't use an `output` file + here and instead manage completion manually. + """ - To allow subclasses to use `output` normally, we don't use an `output` file - here and instead manage completion manually. - ''' - nonce_timestamp = luigi.Parameter(default=arrow.utcnow().format('YYYY-MM-DD-HH-mm-ss')) + nonce_timestamp = luigi.Parameter( + default=arrow.utcnow().format("YYYY-MM-DD-HH-mm-ss") + ) - def __init__(self, *args, **kw): - luigi.Task.__init__(self, *args, **kw) + def __init__(self, *args, **kw): + luigi.Task.__init__(self, *args, **kw) - def requires(self): - return [] + def requires(self): + return [] - def _nonce_file(self): - import hashlib - digest = hashlib.sha1(repr(self).encode('utf-8')).hexdigest() - return '/tmp/always-run-task/%s-%s' % (self.nonce_timestamp, digest) + def _nonce_file(self): + import hashlib - def complete(self): - return os.path.exists(self._nonce_file()) + digest = hashlib.sha1(repr(self).encode("utf-8")).hexdigest() + return "/tmp/always-run-task/%s-%s" % (self.nonce_timestamp, digest) - def run(self): - self._run() + def complete(self): + return os.path.exists(self._nonce_file()) + + def run(self): + self._run() + + os.system('mkdir -p "/tmp/always-run-task"') + with open(self._nonce_file(), "w") as nonce_file: + nonce_file.write("finished.") - os.system('mkdir -p "/tmp/always-run-task"') - with open(self._nonce_file(), 'w') as nonce_file: - nonce_file.write('finished.') class NoopTask(luigi.Task): - ''' - A task that does absolutely nothing. - ''' - def complete(self): - return True + """ + A task that does absolutely nothing. + """ + + def complete(self): + return True diff --git a/openfda/test_common.py b/openfda/test_common.py index f224562..8312356 100644 --- a/openfda/test_common.py +++ b/openfda/test_common.py @@ -1,16 +1,19 @@ #!/usr/bin/python -'''Helper functions for testing.''' +"""Helper functions for testing.""" import inspect import os + def data_filename(fname, caller_frame=1): - # get the parent frame - _, parent_file, _, _, _, _ = inspect.getouterframes( - inspect.currentframe())[caller_frame] - base_dir = os.path.abspath(os.path.dirname(parent_file)) - return os.path.join(base_dir, 'data', fname) + # get the parent frame + _, parent_file, _, _, _, _ = inspect.getouterframes(inspect.currentframe())[ + caller_frame + ] + base_dir = os.path.abspath(os.path.dirname(parent_file)) + return os.path.join(base_dir, "data", fname) + def open_data_file(fname): - return open(data_filename(fname, caller_frame=2), mode='rb') + return open(data_filename(fname, caller_frame=2), mode="rb") diff --git a/openfda/tests/api_test.py b/openfda/tests/api_test.py index bc570b3..6babf96 100644 --- a/openfda/tests/api_test.py +++ b/openfda/tests/api_test.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -''' +""" Quick API tests to run against a new index build. These test certain "quasi-invariants" about the index data: e.g. we should have @@ -8,7 +8,7 @@ test of the full system. Before pushing out a new index, the `refresh_test_responses` script should also be run to compare the index changes against a wider test set. -''' +""" import os import pprint @@ -16,64 +16,78 @@ from openfda.tests.api_test_helpers import * -ENDPOINT = os.getenv('OPENFDA_ENDPOINT_URL', 'http://localhost:8000') +ENDPOINT = os.getenv("OPENFDA_ENDPOINT_URL", "http://localhost:8000") @not_circle def test_humira_class(): - 'Checks for a bug related to incorrect splitting of the pharm_class field.' - data = _fetch('/drug/event.json?count=pharm_class_epc_exact&search=patient.drug.medicinalproduct:humira') - assert data['results'][0]['term'] == 'Tumor Necrosis Factor Blocker [EPC]' + "Checks for a bug related to incorrect splitting of the pharm_class field." + data = _fetch( + "/drug/event.json?count=pharm_class_epc_exact&search=patient.drug.medicinalproduct:humira" + ) + assert data["results"][0]["term"] == "Tumor Necrosis Factor Blocker [EPC]" @not_circle def test_reactionmeddrapt_exact(): - 'Test for correct tokenization of the exact field.' - data = _fetch('/drug/event.json?count=patient.reaction.reactionmeddrapt.exact') - assert data['results'][0]['term'] == 'DRUG INEFFECTIVE' + "Test for correct tokenization of the exact field." + data = _fetch("/drug/event.json?count=patient.reaction.reactionmeddrapt.exact") + assert data["results"][0]["term"] == "DRUG INEFFECTIVE" @not_circle def test_faers_generic_name(): - 'Sanity test generic drug counts. Counts may change, but ordering should in general be consistent.' - data = _fetch('/drug/event.json?count=patient.drug.openfda.generic_name.exact') - pprint.pprint(data['results']) - counts = extract_counts(data) - assert counts[0].term == 'ETANERCEPT' - assert counts[1].term == 'ADALIMUMAB' - assert counts[2].term == 'ASPIRIN' - assert counts[3].term == 'INTERFERON BETA-1A' + "Sanity test generic drug counts. Counts may change, but ordering should in general be consistent." + data = _fetch("/drug/event.json?count=patient.drug.openfda.generic_name.exact") + pprint.pprint(data["results"]) + counts = extract_counts(data) + assert counts[0].term == "ETANERCEPT" + assert counts[1].term == "ADALIMUMAB" + assert counts[2].term == "ASPIRIN" + assert counts[3].term == "INTERFERON BETA-1A" @not_circle def test_openfda_join_humira(): - 'Verifies that we joined correctly for a specific report.' - data = _fetch('/drug/event.json?search=reportduplicate.duplicatenumb.exact:GB-ABBVIE-14P-167-1209507-00') - results = data['results'][0] - pprint.pprint(results) - drugs = results['patient']['drug'] - assert len(drugs) == 2 - assert drugs[0]['openfda']['spl_id'] - assert drugs[1]['openfda']['spl_id'] + "Verifies that we joined correctly for a specific report." + data = _fetch( + "/drug/event.json?search=reportduplicate.duplicatenumb.exact:GB-ABBVIE-14P-167-1209507-00" + ) + results = data["results"][0] + pprint.pprint(results) + drugs = results["patient"]["drug"] + assert len(drugs) == 2 + assert drugs[0]["openfda"]["spl_id"] + assert drugs[1]["openfda"]["spl_id"] @not_circle def test_pagination(): - # Check that the header exists when skip & limit are not provided. - ndc_data = fetch_headers(ENDPOINT + '/drug/ndc.json?search=marketing_start_date:"20060707"') - data_link = ndc_data['Link'] - assert data_link == '<' + ENDPOINT + '/drug/ndc.json?search=marketing_start_date%3A%2220060707%22&skip=1&limit=1>; rel="next"' - - # Check that the header doesn't exist on the last entry. - while data_link: - ndc_data = fetch_headers(data_link[1:-1]) - if 'Link' in ndc_data: - data_link = ndc_data['Link'] - else: - data_link = None - - assert data_link == None - - # Check that the skip & limit parameters are included when provided. - event_data = fetch_headers(ENDPOINT + '/drug/event.json?skip=100&limit=100') - assert event_data['Link'] == '<' + ENDPOINT + '/drug/event.json?skip=200&limit=100>; rel="next"' + # Check that the header exists when skip & limit are not provided. + ndc_data = fetch_headers( + ENDPOINT + '/drug/ndc.json?search=marketing_start_date:"20060707"' + ) + data_link = ndc_data["Link"] + assert ( + data_link + == "<" + + ENDPOINT + + '/drug/ndc.json?search=marketing_start_date%3A%2220060707%22&skip=1&limit=1>; rel="next"' + ) + + # Check that the header doesn't exist on the last entry. + while data_link: + ndc_data = fetch_headers(data_link[1:-1]) + if "Link" in ndc_data: + data_link = ndc_data["Link"] + else: + data_link = None + + assert data_link == None + + # Check that the skip & limit parameters are included when provided. + event_data = fetch_headers(ENDPOINT + "/drug/event.json?skip=100&limit=100") + assert ( + event_data["Link"] + == "<" + ENDPOINT + '/drug/event.json?skip=200&limit=100>; rel="next"' + ) diff --git a/openfda/tests/api_test_helpers.py b/openfda/tests/api_test_helpers.py index 1acf216..e3389d0 100644 --- a/openfda/tests/api_test_helpers.py +++ b/openfda/tests/api_test_helpers.py @@ -4,160 +4,166 @@ import requests -ENDPOINT = os.getenv('OPENFDA_ENDPOINT_URL', 'http://localhost:8000') -Count = collections.namedtuple('Count', ['term', 'count']) +ENDPOINT = os.getenv("OPENFDA_ENDPOINT_URL", "http://localhost:8000") +Count = collections.namedtuple("Count", ["term", "count"]) class CountList(list): - def __repr__(self): - return '\n'.join([repr(c) for c in self]) + def __repr__(self): + return "\n".join([repr(c) for c in self]) def extract_counts(results): - counts = CountList() - for r in results: - if 'term' in r: - counts.append(Count(r['term'], r['count'])) - else: - counts.append(Count(r['time'], r['count'])) - return counts + counts = CountList() + for r in results: + if "term" in r: + counts.append(Count(r["term"], r["count"])) + else: + counts.append(Count(r["time"], r["count"])) + return counts def assert_total(query, minimum): - meta, results = fetch(query) - assert_greater_equal(meta['results']['total'], minimum, - 'Query %s had fewer results than expected. %s < %s' % ( - query, meta['results']['total'], minimum - )) + meta, results = fetch(query) + assert_greater_equal( + meta["results"]["total"], + minimum, + "Query %s had fewer results than expected. %s < %s" + % (query, meta["results"]["total"], minimum), + ) def assert_total_exact(query, total): - meta, results = fetch(query) - eq_(meta['results']['total'], total, - 'Query %s had different number of results than expected. %s != %s' % ( - query, meta['results']['total'], total - )) + meta, results = fetch(query) + eq_( + meta["results"]["total"], + total, + "Query %s had different number of results than expected. %s != %s" + % (query, meta["results"]["total"], total), + ) def assert_count_top(query, expected_terms, N=10): - '''Verify that all of the terms in `terms` are found in the top-N count results.''' - meta, counts = fetch_counts(query) - count_terms = set([count.term for count in counts[:N]]) - for term in expected_terms: - assert term in count_terms, 'Query %s missing expected term %s; terms=%s' % ( - query, term, '\n'.join(list(count_terms)) - ) + """Verify that all of the terms in `terms` are found in the top-N count results.""" + meta, counts = fetch_counts(query) + count_terms = set([count.term for count in counts[:N]]) + for term in expected_terms: + assert term in count_terms, "Query %s missing expected term %s; terms=%s" % ( + query, + term, + "\n".join(list(count_terms)), + ) def assert_count_contains(query, expected_terms): - meta, counts = fetch_counts(query) - count_terms = set([count.term for count in counts]) + meta, counts = fetch_counts(query) + count_terms = set([count.term for count in counts]) - for term in expected_terms: - assert term in count_terms, 'Query %s missing expected term %s; terms=%s' % ( - query, term, '\n'.join(count_terms) - ) + for term in expected_terms: + assert term in count_terms, "Query %s missing expected term %s; terms=%s" % ( + query, + term, + "\n".join(count_terms), + ) def assert_unii(query, expected_result): - meta, results = fetch(query) - unii_list = [] - if 'openfda' in list(results[0].keys()): - for result in results[0]['openfda']['unii']: - unii_list.append(result) - elif 'patient' in list(results[0].keys()): - for result in results[0]['patient']['drug'][0]['openfda']['unii']: - unii_list.append(result) - eq_(sorted(unii_list), sorted(expected_result)) + meta, results = fetch(query) + unii_list = [] + if "openfda" in list(results[0].keys()): + for result in results[0]["openfda"]["unii"]: + unii_list.append(result) + elif "patient" in list(results[0].keys()): + for result in results[0]["patient"]["drug"][0]["openfda"]["unii"]: + unii_list.append(result) + eq_(sorted(unii_list), sorted(expected_result)) def fetch(query): - print('Fetching %s' % query) - data = requests.get(ENDPOINT + query).json() - return data.get('meta'), data.get('results') + print("Fetching %s" % query) + data = requests.get(ENDPOINT + query).json() + return data.get("meta"), data.get("results") def fetch_headers(query): - print('Fetching headers for %s' % query) - data = requests.get(query) - return data.headers + print("Fetching headers for %s" % query) + data = requests.get(query) + return data.headers def expect_error(query): - print('Fetching %s' % query) - data = requests.get(ENDPOINT + query).json() - return data.get('error') + print("Fetching %s" % query) + data = requests.get(ENDPOINT + query).json() + return data.get("error") def json(query): - print('Getting %s' % query) - data = requests.get(ENDPOINT + query).json() - return data + print("Getting %s" % query) + data = requests.get(ENDPOINT + query).json() + return data def fetch_counts(query): - meta, results = fetch(query) - return meta, extract_counts(results) + meta, results = fetch(query) + return meta, extract_counts(results) def not_circle(fn): - 'Skip this test when running under CI.' + "Skip this test when running under CI." - def _fn(*args, **kw): - if 'CIRCLECI' in os.environ: - raise SkipTest - fn(*args, **kw) + def _fn(*args, **kw): + if "CIRCLECI" in os.environ: + raise SkipTest + fn(*args, **kw) - return _fn + return _fn def ok_(expr, msg=None): - """Shorthand for assert. Saves 3 whole characters! - """ - if not expr: - raise AssertionError(msg) + """Shorthand for assert. Saves 3 whole characters!""" + if not expr: + raise AssertionError(msg) def eq_(a, b, msg=None): - """Shorthand for 'assert a == b, "%r != %r" % (a, b) - """ - if not a == b: - raise AssertionError(msg or "%r != %r" % (a, b)) + """Shorthand for 'assert a == b, "%r != %r" % (a, b)""" + if not a == b: + raise AssertionError(msg or "%r != %r" % (a, b)) def assert_greater(a, b, msg=None): """Just like self.assertTrue(a > b), but with a nicer default message.""" if not a > b: - standardMsg = '%s not greater than %s' % (repr(a), repr(b)) + standardMsg = "%s not greater than %s" % (repr(a), repr(b)) fail(_formatMessage(msg, standardMsg)) + def assert_greater_equal(a, b, msg=None): - """Just like self.assertTrue(a >= b), but with a nicer default message.""" - if not a >= b: - standardMsg = '%s not greater than or equal to %s' % (repr(a), repr(b)) - fail(_formatMessage(msg, standardMsg)) + """Just like self.assertTrue(a >= b), but with a nicer default message.""" + if not a >= b: + standardMsg = "%s not greater than or equal to %s" % (repr(a), repr(b)) + fail(_formatMessage(msg, standardMsg)) def fail(msg): - raise AssertionError(msg) + raise AssertionError(msg) def _formatMessage(msg, standardMsg): - """Honour the longMessage attribute when generating failure messages. - If longMessage is False this means: - * Use only an explicit message if it is provided - * Otherwise use the standard message for the assert - - If longMessage is True: - * Use the standard message - * If an explicit message is provided, plus ' : ' and the explicit message - """ - if msg is None: - return standardMsg - try: - # don't switch to '{}' formatting in Python 2.X - # it changes the way unicode input is handled - return '%s : %s' % (standardMsg, msg) - except UnicodeDecodeError: - return '%s : %s' % (repr(standardMsg), repr(msg)) - + """Honour the longMessage attribute when generating failure messages. + If longMessage is False this means: + * Use only an explicit message if it is provided + * Otherwise use the standard message for the assert + + If longMessage is True: + * Use the standard message + * If an explicit message is provided, plus ' : ' and the explicit message + """ + if msg is None: + return standardMsg + try: + # don't switch to '{}' formatting in Python 2.X + # it changes the way unicode input is handled + return "%s : %s" % (standardMsg, msg) + except UnicodeDecodeError: + return "%s : %s" % (repr(standardMsg), repr(msg)) diff --git a/openfda/tests/common_test.py b/openfda/tests/common_test.py index 2829130..581b6ca 100644 --- a/openfda/tests/common_test.py +++ b/openfda/tests/common_test.py @@ -3,28 +3,41 @@ import unittest from openfda import common from six.moves import range -class TestCommonMethods(unittest.TestCase): - def test_extract_date(self): - assert common.extract_date("20121110") == '2012-11-10' - assert common.extract_date("201211103422") == '2012-11-10' - assert common.extract_date("20121610") == '2012-01-10' - assert common.extract_date("20120010") == '2012-01-10' - assert common.extract_date("20121132") == '2012-11-01' - assert common.extract_date("2012000001") == '2012-01-01' - assert common.extract_date("20561132") == '1900-11-01' - assert common.extract_date("18001132") == '1900-11-01' - assert common.extract_date("") == '1900-01-01' - assert common.extract_date(None) == '1900-01-01' - def test_shell_cmd(self): - tmpFile = '/tmp/'+(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(32))) - common.shell_cmd('touch %(tmpFile)s' % locals()) - assert len(common.shell_cmd('ls %(tmpFile)s' % locals())) > 0 - assert common.shell_cmd('ls %(tmpFile)s' % locals()).startswith(tmpFile.encode()) +class TestCommonMethods(unittest.TestCase): + + def test_extract_date(self): + assert common.extract_date("20121110") == "2012-11-10" + assert common.extract_date("201211103422") == "2012-11-10" + assert common.extract_date("20121610") == "2012-01-10" + assert common.extract_date("20120010") == "2012-01-10" + assert common.extract_date("20121132") == "2012-11-01" + assert common.extract_date("2012000001") == "2012-01-01" + assert common.extract_date("20561132") == "1900-11-01" + assert common.extract_date("18001132") == "1900-11-01" + assert common.extract_date("") == "1900-01-01" + assert common.extract_date(None) == "1900-01-01" - def test_shell_cmd_quiet(self): - tmpFile = '/tmp/'+(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(32))) - common.shell_cmd_quiet('touch %(tmpFile)s' % locals()) - assert common.shell_cmd_quiet('ls %(tmpFile)s' % locals()).startswith(tmpFile.encode()) + def test_shell_cmd(self): + tmpFile = "/tmp/" + ( + "".join( + random.choice(string.ascii_uppercase + string.digits) for _ in range(32) + ) + ) + common.shell_cmd("touch %(tmpFile)s" % locals()) + assert len(common.shell_cmd("ls %(tmpFile)s" % locals())) > 0 + assert common.shell_cmd("ls %(tmpFile)s" % locals()).startswith( + tmpFile.encode() + ) + def test_shell_cmd_quiet(self): + tmpFile = "/tmp/" + ( + "".join( + random.choice(string.ascii_uppercase + string.digits) for _ in range(32) + ) + ) + common.shell_cmd_quiet("touch %(tmpFile)s" % locals()) + assert common.shell_cmd_quiet("ls %(tmpFile)s" % locals()).startswith( + tmpFile.encode() + ) diff --git a/openfda/tests/index_util_test.py b/openfda/tests/index_util_test.py index 77f79c5..de3b9f1 100644 --- a/openfda/tests/index_util_test.py +++ b/openfda/tests/index_util_test.py @@ -10,62 +10,71 @@ from openfda import index_util -ES_HOST = os.getenv('ES_HOST', default='localhost:9200') +ES_HOST = os.getenv("ES_HOST", default="localhost:9200") def p(obj): - return json.dumps(obj, indent=2, sort_keys=True) + return json.dumps(obj, indent=2, sort_keys=True) def test_fresh_index(): - es = elasticsearch.Elasticsearch(ES_HOST) - try: - es.indices.delete('index_util_test1') - except: - pass - es.indices.create('index_util_test1') - time.sleep(0.1) - - docs = [ - {'user': 'power', 'message': 'hello1', }, - {'user': 'power', 'message': 'hello2'}, - {'user': 'power', 'message': 'hello3'}, - {'user': 'power', 'message': 'hello4'}, - ] - - doc_type = 'user_message' - batch = list(zip(list(range(4)), docs)) - index_util.index_with_checksum(es, 'index_util_test1', doc_type, batch) - - for i in range(4): - fetched = es.get(index='index_util_test1', id=i, doc_type='user_message') - doc = dict(fetched['_source']) - assert docs[i] == doc, (p(docs[i]), p(fetched)) + es = elasticsearch.Elasticsearch(ES_HOST) + try: + es.indices.delete("index_util_test1") + except: + pass + es.indices.create("index_util_test1") + time.sleep(0.1) + + docs = [ + { + "user": "power", + "message": "hello1", + }, + {"user": "power", "message": "hello2"}, + {"user": "power", "message": "hello3"}, + {"user": "power", "message": "hello4"}, + ] + + doc_type = "user_message" + batch = list(zip(list(range(4)), docs)) + index_util.index_with_checksum(es, "index_util_test1", doc_type, batch) + + for i in range(4): + fetched = es.get(index="index_util_test1", id=i, doc_type="user_message") + doc = dict(fetched["_source"]) + assert docs[i] == doc, (p(docs[i]), p(fetched)) def test_replace_some_docs(): - es = elasticsearch.Elasticsearch(ES_HOST) - test_fresh_index() - - # change 2 documents of our 4, and check they are updated. - docs = [ - {'user': 'power', 'message': 'hello5', }, - {'user': 'power', 'message': 'hello2'}, - {'user': 'power', 'message': 'hello6', }, - {'user': 'power', 'message': 'hello4'}, - ] - - doc_type = 'user_message' - batch = list(zip(list(range(4)), docs)) - index_util.index_with_checksum(es, 'index_util_test1', doc_type, batch) - - for i in range(4): - fetched = es.get(index='index_util_test1', id=i, doc_type='user_message') - doc = dict(fetched['_source']) - assert docs[i] == doc, (p(docs[i]), p(fetched)) - - -if __name__ == '__main__': - logging.basicConfig(stream=sys.stderr, level=logging.INFO) - test_fresh_index() - test_replace_some_docs() + es = elasticsearch.Elasticsearch(ES_HOST) + test_fresh_index() + + # change 2 documents of our 4, and check they are updated. + docs = [ + { + "user": "power", + "message": "hello5", + }, + {"user": "power", "message": "hello2"}, + { + "user": "power", + "message": "hello6", + }, + {"user": "power", "message": "hello4"}, + ] + + doc_type = "user_message" + batch = list(zip(list(range(4)), docs)) + index_util.index_with_checksum(es, "index_util_test1", doc_type, batch) + + for i in range(4): + fetched = es.get(index="index_util_test1", id=i, doc_type="user_message") + doc = dict(fetched["_source"]) + assert docs[i] == doc, (p(docs[i]), p(fetched)) + + +if __name__ == "__main__": + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + test_fresh_index() + test_replace_some_docs() diff --git a/openfda/tests/luigi_test.py b/openfda/tests/luigi_test.py index 31a505a..059d104 100644 --- a/openfda/tests/luigi_test.py +++ b/openfda/tests/luigi_test.py @@ -4,35 +4,38 @@ from luigi import interface, scheduler, worker + class SimpleTask(luigi.Task): - def output(self): - return luigi.LocalTarget('/tmp/a-created') + def output(self): + return luigi.LocalTarget("/tmp/a-created") + + def run(self): + os.system('touch "%s"' % self.output().path) - def run(self): - os.system('touch "%s"' % self.output().path) class DepTask(DependencyTriggeredTask): - def requires(self): - return [SimpleTask()] - - def output(self): - return luigi.LocalTarget('/tmp/b-created') - - def run(self): - os.system('touch "%s"' % self.output().path) - -if __name__ == '__main__': - os.system('rm -f /tmp/a-created /tmp/b-created') - interface.setup_interface_logging() - sch = scheduler.CentralPlannerScheduler() - w = worker.Worker(scheduler=sch) - w.add(SimpleTask()) - w.run() - w.add(DepTask()) - w.run() - os.system('rm -f /tmp/a-created /tmp/b-created') - w.add(DepTask()) - w.run() - os.system('rm -f /tmp/a-created') - w.add(DepTask()) - w.run() + def requires(self): + return [SimpleTask()] + + def output(self): + return luigi.LocalTarget("/tmp/b-created") + + def run(self): + os.system('touch "%s"' % self.output().path) + + +if __name__ == "__main__": + os.system("rm -f /tmp/a-created /tmp/b-created") + interface.setup_interface_logging() + sch = scheduler.CentralPlannerScheduler() + w = worker.Worker(scheduler=sch) + w.add(SimpleTask()) + w.run() + w.add(DepTask()) + w.run() + os.system("rm -f /tmp/a-created /tmp/b-created") + w.add(DepTask()) + w.run() + os.system("rm -f /tmp/a-created") + w.add(DepTask()) + w.run() diff --git a/openfda/tests/parallel_test.py b/openfda/tests/parallel_test.py index aa579ab..0f9c367 100644 --- a/openfda/tests/parallel_test.py +++ b/openfda/tests/parallel_test.py @@ -9,168 +9,194 @@ class KeyMapper(parallel.Mapper): - def map(self, key, value, output): - output.add(key, key) + def map(self, key, value, output): + output.add(key, key) + class BadMapper(parallel.Mapper): - def map(self, key, value, output): - raise Exception('I crashed!') + def map(self, key, value, output): + raise Exception("I crashed!") + class CsvMapper(parallel.Mapper): - def map(self, key, value, output): - output.add('code', int(value[0])) + def map(self, key, value, output): + output.add("code", int(value[0])) + class ParallelTest(unittest.TestCase): - def make_files(self, input_prefix, data, input_format): - os.system('mkdir -p "%s"' % input_prefix) - source_files = [] - for i, val in enumerate(data): - filename = os.path.join(input_prefix, '%d.txt' % i) - source_files.append(filename) - with open(filename, 'w') as f: - f.write(val) - - return parallel.Collection(source_files, input_format) - - def run_mr(self, - prefix, input_data, - input_format=parallel.LineInput(), - mapper=parallel.IdentityMapper(), - reducer=parallel.IdentityReducer(), - output_format=parallel.LevelDBOutput(), - num_shards=5): - os.system('rm -rf "%s"' % prefix) - source = self.make_files(os.path.join(prefix, 'input'), input_data, input_format) - output_prefix = os.path.join(prefix, 'output') - - parallel.mapreduce(source, - mapper=mapper, - reducer=reducer, - map_workers=1, - output_format=output_format, - output_prefix=output_prefix, - num_shards=num_shards) - - if isinstance(output_format, parallel.LevelDBOutput): - return sorted(list(parallel.ShardedDB.open(output_prefix))) - - if isinstance(output_format, parallel.JSONOutput): - return json.load(open(output_prefix)) - - if isinstance(output_format, parallel.JSONLineOutput): - result = [] - with open(output_prefix, 'r') as input_f: - for line in input_f: - result.append(json.loads(line)) - return result - - def test_identity(self): - results = self.run_mr( - '/tmp/test-identity/', - ['hello' for i in range(10)], - input_format=parallel.FilenameInput()) - - for i in range(10): - key, value = results[i] - assert key.decode() == '/tmp/test-identity/input/%d.txt' % i, (i, results[i]) - assert value == '' - - def test_json_output(self): - results = self.run_mr( - '/tmp/test-json-output/', - ['hello' for i in range(10)], - input_format=parallel.FilenameInput(), - output_format=parallel.JSONOutput(indent=2, sort_keys=True), - num_shards=1) - - for i in range(10): - key = '/tmp/test-json-output/input/%d.txt' % i - assert key in results, key - assert results[key] == '', { 'key' : key, 'value' : results[key] } - - def test_jsonline_output(self): - results = self.run_mr( - '/tmp/test-jsonline-output/', - ['hello' for i in range(10)], - mapper=KeyMapper(), - input_format=parallel.FilenameInput(), - output_format=parallel.JSONLineOutput()) - - for i in range(10): - key = '/tmp/test-jsonline-output/input/%d.txt' % i - assert key in results - - def test_lines_input(self): - input_data = [] - for i in range(10): - input_data.append('\n'.join(['test-line'] * 10)) - - results = self.run_mr('/tmp/test-lineinput', input_data) - - for i in range(10): - key, value = results[i] - assert value == 'test-line', (i, results[i]) - - def test_sum(self): - # 10 files with 100 lines each - results = self.run_mr( - '/tmp/test-sum', - ['\n'.join([str(i) for i in range(100)]) for i in range(10)], - reducer=parallel.SumReducer()) - - results = set(dict(list(map(lambda res: (res[0].decode(), res[1]), results))).values()) - for i in range(100): - assert i * 10 in results - results.remove(i * 10) - assert len(results) == 0, 'Unexpected output: %s' % results - - def test_exception(self): - with self.assertRaises(parallel.MRException) as ctx: - self.run_mr( - '/tmp/test-bad-mapper', - ['hello' for i in range(10)], - mapper=BadMapper()) - - - def test_csv_line_split(self): - dir = '/tmp/openfda-test-csv-line-split' - level_db_prefix = os.path.join(dir, 'leveldb') - file = 'csv_split.csv' - - os.system('rm -rf "%s"' % level_db_prefix) - os.makedirs(dir, exist_ok=True) - shutil.copyfile( - os.path.join(dirname(os.path.abspath(__file__)), 'data/%s' % file), - os.path.join(dir, file)) - - col = parallel.Collection.from_glob( - os.path.join(dir, file), parallel.CSVSplitLineInput(quoting=csv.QUOTE_NONE, delimiter='|', fixed_splits=3)) - splits = list(col) - assert len(splits) == 3 - # Check every split's start and end. Start must be at the beginning of a line, end must be after carriage return or EOF. - assert(splits[0].start_pos) == 0 - assert (splits[0].end_pos) == 81 - assert(splits[1].start_pos) == 81 - assert (splits[1].end_pos) == 169 - assert(splits[2].start_pos) == 169 - assert (splits[2].end_pos) == 196 - - # Run M/R with these splits and ensure the result is as expected - parallel.mapreduce(col, - mapper=CsvMapper(), - reducer=parallel.SumReducer(), - map_workers=len(splits), - output_format=parallel.LevelDBOutput(), - output_prefix=level_db_prefix, - num_shards=len(splits)) - - result = sorted(list(parallel.ShardedDB.open(level_db_prefix))) - print(result) - # if we got the sum right we know all the splits have been processed correctly - assert (result[0][1]) == 401 + 402 + 403 + 404 + 405 + 407 + 408 + 409 + 410 + def make_files(self, input_prefix, data, input_format): + os.system('mkdir -p "%s"' % input_prefix) + source_files = [] + for i, val in enumerate(data): + filename = os.path.join(input_prefix, "%d.txt" % i) + source_files.append(filename) + with open(filename, "w") as f: + f.write(val) + + return parallel.Collection(source_files, input_format) + + def run_mr( + self, + prefix, + input_data, + input_format=parallel.LineInput(), + mapper=parallel.IdentityMapper(), + reducer=parallel.IdentityReducer(), + output_format=parallel.LevelDBOutput(), + num_shards=5, + ): + os.system('rm -rf "%s"' % prefix) + source = self.make_files( + os.path.join(prefix, "input"), input_data, input_format + ) + output_prefix = os.path.join(prefix, "output") + + parallel.mapreduce( + source, + mapper=mapper, + reducer=reducer, + map_workers=1, + output_format=output_format, + output_prefix=output_prefix, + num_shards=num_shards, + ) + + if isinstance(output_format, parallel.LevelDBOutput): + return sorted(list(parallel.ShardedDB.open(output_prefix))) + + if isinstance(output_format, parallel.JSONOutput): + return json.load(open(output_prefix)) + + if isinstance(output_format, parallel.JSONLineOutput): + result = [] + with open(output_prefix, "r") as input_f: + for line in input_f: + result.append(json.loads(line)) + return result + + def test_identity(self): + results = self.run_mr( + "/tmp/test-identity/", + ["hello" for i in range(10)], + input_format=parallel.FilenameInput(), + ) + + for i in range(10): + key, value = results[i] + assert key.decode() == "/tmp/test-identity/input/%d.txt" % i, ( + i, + results[i], + ) + assert value == "" + + def test_json_output(self): + results = self.run_mr( + "/tmp/test-json-output/", + ["hello" for i in range(10)], + input_format=parallel.FilenameInput(), + output_format=parallel.JSONOutput(indent=2, sort_keys=True), + num_shards=1, + ) + + for i in range(10): + key = "/tmp/test-json-output/input/%d.txt" % i + assert key in results, key + assert results[key] == "", {"key": key, "value": results[key]} + + def test_jsonline_output(self): + results = self.run_mr( + "/tmp/test-jsonline-output/", + ["hello" for i in range(10)], + mapper=KeyMapper(), + input_format=parallel.FilenameInput(), + output_format=parallel.JSONLineOutput(), + ) + + for i in range(10): + key = "/tmp/test-jsonline-output/input/%d.txt" % i + assert key in results + + def test_lines_input(self): + input_data = [] + for i in range(10): + input_data.append("\n".join(["test-line"] * 10)) + + results = self.run_mr("/tmp/test-lineinput", input_data) + + for i in range(10): + key, value = results[i] + assert value == "test-line", (i, results[i]) + + def test_sum(self): + # 10 files with 100 lines each + results = self.run_mr( + "/tmp/test-sum", + ["\n".join([str(i) for i in range(100)]) for i in range(10)], + reducer=parallel.SumReducer(), + ) + + results = set( + dict(list(map(lambda res: (res[0].decode(), res[1]), results))).values() + ) + for i in range(100): + assert i * 10 in results + results.remove(i * 10) + assert len(results) == 0, "Unexpected output: %s" % results + + def test_exception(self): + with self.assertRaises(parallel.MRException) as ctx: + self.run_mr( + "/tmp/test-bad-mapper", ["hello" for i in range(10)], mapper=BadMapper() + ) + + def test_csv_line_split(self): + dir = "/tmp/openfda-test-csv-line-split" + level_db_prefix = os.path.join(dir, "leveldb") + file = "csv_split.csv" + + os.system('rm -rf "%s"' % level_db_prefix) + os.makedirs(dir, exist_ok=True) + shutil.copyfile( + os.path.join(dirname(os.path.abspath(__file__)), "data/%s" % file), + os.path.join(dir, file), + ) + + col = parallel.Collection.from_glob( + os.path.join(dir, file), + parallel.CSVSplitLineInput( + quoting=csv.QUOTE_NONE, delimiter="|", fixed_splits=3 + ), + ) + splits = list(col) + assert len(splits) == 3 + # Check every split's start and end. Start must be at the beginning of a line, end must be after carriage return or EOF. + assert (splits[0].start_pos) == 0 + assert (splits[0].end_pos) == 81 + assert (splits[1].start_pos) == 81 + assert (splits[1].end_pos) == 169 + assert (splits[2].start_pos) == 169 + assert (splits[2].end_pos) == 196 + + # Run M/R with these splits and ensure the result is as expected + parallel.mapreduce( + col, + mapper=CsvMapper(), + reducer=parallel.SumReducer(), + map_workers=len(splits), + output_format=parallel.LevelDBOutput(), + output_prefix=level_db_prefix, + num_shards=len(splits), + ) + + result = sorted(list(parallel.ShardedDB.open(level_db_prefix))) + print(result) + # if we got the sum right we know all the splits have been processed correctly + assert (result[0][1]) == 401 + 402 + 403 + 404 + 405 + 407 + 408 + 409 + 410 + def main(argv): - unittest.main(argv=argv) + unittest.main(argv=argv) + -if __name__ == '__main__': - app.run() +if __name__ == "__main__": + app.run() diff --git a/openfda/tobacco_problem/pipeline.py b/openfda/tobacco_problem/pipeline.py index 9188163..58e2581 100644 --- a/openfda/tobacco_problem/pipeline.py +++ b/openfda/tobacco_problem/pipeline.py @@ -1,8 +1,8 @@ #!/usr/local/bin/python -''' +""" Pipeline for converting CSV tobacco product problem data to JSON and importing into Elasticsearch. -''' +""" import logging import os @@ -20,102 +20,115 @@ from openfda.common import newest_file_timestamp -TOBACCO_PROBLEM_DOWNLOAD_PAGE = \ - 'https://www.fda.gov/tobacco-products/tobacco-science-research/tobacco-product-problem-reports' -TOBACCO_PROBLEM_EXTRACT_DB = 'tobacco_problem/tobacco_problem.db' -TOBACCO_RAW_DIR = config.data_dir('tobacco_problem/raw') +TOBACCO_PROBLEM_DOWNLOAD_PAGE = "https://www.fda.gov/tobacco-products/tobacco-science-research/tobacco-product-problem-reports" +TOBACCO_PROBLEM_EXTRACT_DB = "tobacco_problem/tobacco_problem.db" +TOBACCO_RAW_DIR = config.data_dir("tobacco_problem/raw") class DownloadTobaccoProblem(luigi.Task): - def requires(self): - return [] - - def output(self): - return luigi.LocalTarget(config.data_dir('tobacco_problem/json/tobacco_problem_raw.json')) - - def run(self): - logging.basicConfig(level=logging.INFO) - output_dir = TOBACCO_RAW_DIR - os.system('mkdir -p %s' % output_dir) - - # Download all csv source files (current year and archived years). - soup = BeautifulSoup(urlopen(TOBACCO_PROBLEM_DOWNLOAD_PAGE).read(), 'lxml') - for a in soup.find_all(title=re.compile('\d{4}.*tppr', re.IGNORECASE)): - file_name = a['title'] if '.csv' in a['title'] else (a['title'] + '.csv') - common.download(urljoin('https://www.fda.gov', a['href']), join(output_dir, file_name)) - - # Combine CSV files into a single json. - all_csv_files = [i for i in glob.glob((output_dir + '/*.{}').format('csv'))] - logging.info("Reading csv files: %s", (all_csv_files)) - os.system('mkdir -p %s' % dirname(self.output().path)) - df = pd.concat( - pd.read_csv(f, encoding='cp1252', skiprows=3).rename( - columns={"REPORT_ID (ICSR)": "REPORT_ID", "REPORT ID": "REPORT_ID"}, errors="ignore") - for f - in all_csv_files) - df.to_json(self.output().path, orient='records') - with open(self.output().path, "w") as f: - for row in df.iterrows(): - row[1].to_json(f) - f.write("\n") + def requires(self): + return [] + + def output(self): + return luigi.LocalTarget( + config.data_dir("tobacco_problem/json/tobacco_problem_raw.json") + ) + + def run(self): + logging.basicConfig(level=logging.INFO) + output_dir = TOBACCO_RAW_DIR + os.system("mkdir -p %s" % output_dir) + + # Download all csv source files (current year and archived years). + soup = BeautifulSoup(urlopen(TOBACCO_PROBLEM_DOWNLOAD_PAGE).read(), "lxml") + for a in soup.find_all(title=re.compile("\d{4}.*tppr", re.IGNORECASE)): + file_name = a["title"] if ".csv" in a["title"] else (a["title"] + ".csv") + common.download( + urljoin("https://www.fda.gov", a["href"]), join(output_dir, file_name) + ) + + # Combine CSV files into a single json. + all_csv_files = [i for i in glob.glob((output_dir + "/*.{}").format("csv"))] + logging.info("Reading csv files: %s", (all_csv_files)) + os.system("mkdir -p %s" % dirname(self.output().path)) + df = pd.concat( + pd.read_csv(f, encoding="cp1252", skiprows=3).rename( + columns={"REPORT_ID (ICSR)": "REPORT_ID", "REPORT ID": "REPORT_ID"}, + errors="ignore", + ) + for f in all_csv_files + ) + df.to_json(self.output().path, orient="records") + with open(self.output().path, "w") as f: + for row in df.iterrows(): + row[1].to_json(f) + f.write("\n") class TobaccoProblem2JSONMapper(parallel.Mapper): - rename_map = { - "REPORT_ID": "report_id", - "DATE_SUBMITTED": "date_submitted", - "NUMBER_TOBACCO_PRODUCTS": "number_tobacco_products", - "TOBACCO_PRODUCTS": "tobacco_products", - "NUMBER_HEALTH_PROBLEMS": "number_health_problems", - "REPORTED_HEALTH_PROBLEMS": "reported_health_problems", - "NONUSER_AFFECTED": "nonuser_affected", - "NUMBER_PRODUCT_PROBLEMS": "number_product_problems", - "REPORTED_PRODUCT_PROBLEMS": "reported_product_problems" - } - - def map(self, key, value, output): - def _cleaner(k, v): - ''' Helper function to rename keys and purge any keys that are not in - the map. - ''' - if k in self.rename_map and v is not None: - if "DATE" in k: - return self.rename_map[k], str(arrow.get(v, ['M/D/YYYY', 'D-MMM-YYYY', 'D-MMM-YY']).format("MM/DD/YYYY")) - # Make arrays of these three fields. - if k in ["TOBACCO_PRODUCTS", "REPORTED_HEALTH_PROBLEMS", "REPORTED_PRODUCT_PROBLEMS"]: - return self.rename_map[k], list(set(v.split(' / '))) - else: - return self.rename_map[k], v - - new_value = common.transform_dict(value, _cleaner) - output.add(key, new_value) + rename_map = { + "REPORT_ID": "report_id", + "DATE_SUBMITTED": "date_submitted", + "NUMBER_TOBACCO_PRODUCTS": "number_tobacco_products", + "TOBACCO_PRODUCTS": "tobacco_products", + "NUMBER_HEALTH_PROBLEMS": "number_health_problems", + "REPORTED_HEALTH_PROBLEMS": "reported_health_problems", + "NONUSER_AFFECTED": "nonuser_affected", + "NUMBER_PRODUCT_PROBLEMS": "number_product_problems", + "REPORTED_PRODUCT_PROBLEMS": "reported_product_problems", + } + + def map(self, key, value, output): + def _cleaner(k, v): + """Helper function to rename keys and purge any keys that are not in + the map. + """ + if k in self.rename_map and v is not None: + if "DATE" in k: + return self.rename_map[k], str( + arrow.get(v, ["M/D/YYYY", "D-MMM-YYYY", "D-MMM-YY"]).format( + "MM/DD/YYYY" + ) + ) + # Make arrays of these three fields. + if k in [ + "TOBACCO_PRODUCTS", + "REPORTED_HEALTH_PROBLEMS", + "REPORTED_PRODUCT_PROBLEMS", + ]: + return self.rename_map[k], list(set(v.split(" / "))) + else: + return self.rename_map[k], v + + new_value = common.transform_dict(value, _cleaner) + output.add(key, new_value) class TobaccoProblem2JSON(luigi.Task): - def requires(self): - return DownloadTobaccoProblem() + def requires(self): + return DownloadTobaccoProblem() - def output(self): - return luigi.LocalTarget(config.data_dir(TOBACCO_PROBLEM_EXTRACT_DB)) + def output(self): + return luigi.LocalTarget(config.data_dir(TOBACCO_PROBLEM_EXTRACT_DB)) - def run(self): - parallel.mapreduce( - parallel.Collection.from_glob( - self.input().path, parallel.JSONLineInput()), - mapper=TobaccoProblem2JSONMapper(), - reducer=parallel.IdentityReducer(), - output_prefix=self.output().path, - num_shards=1) + def run(self): + parallel.mapreduce( + parallel.Collection.from_glob(self.input().path, parallel.JSONLineInput()), + mapper=TobaccoProblem2JSONMapper(), + reducer=parallel.IdentityReducer(), + output_prefix=self.output().path, + num_shards=1, + ) class LoadJSON(index_util.LoadJSONBase): - index_name = 'tobaccoproblem' - mapping_file = './schemas/tobaccoproblem_mapping.json' - data_source = TobaccoProblem2JSON() - use_checksum = False - optimize_index = True - last_update_date = lambda _: newest_file_timestamp(TOBACCO_RAW_DIR) + index_name = "tobaccoproblem" + mapping_file = "./schemas/tobaccoproblem_mapping.json" + data_source = TobaccoProblem2JSON() + use_checksum = False + optimize_index = True + last_update_date = lambda _: newest_file_timestamp(TOBACCO_RAW_DIR) -if __name__ == '__main__': - luigi.run() +if __name__ == "__main__": + luigi.run() diff --git a/scripts/convert_json_to_yaml.py b/scripts/convert_json_to_yaml.py index d76dd7b..d2b9214 100644 --- a/scripts/convert_json_to_yaml.py +++ b/scripts/convert_json_to_yaml.py @@ -2,6 +2,10 @@ import os import yaml -print(os.path.dirname(os.path.dirname(os.getcwd())+'\schemas\phishpharm_mapping.json')) -with open(os.path.dirname(os.path.realpath(__file__))+'\..\schemas\phishpharm_mapping.json') as f: - print(yaml.safe_dump(json.load(f))) +print( + os.path.dirname(os.path.dirname(os.getcwd()) + "\schemas\phishpharm_mapping.json") +) +with open( + os.path.dirname(os.path.realpath(__file__)) + "\..\schemas\phishpharm_mapping.json" +) as f: + print(yaml.safe_dump(json.load(f))) diff --git a/scripts/dbreader.py b/scripts/dbreader.py index b7e6035..347d8dd 100755 --- a/scripts/dbreader.py +++ b/scripts/dbreader.py @@ -1,11 +1,11 @@ #!/usr/bin/env python -''' +""" Tool for debugging leveldb output. This takes a leveldb file and a start and stop key. Records between start_key and end_key are printed, one per line. -''' +""" import os @@ -18,50 +18,54 @@ from openfda import app, parallel -gflags.DEFINE_string('level_db', None, 'Database file to read.') -gflags.DEFINE_string('sharded_db', None, 'Database file to read.') -gflags.DEFINE_string('start_key', None, 'First key to read.') -gflags.DEFINE_string('end_key', None, 'Last key to read.') -gflags.DEFINE_string('key', None, 'Print just this key.') -gflags.DEFINE_boolean('key_only', False, 'Print only keys') -gflags.DEFINE_boolean('value_only', False, 'Print only values') -gflags.DEFINE_boolean('json_value_out', False, 'Prints values as JSON') +gflags.DEFINE_string("level_db", None, "Database file to read.") +gflags.DEFINE_string("sharded_db", None, "Database file to read.") +gflags.DEFINE_string("start_key", None, "First key to read.") +gflags.DEFINE_string("end_key", None, "Last key to read.") +gflags.DEFINE_string("key", None, "Print just this key.") +gflags.DEFINE_boolean("key_only", False, "Print only keys") +gflags.DEFINE_boolean("value_only", False, "Print only values") +gflags.DEFINE_boolean("json_value_out", False, "Prints values as JSON") + def main(argv): - FLAGS = gflags.FLAGS - if FLAGS.key is not None: - FLAGS.start_key = FLAGS.end_key = FLAGS.key + FLAGS = gflags.FLAGS + if FLAGS.key is not None: + FLAGS.start_key = FLAGS.end_key = FLAGS.key - if FLAGS.level_db: - assert os.path.exists(FLAGS.level_db) - ldb = leveldb.LevelDB(FLAGS.level_db) - if FLAGS.start_key and FLAGS.end_key: - db_iter = ldb.RangeIter(FLAGS.start_key, FLAGS.end_key) - else: - db_iter = ldb.RangeIter() - elif FLAGS.sharded_db: - assert os.path.exists(FLAGS.sharded_db) - db = parallel.ShardedDB.open(FLAGS.sharded_db) - if FLAGS.start_key and FLAGS.end_key: - db_iter = db.range_iter(FLAGS.start_key, FLAGS.end_key) + if FLAGS.level_db: + assert os.path.exists(FLAGS.level_db) + ldb = leveldb.LevelDB(FLAGS.level_db) + if FLAGS.start_key and FLAGS.end_key: + db_iter = ldb.RangeIter(FLAGS.start_key, FLAGS.end_key) + else: + db_iter = ldb.RangeIter() + elif FLAGS.sharded_db: + assert os.path.exists(FLAGS.sharded_db) + db = parallel.ShardedDB.open(FLAGS.sharded_db) + if FLAGS.start_key and FLAGS.end_key: + db_iter = db.range_iter(FLAGS.start_key, FLAGS.end_key) + else: + db_iter = db.__iter__() else: - db_iter = db.__iter__() - else: - print('Must specify --level_db or --sharded_db') - print(FLAGS.GetHelp()) - sys.exit(1) - - cnt = 0 - for (k, v) in db_iter: - if FLAGS.json_value_out: v = json.dumps(pickle.loads(v), indent=2, sort_keys=True) + print("Must specify --level_db or --sharded_db") + print(FLAGS.GetHelp()) + sys.exit(1) - if FLAGS.key_only: print(k) - elif FLAGS.value_only: print(v) - else: print(k, v) - cnt+=1 - print(("Total keys: %s", cnt)) + cnt = 0 + for k, v in db_iter: + if FLAGS.json_value_out: + v = json.dumps(pickle.loads(v), indent=2, sort_keys=True) + if FLAGS.key_only: + print(k) + elif FLAGS.value_only: + print(v) + else: + print(k, v) + cnt += 1 + print(("Total keys: %s", cnt)) -if __name__ == '__main__': - app.run() +if __name__ == "__main__": + app.run() diff --git a/scripts/generate_fields_yaml.py b/scripts/generate_fields_yaml.py index b824682..5443b02 100644 --- a/scripts/generate_fields_yaml.py +++ b/scripts/generate_fields_yaml.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -''' Tool for making field values for the documentation website (open.fda.gov) +""" Tool for making field values for the documentation website (open.fda.gov) that are stored in the appropriate _fields.yaml file. These fields are determined by looking both the json schema file and the elasticsearch mapping file. @@ -17,7 +17,7 @@ schema keys to their Elasticsearch mapping counterparts by removing the `.items` key string. It can then use the flatten key name to update the schema to have the additional fields needs by the documentation site. -''' +""" import collections import logging @@ -32,261 +32,267 @@ @click.group() def cli(): - pass + pass + # Each field in the schema will have the following data DEFAULTS = { - 'format': None, - 'type': 'string', - 'description': None, - 'possible_values': None, - 'is_exact': False + "format": None, + "type": "string", + "description": None, + "possible_values": None, + "is_exact": False, } -def completely_flatten(input_data, absolute_key='', sep='.'): - ''' A Flattening function that takes a highly nested dictionary and flattens - it into a single depth dictionary with breadcrumb style keys, {a.b.c: val} +def completely_flatten(input_data, absolute_key="", sep="."): + """A Flattening function that takes a highly nested dictionary and flattens + it into a single depth dictionary with breadcrumb style keys, {a.b.c: val} - Please note, this will not work with dictionaries that have lists in them. - ''' - result = [] - for key, value in input_data.items(): - appended_key = sep.join([absolute_key, key]) if absolute_key else key + Please note, this will not work with dictionaries that have lists in them. + """ + result = [] + for key, value in input_data.items(): + appended_key = sep.join([absolute_key, key]) if absolute_key else key - if isinstance(value, dict): - result.extend(list(completely_flatten(value, appended_key, sep=sep).items())) + if isinstance(value, dict): + result.extend( + list(completely_flatten(value, appended_key, sep=sep).items()) + ) - else: - result.append((appended_key, value)) + else: + result.append((appended_key, value)) - return dict(result) + return dict(result) def set_deep(data, path, value): - ''' Helper function that sets a value deep into a dictionary by passing the - deep address as a '.' delimited string of keys, for instance, `a.b.c` is - pointing to {'a': {'b': {'c': 'some value' }}} + """Helper function that sets a value deep into a dictionary by passing the + deep address as a '.' delimited string of keys, for instance, `a.b.c` is + pointing to {'a': {'b': {'c': 'some value' }}} - Please note, this is not a pure function, it mutates the object (`data`) - that is passed into it. Also, this does not work if there are any lists - within the object. - ''' - path = path.split('.') - latest = path.pop() - for key in path: - data = data.setdefault(key, {}) - data.setdefault(latest, value) + Please note, this is not a pure function, it mutates the object (`data`) + that is passed into it. Also, this does not work if there are any lists + within the object. + """ + path = path.split(".") + latest = path.pop() + for key in path: + data = data.setdefault(key, {}) + data.setdefault(latest, value) def get_deep(data, path): - ''' Helper function that gets a value form deep within an object by passing - a '.' delimited string of keys, for instance, `a.b.c` is - pointing to {'a': {'b': {'c': 'some value' }}}. - ''' - path = path.split('.') - first, rest = path.pop(0), path - result = data.get(first, {}) - for part in rest: - result = result.get(part, {}) + """Helper function that gets a value form deep within an object by passing + a '.' delimited string of keys, for instance, `a.b.c` is + pointing to {'a': {'b': {'c': 'some value' }}}. + """ + path = path.split(".") + first, rest = path.pop(0), path + result = data.get(first, {}) + for part in rest: + result = result.get(part, {}) - return result + return result def get_exact_fields(mapping): - ''' Takes a elasticsearch mapping file, flattens it, filters out exact fields - and returns a unique list of keys that can be consumed by get_deep() - ''' - result = {} - exact_lists = '_exact' - exact_fields = '.fields' - flat_map = completely_flatten(mapping) - flat_map = [key for key, value in flat_map.items() - if exact_lists in key or exact_fields in key] - for key in flat_map: - # Only interested in everything before the _exact or .fields - part = key.split(exact_lists if exact_lists in key else exact_fields).pop(0) - # And after the Elasticsearch type name for the mapping (which is the root) - part = '.'.join(part.split('.')[1:]) - result[part] = True - - return result + """Takes a elasticsearch mapping file, flattens it, filters out exact fields + and returns a unique list of keys that can be consumed by get_deep() + """ + result = {} + exact_lists = "_exact" + exact_fields = ".fields" + flat_map = completely_flatten(mapping) + flat_map = [ + key + for key, value in flat_map.items() + if exact_lists in key or exact_fields in key + ] + for key in flat_map: + # Only interested in everything before the _exact or .fields + part = key.split(exact_lists if exact_lists in key else exact_fields).pop(0) + # And after the Elasticsearch type name for the mapping (which is the root) + part = ".".join(part.split(".")[1:]) + result[part] = True + + return result def _prefix(key): - ''' Helper function that removes the last entry from a '.' delimited string. - ''' - return '.'.join(key.split('.')[:-1]) + """Helper function that removes the last entry from a '.' delimited string.""" + return ".".join(key.split(".")[:-1]) def get_schema_fields(schema): - result = {} - for field, value in completely_flatten(schema).items(): - if value == 'object': - continue + result = {} + for field, value in completely_flatten(schema).items(): + if value == "object": + continue + + field = _prefix(field) + result[field] = True - field = _prefix(field) - result[field] = True + return list(result.keys()) - return list(result.keys()) def get_mapping_fields(mapping): - result = {} - for field, value in completely_flatten(mapping).items(): - # And after the Elasticsearch type name for the mapping (which is the root) - # We also do not need the last key (type) - part = '.'.join(field.split('.')[1:-1]) - result[part] = True + result = {} + for field, value in completely_flatten(mapping).items(): + # And after the Elasticsearch type name for the mapping (which is the root) + # We also do not need the last key (type) + part = ".".join(field.split(".")[1:-1]) + result[part] = True - return result + return result def generate_es_to_schema_index(schema): - ''' Elasticsearch does not natively support the array type (expressed as - items in JSON Schema), so we need an index with the `items` keys missing - in order to lookup from Elasticsearch mappings to JSON Schema. - ''' - flattened_schema = completely_flatten(schema) - result = {} - for key in flattened_schema.keys(): - es_key = key.replace('.items', '') - short_key = _prefix(key) - result[short_key] = _prefix(es_key) - return result + """Elasticsearch does not natively support the array type (expressed as + items in JSON Schema), so we need an index with the `items` keys missing + in order to lookup from Elasticsearch mappings to JSON Schema. + """ + flattened_schema = completely_flatten(schema) + result = {} + for key in flattened_schema.keys(): + es_key = key.replace(".items", "") + short_key = _prefix(key) + result[short_key] = _prefix(es_key) + return result # YAML hackery to preserve item ordering and pretty string formatting def tweak_yaml(): - _mapping_tag = yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG + _mapping_tag = yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG - def dict_constructor(loader, node): - return collections.OrderedDict(loader.construct_pairs(node)) + def dict_constructor(loader, node): + return collections.OrderedDict(loader.construct_pairs(node)) - yaml.add_constructor(_mapping_tag, dict_constructor) + yaml.add_constructor(_mapping_tag, dict_constructor) - def ordered_dict_serializer(self, data): - return self.represent_mapping('tag:yaml.org,2002:map', list(data.items())) + def ordered_dict_serializer(self, data): + return self.represent_mapping("tag:yaml.org,2002:map", list(data.items())) - SafeDumper.add_representer(collections.OrderedDict, ordered_dict_serializer) - # We want blanks for null values - SafeDumper.add_representer( - type(None), - lambda dumper, value: dumper.represent_scalar(u'tag:yaml.org,2002:null', '') - ) + SafeDumper.add_representer(collections.OrderedDict, ordered_dict_serializer) + # We want blanks for null values + SafeDumper.add_representer( + type(None), + lambda dumper, value: dumper.represent_scalar("tag:yaml.org,2002:null", ""), + ) def make_yaml(input_data, file_name): - tweak_yaml() - data_str = json.dumps(input_data, sort_keys=True) - data = json.loads(data_str, object_pairs_hook=collections.OrderedDict) - with open(file_name, 'w') as out: - yaml.safe_dump(data, - out, - indent=2, - default_flow_style=False, - allow_unicode=True) + tweak_yaml() + data_str = json.dumps(input_data, sort_keys=True) + data = json.loads(data_str, object_pairs_hook=collections.OrderedDict) + with open(file_name, "w") as out: + yaml.safe_dump( + data, out, indent=2, default_flow_style=False, allow_unicode=True + ) @cli.command() -@click.option('--mapping', help='ES Mapping file') -@click.option('--schema', help='JSON Schema file') -@click.option('--filename', help='Output file name for the fields file') +@click.option("--mapping", help="ES Mapping file") +@click.option("--schema", help="JSON Schema file") +@click.option("--filename", help="Output file name for the fields file") def generate(mapping, schema, filename): - mapping = json.load(open(mapping)) - schema = json.load(open(schema)) - schema.pop('$schema', 0) + mapping = json.load(open(mapping)) + schema = json.load(open(schema)) + schema.pop("$schema", 0) - # Index for which keys need to be marked as having exact counterparts - exact_index = get_exact_fields(mapping) - # Index to lookup a non-array version of key name (so it maps to ES) - es_items_index = generate_es_to_schema_index(schema) - # All the fields to process from the schema - schema_fields = get_schema_fields(schema) + # Index for which keys need to be marked as having exact counterparts + exact_index = get_exact_fields(mapping) + # Index to lookup a non-array version of key name (so it maps to ES) + es_items_index = generate_es_to_schema_index(schema) + # All the fields to process from the schema + schema_fields = get_schema_fields(schema) - for field in schema_fields: - type_field = field + '.' + 'type' - # No need expand these complex types with DEFAULT attributes - if get_deep(schema, type_field) in ['object', 'array']: - continue - if not field: - continue + for field in schema_fields: + type_field = field + "." + "type" + # No need expand these complex types with DEFAULT attributes + if get_deep(schema, type_field) in ["object", "array"]: + continue + if not field: + continue - es_field = es_items_index.get(field, None) - is_exact = True if es_field in exact_index else False + es_field = es_items_index.get(field, None) + is_exact = True if es_field in exact_index else False - # Expand each of the keys to match DEFAULTS. - # set_deep() will not overwrite data if it does exist, but will add if it - # does not - for key, value in DEFAULTS.items(): - full_key = field + '.' + key - if is_exact and key == 'is_exact': - value = True + # Expand each of the keys to match DEFAULTS. + # set_deep() will not overwrite data if it does exist, but will add if it + # does not + for key, value in DEFAULTS.items(): + full_key = field + "." + key + if is_exact and key == "is_exact": + value = True - set_deep(schema, full_key, value) + set_deep(schema, full_key, value) - make_yaml(schema, filename) + make_yaml(schema, filename) @cli.command() -@click.option('--host', - help='Base URL for the API to test', - default='http://localhost:8000') -@click.option('--mapping', help='ES Mapping file') -@click.option('--endpoint', help='Endpoint to test, i.e. /drug/event') +@click.option( + "--host", help="Base URL for the API to test", default="http://localhost:8000" +) +@click.option("--mapping", help="ES Mapping file") +@click.option("--endpoint", help="Endpoint to test, i.e. /drug/event") def test_api_keys(host, mapping, endpoint): - ''' Utility that tests an API to make sure that all fields in the mapping - file are queryable via the API. It will spit a list of non 200 keys to - console. - - Please note, it is possible to get false positives, since the API responds - with 404s when there is no data in the response. For instance, the query - `/device/510k.json?search=_exists_:ssp_indicator` returns nothing, but it - is a valid field from the source, it just happens to never have data. - ''' - mapping = json.load(open(mapping)) - exact_index = get_exact_fields(mapping) - flat_map = get_mapping_fields(mapping) - url = '%s%s.json?search=_exists_:%s' - for key in flat_map.keys(): - # Remove non queryable fields from key name - key = key.replace('properties.', '').replace('.fields', '') - responses = [] - responses.append( - (requests.get(url % (host, endpoint, key)).status_code, key)) - - if key in exact_index: - key = key + '.exact' - responses.append( - (requests.get(url % (host, endpoint, key)).status_code, key)) - - missing_keys = [] - for resp, req in responses: - if resp != 200: - missing_keys.append(req) - else: - logging.info('%s: Ok!', req) - - if missing_keys: - logging.info('The following keys returned a non 200 response: %s', - missing_keys) - else: - logging.info('All keys in the mapping are returned by API') + """Utility that tests an API to make sure that all fields in the mapping + file are queryable via the API. It will spit a list of non 200 keys to + console. + + Please note, it is possible to get false positives, since the API responds + with 404s when there is no data in the response. For instance, the query + `/device/510k.json?search=_exists_:ssp_indicator` returns nothing, but it + is a valid field from the source, it just happens to never have data. + """ + mapping = json.load(open(mapping)) + exact_index = get_exact_fields(mapping) + flat_map = get_mapping_fields(mapping) + url = "%s%s.json?search=_exists_:%s" + for key in flat_map.keys(): + # Remove non queryable fields from key name + key = key.replace("properties.", "").replace(".fields", "") + responses = [] + responses.append((requests.get(url % (host, endpoint, key)).status_code, key)) + + if key in exact_index: + key = key + ".exact" + responses.append( + (requests.get(url % (host, endpoint, key)).status_code, key) + ) + + missing_keys = [] + for resp, req in responses: + if resp != 200: + missing_keys.append(req) + else: + logging.info("%s: Ok!", req) + + if missing_keys: + logging.info("The following keys returned a non 200 response: %s", missing_keys) + else: + logging.info("All keys in the mapping are returned by API") + @cli.command() -@click.option('--mapping', help='ES Mapping file') +@click.option("--mapping", help="ES Mapping file") def flat_mapping(mapping): - mapping = json.load(open(mapping)) - flat_map = get_mapping_fields(mapping) - for key in sorted(flat_map.keys()): - # Remove non queryable fields from key name - key = key.replace('properties.', '').replace('.fields', '') - if 'exact' not in key: - print(key) - -if __name__ == '__main__': - logging.basicConfig( - format='%(created)f %(filename)s:%(lineno)s [%(funcName)s] %(message)s', - level=logging.INFO, - stream=sys.stderr) - - cli() + mapping = json.load(open(mapping)) + flat_map = get_mapping_fields(mapping) + for key in sorted(flat_map.keys()): + # Remove non queryable fields from key name + key = key.replace("properties.", "").replace(".fields", "") + if "exact" not in key: + print(key) + + +if __name__ == "__main__": + logging.basicConfig( + format="%(created)f %(filename)s:%(lineno)s [%(funcName)s] %(message)s", + level=logging.INFO, + stream=sys.stderr, + ) + + cli() diff --git a/scripts/generate_mapping_from_sections.py b/scripts/generate_mapping_from_sections.py index f05a007..0d9c2ef 100644 --- a/scripts/generate_mapping_from_sections.py +++ b/scripts/generate_mapping_from_sections.py @@ -1,8 +1,8 @@ #!/usr/bin/python -'''This utility prints out a mapping string that is syntactically correct +"""This utility prints out a mapping string that is syntactically correct to be used in the schemas/spl_mapping.json file. -''' +""" import logging import sys @@ -13,9 +13,9 @@ # TODO(hansnelsen): Add writing directly to schemas/spl_mapping.json once it is # pretty def generate_mapping(): - sections = open('../openfda/spl/data/sections.csv', 'r') - mapping_list = [] - openfda = ''' + sections = open("../openfda/spl/data/sections.csv", "r") + mapping_list = [] + openfda = """ "openfda": { "properties": { "application_number": { @@ -201,8 +201,8 @@ def generate_mapping(): "type": "boolean" } } - }''' - mapping_header = '''{ + }""" + mapping_header = """{ "spl": { "_source": { "includes": [ @@ -229,28 +229,31 @@ def generate_mapping(): "type": "date", "format": "basic_date||date" }, - ''' - mapping_footer = ''', + """ + mapping_footer = """, "@timestamp": { "type": "date", "format": "basic_date||date" } } } - }''' - mapping_list.append(mapping_header) + }""" + mapping_list.append(mapping_header) - for row in sections: - name = row.split(',')[1]\ - .replace(':', '')\ - .replace(' & ', ' and ')\ - .replace('/', ' or ')\ - .replace(' ', '_')\ - .lower()\ - .replace('spl_unclassified', 'spl_unclassified_section')\ - .strip() + for row in sections: + name = ( + row.split(",")[1] + .replace(":", "") + .replace(" & ", " and ") + .replace("/", " or ") + .replace(" ", "_") + .lower() + .replace("spl_unclassified", "spl_unclassified_section") + .strip() + ) - row_string = ''' "%(name)s": { + row_string = ( + """ "%(name)s": { "type": "string", "index": "analyzed" }, @@ -258,22 +261,23 @@ def generate_mapping(): "type": "string", "index": "no", "include_in_all": false - },''' % locals() - mapping_list.append(row_string) + },""" + % locals() + ) + mapping_list.append(row_string) - mapping_list.append(openfda) - mapping_list.append(mapping_footer) + mapping_list.append(openfda) + mapping_list.append(mapping_footer) - try: - mapping_string = '\n'.join(mapping_list) - json.loads(mapping_string) - return mapping_string - except: - logging.info('It appears that something is wrong with your json string') + try: + mapping_string = "\n".join(mapping_list) + json.loads(mapping_string) + return mapping_string + except: + logging.info("It appears that something is wrong with your json string") -if __name__ == '__main__': - fmt_string = '%(created)f %(filename)s:%(lineno)s [%(funcName)s] %(message)s' - logging.basicConfig(stream=sys.stderr, - format=fmt_string, - level=logging.DEBUG) - print(generate_mapping()) + +if __name__ == "__main__": + fmt_string = "%(created)f %(filename)s:%(lineno)s [%(funcName)s] %(message)s" + logging.basicConfig(stream=sys.stderr, format=fmt_string, level=logging.DEBUG) + print(generate_mapping()) diff --git a/scripts/generate_schema.py b/scripts/generate_schema.py index 983c872..1135001 100644 --- a/scripts/generate_schema.py +++ b/scripts/generate_schema.py @@ -1,57 +1,59 @@ import json + def byteify(input): - if isinstance(input, dict): - return {byteify(key): byteify(value) - for key, value in iter(input.items())} - elif isinstance(input, list): - return [byteify(element) for element in input] - elif isinstance(input, str): - return input.encode('utf-8') - else: - return input + if isinstance(input, dict): + return {byteify(key): byteify(value) for key, value in iter(input.items())} + elif isinstance(input, list): + return [byteify(element) for element in input] + elif isinstance(input, str): + return input.encode("utf-8") + else: + return input schema_json = {"type": "object", "properties": {}} + def generate_fields(obj): - if 'properties' in obj: - field_obj = {'type': 'object'} - prop_obj = {} - for item, val in obj['properties'].items(): - if item.endswith('_exact'): - pass - else: - prop_obj[item] = generate_fields(val) - field_obj['properties'] = prop_obj - elif 'fields' in obj: - exact = obj['fields']['exact'] - field_obj = {'type': exact['type']} - if 'format' in exact: - if 'date' in exact['format']: - field_obj['format'] = 'date' - field_obj['type'] = 'string' - else: - field_obj['format'] = exact['format'] - else: - field_obj = {'type': obj['type']} - if 'format' in obj: - if 'date' in obj['format']: - field_obj['format'] = 'date' - field_obj['type'] = 'string' - else: - field_obj['format'] = obj['format'] - return field_obj - -with open('../schemas/substancedata_mapping.json') as json_file: - map_file = byteify(json.load(json_file)) - - for field, value in map_file['substancedata']['properties'].items(): - if field.endswith('_exact'): - pass + if "properties" in obj: + field_obj = {"type": "object"} + prop_obj = {} + for item, val in obj["properties"].items(): + if item.endswith("_exact"): + pass + else: + prop_obj[item] = generate_fields(val) + field_obj["properties"] = prop_obj + elif "fields" in obj: + exact = obj["fields"]["exact"] + field_obj = {"type": exact["type"]} + if "format" in exact: + if "date" in exact["format"]: + field_obj["format"] = "date" + field_obj["type"] = "string" + else: + field_obj["format"] = exact["format"] else: - schema_json['properties'][field] = generate_fields(value) + field_obj = {"type": obj["type"]} + if "format" in obj: + if "date" in obj["format"]: + field_obj["format"] = "date" + field_obj["type"] = "string" + else: + field_obj["format"] = obj["format"] + return field_obj + + +with open("../schemas/substancedata_mapping.json") as json_file: + map_file = byteify(json.load(json_file)) + + for field, value in map_file["substancedata"]["properties"].items(): + if field.endswith("_exact"): + pass + else: + schema_json["properties"][field] = generate_fields(value) -with open('../schemas/substancedata_schema.json', 'w') as f: +with open("../schemas/substancedata_schema.json", "w") as f: json.dump(schema_json, f, indent=2, sort_keys=True) diff --git a/scripts/generate_tests.py b/scripts/generate_tests.py index 7e60f1a..ffe3099 100644 --- a/scripts/generate_tests.py +++ b/scripts/generate_tests.py @@ -1,4 +1,4 @@ -''' +""" Given a set of test descriptions from tests_config.json and an endpoint, generate a default test file for the endpoint. @@ -9,7 +9,7 @@ While this is intended as a one-off test generator, it maybe useful if many similar tests need to be generated for an endpoint, or to create a set of tests for a new endpoint. -''' +""" import json import os @@ -17,59 +17,72 @@ import requests -data = json.loads(open('./openfda/deploy/tests_config.json').read())[0] +data = json.loads(open("./openfda/deploy/tests_config.json").read())[0] endpoint = sys.argv[1] types = data[endpoint] total_idx = iter(list(range(100))) count_idx = iter(list(range(100))) + def print_total(name, query, count=None): - if not count: - data = requests.get('http://localhost:8000%s' % query).json() - count = data['meta']['results']['total'] + if not count: + data = requests.get("http://localhost:8000%s" % query).json() + count = data["meta"]["results"]["total"] - print(''' + print( + """ def test_%s_%d(): assert_total('%s', %d) -''' % (name, next(total_idx), query, count), end=' ') +""" + % (name, next(total_idx), query, count), + end=" ", + ) + def print_count(name, query): - cmd = 'curl -s "http://localhost:8000%s" | jq -r ".results[].term"' % query - counts = os.popen(cmd).read().split('\n')[:5] - print(''' + cmd = 'curl -s "http://localhost:8000%s" | jq -r ".results[].term"' % query + counts = os.popen(cmd).read().split("\n")[:5] + print( + """ def test_%s_%d(): assert_count_top('%s', %s) -''' % (name, next(count_idx), query, counts), end=' ') +""" + % (name, next(count_idx), query, counts), + end=" ", + ) + -print('''from nose.tools import * +print( + """from nose.tools import * import requests from openfda.tests.api_test_helpers import * from nose.tools import * -''') +""" +) for name, tests in types.items(): - if name == 'openfda_key_check': - for t in tests: - print_total('openfda', t) + if name == "openfda_key_check": + for t in tests: + print_total("openfda", t) - if name == 'not_analyzed_check': - for t in tests: - print_total('not_analyzed', t) + if name == "not_analyzed_check": + for t in tests: + print_total("not_analyzed", t) - if name == 'date_check': - for t in tests: - print_total('date', t) + if name == "date_check": + for t in tests: + print_total("date", t) - if name == 'date_range_check': - for t in tests: - print_total('date_range', t) + if name == "date_range_check": + for t in tests: + print_total("date_range", t) - if name == 'exact_count': - for t in tests: - print_count('exact_count', t) + if name == "exact_count": + for t in tests: + print_count("exact_count", t) - if name == 'openfda_exact_count': - for t in tests: - print_count('openfda_exact_count', t) + if name == "openfda_exact_count": + for t in tests: + print_count("openfda_exact_count", t) diff --git a/setup.py b/setup.py index f95e5c3..db76565 100644 --- a/setup.py +++ b/setup.py @@ -4,43 +4,47 @@ import os setuptools.setup( - name='openfda', - version='1.0', - maintainer='openFDA', - maintainer_email='open@fda.hhs.gov', - url='http://github.com/fda/openfda', - python_requires='>=3.6', - install_requires=[ - 'arrow', - 'boto', - 'click', - 'elasticsearch<=7.15.1', - 'leveldb', - 'luigi<=2.8.13', - 'lxml', - 'mock<=2.0.0', - 'nose2[coverage_plugin]', - 'python-gflags', - 'requests', - 'setproctitle', - 'simplejson', - 'xmltodict', - 'dictsearch', - 'usaddress', - 'python-crfsuite==0.9.8' - ], - dependency_links=[ - 'file://' + os.path.join(os.getcwd(), 'dependencies', - 'python-crfsuite-0.9.8.tar.gz') + '#python_crfsuite-0.9.8' - ], - description=('A research project to provide open APIs, raw data downloads, ' - 'documentation and examples, and a developer community for an ' - 'important collection of FDA public datasets.'), - packages = ['openfda', - 'openfda.annotation_table', - 'openfda.faers', - 'openfda.spl', - ], - zip_safe=False, - test_suite = 'nose2.collector.collector', + name="openfda", + version="1.0", + maintainer="openFDA", + maintainer_email="open@fda.hhs.gov", + url="http://github.com/fda/openfda", + python_requires=">=3.6", + install_requires=[ + "arrow", + "boto", + "click", + "elasticsearch<=7.15.1", + "leveldb", + "luigi<=2.8.13", + "lxml", + "mock<=2.0.0", + "nose2[coverage_plugin]", + "python-gflags", + "requests", + "setproctitle", + "simplejson", + "xmltodict", + "dictsearch", + "usaddress", + "python-crfsuite==0.9.8", + ], + dependency_links=[ + "file://" + + os.path.join(os.getcwd(), "dependencies", "python-crfsuite-0.9.8.tar.gz") + + "#python_crfsuite-0.9.8" + ], + description=( + "A research project to provide open APIs, raw data downloads, " + "documentation and examples, and a developer community for an " + "important collection of FDA public datasets." + ), + packages=[ + "openfda", + "openfda.annotation_table", + "openfda.faers", + "openfda.spl", + ], + zip_safe=False, + test_suite="nose2.collector.collector", ) From a147cff8e69a613eddd9687a06f98d4d6b476be9 Mon Sep 17 00:00:00 2001 From: "Basli, Adel" Date: Sat, 9 Nov 2024 14:10:12 +0100 Subject: [PATCH 2/2] fix minor errors --- openfda/adae/pipeline.py | 2 +- openfda/adae/tests/pipeline_test.py | 2 +- openfda/annotation_table/tests/pipeline_test.py | 1 - openfda/deploy/tests/device510k/test_endpoint.py | 1 - openfda/deploy/tests/deviceclass/test_endpoint.py | 1 - openfda/deploy/tests/devicepma/test_endpoint.py | 1 - openfda/deploy/tests/devicerecall/test_endpoint.py | 1 - openfda/deploy/tests/devicereglist/test_endpoint.py | 1 - openfda/deploy/tests/drugevent/test_endpoint.py | 1 - openfda/deploy/tests/druglabel/test_endpoint.py | 1 - openfda/deploy/tests/enforcement/test_endpoint.py | 1 - openfda/deploy/tests/issues/test_endpoint.py | 1 - openfda/device_harmonization/pipeline.py | 2 -- openfda/device_pma/transform.py | 2 -- openfda/device_udi/tests/pipeline_test.py | 1 - openfda/download_util.py | 3 +-- openfda/drugsfda/pipeline.py | 2 +- openfda/faers/annotate.py | 1 - openfda/faers/tests/annotate_test.py | 1 - openfda/ndc/annotate.py | 2 -- openfda/res/tests/extract_test.py | 1 - openfda/spl/pipeline.py | 2 -- openfda/spl/tests/extract_test.py | 1 - openfda/spl/tests/process_barcodes_test.py | 4 +++- openfda/tests/api_test.py | 1 - 25 files changed, 7 insertions(+), 30 deletions(-) diff --git a/openfda/adae/pipeline.py b/openfda/adae/pipeline.py index 61a07db..222a7d5 100644 --- a/openfda/adae/pipeline.py +++ b/openfda/adae/pipeline.py @@ -520,7 +520,7 @@ def run(self): for subdir, dirs, files in os.walk(input_dir): for file in files: if file.endswith(".xml"): - if not file in NULLIFIED: + if file not in NULLIFIED: input_shards.append(os.path.join(subdir, file)) else: logging.info("Skipping a nullified case: " + file) diff --git a/openfda/adae/tests/pipeline_test.py b/openfda/adae/tests/pipeline_test.py index 770426e..22bf70b 100644 --- a/openfda/adae/tests/pipeline_test.py +++ b/openfda/adae/tests/pipeline_test.py @@ -8,7 +8,7 @@ import luigi from mock import MagicMock -from openfda.adae.pipeline import ExtractXML, SyncS3, XML2JSONMapper +from openfda.adae.pipeline import ExtractXML, XML2JSONMapper from openfda.adae.annotate import normalize_product_ndc from openfda.tests.api_test_helpers import * diff --git a/openfda/annotation_table/tests/pipeline_test.py b/openfda/annotation_table/tests/pipeline_test.py index b3514cd..df4783e 100644 --- a/openfda/annotation_table/tests/pipeline_test.py +++ b/openfda/annotation_table/tests/pipeline_test.py @@ -6,7 +6,6 @@ import simplejson import openfda.annotation_table.pipeline -from openfda import spl from openfda.annotation_table.pipeline import * from openfda.spl.pipeline import * from openfda.tests.api_test_helpers import * diff --git a/openfda/deploy/tests/device510k/test_endpoint.py b/openfda/deploy/tests/device510k/test_endpoint.py index bb9d601..7f8199f 100644 --- a/openfda/deploy/tests/device510k/test_endpoint.py +++ b/openfda/deploy/tests/device510k/test_endpoint.py @@ -1,4 +1,3 @@ -import requests from openfda.tests.api_test_helpers import * diff --git a/openfda/deploy/tests/deviceclass/test_endpoint.py b/openfda/deploy/tests/deviceclass/test_endpoint.py index 0c5b3d8..f405955 100644 --- a/openfda/deploy/tests/deviceclass/test_endpoint.py +++ b/openfda/deploy/tests/deviceclass/test_endpoint.py @@ -1,4 +1,3 @@ -import requests from openfda.tests.api_test_helpers import * diff --git a/openfda/deploy/tests/devicepma/test_endpoint.py b/openfda/deploy/tests/devicepma/test_endpoint.py index c4c9542..ad92f44 100644 --- a/openfda/deploy/tests/devicepma/test_endpoint.py +++ b/openfda/deploy/tests/devicepma/test_endpoint.py @@ -1,4 +1,3 @@ -import requests from openfda.tests.api_test_helpers import * diff --git a/openfda/deploy/tests/devicerecall/test_endpoint.py b/openfda/deploy/tests/devicerecall/test_endpoint.py index 854aad0..f00fcb9 100644 --- a/openfda/deploy/tests/devicerecall/test_endpoint.py +++ b/openfda/deploy/tests/devicerecall/test_endpoint.py @@ -1,4 +1,3 @@ -import requests from openfda.tests.api_test_helpers import * diff --git a/openfda/deploy/tests/devicereglist/test_endpoint.py b/openfda/deploy/tests/devicereglist/test_endpoint.py index eb469a6..2af4c4d 100644 --- a/openfda/deploy/tests/devicereglist/test_endpoint.py +++ b/openfda/deploy/tests/devicereglist/test_endpoint.py @@ -1,4 +1,3 @@ -import requests from openfda.tests.api_test_helpers import * diff --git a/openfda/deploy/tests/drugevent/test_endpoint.py b/openfda/deploy/tests/drugevent/test_endpoint.py index e407c2a..3a4dd47 100644 --- a/openfda/deploy/tests/drugevent/test_endpoint.py +++ b/openfda/deploy/tests/drugevent/test_endpoint.py @@ -1,4 +1,3 @@ -import requests from openfda.tests.api_test_helpers import * diff --git a/openfda/deploy/tests/druglabel/test_endpoint.py b/openfda/deploy/tests/druglabel/test_endpoint.py index fa988d5..06c2fd7 100644 --- a/openfda/deploy/tests/druglabel/test_endpoint.py +++ b/openfda/deploy/tests/druglabel/test_endpoint.py @@ -1,4 +1,3 @@ -import requests from openfda.tests.api_test_helpers import * diff --git a/openfda/deploy/tests/enforcement/test_endpoint.py b/openfda/deploy/tests/enforcement/test_endpoint.py index ff23bf7..727ad5a 100644 --- a/openfda/deploy/tests/enforcement/test_endpoint.py +++ b/openfda/deploy/tests/enforcement/test_endpoint.py @@ -1,4 +1,3 @@ -import requests from openfda.tests.api_test_helpers import * diff --git a/openfda/deploy/tests/issues/test_endpoint.py b/openfda/deploy/tests/issues/test_endpoint.py index bad77b7..dfea201 100644 --- a/openfda/deploy/tests/issues/test_endpoint.py +++ b/openfda/deploy/tests/issues/test_endpoint.py @@ -1,5 +1,4 @@ from openfda.tests.api_test_helpers import * -import requests from openfda.tests.api_test_helpers import * diff --git a/openfda/device_harmonization/pipeline.py b/openfda/device_harmonization/pipeline.py index c4c5ec4..d560377 100644 --- a/openfda/device_harmonization/pipeline.py +++ b/openfda/device_harmonization/pipeline.py @@ -30,10 +30,8 @@ import logging import os from os.path import basename, dirname, join -import sys import luigi -import simplejson as json from openfda import common, config, parallel from openfda.parallel import mapreduce, Collection, Mapper, PivotReducer diff --git a/openfda/device_pma/transform.py b/openfda/device_pma/transform.py index 67df405..785d296 100644 --- a/openfda/device_pma/transform.py +++ b/openfda/device_pma/transform.py @@ -6,8 +6,6 @@ """ import arrow -import csv -import simplejson as json from openfda import common, device_common diff --git a/openfda/device_udi/tests/pipeline_test.py b/openfda/device_udi/tests/pipeline_test.py index ae6b490..521e24d 100644 --- a/openfda/device_udi/tests/pipeline_test.py +++ b/openfda/device_udi/tests/pipeline_test.py @@ -9,7 +9,6 @@ import luigi from mock import MagicMock from openfda.device_udi.pipeline import ( - SyncS3DeviceUDI, ExtractXML, XML2JSONMapper, UDIAnnotateMapper, diff --git a/openfda/download_util.py b/openfda/download_util.py index 22a3a57..718d960 100644 --- a/openfda/download_util.py +++ b/openfda/download_util.py @@ -6,8 +6,7 @@ import glob import logging -import os -from os.path import basename, dirname, join +from os.path import dirname from openfda import common diff --git a/openfda/drugsfda/pipeline.py b/openfda/drugsfda/pipeline.py index 21c5202..acb2cfa 100644 --- a/openfda/drugsfda/pipeline.py +++ b/openfda/drugsfda/pipeline.py @@ -671,7 +671,7 @@ def add_te(self, products, app_key): te_json = self.te_dict.get(key) if te_json.get("te_code"): product["te_code"] = te_json["te_code"].rstrip() - if not "marketing_status" in product and "marketing_status" in te_json: + if "marketing_status" not in product and "marketing_status" in te_json: product["marketing_status"] = te_json["marketing_status"].rstrip() return products diff --git a/openfda/faers/annotate.py b/openfda/faers/annotate.py index d7089ab..e50ef6f 100644 --- a/openfda/faers/annotate.py +++ b/openfda/faers/annotate.py @@ -3,7 +3,6 @@ import collections import re -import leveldb import simplejson as json from openfda import parallel diff --git a/openfda/faers/tests/annotate_test.py b/openfda/faers/tests/annotate_test.py index d3dd15a..9562ec8 100644 --- a/openfda/faers/tests/annotate_test.py +++ b/openfda/faers/tests/annotate_test.py @@ -1,6 +1,5 @@ #!/usr/bin/python -import os import simplejson as json import unittest diff --git a/openfda/ndc/annotate.py b/openfda/ndc/annotate.py index 0940a3b..bebe91c 100644 --- a/openfda/ndc/annotate.py +++ b/openfda/ndc/annotate.py @@ -2,8 +2,6 @@ import simplejson as json from openfda import parallel -from openfda import common -from openfda.spl.fix_date import validate_date def read_json_file(json_file): diff --git a/openfda/res/tests/extract_test.py b/openfda/res/tests/extract_test.py index dbcf0be..9ee7474 100644 --- a/openfda/res/tests/extract_test.py +++ b/openfda/res/tests/extract_test.py @@ -1,6 +1,5 @@ #!/usr/bin/python -import os import unittest import xmltodict diff --git a/openfda/spl/pipeline.py b/openfda/spl/pipeline.py index d47e233..6aa702b 100644 --- a/openfda/spl/pipeline.py +++ b/openfda/spl/pipeline.py @@ -24,8 +24,6 @@ from openfda.common import ProcessException from openfda.parallel import NullOutput from openfda.spl import annotate -from openfda.tasks import AlwaysRunTask -from bs4 import BeautifulSoup import urllib.request RUN_DIR = dirname(dirname(os.path.abspath(__file__))) diff --git a/openfda/spl/tests/extract_test.py b/openfda/spl/tests/extract_test.py index fa41804..3fbfe00 100644 --- a/openfda/spl/tests/extract_test.py +++ b/openfda/spl/tests/extract_test.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -import os import unittest from openfda.spl import extract diff --git a/openfda/spl/tests/process_barcodes_test.py b/openfda/spl/tests/process_barcodes_test.py index b098333..7fe2b52 100644 --- a/openfda/spl/tests/process_barcodes_test.py +++ b/openfda/spl/tests/process_barcodes_test.py @@ -1,6 +1,8 @@ #!/usr/bin/env python import unittest -import shutil, tempfile, json +import shutil +import tempfile +import json from os import path from openfda.test_common import data_filename from openfda.spl.process_barcodes import XML2JSON diff --git a/openfda/tests/api_test.py b/openfda/tests/api_test.py index 6babf96..c79ebd5 100644 --- a/openfda/tests/api_test.py +++ b/openfda/tests/api_test.py @@ -12,7 +12,6 @@ import os import pprint -import requests from openfda.tests.api_test_helpers import *