From ebd0e59ff3454ea782fbac811380c96905f375d3 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Thu, 5 Oct 2023 09:00:18 -0400 Subject: [PATCH 01/63] Update _version.py (#86) --- dags/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/_version.py b/dags/_version.py index fe7b0e06..40bcdb68 100644 --- a/dags/_version.py +++ b/dags/_version.py @@ -1 +1 @@ -version = "0.10.3-dev" +version = "0.10.3" From b66eb64fcf42d803656cc2b392daa5fd4942f6fd Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Thu, 5 Oct 2023 19:47:36 -0400 Subject: [PATCH 02/63] Update _version.py --- dags/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/_version.py b/dags/_version.py index 40bcdb68..b8ac7e54 100644 --- a/dags/_version.py +++ b/dags/_version.py @@ -1 +1 @@ -version = "0.10.3" +version = "0.10.4-dev" From e7e5f7c89b1ba096b40600bca79dcc9b1ef7617e Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Tue, 31 Oct 2023 09:05:49 -0400 Subject: [PATCH 03/63] Rti merge (#84) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Update requirements.txt * Update .gitignore * Update dug_utils.py Handle Error when curie not found in validate * Update __init__.py * Update config.yaml * Update dev-config.yaml * Update docker-compose.yaml * fixed docker-compose * adding back postgres volume to docker compose * env correction , docker compose updates --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent --- .env | 8 +- .gitignore | 4 +- Makefile | 5 +- README.md | 2 +- bin/docker_backend/docker-compose.yaml | 4 +- bin/roger | 2 +- cli.py | 9 +- dags/dug_helpers/dug_utils.py | 31 ++++-- dags/roger/config/__init__.py | 4 + dags/roger/config/config.yaml | 4 +- dags/roger/{ => config}/dev-config.yaml | 4 +- dags/roger/core/bulkload.py | 24 +++-- dags/roger/core/redis_graph.py | 55 +++++------ dags/test_metadata.yaml | 124 ++++++++++++++++++++++++ docker-compose.yaml | 62 ++++++------ requirements.txt | 8 +- roger-cli-steps.md | 27 ++++++ tests/test_redis_query.cypher | 5 + 18 files changed, 283 insertions(+), 99 deletions(-) rename dags/roger/{ => config}/dev-config.yaml (98%) create mode 100644 dags/test_metadata.yaml create mode 100644 roger-cli-steps.md create mode 100644 tests/test_redis_query.cypher diff --git a/.env b/.env index 7bc31d98..9e2ba0e3 100644 --- a/.env +++ b/.env @@ -15,9 +15,9 @@ ELASTIC_USERNAME=elastic NBOOST_API_HOST=nboost -REDIS_PASSWORD=12345 -REDIS_HOST=redis +REDIS_PASSWORD=weak +REDIS_HOST=merge-redis-master REDIS_PORT=6379 - TRANQL_ACCESS_LOG=access.log -TRANQL_ERROR_LOG=error.log \ No newline at end of file +TRANQL_ERROR_LOG=error.log +ROGER_DUG__INPUTS_DATA__SETS=topmed:v1.0 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 065d4e8b..8ca8ea91 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ # Git ignore bioler plate from https://github.com/github/gitignore/blob/master/Python.gitignore - # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -108,6 +107,7 @@ celerybeat.pid *.sage.py # Environments +.secrets-env .venv env/ venv/ @@ -149,4 +149,4 @@ cython_debug/ dags/roger/data local_storage logs -tests/integration/data/bulk/ \ No newline at end of file +tests/integration/data/bulk/ diff --git a/Makefile b/Makefile index 644a6d01..ef227aa4 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ -PYTHON = PYTHONPATH=dags /usr/bin/env python3 +PYTHON = $(shell which python3) +PYTHONPATH = dags VERSION_FILE = ./dags/_version.py VERSION = $(shell cut -d " " -f 3 ${VERSION_FILE}) DOCKER_REPO = docker.io @@ -17,10 +18,12 @@ help: mk_dirs: mkdir -p {logs,plugins} mkdir -p local_storage/elastic + mkdir -p local_storage/redis rm_dirs: rm -rf logs/* rm -rf local_storage/elastic/* + rm -rf local_storage/redis/* rm -rf ./dags/roger/data/* #install: Install application along with required packages to local environment diff --git a/README.md b/README.md index 5ee4a69b..92ed1652 100644 --- a/README.md +++ b/README.md @@ -533,7 +533,7 @@ Open localhost:8080 in a browser. Then run: ``` -python tranql_translator.py +python tranql_translate.py ``` The Airflow interface shows the workflow: ![image](https://user-images.githubusercontent.com/306971/97787955-b968f680-1b8b-11eb-86cc-4d93842eafd3.png) diff --git a/bin/docker_backend/docker-compose.yaml b/bin/docker_backend/docker-compose.yaml index 695363b0..d87c6ae6 100644 --- a/bin/docker_backend/docker-compose.yaml +++ b/bin/docker_backend/docker-compose.yaml @@ -22,9 +22,9 @@ services: - roger-network environment: - REDIS_PASSWORD=$ROGERENV_REDISGRAPH_PASSWORD - entrypoint: /usr/local/bin/gunicorn --workers=2 --bind=0.0.0.0:8081 --name=tranql --timeout=600 tranql.api:app + entrypoint: /usr/local/bin/gunicorn --workers=2 --bind=0.0.0.0:8001 --name=tranql --timeout=600 tranql.api:app ports: - - 8081:8081 + - 8001:8001 volumes: - ./tranql-schema.yaml:/tranql/tranql/conf/schema.yaml ################################################################################# diff --git a/bin/roger b/bin/roger index 55fd4697..4626df88 100755 --- a/bin/roger +++ b/bin/roger @@ -10,7 +10,7 @@ export PYTHONPATH=$ROGER_HOME:$ROGER_HOME/../kgx export DB_NAME=test roger () { - python $ROGER_HOME/roger/core.py $* + python $ROGER_HOME/dags/roger/core.py $* } kgx () { diff --git a/cli.py b/cli.py index 7b6b609f..be77525a 100644 --- a/cli.py +++ b/cli.py @@ -4,12 +4,15 @@ from dug_helpers.dug_utils import DugUtil, get_topmed_files, get_dbgap_files, get_sparc_files, get_anvil_files, get_nida_files import sys import argparse +import os +import time log = get_logger() if __name__ == "__main__": - + start = time.time() + log.info(f"Start TIME:{start}") parser = argparse.ArgumentParser(description='Roger common cli tool.') """ Common CLI. """ parser.add_argument('-d', '--data-root', help="Root of data hierarchy", default=None) @@ -102,4 +105,8 @@ if args.validate_concepts: DugUtil.validate_indexed_concepts(config=config) + end = time.time() + time_elapsed = end - start + log.info(f"Completion TIME:{time_elapsed}") + sys.exit (0) diff --git a/dags/dug_helpers/dug_utils.py b/dags/dug_helpers/dug_utils.py index 85b40d8f..e55fdb34 100644 --- a/dags/dug_helpers/dug_utils.py +++ b/dags/dug_helpers/dug_utils.py @@ -381,10 +381,17 @@ def _search_elements(self, curie, search_term): query=search_term )) ids_dict = [] - for element_type in response: - all_elements_ids = [e['id'] for e in - reduce(lambda x, y: x + y['elements'], response[element_type], [])] - ids_dict += all_elements_ids + if 'total_items' in response: + if response['total_items'] == 0: + log.error(f"No search elements returned for variable search: {self.variables_index}.") + log.error(f"Concept id : {curie}, Search term: {search_term}") + raise Exception(f"Validation error - Did not find {curie} for" + f"Search term: {search_term}") + else: + for element_type in response: + all_elements_ids = [e['id'] for e in + reduce(lambda x, y: x + y['elements'], response[element_type], [])] + ids_dict += all_elements_ids return ids_dict def crawl_concepts(self, concepts, data_set_name): @@ -511,18 +518,24 @@ def validate_indexed_concepts(self, elements, concepts): searched_element_ids = self._search_elements(curie, search_term) - present = bool(len([x for x in sample_elements[curie] if x in searched_element_ids])) - if not present: - log.error(f"Did not find expected variable {element.id} in search result.") + if curie not in sample_elements: + log.error(f"Did not find Curie id {curie} in Elements.") log.error(f"Concept id : {concept.id}, Search term: {search_term}") raise Exception(f"Validation error - Did not find {element.id} for" f" Concept id : {concept.id}, Search term: {search_term}") + else: + present = bool(len([x for x in sample_elements[curie] if x in searched_element_ids])) + if not present: + log.error(f"Did not find expected variable {element.id} in search result.") + log.error(f"Concept id : {concept.id}, Search term: {search_term}") + raise Exception(f"Validation error - Did not find {element.id} for" + f" Concept id : {concept.id}, Search term: {search_term}") def clear_index(self, index_id): - exists = self.search_obj.es.indices.exists(index_id) + exists = self.search_obj.es.indices.exists(index=index_id) if exists: log.info(f"Deleting index {index_id}") - response = self.event_loop.run_until_complete(self.search_obj.es.indices.delete(index_id)) + response = self.event_loop.run_until_complete(self.search_obj.es.indices.delete(index=index_id)) log.info(f"Cleared Elastic : {response}") log.info("Re-initializing the indicies") self.index_obj.init_indices() diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py index 6a6d757b..403b25b1 100644 --- a/dags/roger/config/__init__.py +++ b/dags/roger/config/__init__.py @@ -144,6 +144,8 @@ class ElasticsearchConfig(DictLike): username: str = "elastic" password: str = "" nboost_host: str = "" + scheme: str = "http" + ca_path: str = "" @@ -174,6 +176,8 @@ def to_dug_conf(self) -> DugConfig: elastic_host=self.elasticsearch.host, elastic_password=self.elasticsearch.password, elastic_username=self.elasticsearch.username, + elastic_scheme=self.elasticsearch.scheme, + elastic_ca_path=self.elasticsearch.ca_path, redis_host=self.redisgraph.host, redis_password=self.redisgraph.password, redis_port=self.redisgraph.port, diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml index 92503030..ae370235 100644 --- a/dags/roger/config/config.yaml +++ b/dags/roger/config/config.yaml @@ -92,6 +92,8 @@ elasticsearch: username: elastic password: "" nboost_host: "" + scheme: "http" + ca_path: "" validation: queries: @@ -154,4 +156,4 @@ lakefs_config: enabled: false access_key_id: "" secret_access_key: "" - host: "" \ No newline at end of file + host: "" diff --git a/dags/roger/dev-config.yaml b/dags/roger/config/dev-config.yaml similarity index 98% rename from dags/roger/dev-config.yaml rename to dags/roger/config/dev-config.yaml index 67fd6d5e..91fc0af3 100644 --- a/dags/roger/dev-config.yaml +++ b/dags/roger/config/dev-config.yaml @@ -13,7 +13,7 @@ data_root: "/Users/schreepc/Projects/helxplatform/roger/roger/test/data" dug_data_root: dug_helpers/dug_data/topmed_data base_data_uri: https://stars.renci.org/var/kgx_data/trapi-1.0/ kgx: - biolink_model_version: 1.5.0 + biolink_model_version: test #https://github.com/RedisGraph/redisgraph-bulk-loader/blob/master/redisgraph_bulk_loader/bulk_insert.py#L43 bulk_loader: @@ -115,4 +115,4 @@ validation: - var: "[N-]=[N+]=[N-]" - var: "[Ag+]" - var: "[Zn+2]" - - var: "[C-]#[O+]" \ No newline at end of file + - var: "[C-]#[O+]" diff --git a/dags/roger/core/bulkload.py b/dags/roger/core/bulkload.py index 772b7cf9..1eca92db 100644 --- a/dags/roger/core/bulkload.py +++ b/dags/roger/core/bulkload.py @@ -341,20 +341,26 @@ def insert (self): args = [] if len(nodes) > 0: bulk_path_root = storage.bulk_path('nodes') + os.path.sep - nodes_with_type = [ - f"{ x.replace(bulk_path_root, '').split('.')[0].replace('~', ':')} {x}" - for x in nodes] + nodes_with_type = [] + for x in nodes: + """ + These lines prep nodes bulk load by: + 1) appending to labels 'biolink.' + 2) combine labels to create a multilabel redis node i.e. "biolink.OrganismalEntity:biolink.SubjectOfInvestigation" + """ + file_name_type_part = x.replace(bulk_path_root, '').split('.')[0].split('~')[1] + all_labels = "biolink." + file_name_type_part + ":" + ":".join([f'biolink.{v.lstrip("biolink:")}' for v in self.biolink.toolkit.get_ancestors("biolink:" + file_name_type_part, reflexive=False, formatted=True )] ) + nodes_with_type.append(f"{all_labels} {x}") args.extend(("-N " + " -N ".join(nodes_with_type)).split()) if len(edges) > 0: bulk_path_root = storage.bulk_path('edges') + os.path.sep - edges_with_type = [ - f"{x.replace(bulk_path_root, '').strip(os.path.sep).split('.')[0].replace('~', ':')} {x}" - for x in edges] + edges_with_type = [f"biolink.{x.replace(bulk_path_root, '').strip(os.path.sep).split('.')[0].split('~')[1]} {x}" + for x in edges] + # Edge label now no longer has 'biolink:' args.extend(("-R " + " -R ".join(edges_with_type)).split()) args.extend([f"--separator={self.separator}"]) - args.extend([f"--host={redisgraph['host']}"]) - args.extend([f"--port={redisgraph['port']}"]) - args.extend([f"--password={redisgraph['password']}"]) + log.debug(f"--redis-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}") + args.extend([f"--redis-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}"]) args.extend(['--enforce-schema']) args.extend([f"{redisgraph['graph']}"]) """ standalone_mode=False tells click not to sys.exit() """ diff --git a/dags/roger/core/redis_graph.py b/dags/roger/core/redis_graph.py index 6710d3fc..ca65ddce 100644 --- a/dags/roger/core/redis_graph.py +++ b/dags/roger/core/redis_graph.py @@ -1,51 +1,44 @@ -"Graph abstraction layer over redisgraph python module" - import copy import redis -from redisgraph import Node, Edge, Graph +# from redisgraph import Node, Edge, Graph +# https://redis-py.readthedocs.io/en/v4.5.1/redismodules.html#redisgraph-commands +from redis.commands.graph.node import Node +from redis.commands.graph.edge import Edge + from roger.logger import get_logger logger = get_logger () - class RedisGraph: - """ Graph abstraction over RedisGraph - - A thin wrapper but provides us some options. - """ - - def __init__(self, host='localhost', port=6379, graph='default', - password=''): + """ Graph abstraction over RedisGraph. A thin wrapper but provides us some options. """ + + def __init__(self, host='localhost', port=6379, graph='default', password=''): """ Construct a connection to Redis Graph. """ self.r = redis.Redis(host=host, port=port, password=password) - self.redis_graph = Graph(graph, self.r) + self.redis_graph = self.r.graph(graph) def add_node (self, identifier=None, label=None, properties=None): """ Add a node with the given label and properties. """ - logger.debug ( - f"--adding node id:{identifier} label:{label} prop:{properties}") + logger.debug (f"--adding node id:{identifier} label:{label} prop:{properties}") if identifier and properties: properties['id'] = identifier - node = Node(node_id=identifier, alias=identifier, - label=label, properties=properties) + node = Node(node_id=identifier, alias=identifier, label=label, properties=properties) self.redis_graph.add_node(node) return node def get_edge (self, start, end, predicate=None): - "Get an edge from the graph with the specified start and end ids" + """ Get an edge from the graph with the specified start and end identifiers. """ result = None for edge in self.redis_graph.edges: if edge.src_node.id == start and edge.dest_node.id == end: result = edge break return result - + def add_edge (self, start, predicate, end, properties={}): - "Add edge with given predicate and properties btw start and end nodes" - logger.debug ( - f"--adding edge start:{start} pred:{predicate} " - f"end:{end} prop:{properties}") + """ Add an edge with the given predicate and properties between start and end nodes. """ + logger.debug (f"--adding edge start:{start} pred:{predicate} end:{end} prop:{properties}") if isinstance(start, str) and isinstance(end, str): start = Node(node_id = start, label='thing') end = Node(node_id = end, label='thing') @@ -56,13 +49,11 @@ def add_edge (self, start, predicate, end, properties={}): return edge def has_node (self, identifier): - "Does the graph have a node with this ID" return identifier in self.redis_graph.nodes def get_node (self, identifier, properties=None): - "Retrieve the node with the given id" return self.redis_graph.nodes[identifier] - + def commit (self): """ Commit modifications to the graph. """ self.redis_graph.commit() @@ -70,13 +61,13 @@ def commit (self): def query (self, query): """ Query and return result set. """ result = self.redis_graph.query(query) - result.pretty_print() + print(result) return result - + def delete (self): """ Delete the named graph. """ self.redis_graph.delete() - + def test (): rg = RedisGraph () p = { 'a' : 4, @@ -91,10 +82,10 @@ def test (): if last is not None: rg.add_edge (node, 'link', last) last = node - rg.commit () - rg.query ("MATCH (obj:yeah)-[:link]->(j:yeah) RETURN obj.a, obj.b, obj.x") - rg.query ("MATCH (a) RETURN a") + rg.commit () + rg.query ("""MATCH (obj:yeah)-[:link]->(j:yeah) RETURN obj.a, obj.b, obj.x""") + rg.query ("""MATCH (a) RETURN a""") rg.delete () # rg.query ("""MATCH (a { id : 'chemical_substance' }) RETURN a""") -#test () +#test () \ No newline at end of file diff --git a/dags/test_metadata.yaml b/dags/test_metadata.yaml new file mode 100644 index 00000000..54d508c4 --- /dev/null +++ b/dags/test_metadata.yaml @@ -0,0 +1,124 @@ +# This is a file that lists the data to be used for testing purposes +# It contains a reduced set of the metadata.yaml file +kgx: + versions: + - files: + - biolink-v1.0.json + - ctd-v1.0.json + - gtopdb-v1.0.json + - hetio-v1.0.json + - hgnc-v1.0.json + - hmdb-v1.0.json + - kegg-v1.0.json + - mychem-v1.0.json + - ontological-hierarchy-v1.0.json + - panther-v1.0.json + - foodb-v1.0.json + - pharos-v1.0.json + - intact-v1.0.json + - human-goa-v1.0.json + - uberongraph-v1.0.json + - viral-proteome-v1.0.json + version: v1.0 + name: baseline-graph + format: json + - files: + - biolink-v2.0.json + - ctd-v2.0.json + - gtopdb-v2.0.json + - hetio-v2.0.json + - hgnc-v2.0.json + - hmdb-v2.0.json + - kegg-v2.0.json + - mychem-v2.0.json + - ontological-hierarchy-v2.0.json + - panther-v2.0.json + - foodb-v2.0.json + - pharos-v2.0.json + - intact-v2.0.json + - human-goa-v2.0.json + - uberongraph-v2.0.json + - viral-proteome-v2.0.json + version: v2.0 + name: baseline-graph + format: json + - files: + - heal/sparc/curation-export-processed.json + version: v2.0 + name: sparc-kgx + format: json + - files: + - Biolink_edges_v3.0.jsonl + - Biolink_nodes_v3.0.jsonl + - CTD_edges_v3.0.jsonl + - CTD_nodes_v3.0.jsonl + - DrugCentral_edges_v3.0.jsonl + - DrugCentral_nodes_v3.0.jsonl + - GtoPdb_edges_v3.0.jsonl + - GtoPdb_nodes_v3.0.jsonl + - Hetio_edges_v3.0.jsonl + - Hetio_nodes_v3.0.jsonl + - HGNC_edges_v3.0.jsonl + - HGNC_nodes_v3.0.jsonl + - HMDB_edges_v3.0.jsonl + - HMDB_nodes_v3.0.jsonl + - HumanGOA_edges_v3.0.jsonl + - HumanGOA_nodes_v3.0.jsonl + - IntAct_edges_v3.0.jsonl + - IntAct_nodes_v3.0.jsonl + - OntologicalHierarchy_edges_v3.0.jsonl + - OntologicalHierarchy_nodes_v3.0.jsonl + - PANTHER_edges_v3.0.jsonl + - PANTHER_nodes_v3.0.jsonl + - PHAROS_edges_v3.0.jsonl + - PHAROS_nodes_v3.0.jsonl + - UberGraph_edges_v3.0.jsonl + - UberGraph_nodes_v3.0.jsonl + version: v3.0 + name: baseline-graph + format: jsonl + - version: test + files: + - hgnc_nodes.jsonl + - hgnc_edges.jsonl + name: test + - version: v3.0 + name: cde-graph + format: jsonl + files: + - cde/annotated_edges_v3.0.jsonl + - cde/annotated_nodes_v3.0.jsonl +dug_inputs: + versions: + - name: bdc + version: v1.0 + files: + s3: + - "bdc/v1.0/bdc_dbgap_data_dicts.tar.gz" + stars: + - "bdc_dbgap_data_dicts.tar.gz" + format: dbGaP + - name: nida + version: v1.0 + files: + s3: + - "nida/v1.0/nida-12studies.tar.gz" + stars: + - "nida-12studies.tar.gz" + format: nida + - name: sparc + version: v1.0 + files: + s3: + - "sparc/v1.0/sparc-dbgap-xml-formatted.tar.gz" + stars: + - "sparc-dbgap-xml-formatted.tar.gz" + format: sparc + - name: anvil + version: v1.0 + files: + s3: + - "bdc/v1.0/anvil_dbgap_data_dicts.tar.gz" + stars: + - "anvil_dbgap_data_dicts.tar.gz" + format: anvil \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index 15e4b88f..7c698ed6 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -52,18 +52,20 @@ x-airflow-common: AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' AIRFLOW__CORE__LOAD_EXAMPLES: 'false' - ROGER_DUG__INPUTS_DATA__SETS: topmed,bdc-dbGaP + ROGER_DUG__INPUTS_DATA__SETS: "$ROGER_DUG__INPUTS_DATA__SETS" ROGER_ELASTICSEARCH_HOST: "$ELASTIC_API_HOST" ROGER_ELASTICSEARCH_PASSWORD: "$ELASTIC_PASSWORD" ROGER_ELASTICSEARCH_NBOOST__HOST: "$NBOOST_API_HOST" ROGER_REDISGRAPH_HOST: "$REDIS_HOST" ROGER_REDISGRAPH_PASSWORD: "$REDIS_PASSWORD" ROGER_KGX_DATASET__VERSION: "v3.0" + ROGER_DATA_DIR: "/opt/airflow/share/data" volumes: - ./dags:/opt/airflow/dags - ./logs:/opt/airflow/logs - ./plugins:/opt/airflow/plugins - user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}" + - ./data:/opt/airflow/share/data + user: root depends_on: redis: condition: service_healthy @@ -79,24 +81,14 @@ services: POSTGRES_DB: airflow volumes: - postgres-db-volume:/var/lib/postgresql/data + - ${DATA_DIR}/elastic:/elastic + - ${DATA_DIR}/redis:/redis healthcheck: test: ["CMD", "pg_isready", "-U", "airflow"] interval: 5s retries: 5 restart: always - redis: - image: redislabs/redisgraph:2.4.1 - command: "redis-server --requirepass $REDIS_PASSWORD --loadmodule /usr/lib/redis/modules/redisgraph.so" - ports: - - $REDIS_PORT:$REDIS_PORT - healthcheck: - test: ["CMD", "redis-cli", "ping"] - interval: 5s - timeout: 30s - retries: 50 - restart: always - airflow-webserver: <<: *airflow-common command: webserver @@ -141,17 +133,33 @@ services: retries: 5 restart: always + redis: + # image: redislabs/redisgraph:2.10.9 #Alternative Image + user: root + image: 'redis/redis-stack:6.2.4-v2' + command: "redis-server --requirepass $REDIS_PASSWORD --loadmodule /opt/redis-stack/lib/redisgraph.so" + environment: + - REDIS_ARGS=--requirepass $REDIS_PASSWORD + volumes: + - $DATA_DIR/redis:/data # FIX RDB Error on local + ports: + - $REDIS_PORT:$REDIS_PORT + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 30s + retries: 50 + restart: always + dug: - image: cschreep/dug:develop + image: containers.renci.org/helxplatform/dug:latest depends_on: - elasticsearch - redis - - nboost restart: always environment: ELASTIC_API_HOST: "$ELASTIC_API_HOST" ELASTIC_PASSWORD: "$ELASTIC_PASSWORD" - NBOOST_API_HOST: "$NBOOST_API_HOST" REDIS_HOST: "$REDIS_HOST" REDIS_PASSWORD: "$REDIS_PASSWORD" FLASK_ENV: "development" @@ -159,35 +167,32 @@ services: entrypoint: [ "gunicorn", "--workers=$API_WORKERS", "--name=dug", "--bind=0.0.0.0:$API_PORT", "--timeout=$API_TIMEOUT", - "--log-level=DEBUG", "--enable-stdio-inheritance", "--reload", "dug.api:app" ] + "--log-level=DEBUG", "-k", "uvicorn.workers.UvicornWorker", "--reload", "dug.server:APP"] ports: - $API_PORT:$API_PORT elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:7.6.1 + user: root + image: docker.elastic.co/elasticsearch/elasticsearch:8.5.2 environment: - ELASTIC_PASSWORD=$ELASTIC_PASSWORD - discovery.type=single-node - xpack.security.enabled=true + - ingest.geoip.downloader.enabled=false volumes: - $DATA_DIR/elastic:/usr/share/elasticsearch/data ports: - '9200:9200' - '9300:9300' - nboost: - image: koursaros/nboost:0.3.9-pt - ports: - - '8000:8000' - tranql: - image: helxplatform/tranql-app:develop-0.0.56 + image: containers.renci.org/helxplatform/tranql:rti-merge ports: - - '8081:8081' + - '8001:8001' entrypoint: [ "gunicorn", "--workers=4", - "--bind=0.0.0.0:8081", + "--bind=0.0.0.0:8001", "--timeout=300", "--access-logfile=$TRANQL_ACCESS_LOG", "--error-logfile=$TRANQL_ERROR_LOG", @@ -198,6 +203,5 @@ services: - REDIS_PASSWORD=$REDIS_PASSWORD volumes: - ./tranql-schema.yaml:/tranql/tranql/conf/schema.yaml - volumes: - postgres-db-volume: + postgres-db-volume: \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index c964fce7..54918df6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,12 @@ -apache-airflow==2.5.0 boto3==1.18.23 botocore==1.21.23 #black==21.10b0 +elasticsearch==8.5.2 flatten-dict -redisgraph==2.4.1 -redisgraph-bulk-loader==0.9.5 +redisgraph-bulk-loader==0.12.3 pytest PyYAML -git+https://github.com/helxplatform/dug@2.11.2 -elasticsearch==7.11.0 +git+https://github.com/helxplatform/dug@2.12.0 orjson kg-utils==0.0.6 bmt==1.1.0 diff --git a/roger-cli-steps.md b/roger-cli-steps.md new file mode 100644 index 00000000..8e132746 --- /dev/null +++ b/roger-cli-steps.md @@ -0,0 +1,27 @@ +# Deployment with Roger CLI + +## QUICK Local Set Up + +This is list steps to produce a local deployment of Roger. This set up does NOT use airflow and instead only uses the Roger CLI via **Makefile** commands. + +### Prerequsite Steps + +- Set up Roger dependencies by ensuring that the `.env` has all the correct information. +- Run the following docker compose commands + - `docker compose up tranql -d`: starts up tranql which is the API handlerfor redis graph in the `graph` stage + - `docker compose up redis -d`: starts up redis which will be used via redis graph for the `graph` stage + - `docker compose up dug -d`: starts up dug API to work as the API handler for elastic search in the `index` stage + - `docker compose up elasticsearch -d`: starts up elastic search for the `index` stage + +### Roger CLI Steps + +1) `python3 -m venv ~/.environments/roger` +2) `source ~/.environments/roger/bin/activate` +3) `pip install -r requirements.txt` +4) `export PYTHONPATH=$PWD/dags` +5) Change the elasticsearch and redisgraph `host` values to localhost in `dags/roger/config/config.yaml` +6) Get the S3 Bucket credentials (access_key, bucket, host, secret_key) and export them as environment variables with ROGER_S3_ in the front of the value like: `ROGER_S3_ACCESS__KEY=XXXXKEYXXXX` +7) `cd bin/` and here either run `make all` OR separate the commands into three steps: + 1) `make annotate`: executes the CLI related commands found in `bin/dug_annotate/Makefile` + 2) `make graph`: executes the CLI related commands found in `bin/roger_graph_build/Makefile` + 3) `make index`: executes the CLI related commands found in `bin/dug_index/Makefile` diff --git a/tests/test_redis_query.cypher b/tests/test_redis_query.cypher new file mode 100644 index 00000000..509df1fa --- /dev/null +++ b/tests/test_redis_query.cypher @@ -0,0 +1,5 @@ +MATCH (c{id:'HP:0032316'}) return c + +MATCH (disease:`Disease` {`id`: 'MONDO:0004979'}) WITH disease MATCH (disease)-[e1_disease_phenotypic_feature]-(phenotypic_feature:`PhenotypicFeature` {}) +WITH disease AS disease, phenotypic_feature AS phenotypic_feature, collect(e1_disease_phenotypic_feature) AS e1_disease_phenotypic_feature +RETURN disease,phenotypic_feature,e1_disease_phenotypic_feature,labels(disease) AS type__disease,labels(phenotypic_feature) AS type__phenotypic_feature,[edge in e1_disease_phenotypic_feature | type(edge)] AS type__e1_disease_phenotypic_feature,[edge in e1_disease_phenotypic_feature | [startNode(edge).id, endNode(edge).id]] AS id_pairs__e1_disease_phenotypic_feature \ No newline at end of file From fee790d4299f05b5d4ab0a9a50c257f4b0f9cd1f Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 1 Nov 2023 12:29:22 -0400 Subject: [PATCH 04/63] adding v5.0 --- dags/metadata.yaml | 12 ++++++++++++ dags/roger/config/config.yaml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/dags/metadata.yaml b/dags/metadata.yaml index 47fedcb5..0cedb6a0 100644 --- a/dags/metadata.yaml +++ b/dags/metadata.yaml @@ -97,6 +97,18 @@ kgx: files: - cde/annotated_edges_v4.0.jsonl - cde/annotated_nodes_v4.0.jsonl + - version: v5.0 + name: baseline-graph + format: jsonl + files: + - baseline-5.0/edges_v5.0.jsonl + - baseline-5.0/nodes_v5.0.jsonl + - version: v5.0 + name: cde-graph + format: jsonl + files: + - cde/annotated_edges_v5.0.jsonl + - cde/annotated_nodes_v5.0.jsonl dug_inputs: versions: - name: bdc diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml index ae370235..30eddcd1 100644 --- a/dags/roger/config/config.yaml +++ b/dags/roger/config/config.yaml @@ -16,7 +16,7 @@ annotation_base_data_uri: https://stars.renci.org/var/dug/ kgx: biolink_model_version: v3.1.2 - dataset_version: v4.0 + dataset_version: v5.0 merge_db_id: 1 merge_db_temp_dir: workspace data_sets: From e15a37305d40adb0c428f70258f8b2dcd6d80b83 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Mon, 6 Nov 2023 15:37:43 -0500 Subject: [PATCH 05/63] cde-links branch --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 54918df6..504042af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ flatten-dict redisgraph-bulk-loader==0.12.3 pytest PyYAML -git+https://github.com/helxplatform/dug@2.12.0 +git+https://github.com/helxplatform/dug@cde-links-2 orjson kg-utils==0.0.6 bmt==1.1.0 From 6aac2fd81e40dadfdd3c4ef5493ebbf177ca2969 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Mon, 6 Nov 2023 16:53:19 -0500 Subject: [PATCH 06/63] pin linkml --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 504042af..17a59d90 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ git+https://github.com/helxplatform/dug@cde-links-2 orjson kg-utils==0.0.6 bmt==1.1.0 +linkml-runtime==1.6.0 \ No newline at end of file From 487b2c1ab24a5323dcc33e7528a724431e9dc69e Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Tue, 7 Nov 2023 13:15:16 -0500 Subject: [PATCH 07/63] Update config.yaml collection_action to action --- dags/roger/config/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml index 30eddcd1..49bc927b 100644 --- a/dags/roger/config/config.yaml +++ b/dags/roger/config/config.yaml @@ -85,7 +85,7 @@ indexing: desc: "summary" collection_name: "cde_category" collection_id: "cde_category" - collection_action: "files" + action: "files" elasticsearch: host: elasticsearch From bb61ab9596e3af267cc1ffd190ef9bef60a0ef43 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 7 Nov 2023 19:55:44 -0500 Subject: [PATCH 08/63] pop total items before result --- dags/dug_helpers/dug_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dags/dug_helpers/dug_utils.py b/dags/dug_helpers/dug_utils.py index e55fdb34..f90c591c 100644 --- a/dags/dug_helpers/dug_utils.py +++ b/dags/dug_helpers/dug_utils.py @@ -388,6 +388,7 @@ def _search_elements(self, curie, search_term): raise Exception(f"Validation error - Did not find {curie} for" f"Search term: {search_term}") else: + del response['total_items'] for element_type in response: all_elements_ids = [e['id'] for e in reduce(lambda x, y: x + y['elements'], response[element_type], [])] From 48650b82f4343e5cd04938aee7fc9b8748b7acb1 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 8 Nov 2023 15:31:22 -0500 Subject: [PATCH 09/63] print extracted elements --- dags/dug_helpers/dug_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dags/dug_helpers/dug_utils.py b/dags/dug_helpers/dug_utils.py index f90c591c..4d417e04 100644 --- a/dags/dug_helpers/dug_utils.py +++ b/dags/dug_helpers/dug_utils.py @@ -435,12 +435,15 @@ def crawl_concepts(self, concepts, data_set_name): casting_config = query['casting_config'] tranql_source = query['tranql_source'] dug_element_type = query['output_dug_type'] - extracted_dug_elements += crawler.expand_to_dug_element( + new_elements = crawler.expand_to_dug_element( concept=concept, casting_config=casting_config, dug_element_type=dug_element_type, tranql_source=tranql_source ) + log.debug("extracted:") + log.debug(str(list([el.get_searchable_dict() for el in new_elements]))) + extracted_dug_elements += new_elements concept.clean() percent_complete = int((counter / total) * 100) if percent_complete % 10 == 0: From 3cfe3f5f7545a6915c20749db7e641153c6d0ce4 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Thu, 21 Dec 2023 15:40:41 -0500 Subject: [PATCH 10/63] Update requirements.txt --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 17a59d90..f2f2d16b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,8 +6,8 @@ flatten-dict redisgraph-bulk-loader==0.12.3 pytest PyYAML -git+https://github.com/helxplatform/dug@cde-links-2 +git+https://github.com/helxplatform/dug@patch-nrslv-resp orjson kg-utils==0.0.6 bmt==1.1.0 -linkml-runtime==1.6.0 \ No newline at end of file +linkml-runtime==1.6.0 From 21889ed9c0bbdb8db13da74b13ce5e95dea3c2e7 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Thu, 4 Jan 2024 13:30:21 -0500 Subject: [PATCH 11/63] Keep edge provenance (#94) * Update kgx.py * Update kgx.py * Update kgx.py can't delete edge keys while looping over them. * just collect then update * Update requirements.txt (#93) --- dags/roger/models/kgx.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dags/roger/models/kgx.py b/dags/roger/models/kgx.py index 62a57a61..a8976a00 100644 --- a/dags/roger/models/kgx.py +++ b/dags/roger/models/kgx.py @@ -478,6 +478,13 @@ def merge(self): edges['subject'] + edges['predicate'] + edges['object'] + edges.get("biolink:primary_knowledge_source", "")) + keys_to_del = set() + for key in edges: + if key.startswith('biolink:'): + keys_to_del.add(key) + for k in keys_to_del: + edges[k.replace('biolink:', '')] = edges[k] + del edges[k] stream.write(json.dumps(edges).decode('utf-8') + '\n') write_merge_metric['edges_writing_time'] = time.time() - start_edge_jsonl From 2eb136ad917472bd7e4711a68ce9e90fa64da63f Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Thu, 18 Apr 2024 13:04:27 -0400 Subject: [PATCH 12/63] Pipeline parameterize restructure (#95) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * Parameterized annotate tasks with input_data_path and output_data_path * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Parameterized annotate tasks with input_data_path and output_data_path (#85) * adding lakefs changes to roger-2.0 * point avalon to vg1 branch * change avalon dep * update airflow * fix avalon tag typo * update jenkins to tag version on main branch only * update jenkins to tag version * update jenkins to tag version * psycopg2 installation * add cncf k8s req * use airflow non-slim * simplified for testing * simplified for testing * change dag name * Erroneous parameter passed, should not be None * adding pre-exec * adding pre-exec * adding pre-exec * typo preexec * typo preexec * fix context * get files from repo * get files from repo * get files from repo * get files from repo * First shot at moving pipeline into base class and implementing. Anvil pipeline not complete * Syntax fix, docker image version bump to airflow 2.7.2-python3.11 * update storage dir * update remove dir code * update remove dir code * remote path to * * fix input dir for annotators * fix input dir for annotators * fix input dir for annotators * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * adding branch info on lakefs config * callback push to branch * back to relative import * reformat temp branch name based on unique task id * add logging * add logging * convert posix path to str for avalon * add extra / to root path * New dag created using DugPipeline subclasses * EmptyOperator imported from wrong place * import and syntax fixes * utterly silly syntax error * Added anvil to default input data sets for testing purposes * adding / to local path * commit meta task args empty string * add merge logic * add merge logic * upstream task dir pull for downstream task * Switched from subdag to taskgroup because latest Airflow depricated subdag * Added BACPAC pipeline object * Temporarily ignoring configuration variable for enabled datasets for testing * Passed dag in to create task group to see if it helps dag errors * Fixed silly syntax error * adding input / output dir params for make kgx * Trying different syntax to make taskgroups work. * adding input / output dir params for make kgx * Parsing, syntax, pylint fixes * adding input / output dir params for make kgx * Added pipeline name to task group name to ensure uniqueness * oops, moved something out of scope. Fixed * Filled out pipeline with methods from dug_utils. Needs data path changes * Finished implementing input_data_path and output_data_path handling, pylint cleanup * Update requirements.txt * adding toggle to avoid sending config obj * adding toggle to avoid sending config obj * disable to string for test * control pipelines for testing * add self to anvil get files * add log stream to make it available * typo fix * correcting branch id * adding source repo * adding source repo * patch name-resolver response * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * dug pipeline edit * recurisvely find recursively * recurisvely find recursively * setup output path for crawling * all task functions should have input and output params * adding annotation as upstream for validate index * revamp create task , and task wrapper * add validate concepts index task * adding concept validation * add index_variables task as dependecy for validate concepts * add index_variables task as dependecy for validate concepts * await client exist * await client exist * concepts not getting picked up for indexing * concepts not getting picked up for indexing * fix search elements * converting annotation output to json * json format annotation outputs * adding support for json format elements and concepts read * json back to dug objects * fixing index valriables with json objects * indetation and new line for better change detection :? * indetation and new line for better change detection * treat dictionary concepts as dictionary * read concepts json as a dict * concepts files are actually file paths * debug message * make output jsonable * clear up dir after commit , and delete unmerged branch even if no changes * don`t clear indexes, parallel dataset processing will be taxed * memory leak? * memory leak? * memory leak? * dumping pickles to debug locally * find out why concepts are being added to every other element * find out why concepts are being added to every other element * pointless shuffle 🤷‍♂️ * revert back in time * back to sanitize dug * output just json for annotation * adding jsonpickle * jsonpickle 🥒 * unpickle for index * unpickle for validate index * crawling fixes * crawling fixes * crawling validation fixes * fix index concepts * fix makekgx * adding other bdc pipelines * adding pipeline paramters to be able to configure per instance * fix * add input dataset for pipelines * Adding README to document how to create data set-specific pipelines * catchup on base.py * Added dbgap and nida pipelines * fix import errors * annotator modules added by passing config val (#90) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 --------- Co-authored-by: YaphetKG * Add heal parsers (#96) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Add heal parsers (#97) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Radx pipeline (#99) * point to large download * fix schema path * debug bulk input dir * fix schema read * fix schema read * fix schema read * commenting steup dir for test * adding logs * fix path stuff * add commented stuff back in * testing radx parser * adding parser * skip indexing vars with no id * adding indexes as part of bulk loader paramters * fix id index cli arg * fix local cli * dug latest --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> --- .env | 6 +- .gitignore | 2 + Dockerfile | 10 +- Jenkinsfile | 32 +- dags/annotate.py | 103 -- dags/annotate_and_index.py | 44 + dags/dug_helpers/dug_utils.py | 201 ++-- dags/index_dag.py | 28 - dags/knowledge_graph_build.py | 102 ++ dags/roger/config/__init__.py | 42 +- dags/roger/config/config.yaml | 17 +- dags/roger/config/dev-config.yaml | 2 +- dags/roger/core/base.py | 36 +- dags/roger/core/bulkload.py | 71 +- dags/roger/core/storage.py | 200 ++-- dags/roger/models/kgx.py | 55 +- dags/roger/pipelines/README.md | 99 ++ dags/roger/pipelines/__init__.py | 28 + dags/roger/pipelines/anvil.py | 24 + dags/roger/pipelines/bacpac.py | 8 + dags/roger/pipelines/base.py | 964 ++++++++++++++++++ dags/roger/pipelines/bdc.py | 19 + dags/roger/pipelines/crdc.py | 19 + dags/roger/pipelines/ctn.py | 10 + dags/roger/pipelines/db_gap.py | 10 + .../roger/pipelines/heal_research_programs.py | 16 + dags/roger/pipelines/heal_studies.py | 16 + dags/roger/pipelines/kfdrc.py | 19 + dags/roger/pipelines/nida.py | 18 + dags/roger/pipelines/radx.py | 8 + dags/roger/pipelines/sparc.py | 17 + dags/roger/pipelines/topmed.py | 41 + dags/roger/tasks.py | 393 ++++++- dags/tranql_translate.py | 43 - requirements.txt | 5 +- 35 files changed, 2238 insertions(+), 470 deletions(-) delete mode 100755 dags/annotate.py create mode 100644 dags/annotate_and_index.py delete mode 100755 dags/index_dag.py create mode 100644 dags/knowledge_graph_build.py create mode 100644 dags/roger/pipelines/README.md create mode 100644 dags/roger/pipelines/__init__.py create mode 100644 dags/roger/pipelines/anvil.py create mode 100644 dags/roger/pipelines/bacpac.py create mode 100644 dags/roger/pipelines/base.py create mode 100644 dags/roger/pipelines/bdc.py create mode 100644 dags/roger/pipelines/crdc.py create mode 100644 dags/roger/pipelines/ctn.py create mode 100644 dags/roger/pipelines/db_gap.py create mode 100644 dags/roger/pipelines/heal_research_programs.py create mode 100644 dags/roger/pipelines/heal_studies.py create mode 100644 dags/roger/pipelines/kfdrc.py create mode 100644 dags/roger/pipelines/nida.py create mode 100644 dags/roger/pipelines/radx.py create mode 100644 dags/roger/pipelines/sparc.py create mode 100644 dags/roger/pipelines/topmed.py delete mode 100755 dags/tranql_translate.py diff --git a/.env b/.env index 9e2ba0e3..2b42e8d7 100644 --- a/.env +++ b/.env @@ -9,9 +9,9 @@ DATA_DIR=./local_storage DUG_LOG_LEVEL=INFO -ELASTIC_PASSWORD=12345 -ELASTIC_API_HOST=elasticsearch -ELASTIC_USERNAME=elastic +ELASTICSEARCH_PASSWORD=12345 +ELASTICSEARCH_HOST=elasticsearch +ELASTICSEARCH_USERNAME=elastic NBOOST_API_HOST=nboost diff --git a/.gitignore b/.gitignore index 8ca8ea91..6c46fe7f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ # Git ignore bioler plate from https://github.com/github/gitignore/blob/master/Python.gitignore +.secret-env +.vscode/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/Dockerfile b/Dockerfile index 5556709c..49c1fd26 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,9 @@ -FROM apache/airflow:2.5.0-python3.10 +FROM apache/airflow:2.7.2-python3.11 + USER root RUN apt-get update && \ - apt-get install -y git gcc python3-dev nano vim + apt-get install -y git nano vim COPY requirements.txt requirements.txt USER airflow -# dependency resolution taking hours eventually failing, -# @TODO fix click lib dependency -RUN pip install -r requirements.txt && \ - pip uninstall -y elasticsearch-dsl +RUN pip install -r requirements.txt RUN rm -f requirements.txt diff --git a/Jenkinsfile b/Jenkinsfile index 3372b9e0..a68ba92b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -70,31 +70,15 @@ spec: steps { script { container(name: 'kaniko', shell: '/busybox/sh') { - kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2", "$IMAGE_NAME:$TAG3", "$IMAGE_NAME:$TAG4"]) + if (env.BRANCH_NAME == "main") { + // Tag with latest and version iff when pushed to master + kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2", "$IMAGE_NAME:$TAG3", "$IMAGE_NAME:$TAG4"]) + } else { + kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2"]) + } } } - } - // post { - // always { - // archiveArtifacts artifacts: 'image.tar', onlyIfSuccessful: true - // } - // } - } - // stage('Publish') { - // steps { - // script { - // container(name: 'crane', shell: '/busybox/sh') { - // def imageTagsPushAlways = ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2"] - // def imageTagsPushForDevelopBranch = ["$IMAGE_NAME:$TAG3"] - // def imageTagsPushForMasterBranch = ["$IMAGE_NAME:$TAG3", "$IMAGE_NAME:$TAG4"] - // image.publish( - // imageTagsPushAlways, - // imageTagsPushForDevelopBranch, - // imageTagsPushForMasterBranch - // ) - // } - // } - // } - // } + } + } } } diff --git a/dags/annotate.py b/dags/annotate.py deleted file mode 100755 index e0d4a2e4..00000000 --- a/dags/annotate.py +++ /dev/null @@ -1,103 +0,0 @@ -import os - -from airflow.models import DAG -from airflow.operators.empty import EmptyOperator - -from dug_helpers.dug_utils import ( - DugUtil, - get_topmed_files, - get_dbgap_files, - get_nida_files, - get_sparc_files, - get_anvil_files, - get_cancer_data_commons_files, - get_kids_first_files, - get_sprint_files, - get_bacpac_files, - get_heal_study_files, - get_heal_research_program_files - ) -from roger.tasks import default_args, create_python_task - -DAG_ID = 'annotate_dug' - -""" Build the workflow's tasks and DAG. """ -with DAG( - dag_id=DAG_ID, - default_args=default_args, - schedule_interval=None -) as dag: - - """Build workflow tasks.""" - intro = EmptyOperator(task_id='Intro', dag=dag) - - # Unzip and get files, avoid this because - # 1. it takes a bit of time making the dag itself, webserver hangs - # 2. Every task in this dag would still need to execute this part making it redundant - # 3. tasks like intro would fail because they don't have the data dir mounted. - - make_kg_tagged = create_python_task(dag, "make_tagged_kgx", DugUtil.make_kg_tagged) - - dummy_stepover = EmptyOperator(task_id="continue") - - #intro >> run_printlog - envspec = os.getenv("ROGER_DUG__INPUTS_DATA__SETS","topmed") - data_sets = envspec.split(",") - - clear_annotation_items = create_python_task(dag, "clear_annotation_files", DugUtil.clear_annotation_cached) - - for i, data_set in enumerate(data_sets): - annotate_files = None - if data_set.startswith("bdc"): - prepare_files = create_python_task(dag, "get_dbgap_data", get_dbgap_files) - annotate_files = create_python_task(dag, "annotate_db_gap_files", DugUtil.annotate_db_gap_files) - elif data_set.startswith("nida"): - prepare_files = create_python_task(dag, "get_nida_files", get_nida_files) - annotate_files = create_python_task(dag, "annotate_nida_files", DugUtil.annotate_nida_files) - elif data_set.startswith("sparc"): - prepare_files = create_python_task(dag, "get_sparc_files", get_sparc_files) - annotate_files = create_python_task(dag, "annotate_sparc_files", DugUtil.annotate_sparc_files) - elif data_set.startswith("topmed"): - prepare_files = create_python_task(dag, "get_topmed_data", get_topmed_files) - annotate_files = create_python_task(dag, "annotate_topmed_files", DugUtil.annotate_topmed_files) - elif data_set.startswith("anvil"): - prepare_files = create_python_task(dag, "get_anvil_data", get_anvil_files) - annotate_files = create_python_task(dag, "annotate_anvil_files", DugUtil.annotate_anvil_files) - elif data_set.startswith("crdc"): - prepare_files = create_python_task(dag, "get_cancer_commons_files", get_cancer_data_commons_files) - annotate_files = create_python_task(dag, "annotate_cancer_commons_files", - DugUtil.annotate_cancer_commons_files) - elif data_set.startswith("kfdrc"): - prepare_files = create_python_task(dag, "get_kids_first_files", get_kids_first_files) - annotate_files = create_python_task(dag, "annotate_kids_first_files", - DugUtil.annotate_kids_first_files) - elif data_set.startswith("sprint"): - prepare_files = create_python_task(dag, "get_sprint_files", get_sprint_files) - annotate_files = create_python_task(dag, "annotate_sprint_files", - DugUtil.annotate_sprint_files) - elif data_set.startswith("bacpac"): - prepare_files = create_python_task(dag, "get_bacpac_files", get_bacpac_files) - annotate_files = create_python_task(dag, "annotate_bacpac_files", - DugUtil.annotate_bacpac_files) - - elif data_set.startswith("heal-studies"): - prepare_files = create_python_task(dag, "get_heal_study_files", get_heal_study_files) - annotate_files = create_python_task(dag, "annotate_heal_study_files", - DugUtil.annotate_heal_study_files) - # elif data_set.startswith("heal-mds-imports"): - # prepare_files = create_python_task(dag, "get_heal_mds_imports", get_heal_study_files) - - elif data_set.startswith("heal-research-programs"): - prepare_files = create_python_task(dag, "get_heal_research_program_files", get_heal_research_program_files) - annotate_files = create_python_task(dag, "annotate_heal_research_program_files", - DugUtil.annotate_heal_research_program_files) - - - intro >> prepare_files - prepare_files >> clear_annotation_items - - if annotate_files: - clear_annotation_items >> annotate_files - annotate_files >> dummy_stepover - - dummy_stepover >> make_kg_tagged diff --git a/dags/annotate_and_index.py b/dags/annotate_and_index.py new file mode 100644 index 00000000..884cd149 --- /dev/null +++ b/dags/annotate_and_index.py @@ -0,0 +1,44 @@ +"""DAG which performs Dug annotate and index operations + +This DAG differes slightly from prior versions of the same functionality in +Roger not only in that the annotation and indexing happen in the same DAG, but +also those tasks are broken out into sub-DAGs organized by dataset. Each dataset +has a subdag for all tasks. +""" + +import os + +from airflow.models import DAG +from airflow.operators.empty import EmptyOperator +from roger.tasks import default_args, create_pipeline_taskgroup + +env_enabled_datasets = os.getenv( + "ROGER_DUG__INPUTS_DATA__SETS", "topmed,anvil").split(",") + +with DAG( + dag_id='annotate_and_index', + default_args=default_args, + schedule_interval=None +) as dag: + init = EmptyOperator(task_id="init", dag=dag) + finish = EmptyOperator(task_id="finish", dag=dag) + + from roger import pipelines + from roger.config import config + envspec = os.getenv("ROGER_DUG__INPUTS_DATA__SETS","topmed:v2.0") + data_sets = envspec.split(",") + pipeline_names = {x.split(':')[0]: x.split(':')[1] for x in data_sets} + for pipeline_class in pipelines.get_pipeline_classes(pipeline_names): + # Only use pipeline classes that are in the enabled datasets list and + # that have a properly defined pipeline_name attribute + + # TODO + # Overriding environment variable just to see if this is working. + # name = getattr(pipeline_class, 'pipeline_name', '*not defined*') + # if not name in env_enabled_datasets: + # continue + + # Do the thing to add the pipeline's subdag to the dag in the right way + # . . . + + init >> create_pipeline_taskgroup(dag, pipeline_class, config) >> finish diff --git a/dags/dug_helpers/dug_utils.py b/dags/dug_helpers/dug_utils.py index 4d417e04..52db9624 100644 --- a/dags/dug_helpers/dug_utils.py +++ b/dags/dug_helpers/dug_utils.py @@ -11,8 +11,9 @@ from typing import Union, List import requests -from dug.core import get_parser, get_plugin_manager, DugConcept -from dug.core.annotate import DugAnnotator, ConceptExpander +from dug.core import get_parser, get_annotator, get_plugin_manager, DugConcept +from dug.core.annotators._base import Annotator +from dug.core.concept_expander import ConceptExpander from dug.core.crawler import Crawler from dug.core.factory import DugFactory from dug.core.parsers import Parser, DugElement @@ -44,7 +45,7 @@ def __init__(self, config: RogerConfig, to_string=True): self.string_handler = logging.StreamHandler(self.log_stream) log.addHandler(self.string_handler) - self.annotator: DugAnnotator = self.factory.build_annotator() + self.annotator_name: str = config.annotation.annotator_type self.tranqlizer: ConceptExpander = self.factory.build_tranqlizer() @@ -85,7 +86,8 @@ def __exit__(self, exc_type, exc_val, exc_tb): log.error(f"{exc_val} {exc_val} {exc_tb}") log.exception("Got an exception") - def annotate_files(self, parser_name, parsable_files): + def annotate_files(self, parser_name, parsable_files, + output_data_path=None): """ Annotates a Data element file using a Dug parser. :param parser_name: Name of Dug parser to use. @@ -94,14 +96,16 @@ def annotate_files(self, parser_name, parsable_files): """ dug_plugin_manager = get_plugin_manager() parser: Parser = get_parser(dug_plugin_manager.hook, parser_name) - output_base_path = storage.dug_annotation_path('') + annotator: Annotator = get_annotator(dug_plugin_manager.hook, annotator_name=self.annotator_name, config=self.config.to_dug_conf()) + if not output_data_path: + output_data_path = storage.dug_annotation_path('') log.info("Parsing files") for parse_file in parsable_files: log.debug("Creating Dug Crawler object") crawler = Crawler( crawl_file=parse_file, parser=parser, - annotator=self.annotator, + annotator=annotator, tranqlizer='', tranql_queries=[], http_session=self.cached_session @@ -109,7 +113,7 @@ def annotate_files(self, parser_name, parsable_files): # configure output space. current_file_name = '.'.join(os.path.basename(parse_file).split('.')[:-1]) - elements_file_path = os.path.join(output_base_path, current_file_name) + elements_file_path = os.path.join(output_data_path, current_file_name) elements_file_name = 'elements.pickle' concepts_file_name = 'concepts.pickle' @@ -566,141 +570,224 @@ def clear_annotation_cached(config=None, to_string=False): dug.cached_session.cache.clear() @staticmethod - def annotate_db_gap_files(config=None, to_string=False, files=None): + def annotate_db_gap_files(config=None, to_string=False, input_data_path=None, output_data_path=None): with Dug(config, to_string=to_string) as dug: - if files is None: - files = storage.dug_dd_xml_objects() + if not input_data_path: + files = storage.dug_dd_xml_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) parser_name = "DbGaP" dug.annotate_files(parser_name=parser_name, - parsable_files=files) + parsable_files=files, + output_data_path=output_data_path) output_log = dug.log_stream.getvalue() if to_string else '' return output_log @staticmethod - def annotate_anvil_files(config=None, to_string=False, files=None): + def annotate_anvil_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): with Dug(config, to_string=to_string) as dug: - if files is None: - files = storage.dug_anvil_objects() + if not input_data_path: + files = storage.dug_anvil_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) parser_name = "Anvil" dug.annotate_files(parser_name=parser_name, - parsable_files=files) + parsable_files=files, + output_data_path=output_data_path) output_log = dug.log_stream.getvalue() if to_string else '' return output_log @staticmethod - def annotate_cancer_commons_files(config=None, to_string=False, files=None): + def annotate_cancer_commons_files(config=None, to_string=False, + input_data_path=None, + output_data_path=None): with Dug(config, to_string=to_string) as dug: - if files is None: - files = storage.dug_crdc_objects() + if not input_data_path: + files = storage.dug_crdc_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) parser_name = "crdc" dug.annotate_files(parser_name=parser_name, - parsable_files=files) + parsable_files=files, + output_data_path=output_data_path) output_log = dug.log_stream.getvalue() if to_string else '' return output_log @staticmethod - def annotate_kids_first_files(config=None, to_string=False, files=None): + def annotate_kids_first_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): with Dug(config, to_string=to_string) as dug: - if files is None: - files = storage.dug_kfdrc_objects() + if not input_data_path: + files = storage.dug_kfdrc_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) parser_name = "kfdrc" dug.annotate_files(parser_name=parser_name, - parsable_files=files) + parsable_files=files, + output_data_path=output_data_path) output_log = dug.log_stream.getvalue() if to_string else '' return output_log @staticmethod - def annotate_nida_files(config=None, to_string=False, files=None): + def annotate_nida_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): with Dug(config, to_string=to_string) as dug: - if files is None: - files = storage.dug_nida_objects() + if not input_data_path: + files = storage.dug_nida_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) parser_name = "NIDA" dug.annotate_files(parser_name=parser_name, - parsable_files=files) + parsable_files=files, + output_data_path=output_data_path) output_log = dug.log_stream.getvalue() if to_string else '' return output_log @staticmethod - def annotate_sparc_files(config=None, to_string=False, files=None): + def annotate_sparc_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): with Dug(config, to_string=to_string) as dug: - if files is None: - files = storage.dug_sparc_objects() + if not input_data_path: + files = storage.dug_sparc_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) parser_name = "SciCrunch" dug.annotate_files(parser_name=parser_name, - parsable_files=files) + parsable_files=files, + output_data_path=output_data_path) output_log = dug.log_stream.getvalue() if to_string else '' return output_log @staticmethod - def annotate_sprint_files(config=None, to_string=False, files=None): + def annotate_sprint_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): with Dug(config, to_string=to_string) as dug: - if files is None: - files = storage.dug_sprint_objects() + if not input_data_path: + files = storage.dug_sprint_objects( + input_data_path=input_data_path) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) parser_name = "SPRINT" dug.annotate_files(parser_name=parser_name, - parsable_files=files) + parsable_files=files, + output_data_path=output_data_path) output_log = dug.log_stream.getvalue() if to_string else '' return output_log @staticmethod - def annotate_topmed_files(config=None, to_string=False, files=None): + def annotate_topmed_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): with Dug(config, to_string=to_string) as dug: - if files is None: - files = storage.dug_topmed_objects() + if not input_data_path: + files = storage.dug_topmed_objects( + input_data_path=None) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) parser_name = "TOPMedTag" log.info(files) dug.annotate_files(parser_name=parser_name, - parsable_files=files) + parsable_files=files, + output_data_path=output_data_path) output_log = dug.log_stream.getvalue() if to_string else '' return output_log @staticmethod - def annotate_bacpac_files(config=None, to_string=False, files=None): + def annotate_bacpac_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): + + log.info(f"Input data path is: {input_data_path}") with Dug(config, to_string=to_string) as dug: - if files is None: - files = storage.dug_bacpac_objects() + files = storage.dug_bacpac_objects( + input_data_path=input_data_path) + parser_name = "BACPAC" log.info(files) dug.annotate_files(parser_name=parser_name, - parsable_files=files) + parsable_files=files, + output_data_path=output_data_path) output_log = dug.log_stream.getvalue() if to_string else '' return output_log @staticmethod - def annotate_heal_study_files(config=None, to_string=False, files=None): + def annotate_heal_study_files(config=None, to_string=False, + input_data_path=None, output_data_path=None): with Dug(config, to_string=to_string) as dug: - if files is None: - files = storage.dug_heal_study_objects() + if not input_data_path: + files = storage.dug_heal_study_objects( + input_data_path=None) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) parser_name = "heal-studies" log.info(files) dug.annotate_files(parser_name=parser_name, - parsable_files=files) + parsable_files=files, + output_data_path=output_data_path) output_log = dug.log_stream.getvalue() if to_string else '' return output_log @staticmethod - def annotate_heal_research_program_files(config=None, to_string=False, files=None): + def annotate_heal_research_program_files(config=None, to_string=False, + input_data_path=None, + output_data_path=None): with Dug(config, to_string=to_string) as dug: - if files is None: - files = storage.dug_heal_research_program_objects() - + if not input_data_path: + files = storage.dug_heal_research_program_objects( + input_data_path=None) + else: + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) parser_name = "heal-research" log.info(files) dug.annotate_files(parser_name=parser_name, - parsable_files=files) + parsable_files=files, + output_data_path=output_data_path) output_log = dug.log_stream.getvalue() if to_string else '' return output_log @staticmethod - def make_kg_tagged(config=None, to_string=False): + def make_kg_tagged(config=None, to_string=False, input_data_path=None, output_data_path=None): with Dug(config, to_string=to_string) as dug: - output_base_path = storage.dug_kgx_path("") - storage.clear_dir(output_base_path) + output_base_path = output_data_path + if not output_data_path: + output_base_path = storage.dug_kgx_path("") + storage.clear_dir(output_base_path) log.info("Starting building KGX files") - elements_files = storage.dug_elements_objects() + if not input_data_path: + elements_files = storage.dug_elements_objects() + else: + import glob + glob_pattern = str(input_data_path / "**" / 'elements.pickle') + elements_files = glob.glob(glob_pattern, recursive=True) + log.info(f"making kgx files for the following pickles: {elements_files}") for file in elements_files: elements = storage.read_object(file) if "topmed_" in file: @@ -930,5 +1017,3 @@ def get_heal_study_files(config: RogerConfig, to_string=False) -> List[str]: def get_heal_research_program_files(config: RogerConfig, to_string=False) -> List[str]: return get_versioned_files(config, "heal-research", "heal-research-programs", data_store=config.dug_inputs.data_source, unzip=True) - - diff --git a/dags/index_dag.py b/dags/index_dag.py deleted file mode 100755 index 6ddc4b02..00000000 --- a/dags/index_dag.py +++ /dev/null @@ -1,28 +0,0 @@ -from airflow.models import DAG -from airflow.operators.empty import EmptyOperator - -from roger.tasks import get_executor_config, default_args, create_python_task -from dug_helpers.dug_utils import DugUtil - -""" Build the workflow's tasks and DAG. """ -with DAG( - dag_id='index_dug', - default_args=default_args, - schedule_interval=None -) as dag: - - """ Build the workflow tasks. """ - intro = EmptyOperator(task_id='Intro') - index_variables = create_python_task (dag, "IndexVariables", DugUtil.index_variables) - validate_index_variables = create_python_task(dag,"ValidateIndexVariables", DugUtil.validate_indexed_variables) - crawl_tags = create_python_task(dag, "CrawlConcepts", DugUtil.crawl_tranql) - index_concepts = create_python_task(dag, "IndexConcepts", DugUtil.index_concepts) - dummy_stepover = EmptyOperator(task_id="continue") - index_extracted_dug_elements = create_python_task(dag, "IndexExtractedElements", DugUtil.index_extracted_elements) - validate_index_concepts = create_python_task(dag, "ValidateIndexConcepts", DugUtil.validate_indexed_concepts) - finish = EmptyOperator(task_id='Finish') - """ Build the DAG. """ - intro >> index_variables >> validate_index_variables >> finish - intro >> crawl_tags >> index_concepts >> dummy_stepover - intro >> crawl_tags >> index_extracted_dug_elements >> dummy_stepover - dummy_stepover >> validate_index_concepts >> finish \ No newline at end of file diff --git a/dags/knowledge_graph_build.py b/dags/knowledge_graph_build.py new file mode 100644 index 00000000..de28aa78 --- /dev/null +++ b/dags/knowledge_graph_build.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +# + +""" +An Airflow workflow for the Roger Translator KGX data pipeline. +""" + +from airflow.models import DAG +from airflow.operators.empty import EmptyOperator +import roger +from roger.tasks import default_args, create_python_task +from roger.config import config + +""" Build the workflow's tasks and DAG. """ +with DAG( + dag_id='knowledge_graph_build', + default_args=default_args, + schedule_interval=None +) as dag: + + """ Build the workflow tasks. """ + intro = EmptyOperator(task_id='Intro') + + # Merge nodes needs inputs from two sources + # 1. baseline and/or CDE KGX files from LakeFS (External repo) + # 2. Infer which local kgx files are needed based on dug_inputs and grab them from the current repo + + # build the annotate and index pipeline output locations + #lakefs://yk-heal/main/annotate_and_index/crdc_dataset_pipeline_task_group.make_kgx_crdc/ + working_repo = config.lakefs_config.repo + branch = config.lakefs_config.branch + kgx_repos = config.kgx.data_sets + input_repos = [{ + 'name': repo.split(':')[0], + 'branch': repo.split(':')[1], + 'path': '*' + } for repo in kgx_repos] + + # Figure out a way to extract paths + get_path_on_lakefs = lambda d: f"annotate_and_index/{d}_dataset_pipeline_task_group.make_kgx_{d}/" + + + for dataset in config.dug_inputs.data_sets: + dataset_name = dataset.split(":")[0] + # add datasets from the other pipeline + input_repos.append( + { + 'name': working_repo, + 'branch': branch, + 'path': get_path_on_lakefs(dataset_name) + } + ) + + merge_nodes = create_python_task (dag, name="MergeNodes", + a_callable=roger.merge_nodes, + external_repos=input_repos + ) + + # The rest of these guys can just operate on the local lakefs repo/branch + # we need to add input dir and output dir similar to what we did for dug tasks + + create_nodes_schema = create_python_task(dag, + name="CreateNodesSchema", + a_callable=roger.create_nodes_schema + ) + create_edges_schema = create_python_task(dag, + name="CreateEdgesSchema", + a_callable=roger.create_edges_schema) + + create_bulk_load_nodes = create_python_task(dag, + name="CreateBulkLoadNodes", + a_callable=roger.create_bulk_nodes) + create_bulk_load_edges = create_python_task(dag, + name="CreateBulkLoadEdges", + a_callable=roger.create_bulk_edges) + bulk_load = create_python_task(dag, + name="BulkLoad", + a_callable=roger.bulk_load, + no_output_files=True) + check_tranql = create_python_task(dag, + name="CheckTranql", + a_callable=roger.check_tranql, + no_output_files=True) + validate = create_python_task(dag, + name="Validate", + a_callable=roger.validate, + no_output_files=True) + + + """ Build the DAG. """ + merge_nodes.set_upstream(intro) + create_nodes_schema.set_upstream(merge_nodes) + create_edges_schema.set_upstream(merge_nodes) + create_bulk_load_nodes.set_upstream(create_nodes_schema) + create_bulk_load_nodes.set_upstream(merge_nodes) + create_bulk_load_edges.set_upstream(create_edges_schema) + create_bulk_load_edges.set_upstream(merge_nodes) + bulk_load.set_upstream(create_bulk_load_nodes) + bulk_load.set_upstream(create_bulk_load_edges) + validate.set_upstream(bulk_load) + check_tranql.set_upstream(bulk_load) + diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py index 403b25b1..ac9eb23a 100644 --- a/dags/roger/config/__init__.py +++ b/dags/roger/config/__init__.py @@ -26,6 +26,21 @@ def __post_init__(self): self.port = int(self.port) +@dataclass +class LakefsConfig(DictLike): + host: str + access_key_id: str + secret_access_key: str + branch: str + repo: str + enabled: bool = False + + def __post_init__(self): + if isinstance(self.enabled, str): + self.enabled = self.enabled.lower() == "true" + + + @dataclass class LoggingConfig(DictLike): level: str = "DEBUG" @@ -35,10 +50,8 @@ class LoggingConfig(DictLike): @dataclass class KgxConfig(DictLike): biolink_model_version: str = "1.5.0" - dataset_version: str = "v1.0" - merge_db_id: int = 1 merge_db_temp_dir: str = "workspace" - data_sets: List = field(default_factory=lambda: ['baseline-graph']) + data_sets: List = field(default_factory=lambda: ['baseline-graph:v5.0']) def __post_init__(self): # Convert strings to list. In cases where this is passed as env variable with a single value @@ -77,7 +90,18 @@ class BulkLoaderConfig(DictLike): @dataclass class AnnotationConfig(DictLike): - annotator: str = "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" + annotator_type: str = "monarch" + annotator_args: dict = field( + default_factory=lambda: { + "monarch": { + "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" + }, + "sapbert": { + "classification_url": "https://med-nemo.apps.renci.org/annotate/", + "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/", + }, + } + ) normalizer: str = "https://nodenormalization-sri.renci.org/get_normalized_nodes?curie=" synonym_service: str = "https://onto.renci.org/synonyms/" ontology_metadata: str = "https://api.monarchinitiative.org/api/bioentity/" @@ -116,7 +140,7 @@ class IndexingConfig(DictLike): "anat_to_disease": ["anatomical_entity", "disease"], "anat_to_pheno": ["anatomical_entity", "phenotypic_feature"], }) - tranql_endpoint: str = "http://tranql:8081/tranql/query?dynamic_id_resolution=true&asynchronous=false" + tranql_endpoint: str = "http://tranql-service/tranql/query?dynamic_id_resolution=true&asynchronous=false" # by default skips node to element queries node_to_element_queries: dict = field(default_factory=lambda: {}) element_mapping: str = "" @@ -170,6 +194,7 @@ def __init__(self, **kwargs): self.annotation_base_data_uri: str = kwargs.pop("annotation_base_data_uri", "") self.validation = kwargs.pop("validation") self.dag_run = kwargs.pop('dag_run', None) + self.lakefs_config = LakefsConfig(**kwargs.pop("lakefs_config")) def to_dug_conf(self) -> DugConfig: return DugConfig( @@ -183,9 +208,8 @@ def to_dug_conf(self) -> DugConfig: redis_port=self.redisgraph.port, nboost_host=self.elasticsearch.nboost_host, preprocessor=self.annotation.preprocessor, - annotator={ - 'url': self.annotation.annotator, - }, + annotator_type=self.annotation.annotator_type, + annotator_args=self.annotation.annotator_args, normalizer={ 'url': self.annotation.normalizer, }, @@ -328,7 +352,7 @@ def update(new_value: Dict): def __str__(self): flat = flatten(Config.__instance__) for k in flat: - if 'PASSWORD' in k or 'password' in k: + if 'PASSWORD' in k or 'password' in k or 'key' in k.lower(): flat[k] = '******' flat = unflatten(flat) result = json.dumps(flat) diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml index 49bc927b..e9402ce4 100644 --- a/dags/roger/config/config.yaml +++ b/dags/roger/config/config.yaml @@ -16,11 +16,9 @@ annotation_base_data_uri: https://stars.renci.org/var/dug/ kgx: biolink_model_version: v3.1.2 - dataset_version: v5.0 - merge_db_id: 1 merge_db_temp_dir: workspace data_sets: - - baseline-graph + - baseline-graph:v5.0 dug_inputs: data_source: s3 @@ -44,10 +42,17 @@ bulk_loader: annotation: clear_http_cache: false - annotator: "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" + annotator_type: monarch + annotator_args: + monarch: + url: "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" + sapbert: + classification_url: "https://med-nemo.apps.renci.org/annotate/" + annotator_url: "https://babel-sapbert.apps.renci.org/annotate/" normalizer: "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie=" synonym_service: "https://name-resolution-sri.renci.org/reverse_lookup" ontology_metadata: "https://api.monarchinitiative.org/api/bioentity/" + preprocessor: debreviator: BMI: "body mass index" @@ -72,7 +77,7 @@ indexing: "small_molecule_to_disease": ["small_molecule", "disease"] "chemical_mixture_to_disease": ["chemical_mixture", "disease"] "phen_to_anat": ["phenotypic_feature", "anatomical_entity"] - tranql_endpoint: "http://tranql:8081/tranql/query?dynamic_id_resolution=true&asynchronous=false" + tranql_endpoint: "http://tranql-service/tranql/query?dynamic_id_resolution=true&asynchronous=false" node_to_element_queries: enabled: false cde: @@ -157,3 +162,5 @@ lakefs_config: access_key_id: "" secret_access_key: "" host: "" + branch: "" + repo: "" diff --git a/dags/roger/config/dev-config.yaml b/dags/roger/config/dev-config.yaml index 91fc0af3..bece11a8 100644 --- a/dags/roger/config/dev-config.yaml +++ b/dags/roger/config/dev-config.yaml @@ -54,7 +54,7 @@ indexing: "phen_to_anat": ["phenotypic_feature", "anatomical_entity"] "anat_to_disease": ["anatomical_entity", "disease"] "anat_to_pheno": ["anatomical_entity", "phenotypic_feature"] - tranql_endpoint: "http://tranql:8081/tranql/query?dynamic_id_resolution=true&asynchronous=false" + tranql_endpoint: "http://tranql-service/tranql/query?dynamic_id_resolution=true&asynchronous=false" elasticsearch: host: elasticsearch diff --git a/dags/roger/core/base.py b/dags/roger/core/base.py index 6696f81d..7ba9409a 100644 --- a/dags/roger/core/base.py +++ b/dags/roger/core/base.py @@ -73,62 +73,68 @@ def create_schema(to_string=False, config=None): output = (o1 + o2 ) if to_string else None return output -def create_edges_schema(to_string=False, config=None): +def create_edges_schema(to_string=False, config=None, input_data_path=None, output_data_path=None): "Create edges schema on KGX object" output = None with Roger(to_string, config=config) as roger: - roger.kgx.create_edges_schema() + roger.kgx.create_edges_schema( + input_data_path=input_data_path, + output_data_path=output_data_path + ) output = roger.log_stream.getvalue() if to_string else None return output -def create_nodes_schema(to_string=False, config=None): +def create_nodes_schema(to_string=False, config=None, input_data_path=None, output_data_path=None): "Create nodes schema on KGX object" output = None with Roger(to_string, config=config) as roger: - roger.kgx.create_nodes_schema() + roger.kgx.create_nodes_schema(input_data_path=input_data_path, + output_data_path=output_data_path) output = roger.log_stream.getvalue() if to_string else None return output -def merge_nodes(to_string=False, config=None): +def merge_nodes(to_string=False, config=None, input_data_path=None, output_data_path=None): "Run KGX merge" output = None with Roger (to_string, config=config) as roger: - roger.kgx.merge() + roger.kgx.merge(input_path=input_data_path, output_path=output_data_path) output = roger.log_stream.getvalue () if to_string else None return output -def create_bulk_load(to_string=False, config=None): +def create_bulk_load(to_string=False, config=None, input_data_path=None, output_data_path=None): "Generate bulk load files" o1 = create_bulk_nodes(to_string=to_string, config=config) o2 = create_bulk_edges(to_string=to_string, config=config) output = (o1 + o2) if to_string else None return output -def create_bulk_nodes(to_string=False, config=None): +def create_bulk_nodes(to_string=False, config=None, input_data_path=None, output_data_path=None): "Generate bulk node CSV file" output = None with Roger(to_string, config=config) as roger: - roger.bulk.create_nodes_csv_file() + log.info("input path: %s", input_data_path) + log.info("output path: %s", output_data_path) + roger.bulk.create_nodes_csv_file(input_data_path, output_data_path) output = roger.log_stream.getvalue() if to_string else None return output -def create_bulk_edges(to_string=False, config=None): +def create_bulk_edges(to_string=False, config=None, input_data_path=None, output_data_path=None): "Create bulk edges CSV file" output = None with Roger(to_string, config=config) as roger: - roger.bulk.create_edges_csv_file() + roger.bulk.create_edges_csv_file(input_data_path, output_data_path) output = roger.log_stream.getvalue() if to_string else None return output -def bulk_load(to_string=False, config=None): +def bulk_load(to_string=False, config=None, input_data_path=None, output_data_path=None): "Run bulk load insert process" output = None with Roger (to_string, config=config) as roger: - roger.bulk.insert() + roger.bulk.insert(input_data_path=input_data_path) output = roger.log_stream.getvalue () if to_string else None return output -def validate (to_string=False, config=None): +def validate (to_string=False, config=None, input_data_path=None, output_data_path=None): "Run bulk validate process" output = None with Roger (to_string, config=config) as roger: @@ -136,7 +142,7 @@ def validate (to_string=False, config=None): output = roger.log_stream.getvalue () if to_string else None return output -def check_tranql(to_string=False, config=None): +def check_tranql(to_string=False, config=None, input_data_path=None, output_data_path=None): "Tranql server smoke check" output = None with Roger(to_string, config=config) as roger: diff --git a/dags/roger/core/bulkload.py b/dags/roger/core/bulkload.py index 1eca92db..0f0fecee 100644 --- a/dags/roger/core/bulkload.py +++ b/dags/roger/core/bulkload.py @@ -33,31 +33,24 @@ def __init__(self, biolink, config=None): self.separator =(chr(separator) if isinstance(separator, int) else separator) - def tables_up_to_date (self): - return storage.is_up_to_date ( - source=[ - storage.schema_path (f"{SchemaType.PREDICATE.value}-schema.json"), - storage.schema_path (f"{SchemaType.PREDICATE.value}-schema.json") - ] + storage.merged_objects (), - targets=glob.glob (storage.bulk_path ("nodes/**.csv")) + \ - glob.glob (storage.bulk_path ("edges/**.csv"))) - - def create_nodes_csv_file(self): - if self.tables_up_to_date (): - log.info ("up to date.") - return + def create (self): + """Used in the CLI on args.create_bulk""" + self.create_nodes_csv_file() + self.create_edges_csv_file() + + def create_nodes_csv_file(self, input_data_path=None, output_data_path=None): # clear out previous data - bulk_path = storage.bulk_path("nodes") + bulk_path = storage.bulk_path("nodes", output_data_path) if os.path.exists(bulk_path): shutil.rmtree(bulk_path) - categories_schema = storage.read_schema (SchemaType.CATEGORY) + categories_schema = storage.read_schema (SchemaType.CATEGORY, input_data_path) state = defaultdict(lambda: None) log.info(f"processing nodes") """ Write node data for bulk load. """ categories = defaultdict(lambda: []) category_error_nodes = set() - merged_nodes_file = storage.merge_path("nodes.jsonl") + merged_nodes_file = storage.merged_objects('nodes', input_data_path) counter = 1 for node in storage.json_line_iter(merged_nodes_file): if not node['category']: @@ -74,7 +67,7 @@ def create_nodes_csv_file(self): f"Showing first 10: {list(category_error_nodes)[:10]}.") # flush every 100K if counter % 100_000 == 0: - self.write_bulk(storage.bulk_path("nodes"), + self.write_bulk(storage.bulk_path("nodes", output_data_path), categories, categories_schema, state=state, is_relation=False) # reset variables. @@ -83,22 +76,19 @@ def create_nodes_csv_file(self): counter += 1 # write back if any thing left. if len(categories): - self.write_bulk(storage.bulk_path("nodes"), + self.write_bulk(storage.bulk_path("nodes", output_data_path), categories, categories_schema, state=state, is_relation=False) - def create_edges_csv_file(self): + def create_edges_csv_file(self, input_data_path=None, output_data_path=None): """ Write predicate data for bulk load. """ - if self.tables_up_to_date (): - log.info ("up to date.") - return # Clear out previous data - bulk_path = storage.bulk_path("edges") + bulk_path = storage.bulk_path("edges", output_data_path) if os.path.exists(bulk_path): shutil.rmtree(bulk_path) - predicates_schema = storage.read_schema(SchemaType.PREDICATE) + predicates_schema = storage.read_schema(SchemaType.PREDICATE, input_data_path) predicates = defaultdict(lambda: []) - edges_file = storage.merge_path('edges.jsonl') + edges_file = storage.merged_objects('edges', input_data_path) counter = 1 state = {} for edge in storage.json_line_iter(edges_file): @@ -106,14 +96,14 @@ def create_edges_csv_file(self): # write out every 100K , to avoid large predicate dict. if counter % 100_000 == 0: self.write_bulk( - storage.bulk_path("edges"),predicates, predicates_schema, + storage.bulk_path("edges", output_data_path),predicates, predicates_schema, state=state, is_relation=True) predicates = defaultdict(lambda : []) counter += 1 # if there are some items left (if loop ended before counter reached the # specified value) if len(predicates): - self.write_bulk(storage.bulk_path("edges"), predicates, + self.write_bulk(storage.bulk_path("edges", output_data_path), predicates, predicates_schema,state=state, is_relation=True) @staticmethod @@ -316,17 +306,10 @@ def write_bulk(self, bulk_path, obj_map, schema, state={}, state['processed_id'] = processed_objects_id state['called_times'] = called_x_times - def insert (self): - - redisgraph = { - 'host': os.getenv('REDIS_HOST'), - 'port': os.getenv('REDIS_PORT', '6379'), - 'password': os.getenv('REDIS_PASSWORD'), - 'graph': os.getenv('REDIS_GRAPH'), - } + def insert (self, input_data_path=None): redisgraph = self.config.redisgraph - nodes = sorted(glob.glob (storage.bulk_path ("nodes/**.csv*"))) - edges = sorted(glob.glob (storage.bulk_path ("edges/**.csv*"))) + nodes = sorted(glob.glob (storage.bulk_path ("**/nodes/**.csv*", input_data_path), recursive=True)) + edges = sorted(glob.glob (storage.bulk_path ("**/edges/**.csv*", input_data_path), recursive=True)) graph = redisgraph['graph'] log.info(f"bulk loading \n nodes: {nodes} \n edges: {edges}") @@ -340,8 +323,9 @@ def insert (self): log.info ("bulk loading graph: %s", str(graph)) args = [] if len(nodes) > 0: - bulk_path_root = storage.bulk_path('nodes') + os.path.sep + bulk_path_root = glob.glob(storage.bulk_path('**/nodes', path=input_data_path), recursive=True)[0] + os.path.sep nodes_with_type = [] + collect_labels = set() for x in nodes: """ These lines prep nodes bulk load by: @@ -350,24 +334,29 @@ def insert (self): """ file_name_type_part = x.replace(bulk_path_root, '').split('.')[0].split('~')[1] all_labels = "biolink." + file_name_type_part + ":" + ":".join([f'biolink.{v.lstrip("biolink:")}' for v in self.biolink.toolkit.get_ancestors("biolink:" + file_name_type_part, reflexive=False, formatted=True )] ) + collect_labels.add("biolink." + file_name_type_part) + for v in self.biolink.toolkit.get_ancestors("biolink:" + file_name_type_part, reflexive=False, + formatted=True): + collect_labels.add(f'biolink.{v.lstrip("biolink:")}') nodes_with_type.append(f"{all_labels} {x}") args.extend(("-N " + " -N ".join(nodes_with_type)).split()) if len(edges) > 0: - bulk_path_root = storage.bulk_path('edges') + os.path.sep + bulk_path_root = glob.glob(storage.bulk_path('**/edges', path=input_data_path), recursive=True)[0] + os.path.sep edges_with_type = [f"biolink.{x.replace(bulk_path_root, '').strip(os.path.sep).split('.')[0].split('~')[1]} {x}" for x in edges] # Edge label now no longer has 'biolink:' args.extend(("-R " + " -R ".join(edges_with_type)).split()) args.extend([f"--separator={self.separator}"]) - log.debug(f"--redis-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}") args.extend([f"--redis-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}"]) args.extend(['--enforce-schema']) + for lbl in collect_labels: + args.extend([f'-i `{lbl}`:id', f'-f {lbl}:name', f'-f {lbl}:synonyms']) args.extend([f"{redisgraph['graph']}"]) """ standalone_mode=False tells click not to sys.exit() """ log.debug(f"Calling bulk_insert with extended args: {args}") try: bulk_insert(args, standalone_mode=False) - self.add_indexes() + # self.add_indexes() except Exception as e: log.error(f"Unexpected {e.__class__.__name__}: {e}") raise diff --git a/dags/roger/core/storage.py b/dags/roger/core/storage.py index 48034b07..c1a3e808 100644 --- a/dags/roger/core/storage.py +++ b/dags/roger/core/storage.py @@ -85,7 +85,7 @@ def read_object(path, key=None): elif path.endswith(".pickle"): with open(file=path, mode="rb") as stream: obj = pickle.load(stream) - elif path.endswith(".jsonl"): + elif path.endswith(".jsonl") or path.endswith('.txt'): obj = read_data(path) return obj @@ -117,11 +117,11 @@ def write_object (obj, path, key=None): yaml.dump (obj, outfile) elif path.endswith (".json"): with open (path, "w", encoding='utf-8') as stream: - stream.write(str(json.dumps (obj).decode('utf-8'))) + stream.write(str(json.dumps (obj, option=json.OPT_INDENT_2).decode('utf-8'))) elif path.endswith(".pickle"): with open (path, "wb") as stream: pickle.dump(obj, file=stream) - elif path.endswith(".jsonl"): + elif path.endswith(".jsonl") or path.endswith('.txt'): with open (path, "w", encoding="utf-8") as stream: stream.write(obj) else: @@ -152,30 +152,50 @@ def kgx_path(name): :path name: Name of the KGX object. """ return str(ROGER_DATA_DIR / "kgx" / name) -def kgx_objects(format_="json"): +def kgx_objects(format_="json", path=None): """ A list of KGX objects. """ kgx_pattern = kgx_path(f"**.{format_}") - return sorted(glob.glob (kgx_pattern)) + if path: + kgx_pattern = f"{path}/**/*.{format_}" + return sorted(glob.glob (kgx_pattern, recursive=True)) -def merge_path(name): +def merge_path(name, path: Path=None): """ Form a merged KGX object path. :path name: Name of the merged KGX object. """ - return str(ROGER_DATA_DIR / 'merge' / name) - -def merged_objects(): + if path is None: + # create output dir + if not os.path.exists(ROGER_DATA_DIR / 'merge'): + os.makedirs(ROGER_DATA_DIR / 'merge') + return str(ROGER_DATA_DIR / 'merge' / name) + return str(path.joinpath(name)) + +def merged_objects(file_type, path=None): """ A list of merged KGX objects. """ - merged_pattern = merge_path("**.json") - return sorted(glob.glob (merged_pattern)) + if not path: + merged_pattern = merge_path(f"**/{file_type}.jsonl") + else: + merged_pattern = merge_path(f"**/{file_type}.jsonl", path=path) + # this thing should always return one edges or nodes file (based on file_type) + try: + return sorted(glob.glob(merged_pattern, recursive=True))[0] + except IndexError: + raise ValueError(f"Could not find merged KGX of type {file_type} in {merged_pattern}") -def schema_path(name): + +def schema_path(name, path=None): """ Path to a schema object. :param name: Name of the object to get a path for. """ - return str(ROGER_DATA_DIR / 'schema' / name) + if not path: + return str(ROGER_DATA_DIR / 'schema' / name) + return str (path / 'schema' / name) -def bulk_path(name): +def bulk_path(name, path=None): """ Path to a bulk load object. :param name: Name of the object. """ - return str(ROGER_DATA_DIR / 'bulk' / name) + if not path: + return str(ROGER_DATA_DIR / 'bulk' / name) + else: + return str(path / name) def metrics_path(name): """ @@ -194,10 +214,14 @@ def dug_annotation_path(name): def dug_expanded_concepts_path(name): return str(ROGER_DATA_DIR / 'dug' / 'expanded_concepts' / name) -def dug_expanded_concept_objects(): - file_pattern = dug_expanded_concepts_path( - os.path.join('*','expanded_concepts.pickle')) - return sorted(glob.glob(file_pattern)) +def dug_expanded_concept_objects(data_path=None, format="pickle"): + "Return a list of files containing expaneded concept objects" + if data_path: + file_pattern = os.path.join(data_path, '**', f'expanded_concepts.{format}') + else: + file_pattern = dug_expanded_concepts_path( + os.path.join('*',f'expanded_concepts.{format}')) + return sorted(glob.glob(file_pattern, recursive=True)) def dug_extracted_elements_objects(): file_pattern = dug_expanded_concepts_path( @@ -212,17 +236,25 @@ def dug_kgx_objects(): dug_kgx_pattern = dug_kgx_path("**.json") return sorted(glob.glob(dug_kgx_pattern)) -def dug_concepts_objects(): +def dug_concepts_objects(data_path, format="pickle"): """ A list of dug annotation Objects. """ - concepts_file_path = dug_annotation_path( - os.path.join('*','concepts.pickle')) - return sorted(glob.glob(concepts_file_path)) + if not data_path: + concepts_file_path = dug_annotation_path( + os.path.join('*',f'concepts.{format}')) + else: + concepts_file_path = os.path.join( + data_path, '**', f'concepts.{format}') + return sorted(glob.glob(concepts_file_path, recursive=True)) -def dug_elements_objects(): +def dug_elements_objects(data_path=None, format='pickle'): """ A list of dug annotation Objects. """ - concepts_file_path = dug_annotation_path( - os.path.join('*', 'elements.pickle')) - return sorted(glob.glob(concepts_file_path)) + if not data_path: + concepts_file_pattern = dug_annotation_path( + os.path.join('*', f'elements.{format}')) + else: + concepts_file_pattern = os.path.join( + data_path, '**', f'elements.{format}') + return sorted(glob.glob(concepts_file_pattern, recursive=True)) def dug_input_files_path(name) -> pathlib.Path: path = ROGER_DATA_DIR / "dug" / "input_files" / name @@ -233,22 +265,13 @@ def dug_input_files_path(name) -> pathlib.Path: log.info(f"Input file path: {path} already exists") return path -def dug_topmed_path(name): - """ Topmed source files""" - return dug_input_files_path('topmed') / name - -def dug_topmed_objects(): - topmed_file_pattern = str(dug_topmed_path("topmed_*.csv")) +def dug_topmed_objects(input_data_path=None): + "Return list of TOPMed source files" + if not input_data_path: + input_data_path = str(dug_input_files_path('topmed')) + topmed_file_pattern = os.path.join(input_data_path, "topmed_*.csv") return sorted(glob.glob(topmed_file_pattern)) -def dug_nida_path(name): - """ NIDA source files""" - return dug_input_files_path('nida') / name - -def dug_sparc_path(name): - """ NIDA source files""" - return dug_input_files_path('sparc') / name - def dug_anvil_path(): """Anvil source files""" return dug_input_files_path('anvil') @@ -281,62 +304,78 @@ def dug_kfdrc_path(): """Anvil source files""" return dug_input_files_path('kfdrc') -def dug_nida_objects(): - nida_file_pattern = str(dug_nida_path("NIDA-*.xml")) +def dug_nida_objects(input_data_path=None): + "Return list of NIDA source files" + if not input_data_path: + input_data_path = str(dug_input_files_path('nida')) + nida_file_pattern = os.path.join(input_data_path, "NIDA-*.xml") return sorted(glob.glob(nida_file_pattern)) -def dug_sparc_objects(): - file_pattern = str(dug_sparc_path("scicrunch/*.xml")) +def dug_sparc_objects(input_data_path=None): + if not input_data_path: + input_data_path = str(dug_input_files_path('sparc')) + file_pattern = os.path.join(input_data_path, "scicrunch/*.xml") return sorted(glob.glob(file_pattern)) -def dug_anvil_objects(): - file_path = dug_anvil_path() +def dug_anvil_objects(input_data_path=None): + if not input_data_path: + input_data_path = dug_anvil_path() files = get_files_recursive( lambda file_name: ( not file_name.startswith('GapExchange_') and file_name.endswith('.xml')), - file_path) + input_data_path) return sorted([str(f) for f in files]) -def dug_sprint_objects(): - file_path = dug_sprint_path() +def dug_sprint_objects(input_data_path=None): + if not input_data_path: + input_data_path = dug_sprint_path() files = get_files_recursive( - lambda file_name: file_name.endswith('.xml'), file_path) + lambda file_name: file_name.endswith('.xml'), input_data_path) return sorted([str(f) for f in files]) -def dug_bacpac_objects(): - file_path = dug_bacpac_path() +def dug_bacpac_objects(input_data_path=None): + "Return list of BACPAC source files" + if not input_data_path: + input_data_path = dug_bacpac_path() files = get_files_recursive( - lambda file_name: file_name.endswith('.xml'), file_path) + lambda file_name: file_name.endswith('.xml'), input_data_path) return sorted([str(f) for f in files]) -def dug_crdc_objects(): - file_path = dug_crdc_path() +def dug_crdc_objects(input_data_path=None): + if not input_data_path: + input_data_path = dug_crdc_path() files = get_files_recursive( lambda file_name: ( not file_name.startswith('GapExchange_') and file_name.endswith('.xml')), - file_path) + input_data_path) return sorted([str(f) for f in files]) -def dug_heal_study_objects(): - file_path = dug_heal_study_path() - files = get_files_recursive(lambda file_name : file_name.endswith('.xml'), file_path) +def dug_heal_study_objects(input_data_path=None): + "Return list of HEAL study source files" + if not input_data_path: + input_data_path = dug_heal_study_path() + files = get_files_recursive(lambda file_name : file_name.endswith('.xml'), + input_data_path) return sorted([str(f) for f in files]) -def dug_heal_research_program_objects(): - file_path = dug_heal_research_program_path() - files = get_files_recursive(lambda file_name : file_name.endswith('.xml'), file_path) +def dug_heal_research_program_objects(input_data_path=None): + "Return list of HEAL research program source files" + if not input_data_path: + input_data_path = dug_heal_research_program_path() + files = get_files_recursive(lambda file_name : file_name.endswith('.xml'), + input_data_path) return sorted([str(f) for f in files]) - -def dug_kfdrc_objects(): - file_path = dug_kfdrc_path() +def dug_kfdrc_objects(input_data_path=None): + if not input_data_path: + input_data_path = dug_kfdrc_path() files = get_files_recursive( lambda file_name: ( not file_name.startswith('GapExchange_') and file_name.endswith('.xml')), - file_path) + input_data_path) return sorted([str(f) for f in files]) @@ -356,23 +395,26 @@ def get_files_recursive(file_name_filter, current_dir): file_paths += [child] return file_paths -def dug_dd_xml_objects(): - file_path = dug_dd_xml_path() +def dug_dd_xml_objects(input_data_path=None): + if not input_data_path: + input_data_path = dug_dd_xml_path() files = get_files_recursive( lambda file_name: ( not file_name.startswith('._') and file_name.endswith('.xml')), - file_path) + input_data_path) return sorted([str(f) for f in files]) def copy_file_to_dir(file_location, dir_name): return shutil.copy(file_location, dir_name) -def read_schema (schema_type: SchemaType): +def read_schema (schema_type: SchemaType, path=None): """ Read a schema object. :param schema_type: Schema type of the object to read. """ - path = schema_path (f"{schema_type.value}-schema.json") - return read_object (path) + if path is not None: + path = path / '**' + location = glob.glob(schema_path (f"{schema_type.value}-schema.json", path=path), recursive=True)[0] + return read_object (location) def get_uri (path, key): """ Build a URI. @@ -392,17 +434,7 @@ def read_relative_object (path): def trunc(text, limit): return ('..' + text[-limit-2:]) if len(text) > limit else text -def is_up_to_date (source, targets): - target_time_list = [ - os.stat (f).st_mtime for f in targets if os.path.exists(f)] - if len(target_time_list) == 0: - log.debug (f"no targets found") - return False - source = [ os.stat (f).st_mtime for f in source if os.path.exists (f) ] - if len(source) == 0: - log.debug ("no source found. up to date") - return True - return max(source) < min(target_time_list) + def json_line_iter(jsonl_file_path): f = open(file=jsonl_file_path, mode='r', encoding='utf-8') diff --git a/dags/roger/models/kgx.py b/dags/roger/models/kgx.py index a8976a00..e80e65db 100644 --- a/dags/roger/models/kgx.py +++ b/dags/roger/models/kgx.py @@ -8,9 +8,8 @@ from collections import defaultdict from xxhash import xxh64_hexdigest import orjson as json -import redis import ntpath -from kg_utils.merging import GraphMerger, MemoryGraphMerger, DiskGraphMerger +from kg_utils.merging import DiskGraphMerger from kg_utils.constants import * from roger.config import get_default_config @@ -43,18 +42,11 @@ def __init__(self, biolink=None, config=None): self.merger = DiskGraphMerger(temp_directory=self.temp_directory, chunk_size=5_000_000) self.biolink_version = self.config.kgx.biolink_model_version - self.merge_db_id = self.config.kgx.merge_db_id - self.merge_db_name = f'db{self.merge_db_id}' log.debug(f"Trying to get biolink version : {self.biolink_version}") if biolink is None: self.biolink = BiolinkModel(self.biolink_version) else: self.biolink = biolink - self.redis_conn = redis.Redis( - host=self.config.redisgraph.host, - port=self.config.redisgraph.port, - password=self.config.redisgraph.password, - db=self.merge_db_id) self.enable_metrics = self.config.get('enable_metrics', False) def get_kgx_json_format(self, files: list, dataset_version: str): @@ -301,7 +293,7 @@ def fetch_dug_kgx(self): log.info("Done copying dug KGX files.") return all_kgx_files - def create_nodes_schema(self): + def create_nodes_schema(self, input_data_path=None, output_data_path=None): """ Extracts schema for nodes based on biolink leaf types :return: @@ -309,7 +301,7 @@ def create_nodes_schema(self): category_schemas = defaultdict(lambda: None) category_error_nodes = set() - merged_nodes_file = storage.merge_path("nodes.jsonl") + merged_nodes_file = storage.merged_objects("nodes", input_data_path) log.info(f"Processing : {merged_nodes_file}") counter = 0 for node in storage.json_line_iter(merged_nodes_file): @@ -356,15 +348,15 @@ def create_nodes_schema(self): f"These will be treated as {BiolinkModel.root_type}.") # Write node schemas. - self.write_schema(category_schemas, SchemaType.CATEGORY) + self.write_schema(category_schemas, SchemaType.CATEGORY, output_path=output_data_path) - def create_edges_schema(self): + def create_edges_schema(self, input_data_path=None, output_data_path=None): """ Create unified schema for all edges in an edges jsonl file. :return: """ predicate_schemas = defaultdict(lambda: None) - merged_edges_file = storage.merge_path("edges.jsonl") + merged_edges_file = storage.merged_objects("edges", input_data_path) """ Infer predicate schemas. """ for edge in storage.json_line_iter(merged_edges_file): predicate = edge['predicate'] @@ -378,7 +370,7 @@ def create_edges_schema(self): previous_type = predicate_schemas[predicate][k] predicate_schemas[predicate][k] = compare_types( previous_type, current_type) - self.write_schema(predicate_schemas, SchemaType.PREDICATE) + self.write_schema(predicate_schemas, SchemaType.PREDICATE, output_path=output_data_path) def create_schema (self): """Determine the schema of each type of object. @@ -403,31 +395,40 @@ def schema_up_to_date (self): f"{SchemaType.PREDICATE.value}-schema.json") ]) - def write_schema(self, schema, schema_type: SchemaType): + def write_schema(self, schema, schema_type: SchemaType ,output_path=None): """ Output the schema file. :param schema: Schema to get keys from. :param schema_type: Type of schema to write. """ - file_name = storage.schema_path (f"{schema_type.value}-schema.json") + file_name = storage.schema_path (f"{schema_type.value}-schema.json", output_path) log.info("writing schema: %s", file_name) dictionary = { k : v for k, v in schema.items () } storage.write_object (dictionary, file_name) - def merge(self): + def merge(self, input_path=None, output_path=None): """ This version uses the disk merging from the kg_utils module """ - data_set_version = self.config.get('kgx', {}).get('dataset_version') + metrics = {} start = time.time() - json_format_files = storage.kgx_objects("json") - jsonl_format_files = storage.kgx_objects("jsonl") + + log.info(f"Input path = {input_path}, Output path = {output_path}") + + if input_path: + json_format_files = storage.kgx_objects("json", input_path) + jsonl_format_files = storage.kgx_objects("jsonl", input_path) + else: + json_format_files = storage.kgx_objects("json") + jsonl_format_files = storage.kgx_objects("jsonl") # Create lists of the nodes and edges files in both json and jsonl # formats jsonl_node_files = {file for file in jsonl_format_files - if "node" in file} + if "node" in file.split('/')[-1]} jsonl_edge_files = {file for file in jsonl_format_files - if "edge" in file} + if "edge" in file.split('/')[-1]} + log.info(f"Jsonl edge files : {jsonl_edge_files}") + log.info(f"Jsonl node files : {jsonl_node_files}") # Create all the needed iterators and sets thereof jsonl_node_iterators = [storage.jsonl_iter(file_name) @@ -449,13 +450,16 @@ def merge(self): self.merger.merge_nodes(node_iterators) merged_nodes = self.merger.get_merged_nodes_jsonl() + self.merger.merge_edges(edge_iterators) merged_edges = self.merger.get_merged_edges_jsonl() write_merge_metric = {} t = time.time() start_nodes_jsonl = time.time() - nodes_file_path = storage.merge_path("nodes.jsonl") + + + nodes_file_path = storage.merge_path("nodes.jsonl", output_path) # stream out nodes to nodes.jsonl file with open(nodes_file_path, 'w') as stream: @@ -468,7 +472,7 @@ def merge(self): start_edge_jsonl = time.time() # stream out edges to edges.jsonl file - edges_file_path = storage.merge_path("edges.jsonl") + edges_file_path = storage.merge_path("edges.jsonl", output_path) with open(edges_file_path, 'w') as stream: for edges in merged_edges: edges = json.loads(edges) @@ -496,3 +500,4 @@ def merge(self): if self.enable_metrics: metricsfile_path = storage.metrics_path('merge_metrics.yaml') storage.write_object(metrics, metricsfile_path) + diff --git a/dags/roger/pipelines/README.md b/dags/roger/pipelines/README.md new file mode 100644 index 00000000..e77e6a29 --- /dev/null +++ b/dags/roger/pipelines/README.md @@ -0,0 +1,99 @@ +# Building custom Dug data pipelines + +The pipelines submodule is where data pipelines can be defined for specific data +sets with specific, custom behaviors for each one. In previous versions of the +code, customizations for each pipeline were spread across several modules. With +this instantiation, the customizations for each data set pipeline are +consolidated into a single overridden subclass of the DataPipeline class. + +## What the base pipeline does + +The function `roger.tasks.create_pipeline_taskgroup`, when called with the given +data pipeline class, will emit an Airflow task group with the following +structure. If Airflow is not being used, another executor should use a similarly +structured set of calls and dependencies to ensure that the task pipeline +executes fully and in order. + +```mermaid +graph TD; + annotate-->index_variables; + annotate-->validate_index_variables; + index_variables-->validate_index_variables; + annotate-->make_kg_tagged; + annotate-->crawl_tranql; + annotate-->index_concepts; + crawl_tranql-->validate_index_concepts; + index_concepts-->validate_index_concepts; + annotate-->validate_index_concepts; +``` +The pipeline steps are briefly described below + +### annotate + +By default, `annotate` will call the `get_objects` method to collect a list of +parsable files. For each of these files, a Dug Crawler object will be created +which will apply the parser returned by the pipeline class's `get_parser_name` +method. (This by default will return `parser_name` if it's defined, or will fall +back to `pipeline_name`.) The results will be written to `elements.json` and +`concepts.json` as appropriate. + +### index_variables + +This will load the `elements.json` files from `annotate` and pass them to the +indexer built from a DugFactory object. (This is sending them to ElasticSearch +for indexing under the hood.) + +### make_kg_tagged + +All `elements.json` files will be loaded, and based on the annotations, a +Translator-compliant knowledge graph will be written to a `_kgx.json` file. + +### index_concepts + +The `concepts.json` files are read and submitted to ElasticSearch using the +indexer object derived from the embedded DugFactory object. + +### validate_index_concepts + +Concepts from `concepts.json` are double-checked to ensure that the ES indexing +process actually worked. + +## Defining a basic pipeline, with no customizations + +Simple pipelines, such as that for the BACPAC dataset, need very little +customization. All pipelines must define a `pipeline_name`, which will be used +as the default value for a number of other parameters if they are not +defined. In the case of BACPAC, a difference in case means that both the +`pipeline_name` and the `parser_name` need to be defined. + +```python +from roger.pipelines import DugPipeline + +class BacPacPipeline(DugPipeline): + "Pipeline for BACPAC data set" + pipeline_name = "bacpac" + parser_name = "BACPAC" +``` + +This is the full extent of the code needed to adapt the DugPipeline object to +BACPAC. Other data sets have more specific customizations that need more custom +code or variables defined. + +## More extensive customization + +Because the base pipeline (defined in `roger/pipelines/base.py:DugPipeline`) is +inherited as a subclass for customizing, effectively any part of the pipeline +that isn't part of Dug proper can be overriden. Here are some common +customizations that are expected to be necessary for many parts of the process: + +### get_objects + +The `get_objects` method by default looks in the `input_data_path` that is +passed to it, and if that is None, loads the default from the `ROGER_DATA_DIR` +environment variable. By default, it reads all files with the `.xml` extension +recursively anywhere in that directory or its subdirectories. + +One example customization is the anvil data pipeline, which additionally +excludes any file that starts with 'GapExchange_'. Any overriden method should +accept an optional `input_data_path` parameter and return a list of files, +sorted in the order that they should be processed. diff --git a/dags/roger/pipelines/__init__.py b/dags/roger/pipelines/__init__.py new file mode 100644 index 00000000..d6664b8e --- /dev/null +++ b/dags/roger/pipelines/__init__.py @@ -0,0 +1,28 @@ +"Modules for individual datasets" + +import pkgutil +from pathlib import Path +import importlib + +from .base import DugPipeline + +def get_pipeline_classes(pipeline_names_dict): + """Return a list of all defined pipeline classes + """ + + base_path = Path(__file__).resolve().parent + + for (_, mod_name, _) in pkgutil.iter_modules([base_path]): + if mod_name == 'base': + continue + + # No need to actuall get the module symbol, once it's imported, it will + # show up below in __subclasses__. + importlib.import_module(f"{__name__}.{mod_name}") + pipeline_list = [] + + for subclass in DugPipeline.__subclasses__(): + if getattr(subclass, 'pipeline_name') and getattr(subclass, 'pipeline_name') in pipeline_names_dict.keys(): + subclass.input_version = pipeline_names_dict[getattr(subclass, 'pipeline_name')] + pipeline_list.append(subclass) + return pipeline_list diff --git a/dags/roger/pipelines/anvil.py b/dags/roger/pipelines/anvil.py new file mode 100644 index 00000000..baa82c05 --- /dev/null +++ b/dags/roger/pipelines/anvil.py @@ -0,0 +1,24 @@ +"Pipeline for anvil data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class AnvilPipeline(DugPipeline): + "Pipeline for Anvil data set" + pipeline_name = 'anvil' + parser_name = 'Anvil' + + def get_objects(self, input_data_path=None): + """Retrieve anvil objects + + This code is imported from roger.core.storage.dug_anvil_objects + """ + if not input_data_path: + input_data_path = storage.dug_input_files_path( + self.files_dir) + files = storage.get_files_recursive( + lambda file_name: ( + not file_name.startswith('GapExchange_') + and file_name.endswith('.xml')), + input_data_path) + return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/bacpac.py b/dags/roger/pipelines/bacpac.py new file mode 100644 index 00000000..495ba3b9 --- /dev/null +++ b/dags/roger/pipelines/bacpac.py @@ -0,0 +1,8 @@ +"Pipeline for BACPAC data" + +from roger.pipelines import DugPipeline + +class BacPacPipeline(DugPipeline): + "Pipeline for BACPAC data set" + pipeline_name = "bacpac" + parser_name = "BACPAC" diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py new file mode 100644 index 00000000..2a1b605e --- /dev/null +++ b/dags/roger/pipelines/base.py @@ -0,0 +1,964 @@ +"Base class for implementing a dataset annotate, crawl, and index pipeline" + +import os +import asyncio +from io import StringIO +import logging +import re +import hashlib +import traceback +from functools import reduce +from pathlib import Path +import tarfile +from typing import Union +import jsonpickle + +import requests + +from dug.core import get_parser, get_annotator, get_plugin_manager, DugConcept +from dug.core.concept_expander import ConceptExpander +from dug.core.crawler import Crawler +from dug.core.factory import DugFactory +from dug.core.parsers import Parser, DugElement +from dug.core.annotators import Annotator +from dug.core.async_search import Search +from dug.core.index import Index + +from roger.config import RogerConfig +from roger.core import storage +from roger.models.biolink import BiolinkModel +from roger.logger import get_logger + +from utils.s3_utils import S3Utils + +log = get_logger() + +class PipelineException(Exception): + "Exception raised from DugPipeline and related classes" + +def make_edge(subj, + obj, + predicate='biolink:related_to', + predicate_label='related to', + relation='biolink:related_to', + relation_label='related to' + ): + """Create an edge between two nodes. + + :param subj: The identifier of the subject. + :param pred: The predicate linking the subject and object. + :param obj: The object of the relation. + :param predicate: Biolink compatible edge type. + :param predicate_label: Edge label. + :param relation: Ontological edge type. + :param relation_label: Ontological edge type label. + :returns: Returns and edge. + """ + edge_id = hashlib.md5( + f'{subj}{predicate}{obj}'.encode('utf-8')).hexdigest() + return { + "subject": subj, + "predicate": predicate, + "predicate_label": predicate_label, + "id": edge_id, + "relation": relation, + "relation_label": relation_label, + "object": obj, + "provided_by": "renci.bdc.semanticsearch.annotator" + } + +class FileFetcher: + """A basic remote file fetcher class + """ + + def __init__( + self, + remote_host: str, + remote_dir: Union[str, Path], + local_dir: Union[str, Path] = "." + ): + self.remote_host = remote_host + if isinstance(remote_dir, str): + self.remote_dir = remote_dir.rstrip("/") + else: + self.remote_dir = str(remote_dir.as_posix()) + self.local_dir = Path(local_dir).resolve() + + def __call__(self, remote_file_path: Union[str, Path]) -> Path: + remote_path = self.remote_dir + "/" + remote_file_path + local_path = self.local_dir / remote_file_path + url = f"{self.remote_host}{remote_path}" + log.debug("Fetching %s", url) + try: + response = requests.get(url, allow_redirects=True, timeout=60) + except Exception as e: + log.error("Unexpected %s: %s", e.__class__.__name__, str(e)) + raise RuntimeError(f"Unable to fetch {url}") from e + + log.debug("Response: %d", response.status_code) + if response.status_code != 200: + log.debug("Unable to fetch %s: %d", url, response.status_code) + raise RuntimeError(f"Unable to fetch {url}") + + with local_path.open('wb') as file_obj: + file_obj.write(response.content) + return local_path + +class DugPipeline(): + "Base class for dataset pipelines" + + pipeline_name = None + unzip_source = True + input_version = "" + + def __init__(self, config: RogerConfig, to_string=False): + "Set instance variables and check to make sure we're overriden" + if not self.pipeline_name: + raise PipelineException( + "Subclass must at least define pipeline_name as class var") + self.config = config + self.bl_toolkit = BiolinkModel() + dug_conf = config.to_dug_conf() + self.element_mapping = config.indexing.element_mapping + self.factory = DugFactory(dug_conf) + self.cached_session = self.factory.build_http_session() + self.event_loop = asyncio.new_event_loop() + self.log_stream = StringIO() + if to_string: + self.string_handler = logging.StreamHandler(self.log_stream) + log.addHandler(self.string_handler) + self.s3_utils = S3Utils(self.config.s3_config) + + self.tranqlizer: ConceptExpander = self.factory.build_tranqlizer() + + graph_name = self.config["redisgraph"]["graph"] + source = f"redis:{graph_name}" + self.tranql_queries: dict = self.factory.build_tranql_queries(source) + self.node_to_element_queries: list = ( + self.factory.build_element_extraction_parameters(source)) + + indexing_config = config.indexing + self.variables_index = indexing_config.get('variables_index') + self.concepts_index = indexing_config.get('concepts_index') + self.kg_index = indexing_config.get('kg_index') + + self.search_obj: Search = self.factory.build_search_obj([ + self.variables_index, + self.concepts_index, + self.kg_index, + ]) + self.index_obj: Index = self.factory.build_indexer_obj([ + self.variables_index, + self.concepts_index, + self.kg_index, + + ]) + + def __enter__(self): + self.event_loop = asyncio.new_event_loop() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # close elastic search connection + self.event_loop.run_until_complete(self.search_obj.es.close()) + # close async loop + if self.event_loop.is_running() and not self.event_loop.is_closed(): + self.event_loop.close() + if exc_type or exc_val or exc_tb: + traceback.print_exc() + log.error("%s %s %s", exc_val, exc_val, exc_tb) + log.exception("Got an exception") + + def get_data_format(self): + """Access method for data_format parameter + + Defaults to pipeline_name unless self.data_format is set. This method + can also be overriden + """ + return getattr(self, 'data_format', self.pipeline_name) + + def get_files_dir(self): + """Access method for files_dir parameter + + Defaults to pipeline_name unless self.files_dir is set. This method can + also be overriden. + """ + return getattr(self, 'files_dir', self.pipeline_name) + + def get_parser_name(self): + """Access method for parser_name + + Defaults to pipeline_name unless self.parser_name is set. This method + can also be overriden. + """ + return getattr(self, 'parser_name', self.pipeline_name) + + def get_annotator_name(self): + """ + Access method for annotator_name + Defaults to annotator_monarch unless specified using annotation.annotator_type in the configuration file. + """ + return self.config.annotation.annotator_type + + + def get_parser(self): + dug_plugin_manager = get_plugin_manager() + parser: Parser = get_parser(dug_plugin_manager.hook, + self.get_parser_name()) + return parser + + def get_annotator(self): + dug_plugin_manager = get_plugin_manager() + annotator: Annotator = get_annotator( + dug_plugin_manager.hook, + self.get_annotator_name(), + self.config.to_dug_conf() + ) + return annotator + + def annotate_files(self, parsable_files, output_data_path=None): + """ + Annotates a Data element file using a Dug parser. + :param parser_name: Name of Dug parser to use. + :param parsable_files: Files to parse. + :return: None. + """ + if not output_data_path: + output_data_path = storage.dug_annotation_path('') + log.info("Parsing files") + for _, parse_file in enumerate(parsable_files): + log.debug("Creating Dug Crawler object on parse_file %s at %d of %d", + parse_file, _ , len(parsable_files)) + parser = self.get_parser() + annotator = self.get_annotator() + crawler = Crawler( + crawl_file=parse_file, + parser=parser, + annotator=annotator, + tranqlizer='', + tranql_queries=[], + http_session=self.cached_session + ) + + # configure output space. + current_file_name = '.'.join( + os.path.basename(parse_file).split('.')[:-1]) + elements_file_path = os.path.join( + output_data_path, current_file_name) + elements_file = os.path.join(elements_file_path, 'elements.txt') + concepts_file = os.path.join(elements_file_path, 'concepts.txt') + # This is a file that the crawler will later populate. We start here + # by creating an empty elements file. + # This also creates output dir if it doesn't exist. + # elements_json = os.path.join(elements_file_path, + # 'element_file.json') + # log.debug("Creating empty file: %s", elements_json) + # storage.write_object({}, elements_json) + + # Use the specified parser to parse the parse_file into elements. + log.debug("Parser is %s", str(parser)) + elements = parser(parse_file) + log.debug("Parsed elements: %s", str(elements)) + + # This inserts the list of elements into the crawler where + # annotate_elements expects to find it. Maybe in some future version + # of Dug this could be a parameter instead of an attribute? + crawler.elements = elements + + # @TODO propose for Dug to make this a crawler class init param(??) + crawler.crawlspace = elements_file_path + log.debug("Crawler annotator: %s", str(crawler.annotator)) + crawler.annotate_elements() + + # Extract out the concepts gotten out of annotation + # Extract out the elements + non_expanded_concepts = crawler.concepts + # The elements object will have been modified by annotate_elements, + # so we want to make sure to catch those modifications. + elements = crawler.elements + + # Write pickles of objects to file + log.info("Parsed and annotated: %s", parse_file) + + storage.write_object(jsonpickle.encode(elements, indent=2), elements_file) + log.info("Serialized annotated elements to : %s", elements_file) + + storage.write_object(jsonpickle.encode(non_expanded_concepts, indent=2), concepts_file) + log.info("Serialized annotated concepts to : %s", concepts_file) + + def convert_to_kgx_json(self, elements, written_nodes=None): + """ + Given an annotated and normalized set of study variables, + generate a KGX compliant graph given the normalized annotations. + Write that grpah to a graph database. + See BioLink Model for category descriptions. + https://biolink.github.io/biolink-model/notes.html + """ + if written_nodes is None: + written_nodes = set() + graph = { + "nodes": [], + "edges": [] + } + edges = graph['edges'] + nodes = graph['nodes'] + + for _, element in enumerate(elements): + # DugElement means a variable (Study variable...) + if not isinstance(element, DugElement): + continue + study_id = element.collection_id + if study_id not in written_nodes: + nodes.append({ + "id": study_id, + "category": ["biolink:Study"], + "name": study_id + }) + written_nodes.add(study_id) + + # connect the study and the variable. + edges.append(make_edge( + subj=element.id, + relation_label='part of', + relation='BFO:0000050', + obj=study_id, + predicate='biolink:part_of', + predicate_label='part of')) + edges.append(make_edge( + subj=study_id, + relation_label='has part', + relation="BFO:0000051", + obj=element.id, + predicate='biolink:has_part', + predicate_label='has part')) + + # a node for the variable. Should be BL compatible + variable_node = { + "id": element.id, + "name": element.name, + "category": ["biolink:StudyVariable"], + # bulk loader parsing issue + "description": ( + element.description.replace("'", '`').replace('\n', ' ')) + } + if element.id not in written_nodes: + nodes.append(variable_node) + written_nodes.add(element.id) + + for identifier, metadata in element.concepts.items(): + identifier_object = metadata.identifiers.get(identifier) + # This logic is treating DBGap files. + # First item in current DBGap xml files is a topmed tag, + # This is treated as a DugConcept Object. But since its not + # a concept we get from annotation (?) its never added to + # variable.concepts.items (Where variable is a DugElement obj) + # The following logic is trying to extract types, and for the + # aformentioned topmed tag it adds + # `biolink:InfomrmationContentEntity` + # Maybe a better solution could be adding types on + # DugConcept objects + # More specifically Biolink compatible types (?) + # + if identifier_object: + category = identifier_object.types + elif identifier.startswith("TOPMED.TAG:"): + category = ["biolink:InformationContentEntity"] + else: + continue + if identifier not in written_nodes: + if isinstance(category, str): + bl_element = self.bl_toolkit.toolkit.get_element( + category) + category = [bl_element.class_uri or bl_element.slot_uri] + nodes.append({ + "id": identifier, + "category": category, + "name": metadata.name + }) + written_nodes.add(identifier) + # related to edge + edges.append(make_edge( + subj=element.id, + obj=identifier + )) + # related to edge + edges.append(make_edge( + subj=identifier, + obj=element.id)) + return graph + + def make_tagged_kg(self, elements): + """ Make a Translator standard knowledge graph representing + tagged study variables. + :param variables: The variables to model. + :param tags: The tags characterizing the variables. + :returns: dict with nodes and edges modeling a Translator/Biolink KG. + """ + graph = { + "nodes": [], + "edges": [] + } + edges = graph['edges'] + nodes = graph['nodes'] + + # Create graph elements to model tags and their + # links to identifiers gathered by semantic tagging + tag_map = {} + # @TODO extract this into config or maybe dug ?? + topmed_tag_concept_type = "TOPMed Phenotype Concept" + nodes_written = set() + for tag in elements: + if not (isinstance(tag, DugConcept) + and tag.type == topmed_tag_concept_type): + continue + tag_id = tag.id + tag_map[tag_id] = tag + nodes.append({ + "id": tag_id, + "name": tag.name, + "description": tag.description.replace("'", "`"), + "category": ["biolink:InformationContentEntity"] + }) + + # Link ontology identifiers we've found for this tag via nlp. + for identifier, metadata in tag.identifiers.items(): + if isinstance(metadata.types, str): + bl_element = self.bl_toolkit.toolkit.get_element( + metadata.types) + category = [bl_element.class_uri or bl_element.slot_uri] + else: + category = metadata.types + synonyms = metadata.synonyms if metadata.synonyms else [] + nodes.append({ + "id": identifier, + "name": metadata.label, + "category": category, + "synonyms": synonyms + }) + nodes_written.add(identifier) + edges.append(make_edge( + subj=tag_id, + obj=identifier)) + edges.append(make_edge( + subj=identifier, + obj=tag_id)) + + concepts_graph = self.convert_to_kgx_json(elements, + written_nodes=nodes_written) + graph['nodes'] += concepts_graph['nodes'] + graph['edges'] += concepts_graph['edges'] + + return graph + + def index_elements(self, elements_file): + "Submit elements_file to ElasticSearch for indexing " + log.info("Indexing %s...", str(elements_file)) + elements =jsonpickle.decode(storage.read_object(elements_file)) + count = 0 + total = len(elements) + # Index Annotated Elements + log.info("found %d from elements files.", len(elements)) + for element in elements: + count += 1 + # Only index DugElements as concepts will be + # indexed differently in next step + if not isinstance(element, DugConcept): + # override data-type with mapping values + if element.type.lower() in self.element_mapping: + element.type = self.element_mapping[element.type.lower()] + if not element.id: + # no id no indexing + continue + # Use the Dug Index object to submit the element to ES + self.index_obj.index_element( + element, index=self.variables_index) + percent_complete = (count / total) * 100 + if percent_complete % 10 == 0: + log.info("%d %%", percent_complete) + log.info("Done indexing %s.", elements_file) + + def validate_indexed_element_file(self, elements_file): + "After submitting elements for indexing, verify that they're available" + elements = [x for x in jsonpickle.decode(storage.read_object(elements_file)) + if not isinstance(x, DugConcept)] + # Pick ~ 10 % + sample_size = int(len(elements) * 0.1) + + # random.choices(elements, k=sample_size) + test_elements = elements[:sample_size] + log.info("Picked %d from %s for validation.", len(test_elements), + elements_file) + for element in test_elements: + # Pick a concept + concepts = [element.concepts[curie] for curie in element.concepts + if element.concepts[curie].name] + + if len(concepts): + # Pick the first concept + concept = concepts[0] + curie = concept.id + search_term = re.sub(r'[^a-zA-Z0-9_\ ]+', '', concept.name) + log.debug("Searching for Concept: %s and Search term: %s", + str(curie), search_term) + all_elements_ids = self._search_elements(curie, search_term) + present = element.id in all_elements_ids + if not present: + log.error("Did not find expected variable %s in search " + "result.", str(element.id)) + log.error("Concept id : %s, Search term: %s", + str(concept.id), search_term) + raise PipelineException( + f"Validation exception - did not find variable " + f"{element.id} from {str(elements_file)}" + f"when searching variable index with Concept ID : " + f"{concept.id} using Search Term : {search_term} ") + else: + log.info( + "%s has no concepts annotated. Skipping validation for it.", + str(element.id)) + + def _search_elements(self, curie, search_term): + "Asynchronously call a search on the curie and search term" + response = self.event_loop.run_until_complete(self.search_obj.search_vars_unscored( + concept=curie, + query=search_term + )) + ids_dict = [] + if 'total_items' in response: + if response['total_items'] == 0: + log.error(f"No search elements returned for variable search: {self.variables_index}.") + log.error(f"Concept id : {curie}, Search term: {search_term}") + raise Exception(f"Validation error - Did not find {curie} for" + f"Search term: {search_term}") + else: + del response['total_items'] + for element_type in response: + all_elements_ids = [e['id'] for e in + reduce(lambda x, y: x + y['elements'], response[element_type], [])] + ids_dict += all_elements_ids + return ids_dict + + def crawl_concepts(self, concepts, data_set_name, output_path=None): + """Adds tranql KG to Concepts + + Terms grabbed from KG are also added as search terms + :param concepts: + :param data_set_name: + :return: + """ + # TODO crawl dir seems to be storaing crawling info to avoid re-crawling, but is that consting us much? , it was when tranql was slow, but + # might right to consider getting rid of it. + crawl_dir = storage.dug_crawl_path('crawl_output') + output_file_name = os.path.join(data_set_name, + 'expanded_concepts.txt') + extracted_dug_elements_file_name = os.path.join(data_set_name, + 'extracted_graph_elements.txt') + if not output_path: + output_file = storage.dug_expanded_concepts_path(output_file_name) + extracted_output_file = storage.dug_expanded_concepts_path( + extracted_dug_elements_file_name + ) + else: + output_file = os.path.join(output_path, output_file_name) + extracted_output_file = os.path.join( output_path, extracted_dug_elements_file_name) + + Path(crawl_dir).mkdir(parents=True, exist_ok=True) + extracted_dug_elements = [] + log.debug("Creating Dug Crawler object") + crawler = Crawler( + crawl_file="", + parser=None, + annotator=None, + tranqlizer=self.tranqlizer, + tranql_queries=self.tranql_queries, + http_session=self.cached_session, + ) + crawler.crawlspace = crawl_dir + counter = 0 + total = len(concepts) + for concept in concepts.values(): + counter += 1 + try: + crawler.expand_concept(concept) + concept.set_search_terms() + concept.set_optional_terms() + except Exception as e: + log.error(concept) + raise e + for query in self.node_to_element_queries: + log.info(query) + casting_config = query['casting_config'] + tranql_source = query['tranql_source'] + dug_element_type = query['output_dug_type'] + extracted_dug_elements += crawler.expand_to_dug_element( + concept=concept, + casting_config=casting_config, + dug_element_type=dug_element_type, + tranql_source=tranql_source + ) + concept.clean() + percent_complete = int((counter / total) * 100) + if percent_complete % 10 == 0: + log.info("%d%%", percent_complete) + log.info("Crawling %s done", data_set_name) + storage.write_object(obj=jsonpickle.encode(concepts, indent=2), path=output_file) + log.info ("Concepts serialized to %s", output_file) + storage.write_object(obj=jsonpickle.encode(extracted_dug_elements, indent=2), + path=extracted_output_file) + log.info("Extracted elements serialized to %s", extracted_output_file) + + def _index_concepts(self, concepts): + "Submit concepts to ElasticSearch for indexing" + log.info("Indexing Concepts") + total = len(concepts) + count = 0 + for concept_id, concept in concepts.items(): + count += 1 + self.index_obj.index_concept(concept, index=self.concepts_index) + # Index knowledge graph answers for each concept + for kg_answer_id, kg_answer in concept.kg_answers.items(): + self.index_obj.index_kg_answer( + concept_id=concept_id, + kg_answer=kg_answer, + index=self.kg_index, + id_suffix=kg_answer_id + ) + percent_complete = int((count / total) * 100) + if percent_complete % 10 == 0: + log.info("%s %%", percent_complete) + log.info("Done Indexing concepts") + + def _validate_indexed_concepts(self, elements, concepts): + """ + Validates linked concepts are searchable + :param elements: Annotated dug elements + :param concepts: Crawled (expanded) concepts + :return: + """ + # 1 . Find concepts with KG <= 10% of all concepts, + # <= because we might have no results for some concepts from tranql + sample_concepts = {key: value for key, value + in concepts.items() if value.kg_answers} + if len(concepts) == 0: + log.info("No Concepts found.") + return + log.info("Found only %d Concepts with Knowledge graph out of %d. %d%%", + len(sample_concepts), len(concepts), + (len(sample_concepts) / len(concepts)) * 100) + # 2. pick elements that have concepts in the sample concepts set + sample_elements = {} + for element in elements: + if isinstance(element, DugConcept): + continue + for concept in element.concepts: + # add elements that have kg + if concept in sample_concepts: + sample_elements[concept] = sample_elements.get( + concept, set()) + sample_elements[concept].add(element.id) + + # Time for some validation + for curie in concepts: + concept = concepts[curie] + if not concept.kg_answers: + continue + search_terms = [] + for key in concept.kg_answers: + kg_object = concept.kg_answers[key] + search_terms += kg_object.get_node_names() + search_terms += kg_object.get_node_synonyms() + # reduce(lambda x,y: x + y, [[node.get("name")] + # + node.get("synonyms", []) + # for node in concept.kg_answers[ + # "knowledge_graph"]["nodes"]], []) + # validation here is that for any of these nodes we should get back + # the variable. + # make unique + search_terms_cap = 10 + search_terms = list(set(search_terms))[:search_terms_cap] + log.debug("Using %d Search terms for concept %s", len(search_terms), + str(curie)) + for search_term in search_terms: + # avoids elastic failure due to some reserved characters + # 'search_phase_execution_exception', + # 'token_mgr_error: Lexical error ... + search_term = re.sub(r'[^a-zA-Z0-9_\ ]+', '', search_term) + + searched_element_ids = self._search_elements(curie, search_term) + + if curie not in sample_elements: + log.error("Did not find Curie id %s in Elements.", + str(curie)) + log.error("Concept id : %s, Search term: %s", + str(concept.id), search_term) + raise PipelineException( + f"Validation error - Did not find {curie} for " + f"Concept id : {concept.id}, " + f"Search term: {search_term}") + + present = bool([x for x in sample_elements[curie] + if x in searched_element_ids]) + if not present: + log.error("Did not find expected variable %s " + "in search result.", + str(curie)) + log.error("Concept id : %s, Search term: %s", + str(concept.id), search_term) + raise PipelineException( + f"Validation error - Did not find {curie} for" + f" Concept id : {concept.id}, " + f"Search term: {search_term}") + + def clear_index(self, index_id): + "Delete the index specified by index_id from ES" + exists = self.event_loop.run_until_complete(self.search_obj.es.indices.exists(index=index_id)) + if exists: + log.info("Deleting index %s", str(index_id)) + response = self.event_loop.run_until_complete( + self.search_obj.es.indices.delete(index=index_id)) + log.info("Cleared Elastic : %s", str(response)) + log.info("Re-initializing the indicies") + self.index_obj.init_indices() + + def clear_variables_index(self): + "Delete the variables index from ES" + self.clear_index(self.variables_index) + + def clear_kg_index(self): + "Delete the KG index from ES" + self.clear_index(self.kg_index) + + def clear_concepts_index(self): + "Delete the concepts index from ES" + self.clear_index(self.concepts_index) + + #### + # Methods above this are directly from what used to be + # dug_helpers.dug_utils.Dug. Methods below are consolidated from what used + # to be dug_helpers.dug_utils.DugUtil. These are intented to be the "top + # level" interface to Roger, which Airflow DAGs or other orchestrators can + # call directly. + + def _fetch_s3_file(self, filename, output_dir): + "Fetch a file from s3 to output_dir" + log.info("Fetching %s", filename) + output_name = filename.split('/')[-1] + output_path = output_dir / output_name + self.s3_utils.get( + str(filename), + str(output_path), + ) + if self.unzip_source: + log.info("Unzipping %s", str(output_path)) + with tarfile.open(str(output_path)) as tar: + tar.extractall(path=output_dir) + return output_path + + def _fetch_remote_file(self, filename, output_dir, current_version): + "Fetch a file from a location using FileFetcher" + log.info("Fetching %s", filename) + # fetch from stars + remote_host = self.config.annotation_base_data_uri + fetch = FileFetcher( + remote_host=remote_host, + remote_dir=current_version, + local_dir=output_dir) + output_path = fetch(filename) + if self.unzip_source: + log.info("Unzipping %s", str(output_path)) + with tarfile.open(str(output_path)) as tar: + tar.extractall(path=output_dir) + return output_path + + def get_versioned_files(self): + """ Fetches a dug input data files to input file directory + """ + meta_data = storage.read_relative_object("../../metadata.yaml") + output_dir: Path = storage.dug_input_files_path( + self.get_files_dir()) + data_store = self.config.dug_inputs.data_source + + # clear dir + storage.clear_dir(output_dir) + data_sets = self.config.dug_inputs.data_sets + log.info("dataset: %s", data_sets) + pulled_files = [] + for data_set in data_sets: + data_set_name, current_version = data_set.split(':') + for item in meta_data["dug_inputs"]["versions"]: + if (item["version"] == current_version and + item["name"] == data_set_name and + item["format"] == self.get_data_format()): + if data_store == "s3": + for filename in item["files"]["s3"]: + pulled_files.append( + self._fetch_s3_file(filename, output_dir)) + else: + for filename in item["files"]["stars"]: + pulled_files.append( + self.fetch_remote_file(filename, output_dir, + current_version)) + return [str(filename) for filename in pulled_files] + + def get_objects(self, input_data_path=None): + """Retrieve initial source objects for parsing + + This is a default method that will be overridden by subclasses + frequently, it is expected. + """ + if not input_data_path: + input_data_path = storage.dug_input_files_path( + self.get_files_dir()) + files = storage.get_files_recursive( + lambda file_name: file_name.endswith('.xml'), + input_data_path) + return sorted([str(f) for f in files]) + + def annotate(self, to_string=False, files=None, input_data_path=None, + output_data_path=None): + "Annotate files with the appropriate parsers and crawlers" + if files is None: + files = self.get_objects(input_data_path=input_data_path) + self.annotate_files(parsable_files=files, + output_data_path=output_data_path) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + + def index_variables(self, to_string=False, element_object_files=None, + input_data_path=None, output_data_path=None): + """Index variables from element object files for pipeline + + if element_object_files is specified, only those files are + indexed. Otherwise, if the input_data_path is supplied, elements files + under that path are indexed. If neither is supplied, the default + directory is searched for index files and those are indexed. + """ + # self.clear_variables_index() + if element_object_files is None: + element_object_files = storage.dug_elements_objects(input_data_path,format='txt') + for file_ in element_object_files: + self.index_elements(file_) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + + def validate_indexed_variables(self, to_string=None, + element_object_files=None, + input_data_path=None, + output_data_path=None): + "Validate output from index variables task for pipeline" + if not element_object_files: + element_object_files = storage.dug_elements_objects(input_data_path, format='txt') + for file_ in element_object_files: + log.info("Validating %s", str(file_)) + self.validate_indexed_element_file(file_) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + + def validate_indexed_concepts(self, config=None, to_string=None, input_data_path=None, output_data_path=None): + """ + Entry for validate concepts + """ + get_data_set_name = lambda file: os.path.split(os.path.dirname(file))[-1] + expanded_concepts_files_dict = { + get_data_set_name(file): file for file in storage.dug_expanded_concept_objects(data_path=input_data_path, format='txt') + } + annotated_elements_files_dict = { + get_data_set_name(file): file for file in storage.dug_elements_objects(data_path=input_data_path, format='txt') + } + try: + assert len(expanded_concepts_files_dict) == len(annotated_elements_files_dict) + except: + log.error("Files Annotated Elements files and Expanded concepts files, should be pairs") + if len(expanded_concepts_files_dict) > len(annotated_elements_files_dict): + log.error("Some Annotated Elements files (from load_and_annotate task) are missing") + else: + log.error("Some Expanded Concepts files (from crawl task) are missing") + log.error(f"Annotated Datasets : {list(annotated_elements_files_dict.keys())}") + log.error(f"Expanded Concepts Datasets: {list(expanded_concepts_files_dict.keys())}") + exit(-1) + for data_set_name in annotated_elements_files_dict: + log.debug(f"Reading concepts and elements for dataset {data_set_name}") + elements_file_path = annotated_elements_files_dict[data_set_name] + concepts_file_path = expanded_concepts_files_dict[data_set_name] + dug_elements = jsonpickle.decode(storage.read_object(elements_file_path)) + dug_concepts = jsonpickle.decode(storage.read_object(concepts_file_path)) + log.debug(f"Read {len(dug_elements)} elements, and {len(dug_concepts)} Concepts") + log.info(f"Validating {data_set_name}") + self._validate_indexed_concepts(elements=dug_elements, concepts=dug_concepts) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + + def make_kg_tagged(self, to_string=False, elements_files=None, + input_data_path=None, output_data_path=None): + "Create tagged knowledge graphs from elements" + if not output_data_path: + output_data_path = storage.dug_kgx_path("") + storage.clear_dir(output_data_path) + log.info("Starting building KGX files") + + if not elements_files: + elements_files = storage.dug_elements_objects(input_data_path, format='txt') + log.info(f"found {len(elements_files)} files : {elements_files}") + for file_ in elements_files: + elements = jsonpickle.decode(storage.read_object(file_)) + if "topmed_" in file_: + kg = self.make_tagged_kg(elements) + else: + kg = self.convert_to_kgx_json(elements) + dug_base_file_name = file_.split(os.path.sep)[-2] + output_file_path = os.path.join(output_data_path, + dug_base_file_name + '_kgx.json') + storage.write_object(kg, output_file_path) + log.info("Wrote %d and %d edges, to %s", len(kg['nodes']), + len(kg['edges']), output_file_path) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + + def crawl_tranql(self, to_string=False, concept_files=None, + input_data_path=None, output_data_path=None): + "Perform the tranql crawl" + if not concept_files: + concept_files = storage.dug_concepts_objects(input_data_path, format='txt') + + if output_data_path: + crawl_dir = os.path.join(output_data_path, 'crawl_output') + expanded_concepts_dir = os.path.join(output_data_path, + 'expanded_concepts') + else: + crawl_dir = storage.dug_crawl_path('crawl_output') + expanded_concepts_dir = storage.dug_expanded_concepts_path("") + log.info("Clearing crawl output dir %s", crawl_dir) + storage.clear_dir(crawl_dir) + + log.info("Clearing expanded concepts dir: %s", expanded_concepts_dir) + storage.clear_dir(expanded_concepts_dir) + + log.info("Crawling Dug Concepts, found %d file(s).", + len(concept_files)) + for file_ in concept_files: + objects = storage.read_object(file_) + objects = objects or {} + if not objects: + log.info(f'no concepts in {file_}') + data_set = jsonpickle.decode(objects) + original_variables_dataset_name = os.path.split( + os.path.dirname(file_))[-1] + self.crawl_concepts(concepts=data_set, + data_set_name=original_variables_dataset_name, output_path= output_data_path) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + + def index_concepts(self, to_string=False, + input_data_path=None, output_data_path=None): + "Index concepts from expanded concept files" + # These are concepts that have knowledge graphs from tranql + # clear out concepts and kg indicies from previous runs + # self.clear_concepts_index() + # self.clear_kg_index() + expanded_concepts_files = storage.dug_expanded_concept_objects( + input_data_path, format="txt") + for file_ in expanded_concepts_files: + concepts = jsonpickle.decode(storage.read_object(file_)) + self._index_concepts(concepts=concepts) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log diff --git a/dags/roger/pipelines/bdc.py b/dags/roger/pipelines/bdc.py new file mode 100644 index 00000000..bc30cf44 --- /dev/null +++ b/dags/roger/pipelines/bdc.py @@ -0,0 +1,19 @@ +"Pipeline for BDC-dbGap data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class bdcPipeline(DugPipeline): + "Pipeline for BDC-dbGap data set" + pipeline_name = "bdc" + parser_name = "dbgap" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_dd_xml_path() + files = storage.get_files_recursive( + lambda file_name: ( + not file_name.startswith('._') + and file_name.endswith('.xml')), + input_data_path) + return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/crdc.py b/dags/roger/pipelines/crdc.py new file mode 100644 index 00000000..2143cf7b --- /dev/null +++ b/dags/roger/pipelines/crdc.py @@ -0,0 +1,19 @@ +"Pipeline for Cancer Commons data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class CRDCPipeline(DugPipeline): + "Pipeline for Cancer Commons data set" + pipeline_name = "crdc" + parser_name = "crdc" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_crdc_path() + files = storage.get_files_recursive( + lambda file_name: ( + not file_name.startswith('GapExchange_') + and file_name.endswith('.xml')), + input_data_path) + return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/ctn.py b/dags/roger/pipelines/ctn.py new file mode 100644 index 00000000..25918062 --- /dev/null +++ b/dags/roger/pipelines/ctn.py @@ -0,0 +1,10 @@ +"Pipeline for Clinical trials network data" + +from roger.pipelines import DugPipeline + +class CTNPipeline(DugPipeline): + "Pipeline for Clinical trials nework data set" + pipeline_name = "ctn" + parser_name = "ctn" + + diff --git a/dags/roger/pipelines/db_gap.py b/dags/roger/pipelines/db_gap.py new file mode 100644 index 00000000..7c1db504 --- /dev/null +++ b/dags/roger/pipelines/db_gap.py @@ -0,0 +1,10 @@ +"Dug pipeline for dbGaP data set" + +from roger.pipelines import DugPipeline + +class dbGaPPipeline(DugPipeline): + "Pipeline for the dbGaP data set" + + pipeline_name = 'dbGaP' + parser_name = 'DbGaP' + files_dir = 'db_gap' diff --git a/dags/roger/pipelines/heal_research_programs.py b/dags/roger/pipelines/heal_research_programs.py new file mode 100644 index 00000000..d95a4497 --- /dev/null +++ b/dags/roger/pipelines/heal_research_programs.py @@ -0,0 +1,16 @@ +"Pipeline for Heal-studies data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class HealResearchProgramPipeline(DugPipeline): + "Pipeline for Heal-research-programs data set" + pipeline_name = "heal-research-programs" + parser_name = "heal-research" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_heal_research_program_path() + files = storage.get_files_recursive(lambda file_name: file_name.endswith('.xml'), + input_data_path) + return sorted([str(f) for f in files]) \ No newline at end of file diff --git a/dags/roger/pipelines/heal_studies.py b/dags/roger/pipelines/heal_studies.py new file mode 100644 index 00000000..cf78c042 --- /dev/null +++ b/dags/roger/pipelines/heal_studies.py @@ -0,0 +1,16 @@ +"Pipeline for Heal-studies data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class HealStudiesPipeline(DugPipeline): + "Pipeline for Heal-studies data set" + pipeline_name = "heal-studies" + parser_name = "heal-studies" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_heal_study_path() + files = storage.get_files_recursive(lambda file_name: file_name.endswith('.xml'), + input_data_path) + return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/kfdrc.py b/dags/roger/pipelines/kfdrc.py new file mode 100644 index 00000000..bcb0b7ac --- /dev/null +++ b/dags/roger/pipelines/kfdrc.py @@ -0,0 +1,19 @@ +"Pipeline for KDFRC data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class kfdrcPipeline(DugPipeline): + "Pipeline for KDFRC data set" + pipeline_name = "kfdrc" + parser_name = "kfdrc" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_kfdrc_path() + files = storage.get_files_recursive( + lambda file_name: ( + not file_name.startswith('GapExchange_') + and file_name.endswith('.xml')), + input_data_path) + return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/nida.py b/dags/roger/pipelines/nida.py new file mode 100644 index 00000000..b2e841bd --- /dev/null +++ b/dags/roger/pipelines/nida.py @@ -0,0 +1,18 @@ +"NIDA data set pipeline definition" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class NIDAPipeline(DugPipeline): + "NIDA data pipeline" + + pipeline_name = 'nida' + parser_name = 'NIDA' + + def get_objects(self, input_data_path=None): + "Return list of NIDA source files" + if not input_data_path: + input_data_path = storage.dug_input_files_path( + self.get_files_dir()) + files = sorted(storage.get_files_recursive(lambda x: 'NIDA-' in x , input_data_path)) + return files diff --git a/dags/roger/pipelines/radx.py b/dags/roger/pipelines/radx.py new file mode 100644 index 00000000..32491fb0 --- /dev/null +++ b/dags/roger/pipelines/radx.py @@ -0,0 +1,8 @@ +"Pipeline for BACPAC data" + +from roger.pipelines import DugPipeline + +class RadxPipeline(DugPipeline): + "Pipeline for BACPAC data set" + pipeline_name = "radx" + parser_name = "radx" diff --git a/dags/roger/pipelines/sparc.py b/dags/roger/pipelines/sparc.py new file mode 100644 index 00000000..d1c9c950 --- /dev/null +++ b/dags/roger/pipelines/sparc.py @@ -0,0 +1,17 @@ +"Pipeline for Sparc data" + +from roger.pipelines import DugPipeline +from roger.core import storage + +class SparcPipeline(DugPipeline): + "Pipeline for Sparc data set" + pipeline_name = "sparc" + parser_name = "SciCrunch" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_heal_study_path() + files = storage.get_files_recursive( + lambda x: True, input_data_path + ) + return sorted([str(f) for f in files]) diff --git a/dags/roger/pipelines/topmed.py b/dags/roger/pipelines/topmed.py new file mode 100644 index 00000000..90b3e515 --- /dev/null +++ b/dags/roger/pipelines/topmed.py @@ -0,0 +1,41 @@ +"Pipeline for Topmed data" + +from roger.pipelines import DugPipeline +from roger.pipelines.base import log, os +import jsonpickle +from roger.core import storage +from roger.logger import logger +class TopmedPipeline(DugPipeline): + "Pipeline for Topmed data set" + pipeline_name = "topmed" + parser_name = "TOPMedTag" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = str(storage.dug_input_files_path('topmed')) + files =storage.get_files_recursive( + lambda file_name: file_name.endswith('.csv'), + input_data_path) + return sorted([str(x) for x in files]) + + def make_kg_tagged(self, to_string=False, elements_files=None, + input_data_path=None, output_data_path=None): + "Create tagged knowledge graphs from elements" + log.info("Override base.make_kg_tagged called") + if not output_data_path: + output_data_path = storage.dug_kgx_path("") + storage.clear_dir(output_data_path) + if not elements_files: + elements_files = storage.dug_elements_objects(input_data_path, format='txt') + for file_ in elements_files: + elements = jsonpickle.decode(storage.read_object(file_)) + kg = self.make_tagged_kg(elements) + dug_base_file_name = file_.split(os.path.sep)[-2] + output_file_path = os.path.join(output_data_path, + dug_base_file_name + '_kgx.json') + storage.write_object(kg, output_file_path) + log.info("Wrote %d and %d edges, to %s", len(kg['nodes']), + len(kg['edges']), output_file_path) + output_log = self.log_stream.getvalue() if to_string else '' + return output_log + diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py index c2367e32..f5451140 100755 --- a/dags/roger/tasks.py +++ b/dags/roger/tasks.py @@ -1,10 +1,28 @@ +"Tasks and methods related to Airflow implementations of Roger" + import os from airflow.operators.python import PythonOperator +from airflow.operators.empty import EmptyOperator +from airflow.utils.task_group import TaskGroup from airflow.utils.dates import days_ago +from airflow.models import DAG +from airflow.models.dag import DagContext +from airflow.models.taskinstance import TaskInstance +from typing import Union +from pathlib import Path +import glob +import shutil -from roger.config import config +from roger.config import config, RogerConfig from roger.logger import get_logger +from roger.pipelines.base import DugPipeline +from avalon.mainoperations import put_files, LakeFsWrapper, get_files +from lakefs_sdk.configuration import Configuration +from lakefs_sdk.models.merge import Merge +from functools import partial + +logger = get_logger() default_args = { 'owner': 'RENCI', @@ -21,21 +39,35 @@ def task_wrapper(python_callable, **kwargs): """ # get dag config provided dag_run = kwargs.get('dag_run') - dag_conf = {} - logger = get_logger() - if dag_run: - dag_conf = dag_run.conf - # remove this since to send every other argument to the python callable. - del kwargs['dag_run'] + pass_conf = kwargs.get('pass_conf', True) + if config.lakefs_config.enabled: + # get input path + input_data_path = generate_dir_name_from_task_instance(kwargs['ti'], + roger_config=config, + suffix='input') + # get output path from task id run id dag id combo + output_data_path = generate_dir_name_from_task_instance(kwargs['ti'], + roger_config=config, + suffix='output') + else: + input_data_path, output_data_path = None, None + # cast it to a path object + func_args = { + 'input_data_path': input_data_path, + 'output_data_path': output_data_path, + 'to_string': kwargs.get('to_string') + } + logger.info(f"Task function args: {func_args}") # overrides values config.dag_run = dag_run - return python_callable(to_string=False, config=config) - + if pass_conf: + return python_callable(config=config, **func_args) + return python_callable(**func_args) def get_executor_config(data_path='/opt/airflow/share/data'): """ Get an executor configuration. :param annotations: Annotations to attach to the executor. - :returns: Returns a KubernetesExecutor if K8s is configured and None otherwise. + :returns: Returns a KubernetesExecutor if K8s configured, None otherwise. """ env_var_prefix = config.OS_VAR_PREFIX # based on environment set on scheduler pod, make secrets for worker pod @@ -70,28 +102,343 @@ def get_executor_config(data_path='/opt/airflow/share/data'): } return k8s_executor_config +def init_lakefs_client(config: RogerConfig) -> LakeFsWrapper: + configuration = Configuration() + configuration.username = config.lakefs_config.access_key_id + configuration.password = config.lakefs_config.secret_access_key + configuration.host = config.lakefs_config.host + the_lake = LakeFsWrapper(configuration=configuration) + return the_lake + + +def pagination_helper(page_fetcher, **kwargs): + """Helper function to iterate over paginated results""" + while True: + resp = page_fetcher(**kwargs) + yield from resp.results + if not resp.pagination.has_more: + break + kwargs['after'] = resp.pagination.next_offset + + +def avalon_commit_callback(context: DagContext, **kwargs): + client: LakeFsWrapper = init_lakefs_client(config=config) + # now files have been processed, + # this part should + # get the out path of the task + local_path = str(generate_dir_name_from_task_instance(context['ti'], + roger_config=config, + suffix='output')).rstrip('/') + '/' + task_id = context['ti'].task_id + dag_id = context['ti'].dag_id + run_id = context['ti'].run_id + # run id looks like 2023-10-18T17:35:14.890186+00:00 + # normalized to 2023_10_18T17_35_14_890186_00_00 + # since lakefs branch id must consist of letters, digits, underscores and dashes, + # and cannot start with a dash + run_id_normalized = run_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_') + dag_id_normalized = dag_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_') + task_id_normalized = task_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_') + temp_branch_name = f'{dag_id_normalized}_{task_id_normalized}_{run_id_normalized}' + # remote path to upload the files to. + remote_path = f'{dag_id}/{task_id}/' + + # merge destination branch + branch = config.lakefs_config.branch + repo = config.lakefs_config.repo + # This part pushes to a temp branch on the repo + + # now we have the output path lets do some pushing but where ? + # right now lets stick to using one repo , + + # issue Vladmir pointed out if uploads to a single lakefs branch have not + # been finalized with commit, + # this would cause dirty commits if parallel tasks target the same branch. + + # solution: Lakefs team suggested we commit to a different temp branch per + # task, and merge that branch. + # this callback function will do that for now. + + # 1. put files into a temp branch. + # 2. make sure a commit happens. + # 3. merge that branch to master branch. + logger.info("Pushing local path %s to %s@%s in %s dir", + local_path, repo, temp_branch_name, remote_path) + put_files( + local_path=local_path, + remote_path=remote_path, + task_name=task_id, + task_args=[""], + pipeline_id=dag_id, + task_docker_image="docker-image", + s3storage=False, + lake_fs_client=client, + branch=temp_branch_name, + repo=repo, + # @TODO figure out how to pass real commit id here + commit_id=branch + ) + + # see what changes are going to be pushed from this branch to main branch + for diff in pagination_helper(client._client.refs_api.diff_refs, + repository=repo, left_ref=branch, + right_ref=temp_branch_name): + logger.info("Diff: " + str(diff)) + + try: + # merging temp branch to working branch + # the current working branch wins incase of conflicts + merge = Merge(**{"strategy": "source-wins"}) + client._client.refs_api.merge_into_branch(repository=repo, + source_ref=temp_branch_name, + destination_branch=branch, + merge=merge + ) + + logger.info(f"merged branch {temp_branch_name} into {branch}") + except Exception as e: + # remove temp + logger.error(e) + # delete temp branch + finally: + client._client.branches_api.delete_branch( + repository=repo, + branch=temp_branch_name + ) + logger.info(f"deleted temp branch {temp_branch_name}") + logger.info(f"deleting local dir {local_path}") + files_to_clean = glob.glob(local_path + '**', recursive=True) + [local_path] + + clean_up(context, **kwargs) + +def clean_up(context: DagContext, **kwargs): + input_dir = str(generate_dir_name_from_task_instance(context['ti'], + roger_config=config, + suffix='output')).rstrip('/') + '/' + output_dir = str(generate_dir_name_from_task_instance(context['ti'], + roger_config=config, + suffix='input')).rstrip('/') + '/' + files_to_clean = glob.glob(input_dir + '**', recursive=True) + [input_dir] + files_to_clean += glob.glob(output_dir + '**', recursive=True) + [output_dir] + for f in files_to_clean: + if os.path.exists(f): + shutil.rmtree(f) + +def generate_dir_name_from_task_instance(task_instance: TaskInstance, + roger_config: RogerConfig, suffix:str): + # if lakefs is not enabled just return none so methods default to using + # local dir structure. + if not roger_config.lakefs_config.enabled: + return None + root_data_dir = os.getenv("ROGER_DATA_DIR").rstrip('/') + task_id = task_instance.task_id + dag_id = task_instance.dag_id + run_id = task_instance.run_id + try_number = task_instance._try_number + return Path( + f"{root_data_dir}/{dag_id}_{task_id}_{run_id}_{try_number}_{suffix}") + +def setup_input_data(context, exec_conf): + logger.info(""" + - Figures out the task name and id, + - find its data dependencies + - clean up and create in and out dir + - put dependency data in input dir + - if for some reason data was not found raise an exception + """) + # Serves as a location where files the task will work on are placed. + # computed as ROGER_DATA_DIR + /current task instance name_input_dir + + input_dir = str(generate_dir_name_from_task_instance( + context['ti'], roger_config=config, suffix="input")) + # Clear up files from previous run etc... + + # create input dir + os.makedirs(input_dir, exist_ok=True) + + # Download files from lakefs and store them in this new input_path + client = init_lakefs_client(config=config) + repos = exec_conf['repos'] + # if no external repo is provided we assume to get the upstream task dataset. + if not repos or len(repos) == 0: + # merge destination branch + branch = config.lakefs_config.branch + repo = config.lakefs_config.repo + task_instance: TaskInstance = context['ti'] + # get upstream ids + upstream_ids = task_instance.task.upstream_task_ids + dag_id = task_instance.dag_id + # calculate remote dirs using dag_id + upstreams + repos = [{ + 'repo': repo, + 'branch': branch, + 'path': f'{dag_id}/{upstream_id}' + } for upstream_id in upstream_ids] -def create_python_task (dag, name, a_callable, func_kwargs=None): + # input_repo = exec_conf['input_repo'] + # input_branch = exec_conf['input_branch'] + # If input repo is provided use that as source of files + for repo in repos: + if not repo.get('path'): + # get all if path is not specified + repo['path'] = '*' + logger.info(f"repos : {repos}") + for r in repos: + logger.info("downloading %s from %s@%s to %s", + r['path'], r['repo'], r['branch'], input_dir) + # create path to download to ... + if not os.path.exists(input_dir + f'/{r["repo"]}'): + os.mkdir(input_dir + f'/{r["repo"]}') + get_files( + local_path=input_dir + f'/{r["repo"]}', + remote_path=r['path'], + branch=r['branch'], + repo=r['repo'], + changes_only=False, + lake_fs_client=client + ) + + +def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos = {}, pass_conf=True, no_output_files=False): """ Create a python task. :param func_kwargs: additional arguments for callable. :param dag: dag to add task to. :param name: The name of the task. :param a_callable: The code to run in this task. """ + # these are actual arguments passed down to the task function op_kwargs = { - "python_callable": a_callable, - "to_string": True - } + "python_callable": a_callable, + "to_string": True, + "pass_conf": pass_conf + } + # update / override some of the args passed to the task function by default if func_kwargs is None: - func_kwargs = dict() + func_kwargs = {} op_kwargs.update(func_kwargs) - return PythonOperator( - task_id=name, - python_callable=task_wrapper, - op_kwargs=op_kwargs, - executor_config=get_executor_config(), - dag=dag, - provide_context=True - ) + # Python operator arguments , by default for non-lakefs config this is all we need. + python_operator_args = { + "task_id": name, + "python_callable":task_wrapper, + "executor_config" : get_executor_config(), + "dag": dag, + "provide_context" : True + } + + # if we have lakefs... + if config.lakefs_config.enabled: + + # repo and branch for pre-execution , to download input objects + pre_exec_conf = { + 'repos': [] + } + if external_repos: + # if the task is a root task , beginning of the dag... + # and we want to pull data from a different repo. + pre_exec_conf = { + 'repos': [{ + 'repo': r['name'], + 'branch': r['branch'], + 'path': r.get('path', '*') + } for r in external_repos] + } + + pre_exec = partial(setup_input_data, exec_conf=pre_exec_conf) + # add pre_exec partial function as an argument to python executor conf + python_operator_args['pre_execute'] = pre_exec + python_operator_args['on_failure_callback'] = partial(clean_up, kwargs=op_kwargs) + # if the task has output files, we will add a commit callback + if not no_output_files: + python_operator_args['on_success_callback'] = partial(avalon_commit_callback, kwargs=op_kwargs) + + # add kwargs + python_operator_args["op_kwargs"] = op_kwargs + + return PythonOperator(**python_operator_args) + +def create_pipeline_taskgroup( + dag, + pipeline_class: type, + configparam: RogerConfig, + **kwargs): + """Emit an Airflow dag pipeline for the specified pipeline_class + + Extra kwargs are passed to the pipeline class init call. + """ + name = pipeline_class.pipeline_name + input_dataset_version = pipeline_class.input_version + + with TaskGroup(group_id=f"{name}_dataset_pipeline_task_group") as tg: + with pipeline_class(config=configparam, **kwargs) as pipeline: + pipeline: DugPipeline + annotate_task = create_python_task( + dag, + f"annotate_{name}_files", + pipeline.annotate, + external_repos=[{ + 'name': getattr(pipeline_class, 'pipeline_name'), + 'branch': input_dataset_version + }], + pass_conf=False) + + index_variables_task = create_python_task( + dag, + f"index_{name}_variables", + pipeline.index_variables, + pass_conf=False, + # declare that this task will not generate files. + no_output_files=True) + index_variables_task.set_upstream(annotate_task) + + validate_index_variables_task = create_python_task( + dag, + f"validate_{name}_index_variables", + pipeline.validate_indexed_variables, + pass_conf=False, + # declare that this task will not generate files. + no_output_files=True + ) + validate_index_variables_task.set_upstream([annotate_task, index_variables_task]) + + make_kgx_task = create_python_task( + dag, + f"make_kgx_{name}", + pipeline.make_kg_tagged, + pass_conf=False) + make_kgx_task.set_upstream(annotate_task) + + crawl_task = create_python_task( + dag, + f"crawl_{name}", + pipeline.crawl_tranql, + pass_conf=False) + crawl_task.set_upstream(annotate_task) + + index_concepts_task = create_python_task( + dag, + f"index_{name}_concepts", + pipeline.index_concepts, + pass_conf=False, + # declare that this task will not generate files. + no_output_files=True) + index_concepts_task.set_upstream(crawl_task) + + validate_index_concepts_task = create_python_task( + dag, + f"validate_{name}_index_concepts", + pipeline.validate_indexed_concepts, + pass_conf=False, + # declare that this task will not generate files. + no_output_files=True + ) + validate_index_concepts_task.set_upstream([crawl_task, index_concepts_task, annotate_task]) + + + complete_task = EmptyOperator(task_id=f"complete_{name}") + complete_task.set_upstream( + (make_kgx_task, + validate_index_variables_task, validate_index_concepts_task)) + + return tg diff --git a/dags/tranql_translate.py b/dags/tranql_translate.py deleted file mode 100755 index 53394ba2..00000000 --- a/dags/tranql_translate.py +++ /dev/null @@ -1,43 +0,0 @@ -# -*- coding: utf-8 -*- -# - -""" -An Airflow workflow for the Roger Translator KGX data pipeline. -""" - -from airflow.models import DAG -from airflow.operators.empty import EmptyOperator -import roger -from roger.tasks import get_executor_config, default_args, create_python_task - -""" Build the workflow's tasks and DAG. """ -with DAG( - dag_id='tranql_translate', - default_args=default_args, - schedule_interval=None, - concurrency=16, -) as dag: - - """ Build the workflow tasks. """ - intro = EmptyOperator(task_id='Intro') - get_kgx = create_python_task (dag, "GetSource", roger.get_kgx) - create_nodes_schema = create_python_task(dag, "CreateNodesSchema", - roger.create_nodes_schema) - create_edges_schema = create_python_task(dag, "CreateEdgesSchema", - roger.create_edges_schema) - continue_task_bulk_load = EmptyOperator(task_id="continueBulkCreate") - continue_task_validate = EmptyOperator(task_id="continueValidation") - merge_nodes = create_python_task (dag, "MergeNodes", roger.merge_nodes) - create_bulk_load_nodes = create_python_task(dag, "CreateBulkLoadNodes", - roger.create_bulk_nodes) - create_bulk_load_edges = create_python_task(dag, "CreateBulkLoadEdges", - roger.create_bulk_edges) - bulk_load = create_python_task(dag, "BulkLoad", roger.bulk_load) - check_tranql = create_python_task(dag, "CheckTranql", - roger.check_tranql) - validate = create_python_task(dag, "Validate", roger.validate) - finish = EmptyOperator(task_id='Finish') - - """ Build the DAG. """ - intro >> get_kgx >> merge_nodes >> [create_nodes_schema, create_edges_schema ] >> continue_task_bulk_load >> \ - [create_bulk_load_nodes, create_bulk_load_edges] >> bulk_load >> continue_task_validate >>[validate, check_tranql ] >> finish diff --git a/requirements.txt b/requirements.txt index 9868ea75..d7eb73f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,14 @@ boto3==1.18.23 botocore==1.21.23 -#black==21.10b0 elasticsearch==8.5.2 flatten-dict +jsonpickle redisgraph-bulk-loader==0.12.3 pytest PyYAML -git+https://github.com/helxplatform/dug@2.12.0 +git+https://github.com/helxplatform/dug@2.13.1 orjson kg-utils==0.0.6 bmt==1.1.0 +git+https://github.com/helxplatform/avalon.git@VG3 linkml-runtime==1.6.0 From f6711947b5c1bd4fee14481517da69c7bed91749 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 18 Apr 2024 13:32:57 -0400 Subject: [PATCH 13/63] pin avalon --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d7eb73f8..d1b1f68f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,5 +10,5 @@ git+https://github.com/helxplatform/dug@2.13.1 orjson kg-utils==0.0.6 bmt==1.1.0 -git+https://github.com/helxplatform/avalon.git@VG3 +git+https://github.com/helxplatform/avalon.git@v1.0.1 linkml-runtime==1.6.0 From 473816a912045a25ccf4b123f967147e9d38773a Mon Sep 17 00:00:00 2001 From: Patrick Hachicho Date: Thu, 6 Jun 2024 16:01:38 -0400 Subject: [PATCH 14/63] deleted jenkins and added workflows --- .github/workflows/build-push-dev-image.yml | 27 +++++++ .github/workflows/build-push-release.yml | 25 +++++++ .github/workflows/code-checks.yml | 38 ++++++++++ .github/workflows/trivy-pr-scan.yml | 23 ++++++ Jenkinsfile | 84 ---------------------- 5 files changed, 113 insertions(+), 84 deletions(-) create mode 100644 .github/workflows/build-push-dev-image.yml create mode 100644 .github/workflows/build-push-release.yml create mode 100644 .github/workflows/code-checks.yml create mode 100644 .github/workflows/trivy-pr-scan.yml delete mode 100644 Jenkinsfile diff --git a/.github/workflows/build-push-dev-image.yml b/.github/workflows/build-push-dev-image.yml new file mode 100644 index 00000000..f18a05c6 --- /dev/null +++ b/.github/workflows/build-push-dev-image.yml @@ -0,0 +1,27 @@ +# Workflow responsible for the +# development release processes. +# +name: Build-Push-Dev-Image +on: + push: + branches: + - develop + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + # Do not build another image on a pull request. + # Any push to develop will trigger a new build however. + pull_request: + branches-ignore: + - '*' + +jobs: + build-push-dev-image: + uses: helxplatform/helx-github-actions/.github/workflows/build-push-dev-image.yml@main + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/build-push-release.yml b/.github/workflows/build-push-release.yml new file mode 100644 index 00000000..942221a3 --- /dev/null +++ b/.github/workflows/build-push-release.yml @@ -0,0 +1,25 @@ +# Workflow responsible for the +# major release processes. +# + +name: Build-Push-Release +on: + push: + branches: + - master + - main + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + tags-ignore: + - '*' +jobs: + build-push-release: + uses: helxplatform/helx-github-actions/.github/workflows/build-push-release.yml@main + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml new file mode 100644 index 00000000..46f09d3c --- /dev/null +++ b/.github/workflows/code-checks.yml @@ -0,0 +1,38 @@ +# Workflow responsible for core acceptance testing. +# Tests Currently Run: +# - flake8-linter +# - PYTest +# - Bandit +# For PR Vulnerability Scanning a separate workflow will run. +# The build-push-dev-image and build-push-release workflows +# handle the develop and release image storage respectively. +# +# + +name: Code-Checks +on: + # push: + # branches-ignore: + # - master + # - main + # - develop + pull_request: + branches: + - develop + - master + - main + types: [ opened, synchronize ] + paths-ignore: + - README.md + - .old_cicd/* + # - .github/* + # - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + +jobs: + build-push-release: + uses: helxplatform/helx-github-actions/.github/workflows/code-checks.yml@main + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml new file mode 100644 index 00000000..6097214a --- /dev/null +++ b/.github/workflows/trivy-pr-scan.yml @@ -0,0 +1,23 @@ + +name: trivy-pr-scan +on: + pull_request: + branches: + - develop + - master + - main + types: [ opened, synchronize ] + paths-ignore: + - README.md + - .old_cicd/* + - .github/* + - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + +jobs: + trivy-pr-scan: + uses: helxplatform/helx-github-actions/.github/workflows/trivy-pr-scan.yml@main + secrets: inherit \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile deleted file mode 100644 index a68ba92b..00000000 --- a/Jenkinsfile +++ /dev/null @@ -1,84 +0,0 @@ -library 'pipeline-utils@master' - -pipeline { - agent { - kubernetes { - label 'kaniko-build-agent' - yaml ''' -kind: Pod -metadata: - name: kaniko -spec: - containers: - - name: jnlp - workingDir: /home/jenkins/agent - - name: kaniko - workingDir: /home/jenkins/agent - image: gcr.io/kaniko-project/executor:debug - imagePullPolicy: Always - resources: - requests: - cpu: "512m" - memory: "1024Mi" - ephemeral-storage: "4Gi" - limits: - cpu: "1024m" - memory: "2048Mi" - ephemeral-storage: "8Gi" - command: - - /busybox/cat - tty: true - volumeMounts: - - name: jenkins-docker-cfg - mountPath: /kaniko/.docker - volumes: - - name: jenkins-docker-cfg - projected: - sources: - - secret: - name: rencibuild-imagepull-secret - items: - - key: .dockerconfigjson - path: config.json -''' - } - } - environment { - PATH = "/busybox:/kaniko:/ko-app/:$PATH" - DOCKERHUB_CREDS = credentials("${env.CONTAINERS_REGISTRY_CREDS_ID_STR}") - REGISTRY = "${env.REGISTRY}" - REG_OWNER="helxplatform" - REG_APP="roger" - COMMIT_HASH="${sh(script:"git rev-parse --short HEAD", returnStdout: true).trim()}" - VERSION_FILE="./dags/_version.py" - VERSION="${sh(script:'awk \'{ print $3 }\' ./dags/_version.py | xargs', returnStdout: true).trim()}" - IMAGE_NAME="${REGISTRY}/${REG_OWNER}/${REG_APP}" - TAG1="$BRANCH_NAME" - TAG2="$COMMIT_HASH" - TAG3="$VERSION" - TAG4="latest" - } - stages { - stage('Test') { - steps { - sh ''' - echo "Test stage" - ''' - } - } - stage('Build') { - steps { - script { - container(name: 'kaniko', shell: '/busybox/sh') { - if (env.BRANCH_NAME == "main") { - // Tag with latest and version iff when pushed to master - kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2", "$IMAGE_NAME:$TAG3", "$IMAGE_NAME:$TAG4"]) - } else { - kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2"]) - } - } - } - } - } - } -} From 98257d9e1a8b163d4ed5d6880a8c8b96610f710e Mon Sep 17 00:00:00 2001 From: Patrick Hachicho Date: Fri, 7 Jun 2024 13:51:21 -0400 Subject: [PATCH 15/63] unlinked helx-actions --- .github/workflows/build-push-dev-image.yml | 63 ++++++++++- .github/workflows/build-push-release.yml | 110 ++++++++++++++++++- .github/workflows/code-checks.yml | 120 ++++++++++++++++++++- .github/workflows/trivy-pr-scan.yml | 50 ++++++++- 4 files changed, 333 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-push-dev-image.yml b/.github/workflows/build-push-dev-image.yml index f18a05c6..13f8cfb7 100644 --- a/.github/workflows/build-push-dev-image.yml +++ b/.github/workflows/build-push-dev-image.yml @@ -23,5 +23,64 @@ on: jobs: build-push-dev-image: - uses: helxplatform/helx-github-actions/.github/workflows/build-push-dev-image.yml@main - secrets: inherit \ No newline at end of file + runs-on: ubuntu-latest + steps: + + - name: Checkout Code + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + # fetch-depth: 0 means, get all branches and commits + fetch-depth: 0 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + # Docker Buildx is important to caching in the Build And Push Container + # step + # https://github.com/marketplace/actions/build-and-push-docker-images + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Login to Container Registry + uses: docker/login-action@v3 + with: + registry: containers.renci.org + username: ${{ secrets.CONTAINERHUB_USERNAME }} + password: ${{ secrets.CONTAINERHUB_TOKEN }} + logout: true + + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Push Container + uses: docker/build-push-action@v5 + with: + context: . + push: true + # Push to renci-registry and dockerhub here. + # cache comes from dockerhub. + tags: | + ${{ github.repository }}:develop + ${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + containers.renci.org/${{ github.repository }}:develop + containers.renci.org/${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache-dev + cache-to: type=registry,ref=${{ github.repository }}:buildcache-dev,mode=max \ No newline at end of file diff --git a/.github/workflows/build-push-release.yml b/.github/workflows/build-push-release.yml index 942221a3..07b22d21 100644 --- a/.github/workflows/build-push-release.yml +++ b/.github/workflows/build-push-release.yml @@ -21,5 +21,111 @@ on: - '*' jobs: build-push-release: - uses: helxplatform/helx-github-actions/.github/workflows/build-push-release.yml@main - secrets: inherit \ No newline at end of file + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + fetch-depth: 0 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + # https://github.com/marketplace/actions/git-semantic-version + - name: Semver Check + uses: paulhatch/semantic-version@v5.0.3 + id: version + with: + # The prefix to use to identify tags + tag_prefix: "v" + # A string which, if present in a git commit, indicates that a change represents a + # major (breaking) change, supports regular expressions wrapped with '/' + major_pattern: "/breaking:|major:/" + # A string which indicates the flags used by the `major_pattern` regular expression. Supported flags: idgs + major_regexp_flags: "ig" + # Same as above except indicating a minor change, supports regular expressions wrapped with '/' + minor_pattern: "/feat:|feature:|minor:/" + # A string which indicates the flags used by the `minor_pattern` regular expression. Supported flags: idgs + minor_regexp_flags: "ig" + # A string to determine the format of the version output + # version_format: "${major}.${minor}.${patch}-prerelease${increment}" + version_format: "${major}.${minor}.${patch}" + search_commit_body: false + + # Docker Buildx is important to caching in the Build And Push Container + # step + # https://github.com/marketplace/actions/build-and-push-docker-images + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Login to Container Registry + uses: docker/login-action@v3 + with: + registry: containers.renci.org + username: ${{ secrets.CONTAINERHUB_USERNAME }} + password: ${{ secrets.CONTAINERHUB_TOKEN }} + logout: true + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Push Container + uses: docker/build-push-action@v5 + with: + push: true + # Push to renci-registry and dockerhub here. + # cache comes from dockerhub. + tags: | + containers.renci.org/${{ github.repository }}:v${{ steps.version.outputs.version }} + containers.renci.org/${{ github.repository }}:latest + containers.renci.org/${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + ${{ github.repository }}:v${{ steps.version.outputs.version }} + ${{ github.repository }}:latest + ${{ github.repository }}:${{ steps.vars.outputs.short_sha }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache-release + cache-to: type=registry,ref=${{ github.repository }}:buildcache-release,mode=max + +#==========================TAG & RELEASE W/ NOTES ========================= + + # Note: GITHUB_TOKEN is autogenerated feature of github app + # which is auto-enabled when using github actions. + # https://docs.github.com/en/actions/security-guides/automatic-token-authentication + # https://docs.github.com/en/rest/git/tags?apiVersion=2022-11-28#create-a-tag-object + # https://docs.github.com/en/rest/git/refs?apiVersion=2022-11-28#create-a-reference + # This creates a "lightweight" ref tag. + - name: Create Tag for Release + run: | + curl \ + -s --fail -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/${{ github.repository }}/git/refs \ + -d '{"ref":"refs/tags/v${{ steps.version.outputs.version }}","sha":"${{ github.sha }}"}' + +# https://cli.github.com/manual/gh_release_create + - name: Create Release + env: + RELEASE_VERSION: ${{ steps.version.outputs.version }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release create ${{ env.RELEASE_VERSION }} \ + -t "${{ env.RELEASE_VERSION }}" \ + --generate-notes \ + --latest \ No newline at end of file diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 46f09d3c..401c24cc 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -33,6 +33,120 @@ on: - .githooks jobs: - build-push-release: - uses: helxplatform/helx-github-actions/.github/workflows/code-checks.yml@main - secrets: inherit \ No newline at end of file +############################## flake8-linter ############################## + flake8-linter: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + + # Currently actions/setup-python supports caching + # but the cache is not as robust as cache action. + # Here we cache the entire python env which speeds subsequent builds up alot. (alot being scientific term) + # Ref: https://blog.allenai.org/python-caching-in-github-actions-e9452698e98d + - uses: actions/cache@v3 + name: Cache Python + with: + path: ${{ env.pythonLocation }} + key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('pyproject.toml') }} + + - name: Install Requirements + run: | + pip install -r requirements.txt + + - name: Lint with flake8 + run: | + pip install flake8 + flake8 --ignore=E,W src + # We continue on error here until the code is clean + # flake8 --ignore=E,W --exit-zero . + continue-on-error: true + +################################### PYTEST ################################### + pytest: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + + - name: Install Requirements + run: | + pip install -r requirements.txt + pip install coverage + pip install . + + - name: Test with pytest + run: | + make test + +############################ Bandit ################################ + bandit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + + - name: Install Requirements + run: | + pip install -r requirements.txt + pip install bandit + pip install . + + # Only report high security issues + - name: Test with Bandit + run: | + bandit -r src -n3 -lll + +############################## test-image-build ############################## + test-image-build: + runs-on: ubuntu-latest + # if: ${{ github.actor == 'dependabot[bot]' }} + steps: + - uses: actions/checkout@v3 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Parse Github Reference Name + id: branch + run: | + REF=${{ github.ref_name }} + echo "GHR=${REF%/*}" >> $GITHUB_OUTPUT + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: | + ${{ github.repository }}:test_${{ steps.branch.outputs.GHR }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max \ No newline at end of file diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml index 6097214a..1e7bc060 100644 --- a/.github/workflows/trivy-pr-scan.yml +++ b/.github/workflows/trivy-pr-scan.yml @@ -1,4 +1,3 @@ - name: trivy-pr-scan on: pull_request: @@ -19,5 +18,50 @@ on: jobs: trivy-pr-scan: - uses: helxplatform/helx-github-actions/.github/workflows/trivy-pr-scan.yml@main - secrets: inherit \ No newline at end of file + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: false + load: true + tags: ${{ github.repository }}:vuln-test + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max + + # We will not be concerned with Medium and Low vulnerabilities + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + image-ref: '${{ github.repository }}:vuln-test' + format: 'sarif' + severity: 'CRITICAL,HIGH' + ignore-unfixed: true + output: 'trivy-results.sarif' + exit-code: '1' + # Scan results should be viewable in GitHub Security Dashboard + # We still fail the job if results are found, so below will always run + # unless manually canceled. + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + if: '!cancelled()' + with: + sarif_file: 'trivy-results.sarif' \ No newline at end of file From d7b1f7e99670228e59b2b6c750cf340defcf5c29 Mon Sep 17 00:00:00 2001 From: Patrick Hachicho Date: Tue, 11 Jun 2024 13:45:19 -0400 Subject: [PATCH 16/63] testing paths --- .github/workflows/code-checks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 401c24cc..372c8f76 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -80,7 +80,7 @@ jobs: run: | pip install -r requirements.txt pip install coverage - pip install . + pip install ./dags - name: Test with pytest run: | @@ -100,7 +100,7 @@ jobs: run: | pip install -r requirements.txt pip install bandit - pip install . + pip install ./dags/roger # Only report high security issues - name: Test with Bandit From b0d8c923df8798efcb532058d528fe5b96ecb101 Mon Sep 17 00:00:00 2001 From: Patrick Hachicho Date: Tue, 11 Jun 2024 13:53:40 -0400 Subject: [PATCH 17/63] testing again --- .github/workflows/code-checks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 372c8f76..126cca81 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -80,7 +80,7 @@ jobs: run: | pip install -r requirements.txt pip install coverage - pip install ./dags + # pip install ./dags - name: Test with pytest run: | @@ -100,7 +100,7 @@ jobs: run: | pip install -r requirements.txt pip install bandit - pip install ./dags/roger + # pip install ./dags/roger # Only report high security issues - name: Test with Bandit From 9bee0884bb646a3b97d0ff405c10c97d6bdbdf45 Mon Sep 17 00:00:00 2001 From: Patrick Hachicho Date: Tue, 11 Jun 2024 14:28:34 -0400 Subject: [PATCH 18/63] d --- .github/workflows/code-checks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 126cca81..ec18da7a 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -80,7 +80,7 @@ jobs: run: | pip install -r requirements.txt pip install coverage - # pip install ./dags + pip install /dags - name: Test with pytest run: | @@ -100,7 +100,7 @@ jobs: run: | pip install -r requirements.txt pip install bandit - # pip install ./dags/roger + pip install . # Only report high security issues - name: Test with Bandit From 2b44c2280b46f94fa5f05037625cb3b5e8253dc9 Mon Sep 17 00:00:00 2001 From: Patrick Hachicho Date: Tue, 11 Jun 2024 14:31:23 -0400 Subject: [PATCH 19/63] tests --- .github/workflows/code-checks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index ec18da7a..35f526b4 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -80,7 +80,7 @@ jobs: run: | pip install -r requirements.txt pip install coverage - pip install /dags + pip install ./tests - name: Test with pytest run: | @@ -100,7 +100,7 @@ jobs: run: | pip install -r requirements.txt pip install bandit - pip install . + pip install ./tests # Only report high security issues - name: Test with Bandit From 7a8cc6dfec7f71589711e116f6b6f7e7c79f0677 Mon Sep 17 00:00:00 2001 From: Patrick Hachicho Date: Wed, 12 Jun 2024 21:56:01 -0400 Subject: [PATCH 20/63] commented out pytest --- .github/workflows/code-checks.yml | 40 +++++++++++++++---------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 35f526b4..1d175672 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -61,30 +61,30 @@ jobs: - name: Lint with flake8 run: | pip install flake8 - flake8 --ignore=E,W src + flake8 --ignore=E,W dag # We continue on error here until the code is clean # flake8 --ignore=E,W --exit-zero . continue-on-error: true ################################### PYTEST ################################### - pytest: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.12' - - - name: Install Requirements - run: | - pip install -r requirements.txt - pip install coverage - pip install ./tests - - - name: Test with pytest - run: | - make test + # pytest: + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v3 + # - name: Set up Python + # uses: actions/setup-python@v4 + # with: + # python-version: '3.12' + + # - name: Install Requirements + # run: | + # pip install -r requirements.txt + # pip install coverage + # pip install ./tests + + # - name: Test with pytest + # run: | + # make test ############################ Bandit ################################ bandit: @@ -100,7 +100,7 @@ jobs: run: | pip install -r requirements.txt pip install bandit - pip install ./tests + pip install ./dags # Only report high security issues - name: Test with Bandit From 94253c934cf7ed7912f7d195b09705e802345996 Mon Sep 17 00:00:00 2001 From: Patrick Hachicho Date: Wed, 12 Jun 2024 21:58:57 -0400 Subject: [PATCH 21/63] try again for bandit --- .github/workflows/code-checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 1d175672..f22839a9 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -100,7 +100,7 @@ jobs: run: | pip install -r requirements.txt pip install bandit - pip install ./dags + pip install /dags # Only report high security issues - name: Test with Bandit From 7f37f916f466d36c0eea45670ccd8b648ddbb20d Mon Sep 17 00:00:00 2001 From: Patrick Hachicho Date: Wed, 12 Jun 2024 22:21:30 -0400 Subject: [PATCH 22/63] commented out bandit --- .github/workflows/code-checks.yml | 187 +++++++++++++++--------------- 1 file changed, 93 insertions(+), 94 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index f22839a9..a2e53021 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,7 +4,7 @@ # - PYTest # - Bandit # For PR Vulnerability Scanning a separate workflow will run. -# The build-push-dev-image and build-push-release workflows +# The build-push-dev-image and build-push-release workflows # handle the develop and release image storage respectively. # # @@ -20,8 +20,8 @@ on: branches: - develop - master - - main - types: [ opened, synchronize ] + - main + types: [opened, synchronize] paths-ignore: - README.md - .old_cicd/* @@ -33,40 +33,40 @@ on: - .githooks jobs: -############################## flake8-linter ############################## + ############################## flake8-linter ############################## flake8-linter: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.12' - - # Currently actions/setup-python supports caching - # but the cache is not as robust as cache action. - # Here we cache the entire python env which speeds subsequent builds up alot. (alot being scientific term) - # Ref: https://blog.allenai.org/python-caching-in-github-actions-e9452698e98d - - uses: actions/cache@v3 - name: Cache Python - with: - path: ${{ env.pythonLocation }} - key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('pyproject.toml') }} - - - name: Install Requirements - run: | - pip install -r requirements.txt - - - name: Lint with flake8 - run: | - pip install flake8 - flake8 --ignore=E,W dag - # We continue on error here until the code is clean - # flake8 --ignore=E,W --exit-zero . - continue-on-error: true - -################################### PYTEST ################################### + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12" + + # Currently actions/setup-python supports caching + # but the cache is not as robust as cache action. + # Here we cache the entire python env which speeds subsequent builds up alot. (alot being scientific term) + # Ref: https://blog.allenai.org/python-caching-in-github-actions-e9452698e98d + - uses: actions/cache@v3 + name: Cache Python + with: + path: ${{ env.pythonLocation }} + key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('pyproject.toml') }} + + - name: Install Requirements + run: | + pip install -r requirements.txt + + - name: Lint with flake8 + run: | + pip install flake8 + flake8 --ignore=E,W dag + # We continue on error here until the code is clean + # flake8 --ignore=E,W --exit-zero . + continue-on-error: true + + ################################### PYTEST ################################### # pytest: # runs-on: ubuntu-latest # steps: @@ -86,67 +86,66 @@ jobs: # run: | # make test -############################ Bandit ################################ - bandit: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.12' - - - name: Install Requirements - run: | - pip install -r requirements.txt - pip install bandit - pip install /dags - - # Only report high security issues - - name: Test with Bandit - run: | - bandit -r src -n3 -lll - -############################## test-image-build ############################## + ############################ Bandit ################################ + # bandit: + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v3 + # - name: Set up Python + # uses: actions/setup-python@v4 + # with: + # python-version: "3.12" + + # - name: Install Requirements + # run: | + # pip install -r requirements.txt + # pip install bandit + # pip install ./dags + + # # Only report high security issues + # - name: Test with Bandit + # run: | + # bandit -r dags -n3 -lll + + ############################## test-image-build ############################## test-image-build: runs-on: ubuntu-latest # if: ${{ github.actor == 'dependabot[bot]' }} steps: - - uses: actions/checkout@v3 - - - name: Set short git commit SHA - id: vars - run: | - echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT - # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ - - - name: Confirm git commit SHA output - run: echo ${{ steps.vars.outputs.short_sha }} - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - logout: true - - - name: Parse Github Reference Name - id: branch - run: | - REF=${{ github.ref_name }} - echo "GHR=${REF%/*}" >> $GITHUB_OUTPUT - - # Notes on Cache: - # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache - - name: Build Container - uses: docker/build-push-action@v5 - with: - context: . - push: true - tags: | - ${{ github.repository }}:test_${{ steps.branch.outputs.GHR }} - cache-from: type=registry,ref=${{ github.repository }}:buildcache - cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max \ No newline at end of file + - uses: actions/checkout@v3 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Parse Github Reference Name + id: branch + run: | + REF=${{ github.ref_name }} + echo "GHR=${REF%/*}" >> $GITHUB_OUTPUT + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: | + ${{ github.repository }}:test_${{ steps.branch.outputs.GHR }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max From e696854fd71cd06a44d1cf721379364f325be59a Mon Sep 17 00:00:00 2001 From: Patrick Hachicho Date: Thu, 13 Jun 2024 21:54:59 -0400 Subject: [PATCH 23/63] changed dag to dags --- .github/workflows/code-checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index a2e53021..d79d0f7a 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -61,7 +61,7 @@ jobs: - name: Lint with flake8 run: | pip install flake8 - flake8 --ignore=E,W dag + flake8 --ignore=E,W dags # We continue on error here until the code is clean # flake8 --ignore=E,W --exit-zero . continue-on-error: true From b90927495de5e79324d7cd17d1be9ac807f60716 Mon Sep 17 00:00:00 2001 From: Patrick Hachicho Date: Fri, 21 Jun 2024 11:51:36 -0400 Subject: [PATCH 24/63] Added fixes --- .github/workflows/code-checks.yml | 36 ++++++------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index d79d0f7a..b7f3e6a5 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -11,11 +11,11 @@ name: Code-Checks on: - # push: - # branches-ignore: - # - master - # - main - # - develop + push: + branches-ignore: + - master + - main + - develop pull_request: branches: - develop @@ -25,8 +25,8 @@ on: paths-ignore: - README.md - .old_cicd/* - # - .github/* - # - .github/workflows/* + - .github/* + - .github/workflows/* - LICENSE - .gitignore - .dockerignore @@ -85,28 +85,6 @@ jobs: # - name: Test with pytest # run: | # make test - - ############################ Bandit ################################ - # bandit: - # runs-on: ubuntu-latest - # steps: - # - uses: actions/checkout@v3 - # - name: Set up Python - # uses: actions/setup-python@v4 - # with: - # python-version: "3.12" - - # - name: Install Requirements - # run: | - # pip install -r requirements.txt - # pip install bandit - # pip install ./dags - - # # Only report high security issues - # - name: Test with Bandit - # run: | - # bandit -r dags -n3 -lll - ############################## test-image-build ############################## test-image-build: runs-on: ubuntu-latest From 4532bdad1b1c89ab81c51ca9d05d9ce2e9d40583 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:31:12 -0400 Subject: [PATCH 25/63] Bagel (#103) * bump dug version * adding bdc new pipelines * adding curesc * adding bagel config * add test parser * add score threshold * point to dug develop --- dags/roger/config/__init__.py | 15 ++++++++ dags/roger/config/config.yaml | 22 +++++++++--- dags/roger/pipelines/bdc_pipelines.py | 50 +++++++++++++++++++++++++++ requirements.txt | 2 +- 4 files changed, 83 insertions(+), 6 deletions(-) create mode 100644 dags/roger/pipelines/bdc_pipelines.py diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py index ac9eb23a..8cb10247 100644 --- a/dags/roger/config/__init__.py +++ b/dags/roger/config/__init__.py @@ -99,6 +99,21 @@ class AnnotationConfig(DictLike): "sapbert": { "classification_url": "https://med-nemo.apps.renci.org/annotate/", "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/", + "score_threshold": 0.8, + "bagel": { + "enabled": False, + "url": "https://bagel.apps.renci.org/group_synonyms_openai", + "prompt": "bagel/ask_classes", + "llm_args": { + "llm_model_name": "gpt-4o-2024-05-13", + "organization": "", + "access_key": "", + "llm_model_args": { + "top_p": 0, + "temperature": 0.1 + } + } + } }, } ) diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml index e9402ce4..86c95adc 100644 --- a/dags/roger/config/config.yaml +++ b/dags/roger/config/config.yaml @@ -1,6 +1,6 @@ redisgraph: username: "" - password: "12345" + password: "weak" host: localhost graph: test port: 6379 @@ -42,13 +42,25 @@ bulk_loader: annotation: clear_http_cache: false - annotator_type: monarch + annotator_type: sapbert annotator_args: monarch: url: "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" sapbert: classification_url: "https://med-nemo.apps.renci.org/annotate/" - annotator_url: "https://babel-sapbert.apps.renci.org/annotate/" + annotator_url: "https://sap-qdrant.apps.renci.org/annotate/" + score_threshold: 0.5 + bagel: + enabled: false + url: "http://localhost:9099/group_synonyms_openai" + prompt: "bagel/ask_classes" + llm_args: + llm_model_name: "gpt-4o-2024-05-13" + organization: + access_key: + llm_model_args: + top_p: 0 + temperature: 0.1 normalizer: "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie=" synonym_service: "https://name-resolution-sri.renci.org/reverse_lookup" ontology_metadata: "https://api.monarchinitiative.org/api/bioentity/" @@ -93,9 +105,9 @@ indexing: action: "files" elasticsearch: - host: elasticsearch + host: localhost username: elastic - password: "" + password: "12345" nboost_host: "" scheme: "http" ca_path: "" diff --git a/dags/roger/pipelines/bdc_pipelines.py b/dags/roger/pipelines/bdc_pipelines.py new file mode 100644 index 00000000..bf483928 --- /dev/null +++ b/dags/roger/pipelines/bdc_pipelines.py @@ -0,0 +1,50 @@ +"Dug pipeline for dbGaP data set" + +from roger.pipelines import DugPipeline + +class BIOLINCCdbGaPPipeline(DugPipeline): + "Pipeline for the dbGaP data set" + pipeline_name = 'biolincc' + parser_name = 'biolincc' + + +class covid19dbGaPPipeline(DugPipeline): + "Pipeline for the dbGaP data set" + pipeline_name = 'covid19-dbgap' + parser_name = 'covid19' + +class dirDbGaPPipeline(DugPipeline): + pipeline_name = "dir-dbgap" + parser_name = "dir" + +class LungMapDbGaPPipeline(DugPipeline): + pipeline_name = "lungmap-dbgap" + parser_name = "lungmap" + +class nsrrDbGaPPipeline(DugPipeline): + pipeline_name = "nsrr-dbgap" + parser_name = "nsrr" + +class ParentDbGaPPipeline(DugPipeline): + pipeline_name = "parent-dbgap" + parser_name = "parent" + +class PCGCDbGaPPipeline(DugPipeline): + pipeline_name = "pcgc-dbgap" + parser_name = "pcgc" + +class RecoverDbGaPPipeline(DugPipeline): + pipeline_name = "recover-dbgap" + parser_name = "recover" + +class TopmedDBGaPPipeline(DugPipeline): + pipeline_name = "topmed-gen3-dbgap" + parser_name = "topmeddbgap" + +class CureSCPipeline(DugPipeline): + pipeline_name = "curesc-dbgap" + parser_name = "curesc" + +class SmallDataDbGap(DugPipeline): + pipeline_name = "small-data-dbgap" + parser_name = "topmeddbgap" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d1b1f68f..44c9a395 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ jsonpickle redisgraph-bulk-loader==0.12.3 pytest PyYAML -git+https://github.com/helxplatform/dug@2.13.1 +git+https://github.com/helxplatform/dug@develop orjson kg-utils==0.0.6 bmt==1.1.0 From af41d8f60603efbd1e9bebe9879d2fab8ff88e71 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:34:54 -0400 Subject: [PATCH 26/63] Dbgap programs (#104) * bump dug version * adding bdc new pipelines * adding curesc * fix up merge conflict --- dags/roger/pipelines/bdc_pipelines.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dags/roger/pipelines/bdc_pipelines.py b/dags/roger/pipelines/bdc_pipelines.py index bf483928..5a945641 100644 --- a/dags/roger/pipelines/bdc_pipelines.py +++ b/dags/roger/pipelines/bdc_pipelines.py @@ -45,6 +45,4 @@ class CureSCPipeline(DugPipeline): pipeline_name = "curesc-dbgap" parser_name = "curesc" -class SmallDataDbGap(DugPipeline): - pipeline_name = "small-data-dbgap" - parser_name = "topmeddbgap" \ No newline at end of file + From 99a6e7b8285231b1a1b8931de26edafbd5c17cfb Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 19 Sep 2024 10:06:39 -0400 Subject: [PATCH 27/63] adding bagel config parse to bool --- dags/roger/config/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py index 8cb10247..71111f39 100644 --- a/dags/roger/config/__init__.py +++ b/dags/roger/config/__init__.py @@ -134,6 +134,9 @@ class AnnotationConfig(DictLike): "PATO", "CHEBI", "MONDO", "UBERON", "HP", "MESH", "UMLS" ]) + def __post_init__(self): + self.annotator_args["sapbert"]["bagel"]["enabled"] = self.annotator_args["sapbert"]["bagel"][ + "enabled"].lower() == "true" @dataclass From 05b115ec9afce05ca0b1edb80e66356a16f5dbf1 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Thu, 19 Sep 2024 14:50:13 -0400 Subject: [PATCH 28/63] Dev sync main (#106) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * release sync (#100) * Update _version.py (#86) * Update _version.py * Rti merge (#84) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Update requirements.txt * Update .gitignore * Update dug_utils.py Handle Error when curie not found in validate * Update __init__.py * Update config.yaml * Update dev-config.yaml * Update docker-compose.yaml * fixed docker-compose * adding back postgres volume to docker compose * env correction , docker compose updates --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent * adding v5.0 * cde-links branch * pin linkml * Update config.yaml collection_action to action * pop total items before result * print extracted elements * Update requirements.txt * Keep edge provenance (#94) * Update kgx.py * Update kgx.py * Update kgx.py can't delete edge keys while looping over them. * just collect then update * Update requirements.txt (#93) * Pipeline parameterize restructure (#95) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * Parameterized annotate tasks with input_data_path and output_data_path * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Parameterized annotate tasks with input_data_path and output_data_path (#85) * adding lakefs changes to roger-2.0 * point avalon to vg1 branch * change avalon dep * update airflow * fix avalon tag typo * update jenkins to tag version on main branch only * update jenkins to tag version * update jenkins to tag version * psycopg2 installation * add cncf k8s req * use airflow non-slim * simplified for testing * simplified for testing * change dag name * Erroneous parameter passed, should not be None * adding pre-exec * adding pre-exec * adding pre-exec * typo preexec * typo preexec * fix context * get files from repo * get files from repo * get files from repo * get files from repo * First shot at moving pipeline into base class and implementing. Anvil pipeline not complete * Syntax fix, docker image version bump to airflow 2.7.2-python3.11 * update storage dir * update remove dir code * update remove dir code * remote path to * * fix input dir for annotators * fix input dir for annotators * fix input dir for annotators * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * adding branch info on lakefs config * callback push to branch * back to relative import * reformat temp branch name based on unique task id * add logging * add logging * convert posix path to str for avalon * add extra / to root path * New dag created using DugPipeline subclasses * EmptyOperator imported from wrong place * import and syntax fixes * utterly silly syntax error * Added anvil to default input data sets for testing purposes * adding / to local path * commit meta task args empty string * add merge logic * add merge logic * upstream task dir pull for downstream task * Switched from subdag to taskgroup because latest Airflow depricated subdag * Added BACPAC pipeline object * Temporarily ignoring configuration variable for enabled datasets for testing * Passed dag in to create task group to see if it helps dag errors * Fixed silly syntax error * adding input / output dir params for make kgx * Trying different syntax to make taskgroups work. * adding input / output dir params for make kgx * Parsing, syntax, pylint fixes * adding input / output dir params for make kgx * Added pipeline name to task group name to ensure uniqueness * oops, moved something out of scope. Fixed * Filled out pipeline with methods from dug_utils. Needs data path changes * Finished implementing input_data_path and output_data_path handling, pylint cleanup * Update requirements.txt * adding toggle to avoid sending config obj * adding toggle to avoid sending config obj * disable to string for test * control pipelines for testing * add self to anvil get files * add log stream to make it available * typo fix * correcting branch id * adding source repo * adding source repo * patch name-resolver response * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * dug pipeline edit * recurisvely find recursively * recurisvely find recursively * setup output path for crawling * all task functions should have input and output params * adding annotation as upstream for validate index * revamp create task , and task wrapper * add validate concepts index task * adding concept validation * add index_variables task as dependecy for validate concepts * add index_variables task as dependecy for validate concepts * await client exist * await client exist * concepts not getting picked up for indexing * concepts not getting picked up for indexing * fix search elements * converting annotation output to json * json format annotation outputs * adding support for json format elements and concepts read * json back to dug objects * fixing index valriables with json objects * indetation and new line for better change detection :? * indetation and new line for better change detection * treat dictionary concepts as dictionary * read concepts json as a dict * concepts files are actually file paths * debug message * make output jsonable * clear up dir after commit , and delete unmerged branch even if no changes * don`t clear indexes, parallel dataset processing will be taxed * memory leak? * memory leak? * memory leak? * dumping pickles to debug locally * find out why concepts are being added to every other element * find out why concepts are being added to every other element * pointless shuffle 🤷‍♂️ * revert back in time * back to sanitize dug * output just json for annotation * adding jsonpickle * jsonpickle 🥒 * unpickle for index * unpickle for validate index * crawling fixes * crawling fixes * crawling validation fixes * fix index concepts * fix makekgx * adding other bdc pipelines * adding pipeline paramters to be able to configure per instance * fix * add input dataset for pipelines * Adding README to document how to create data set-specific pipelines * catchup on base.py * Added dbgap and nida pipelines * fix import errors * annotator modules added by passing config val (#90) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 --------- Co-authored-by: YaphetKG * Add heal parsers (#96) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Add heal parsers (#97) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Radx pipeline (#99) * point to large download * fix schema path * debug bulk input dir * fix schema read * fix schema read * fix schema read * commenting steup dir for test * adding logs * fix path stuff * add commented stuff back in * testing radx parser * adding parser * skip indexing vars with no id * adding indexes as part of bulk loader paramters * fix id index cli arg * fix local cli * dug latest --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * pin avalon --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * remove jenkins file * bump apache version * revert airflow version --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> --- Dockerfile | 2 +- dags/roger/config/config.yaml | 2 +- requirements.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 49c1fd26..7760c983 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM apache/airflow:2.7.2-python3.11 USER root RUN apt-get update && \ - apt-get install -y git nano vim + apt-get install -y git nano vim gcc COPY requirements.txt requirements.txt USER airflow RUN pip install -r requirements.txt diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml index 86c95adc..c407555f 100644 --- a/dags/roger/config/config.yaml +++ b/dags/roger/config/config.yaml @@ -49,7 +49,7 @@ annotation: sapbert: classification_url: "https://med-nemo.apps.renci.org/annotate/" annotator_url: "https://sap-qdrant.apps.renci.org/annotate/" - score_threshold: 0.5 + score_threshold: 0.8 bagel: enabled: false url: "http://localhost:9099/group_synonyms_openai" diff --git a/requirements.txt b/requirements.txt index 44c9a395..3a0ee223 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ jsonpickle redisgraph-bulk-loader==0.12.3 pytest PyYAML -git+https://github.com/helxplatform/dug@develop +git+https://github.com/helxplatform/dug@2.13.2 orjson kg-utils==0.0.6 bmt==1.1.0 From 20585846d170257bf5383ea12c95aacd5b39eb7a Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Tue, 8 Oct 2024 13:50:17 -0400 Subject: [PATCH 29/63] Updated docker image to 2.10.2 --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 7760c983..9d3ca383 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM apache/airflow:2.7.2-python3.11 +FROM apache/airflow:2.10.2-python3.11 USER root RUN apt-get update && \ From 6dec958663857cdbe4f9c46e4d7d16fc47455269 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 8 Oct 2024 14:55:46 -0400 Subject: [PATCH 30/63] fix kgx path error --- dags/roger/core/storage.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dags/roger/core/storage.py b/dags/roger/core/storage.py index c1a3e808..3e11ecc6 100644 --- a/dags/roger/core/storage.py +++ b/dags/roger/core/storage.py @@ -167,6 +167,8 @@ def merge_path(name, path: Path=None): if not os.path.exists(ROGER_DATA_DIR / 'merge'): os.makedirs(ROGER_DATA_DIR / 'merge') return str(ROGER_DATA_DIR / 'merge' / name) + if not os.path.exists(path): + os.makedirs(ROGER_DATA_DIR / 'merge') return str(path.joinpath(name)) def merged_objects(file_type, path=None): From d3f03ab1a5554d75731edb18832f568f468bca49 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 8 Oct 2024 16:42:53 -0400 Subject: [PATCH 31/63] fix kgx path error --- dags/roger/core/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/roger/core/storage.py b/dags/roger/core/storage.py index 3e11ecc6..4346cd92 100644 --- a/dags/roger/core/storage.py +++ b/dags/roger/core/storage.py @@ -168,7 +168,7 @@ def merge_path(name, path: Path=None): os.makedirs(ROGER_DATA_DIR / 'merge') return str(ROGER_DATA_DIR / 'merge' / name) if not os.path.exists(path): - os.makedirs(ROGER_DATA_DIR / 'merge') + os.makedirs(path) return str(path.joinpath(name)) def merged_objects(file_type, path=None): From 745cf79d5f05d7366fd856dbc63136469e7d972f Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Thu, 10 Oct 2024 17:52:59 -0400 Subject: [PATCH 32/63] Trying out the slim apache images to reduce vulnerability footprint --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9d3ca383..63350c0f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM apache/airflow:2.10.2-python3.11 +FROM apache/airflow:slim-2.10.2-python3.11 USER root RUN apt-get update && \ From b86807e0fc24c70041f94c06cc7fc2113926514e Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Thu, 10 Oct 2024 18:00:28 -0400 Subject: [PATCH 33/63] Building general package update into dockerfile --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 63350c0f..5175d1e8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM apache/airflow:slim-2.10.2-python3.11 USER root RUN apt-get update && \ + apt-get upgrade -y && \ apt-get install -y git nano vim gcc COPY requirements.txt requirements.txt USER airflow From 5dbe502150523b50c4af67394b30ea0494308e90 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Thu, 10 Oct 2024 18:09:42 -0400 Subject: [PATCH 34/63] Trying more rigorous dist-upgrade to try to get rid of vulerabilities that won't quit --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 5175d1e8..1933fcc0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM apache/airflow:slim-2.10.2-python3.11 USER root RUN apt-get update && \ - apt-get upgrade -y && \ + apt-get dist-upgrade && \ apt-get install -y git nano vim gcc COPY requirements.txt requirements.txt USER airflow From 3f6e987a9949f4ecedfb1b21bfc0180264b97127 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Thu, 10 Oct 2024 18:10:58 -0400 Subject: [PATCH 35/63] dist-upgrade also needs -y flag --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 1933fcc0..cda551c1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM apache/airflow:slim-2.10.2-python3.11 USER root RUN apt-get update && \ - apt-get dist-upgrade && \ + apt-get dist-upgrade -y && \ apt-get install -y git nano vim gcc COPY requirements.txt requirements.txt USER airflow From a1673b69102be89642829ea570282de2a24b1490 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Fri, 11 Oct 2024 12:22:29 -0400 Subject: [PATCH 36/63] Rolling back from slim image --- Dockerfile | 5 +++-- requirements.txt | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index cda551c1..ef772ddd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,9 @@ -FROM apache/airflow:slim-2.10.2-python3.11 +FROM apache/airflow:2.10.2-python3.11 USER root RUN apt-get update && \ - apt-get dist-upgrade -y && \ + apt-get upgrade -y && \ + apt-get uninstall libaom3 -y && \ apt-get install -y git nano vim gcc COPY requirements.txt requirements.txt USER airflow diff --git a/requirements.txt b/requirements.txt index 3a0ee223..651daec0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ elasticsearch==8.5.2 flatten-dict jsonpickle redisgraph-bulk-loader==0.12.3 +setuptools>=66 pytest PyYAML git+https://github.com/helxplatform/dug@2.13.2 From 0464da117b2b587d9ceac4e3e5041f944c073f8f Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Fri, 11 Oct 2024 12:44:36 -0400 Subject: [PATCH 37/63] Fixed apt-get syntax error --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index ef772ddd..d8722b54 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ FROM apache/airflow:2.10.2-python3.11 USER root RUN apt-get update && \ apt-get upgrade -y && \ - apt-get uninstall libaom3 -y && \ + apt-get remove libaom3 -y && \ apt-get install -y git nano vim gcc COPY requirements.txt requirements.txt USER airflow From 47df70d10d91a8ecbe52f648db35be3f31b8c378 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Fri, 11 Oct 2024 15:00:46 -0400 Subject: [PATCH 38/63] Reverting develop to 2.7.2 for stability's sake, will revisit after upgrades finish --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index d8722b54..9cc3f819 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM apache/airflow:2.10.2-python3.11 +FROM apache/airflow:2.7.2-python3.11 USER root RUN apt-get update && \ From b721976c5ab507262aec57d4f7576bcdf03db780 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:23:21 -0400 Subject: [PATCH 39/63] Update Dockerfile --- Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9cc3f819..bd4ef93d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,6 @@ FROM apache/airflow:2.7.2-python3.11 USER root RUN apt-get update && \ apt-get upgrade -y && \ - apt-get remove libaom3 -y && \ apt-get install -y git nano vim gcc COPY requirements.txt requirements.txt USER airflow From 6f505d8604167e2a33ffd4a01cb67ef8023c151b Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:25:08 -0400 Subject: [PATCH 40/63] Update Dockerfile --- Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index bd4ef93d..7760c983 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,6 @@ FROM apache/airflow:2.7.2-python3.11 USER root RUN apt-get update && \ - apt-get upgrade -y && \ apt-get install -y git nano vim gcc COPY requirements.txt requirements.txt USER airflow From be2b07d11cde611e5b325aa2cfa637a248d96954 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Fri, 11 Oct 2024 15:43:38 -0400 Subject: [PATCH 41/63] Removed post-install package cleanup --- Dockerfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9cc3f819..7760c983 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,8 +2,6 @@ FROM apache/airflow:2.7.2-python3.11 USER root RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get remove libaom3 -y && \ apt-get install -y git nano vim gcc COPY requirements.txt requirements.txt USER airflow From 73812a1eb405f0d130f0516906958fdafac8ffc5 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Fri, 11 Oct 2024 21:36:29 -0400 Subject: [PATCH 42/63] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 651daec0..3397cb9e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ redisgraph-bulk-loader==0.12.3 setuptools>=66 pytest PyYAML -git+https://github.com/helxplatform/dug@2.13.2 +git+https://github.com/helxplatform/dug@patch-1-nemo-error orjson kg-utils==0.0.6 bmt==1.1.0 From 6d5194ccc1b1f3b5172d60a81e0f70a98454ea3a Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Mon, 21 Oct 2024 16:44:14 -0400 Subject: [PATCH 43/63] test merge issue --- dags/roger/tasks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py index f5451140..1f17f6ae 100755 --- a/dags/roger/tasks.py +++ b/dags/roger/tasks.py @@ -201,10 +201,10 @@ def avalon_commit_callback(context: DagContext, **kwargs): logger.error(e) # delete temp branch finally: - client._client.branches_api.delete_branch( - repository=repo, - branch=temp_branch_name - ) + # client._client.branches_api.delete_branch( + # repository=repo, + # branch=temp_branch_name + # ) logger.info(f"deleted temp branch {temp_branch_name}") logger.info(f"deleting local dir {local_path}") files_to_clean = glob.glob(local_path + '**', recursive=True) + [local_path] From 6db88f82879512c86ce335ec5946406d5abb7ad3 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Mon, 21 Oct 2024 17:48:44 -0400 Subject: [PATCH 44/63] fix merge issue --- dags/roger/tasks.py | 3 ++- requirements.txt | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py index 1f17f6ae..8823d629 100755 --- a/dags/roger/tasks.py +++ b/dags/roger/tasks.py @@ -176,7 +176,8 @@ def avalon_commit_callback(context: DagContext, **kwargs): branch=temp_branch_name, repo=repo, # @TODO figure out how to pass real commit id here - commit_id=branch + commit_id=branch, + source_branch_name=branch ) # see what changes are going to be pushed from this branch to main branch diff --git a/requirements.txt b/requirements.txt index 3397cb9e..d89eb237 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,5 +11,5 @@ git+https://github.com/helxplatform/dug@patch-1-nemo-error orjson kg-utils==0.0.6 bmt==1.1.0 -git+https://github.com/helxplatform/avalon.git@v1.0.1 +git+https://github.com/helxplatform/avalon.git@develop linkml-runtime==1.6.0 From 5794dc16f61f4f1de424b3bfddb51e8618e96cb4 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Tue, 18 Feb 2025 11:42:37 -0500 Subject: [PATCH 45/63] modify radx pipeline (#111) * modify radx pipeline * pin to dug release --- dags/roger/pipelines/radx.py | 12 +++++++++++- requirements.txt | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/dags/roger/pipelines/radx.py b/dags/roger/pipelines/radx.py index 32491fb0..7ffae159 100644 --- a/dags/roger/pipelines/radx.py +++ b/dags/roger/pipelines/radx.py @@ -1,8 +1,18 @@ "Pipeline for BACPAC data" from roger.pipelines import DugPipeline +from roger.core import storage + class RadxPipeline(DugPipeline): - "Pipeline for BACPAC data set" + "Pipeline for Radx data set" pipeline_name = "radx" parser_name = "radx" + + def get_objects(self, input_data_path=None): + if not input_data_path: + input_data_path = storage.dug_kfdrc_path() + files = storage.get_files_recursive( + lambda file_name: file_name.endswith('.json'), + input_data_path) + return sorted([str(f) for f in files]) diff --git a/requirements.txt b/requirements.txt index 2c9dbbf4..4877244f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ redisgraph-bulk-loader==0.12.3 setuptools>=66 pytest PyYAML -git+https://github.com/helxplatform/dug@2.13.4 +git+https://github.com/helxplatform/dug@2.13.6 orjson kg-utils==0.0.6 bmt==1.1.0 From dbfb5b2973624eac7e2fd0aaa8e1950943b1ddfd Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Wed, 26 Feb 2025 10:43:39 -0500 Subject: [PATCH 46/63] Heal ingest 2 18 (#112) * rename repos for heal ingest * change lib for bulk loader * change lib for bulk loader * cde?? * cde?? * cde?? * cde?? * cde?? * final push * final push --- dags/roger/core/bulkload.py | 5 +-- dags/roger/core/storage.py | 11 ++++--- dags/roger/models/kgx.py | 4 +-- dags/roger/pipelines/base.py | 15 +++++++-- .../roger/pipelines/heal_research_programs.py | 2 +- dags/roger/pipelines/heal_studies.py | 2 +- requirements.txt | 31 ++++++++++--------- 7 files changed, 42 insertions(+), 28 deletions(-) diff --git a/dags/roger/core/bulkload.py b/dags/roger/core/bulkload.py index 0f0fecee..616846dc 100644 --- a/dags/roger/core/bulkload.py +++ b/dags/roger/core/bulkload.py @@ -10,7 +10,7 @@ import requests import redis -from redisgraph_bulk_loader.bulk_insert import bulk_insert +from falkordb_bulk_loader.bulk_insert import bulk_insert from roger.config import get_default_config as get_config from roger.logger import get_logger @@ -347,8 +347,9 @@ def insert (self, input_data_path=None): # Edge label now no longer has 'biolink:' args.extend(("-R " + " -R ".join(edges_with_type)).split()) args.extend([f"--separator={self.separator}"]) - args.extend([f"--redis-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}"]) + args.extend([f"--server-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}"]) args.extend(['--enforce-schema']) + args.extend(['-e']) for lbl in collect_labels: args.extend([f'-i `{lbl}`:id', f'-f {lbl}:name', f'-f {lbl}:synonyms']) args.extend([f"{redisgraph['graph']}"]) diff --git a/dags/roger/core/storage.py b/dags/roger/core/storage.py index e3c9697c..b4869758 100644 --- a/dags/roger/core/storage.py +++ b/dags/roger/core/storage.py @@ -226,10 +226,13 @@ def dug_expanded_concept_objects(data_path=None, format="pickle"): os.path.join('*',f'expanded_concepts.{format}')) return sorted(glob.glob(file_pattern, recursive=True)) -def dug_extracted_elements_objects(): - file_pattern = dug_expanded_concepts_path( - os.path.join('*', 'extracted_graph_elements.pickle')) - return sorted(glob.glob(file_pattern)) +def dug_extracted_elements_objects(data_path=None, format="txt"): + if data_path: + file_pattern = os.path.join(data_path, '**', f'extracted_graph_elements.{format}') + else: + file_pattern = dug_expanded_concepts_path( + os.path.join('*', f'extracted_graph_elements.{format}')) + return sorted(glob.glob(file_pattern, recursive=True)) def dug_crawl_path(name): return str(ROGER_DATA_DIR / 'dug' / 'crawl' / name) diff --git a/dags/roger/models/kgx.py b/dags/roger/models/kgx.py index e80e65db..e35ab07e 100644 --- a/dags/roger/models/kgx.py +++ b/dags/roger/models/kgx.py @@ -310,7 +310,7 @@ def create_nodes_schema(self, input_data_path=None, output_data_path=None): log.info(f"Processing node : {node} counter : {counter}") counter += 1 - if not node['category']: + if not node.get('category'): category_error_nodes.add(node['id']) node['category'] = [BiolinkModel.root_type] @@ -341,7 +341,7 @@ def create_nodes_schema(self, input_data_path=None, output_data_path=None): if len(category_error_nodes): - log.warn(f"some nodes didn't have category assigned. " + log.warning(f"some nodes didn't have category assigned. " f"KGX file has errors." f"Nodes {len(category_error_nodes)}." f"Showing first 10: {list(category_error_nodes)[:10]}." diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py index 2a1b605e..9852147c 100644 --- a/dags/roger/pipelines/base.py +++ b/dags/roger/pipelines/base.py @@ -953,12 +953,21 @@ def index_concepts(self, to_string=False, "Index concepts from expanded concept files" # These are concepts that have knowledge graphs from tranql # clear out concepts and kg indicies from previous runs - # self.clear_concepts_index() - # self.clear_kg_index() + self.clear_concepts_index() + self.clear_kg_index() expanded_concepts_files = storage.dug_expanded_concept_objects( input_data_path, format="txt") for file_ in expanded_concepts_files: - concepts = jsonpickle.decode(storage.read_object(file_)) + concepts = jsonpickle.decode(storage.read_object(file_)) self._index_concepts(concepts=concepts) + + if self.config.indexing.node_to_element_queries: + log.info("*******************") + + extracted_elements_files = storage.dug_extracted_elements_objects(data_path=input_data_path) + log.info(f"{extracted_elements_files}") + for file_ in extracted_elements_files: + log.info(f"reading file {file_}") + self.index_elements(file_) output_log = self.log_stream.getvalue() if to_string else '' return output_log diff --git a/dags/roger/pipelines/heal_research_programs.py b/dags/roger/pipelines/heal_research_programs.py index d95a4497..bfec3f83 100644 --- a/dags/roger/pipelines/heal_research_programs.py +++ b/dags/roger/pipelines/heal_research_programs.py @@ -5,7 +5,7 @@ class HealResearchProgramPipeline(DugPipeline): "Pipeline for Heal-research-programs data set" - pipeline_name = "heal-research-programs" + pipeline_name = "heal-mds-research-networks" parser_name = "heal-research" def get_objects(self, input_data_path=None): diff --git a/dags/roger/pipelines/heal_studies.py b/dags/roger/pipelines/heal_studies.py index cf78c042..a08e8115 100644 --- a/dags/roger/pipelines/heal_studies.py +++ b/dags/roger/pipelines/heal_studies.py @@ -5,7 +5,7 @@ class HealStudiesPipeline(DugPipeline): "Pipeline for Heal-studies data set" - pipeline_name = "heal-studies" + pipeline_name = "heal-mds-studies" parser_name = "heal-studies" def get_objects(self, input_data_path=None): diff --git a/requirements.txt b/requirements.txt index 4877244f..743a2b49 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,16 @@ -boto3==1.18.23 -botocore==1.21.23 -elasticsearch==8.5.2 -flatten-dict -jsonpickle -redisgraph-bulk-loader==0.12.3 -setuptools>=66 -pytest -PyYAML -git+https://github.com/helxplatform/dug@2.13.6 -orjson -kg-utils==0.0.6 -bmt==1.1.0 -git+https://github.com/helxplatform/avalon.git@v1.1.0 -linkml-runtime==1.6.0 +boto3==1.18.23 +botocore==1.21.23 +elasticsearch==8.5.2 +flatten-dict +jsonpickle +git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6 +#redisgraph-bulk-loader==0.12.3 +setuptools>=66 +pytest +PyYAML +git+https://github.com/helxplatform/dug@develop +orjson +kg-utils==0.0.6 +bmt==1.1.0 +git+https://github.com/helxplatform/avalon.git@v1.1.0 +linkml-runtime==1.6.0 From 06a4ed1a8090a3c735222909616c7e880d94c575 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 26 Feb 2025 12:04:30 -0500 Subject: [PATCH 47/63] test node type --- dags/roger/core/bulkload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/roger/core/bulkload.py b/dags/roger/core/bulkload.py index 616846dc..a8ddbc15 100644 --- a/dags/roger/core/bulkload.py +++ b/dags/roger/core/bulkload.py @@ -53,7 +53,7 @@ def create_nodes_csv_file(self, input_data_path=None, output_data_path=None): merged_nodes_file = storage.merged_objects('nodes', input_data_path) counter = 1 for node in storage.json_line_iter(merged_nodes_file): - if not node['category']: + if not node.get('category'): category_error_nodes.add(node['id']) node['category'] = [BiolinkModel.root_type] index = self.biolink.get_leaf_class(node['category']) From faf49756063c8f0539ae613d3ab0cae5de5a7b53 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Mon, 24 Mar 2025 11:08:40 -0400 Subject: [PATCH 48/63] Heal ingest 2 18 (#114) * rename repos for heal ingest * change lib for bulk loader * change lib for bulk loader * cde?? * cde?? * cde?? * cde?? * cde?? * final push * final push * add study url --- dags/roger/pipelines/base.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py index 9852147c..eab9159c 100644 --- a/dags/roger/pipelines/base.py +++ b/dags/roger/pipelines/base.py @@ -308,11 +308,16 @@ def convert_to_kgx_json(self, elements, written_nodes=None): if not isinstance(element, DugElement): continue study_id = element.collection_id + study_link = element.collection_action + study_desc = element.collection_desc + if study_id not in written_nodes: nodes.append({ "id": study_id, "category": ["biolink:Study"], - "name": study_id + "name": study_id, + "url": study_link, + "description": study_desc }) written_nodes.add(study_id) From 048e15f6e231239479ff738b1684dc73efef7401 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Wed, 26 Mar 2025 10:48:36 -0400 Subject: [PATCH 49/63] Bitnami (#117) * checkpoint * pushing docker changes * Executor tests 2.10 (#115) * test out bash operator * task name * pushing * revert python executor , with no excutor config * docker file and requriements * everything latest --------- Co-authored-by: waTeim --- Dockerfile | 34 +++++++++++++++++++++++++++++----- dags/roger/tasks.py | 7 +++++-- requirements.txt | 30 ++++++++++++++---------------- 3 files changed, 48 insertions(+), 23 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7760c983..47d2c13f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,33 @@ -FROM apache/airflow:2.7.2-python3.11 +FROM bitnami/airflow:2.10.5-debian-12-r7 USER root -RUN apt-get update && \ - apt-get install -y git nano vim gcc +RUN apt-get update && apt-get install -y git nano vim gcc rustc cargo +#RUN useradd -u 1001 -ms /bin/bash airflow && chown -R airflow /home/airflow COPY requirements.txt requirements.txt -USER airflow -RUN pip install -r requirements.txt +RUN source /opt/bitnami/airflow/venv/bin/activate && CARGO_HOME=/tmp/.cargo && \ + pip install setuptools wheel && \ + pip install -r requirements.txt + RUN rm -f requirements.txt + +## Vul patches +## Python lib patches on airflow python env +RUN source /opt/bitnami/airflow/venv/bin/activate pip install --upgrade \ + flask-appbuilder==4.5.3 \ + cryptography==44.0.1 \ + werkzeug==3.0.6 \ + urllib3==2.2.2 +RUN source /opt/bitnami/airflow/venv/bin/activate pip uninstall -y \ + apache-airflow-providers-mysql==6.2.0 + +# Uninstall these from non airflow python env +RUN pip install --upgrade \ + flask-appbuilder==4.5.3 \ + cryptography==44.0.1 \ + werkzeug==3.0.6 \ + urllib3==2.2.2 +RUN apt-get autoremove -y vim +RUN apt-get autoremove -y binutils +RUN apt-get autoremove -y linux-libc-dev + +USER airflow diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py index 678f7137..5fe5ff90 100755 --- a/dags/roger/tasks.py +++ b/dags/roger/tasks.py @@ -9,11 +9,13 @@ from airflow.models import DAG from airflow.models.dag import DagContext from airflow.models.taskinstance import TaskInstance +from airflow.operators.bash import BashOperator from typing import Union from pathlib import Path import glob import shutil + from roger.config import config, RogerConfig from roger.logger import get_logger from roger.pipelines.base import DugPipeline @@ -307,7 +309,8 @@ def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos = :param dag: dag to add task to. :param name: The name of the task. :param a_callable: The code to run in this task. - """ + """ + # these are actual arguments passed down to the task function op_kwargs = { "python_callable": a_callable, @@ -324,7 +327,7 @@ def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos = python_operator_args = { "task_id": name, "python_callable":task_wrapper, - "executor_config" : get_executor_config(), + # "executor_config" : get_executor_config(), "dag": dag, "provide_context" : True } diff --git a/requirements.txt b/requirements.txt index ef7b40e5..83d7c320 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,14 @@ -boto3==1.18.23 -botocore==1.21.23 -elasticsearch==8.5.2 -flatten-dict -jsonpickle -git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6 -#redisgraph-bulk-loader==0.12.3 -setuptools>=66 -pytest -PyYAML -git+https://github.com/helxplatform/dug@develop -orjson -kg-utils==0.0.6 -bmt==1.1.0 -git+https://github.com/helxplatform/avalon.git@v1.1.0 -linkml-runtime==1.6.0 \ No newline at end of file +elasticsearch==8.5.2 +flatten-dict +jsonpickle +git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6 +setuptools>=66 +pytest +PyYAML +git+https://github.com/helxplatform/dug@2.13.8 +orjson==3.9.15 +git+https://github.com/helxplatform/kg_utils.git@v0.0.10 +git+https://github.com/helxplatform/python-stringcase@1.2.1 +bmt==1.4.4 +git+https://github.com/helxplatform/avalon.git@v1.1.0 + From ddb3663f4f7290f598b6a11781267ec8eaa30cad Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Mon, 19 May 2025 14:15:16 -0400 Subject: [PATCH 50/63] remove clear index --- dags/roger/pipelines/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py index eab9159c..24170f69 100644 --- a/dags/roger/pipelines/base.py +++ b/dags/roger/pipelines/base.py @@ -958,8 +958,8 @@ def index_concepts(self, to_string=False, "Index concepts from expanded concept files" # These are concepts that have knowledge graphs from tranql # clear out concepts and kg indicies from previous runs - self.clear_concepts_index() - self.clear_kg_index() + # self.clear_concepts_index() + # self.clear_kg_index() expanded_concepts_files = storage.dug_expanded_concept_objects( input_data_path, format="txt") for file_ in expanded_concepts_files: From 27cc8806fb3ed37ccf43a652133c343c2289596d Mon Sep 17 00:00:00 2001 From: ykale <14827177+yskale@users.noreply.github.com> Date: Mon, 19 May 2025 16:36:22 -0400 Subject: [PATCH 51/63] Update requirements.txt fixed h11 version --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 83d7c320..b47e4b5a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ git+https://github.com/helxplatform/kg_utils.git@v0.0.10 git+https://github.com/helxplatform/python-stringcase@1.2.1 bmt==1.4.4 git+https://github.com/helxplatform/avalon.git@v1.1.0 +h11>=0.16.0 From 2cea20725c387331becbcfdc9d86eaac406cd450 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Tue, 27 May 2025 11:40:29 -0400 Subject: [PATCH 52/63] fix study name in graph (#121) --- dags/roger/pipelines/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py index 24170f69..ba278deb 100644 --- a/dags/roger/pipelines/base.py +++ b/dags/roger/pipelines/base.py @@ -310,12 +310,13 @@ def convert_to_kgx_json(self, elements, written_nodes=None): study_id = element.collection_id study_link = element.collection_action study_desc = element.collection_desc + study_name = element.collection_name or element.collection_id if study_id not in written_nodes: nodes.append({ "id": study_id, "category": ["biolink:Study"], - "name": study_id, + "name": study_name, "url": study_link, "description": study_desc }) From 30c4e8d0e986223091713055bc87a5dd97d35353 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Wed, 28 May 2025 15:03:19 -0400 Subject: [PATCH 53/63] comment clear index. rolledback during merge --- dags/roger/pipelines/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py index 09e8284b..d9a41493 100644 --- a/dags/roger/pipelines/base.py +++ b/dags/roger/pipelines/base.py @@ -960,8 +960,8 @@ def index_concepts(self, to_string=False, "Index concepts from expanded concept files" # These are concepts that have knowledge graphs from tranql # clear out concepts and kg indicies from previous runs - self.clear_concepts_index() - self.clear_kg_index() + # self.clear_concepts_index() + # self.clear_kg_index() expanded_concepts_files = storage.dug_expanded_concept_objects( input_data_path, format="txt") for file_ in expanded_concepts_files: From 3a8181156ae67c25d5f13f84355ae9b3b56f293e Mon Sep 17 00:00:00 2001 From: ykale <14827177+yskale@users.noreply.github.com> Date: Tue, 3 Jun 2025 12:02:48 -0400 Subject: [PATCH 54/63] Picsure data ingest test (#118) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test * changed the pipeline name * parser name updated * update pipeline name * update the get_object/files * changed the lakefs dataset name for testing * update the pipelines for new programs in BDC * init parser and crawler objects once and retry * fix logger -> log typo * remove clear index (#119) * update with develop (#124) * Sync develop with main (#116) * Update _version.py (#86) * Update _version.py * Rti merge (#84) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Update requirements.txt * Update .gitignore * Update dug_utils.py Handle Error when curie not found in validate * Update __init__.py * Update config.yaml * Update dev-config.yaml * Update docker-compose.yaml * fixed docker-compose * adding back postgres volume to docker compose * env correction , docker compose updates --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent * adding v5.0 * cde-links branch * pin linkml * Update config.yaml collection_action to action * pop total items before result * print extracted elements * Update requirements.txt * Keep edge provenance (#94) * Update kgx.py * Update kgx.py * Update kgx.py can't delete edge keys while looping over them. * just collect then update * Update requirements.txt (#93) * Pipeline parameterize restructure (#95) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * Parameterized annotate tasks with input_data_path and output_data_path * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Parameterized annotate tasks with input_data_path and output_data_path (#85) * adding lakefs changes to roger-2.0 * point avalon to vg1 branch * change avalon dep * update airflow * fix avalon tag typo * update jenkins to tag version on main branch only * update jenkins to tag version * update jenkins to tag version * psycopg2 installation * add cncf k8s req * use airflow non-slim * simplified for testing * simplified for testing * change dag name * Erroneous parameter passed, should not be None * adding pre-exec * adding pre-exec * adding pre-exec * typo preexec * typo preexec * fix context * get files from repo * get files from repo * get files from repo * get files from repo * First shot at moving pipeline into base class and implementing. Anvil pipeline not complete * Syntax fix, docker image version bump to airflow 2.7.2-python3.11 * update storage dir * update remove dir code * update remove dir code * remote path to * * fix input dir for annotators * fix input dir for annotators * fix input dir for annotators * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * adding branch info on lakefs config * callback push to branch * back to relative import * reformat temp branch name based on unique task id * add logging * add logging * convert posix path to str for avalon * add extra / to root path * New dag created using DugPipeline subclasses * EmptyOperator imported from wrong place * import and syntax fixes * utterly silly syntax error * Added anvil to default input data sets for testing purposes * adding / to local path * commit meta task args empty string * add merge logic * add merge logic * upstream task dir pull for downstream task * Switched from subdag to taskgroup because latest Airflow depricated subdag * Added BACPAC pipeline object * Temporarily ignoring configuration variable for enabled datasets for testing * Passed dag in to create task group to see if it helps dag errors * Fixed silly syntax error * adding input / output dir params for make kgx * Trying different syntax to make taskgroups work. * adding input / output dir params for make kgx * Parsing, syntax, pylint fixes * adding input / output dir params for make kgx * Added pipeline name to task group name to ensure uniqueness * oops, moved something out of scope. Fixed * Filled out pipeline with methods from dug_utils. Needs data path changes * Finished implementing input_data_path and output_data_path handling, pylint cleanup * Update requirements.txt * adding toggle to avoid sending config obj * adding toggle to avoid sending config obj * disable to string for test * control pipelines for testing * add self to anvil get files * add log stream to make it available * typo fix * correcting branch id * adding source repo * adding source repo * patch name-resolver response * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * dug pipeline edit * recurisvely find recursively * recurisvely find recursively * setup output path for crawling * all task functions should have input and output params * adding annotation as upstream for validate index * revamp create task , and task wrapper * add validate concepts index task * adding concept validation * add index_variables task as dependecy for validate concepts * add index_variables task as dependecy for validate concepts * await client exist * await client exist * concepts not getting picked up for indexing * concepts not getting picked up for indexing * fix search elements * converting annotation output to json * json format annotation outputs * adding support for json format elements and concepts read * json back to dug objects * fixing index valriables with json objects * indetation and new line for better change detection :? * indetation and new line for better change detection * treat dictionary concepts as dictionary * read concepts json as a dict * concepts files are actually file paths * debug message * make output jsonable * clear up dir after commit , and delete unmerged branch even if no changes * don`t clear indexes, parallel dataset processing will be taxed * memory leak? * memory leak? * memory leak? * dumping pickles to debug locally * find out why concepts are being added to every other element * find out why concepts are being added to every other element * pointless shuffle 🤷‍♂️ * revert back in time * back to sanitize dug * output just json for annotation * adding jsonpickle * jsonpickle 🥒 * unpickle for index * unpickle for validate index * crawling fixes * crawling fixes * crawling validation fixes * fix index concepts * fix makekgx * adding other bdc pipelines * adding pipeline paramters to be able to configure per instance * fix * add input dataset for pipelines * Adding README to document how to create data set-specific pipelines * catchup on base.py * Added dbgap and nida pipelines * fix import errors * annotator modules added by passing config val (#90) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 --------- Co-authored-by: YaphetKG * Add heal parsers (#96) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Add heal parsers (#97) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Radx pipeline (#99) * point to large download * fix schema path * debug bulk input dir * fix schema read * fix schema read * fix schema read * commenting steup dir for test * adding logs * fix path stuff * add commented stuff back in * testing radx parser * adding parser * skip indexing vars with no id * adding indexes as part of bulk loader paramters * fix id index cli arg * fix local cli * dug latest --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * pin avalon * deleted jenkins and added workflows * unlinked helx-actions * testing paths * testing again * d * tests * commented out pytest * try again for bandit * commented out bandit * changed dag to dags * Added fixes * Bagel (#103) * bump dug version * adding bdc new pipelines * adding curesc * adding bagel config * add test parser * add score threshold * point to dug develop * Dbgap programs (#104) * bump dug version * adding bdc new pipelines * adding curesc * fix up merge conflict * adding bagel config parse to bool * Dev sync main (#106) * release sync (#100) * Update _version.py (#86) * Update _version.py * Rti merge (#84) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Update requirements.txt * Update .gitignore * Update dug_utils.py Handle Error when curie not found in validate * Update __init__.py * Update config.yaml * Update dev-config.yaml * Update docker-compose.yaml * fixed docker-compose * adding back postgres volume to docker compose * env correction , docker compose updates --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent * adding v5.0 * cde-links branch * pin linkml * Update config.yaml collection_action to action * pop total items before result * print extracted elements * Update requirements.txt * Keep edge provenance (#94) * Update kgx.py * Update kgx.py * Update kgx.py can't delete edge keys while looping over them. * just collect then update * Update requirements.txt (#93) * Pipeline parameterize restructure (#95) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * Parameterized annotate tasks with input_data_path and output_data_path * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Parameterized annotate tasks with input_data_path and output_data_path (#85) * adding lakefs changes to roger-2.0 * point avalon to vg1 branch * change avalon dep * update airflow * fix avalon tag typo * update jenkins to tag version on main branch only * update jenkins to tag version * update jenkins to tag version * psycopg2 installation * add cncf k8s req * use airflow non-slim * simplified for testing * simplified for testing * change dag name * Erroneous parameter passed, should not be None * adding pre-exec * adding pre-exec * adding pre-exec * typo preexec * typo preexec * fix context * get files from repo * get files from repo * get files from repo * get files from repo * First shot at moving pipeline into base class and implementing. Anvil pipeline not complete * Syntax fix, docker image version bump to airflow 2.7.2-python3.11 * update storage dir * update remove dir code * update remove dir code * remote path to * * fix input dir for annotators * fix input dir for annotators * fix input dir for annotators * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * adding branch info on lakefs config * callback push to branch * back to relative import * reformat temp branch name based on unique task id * add logging * add logging * convert posix path to str for avalon * add extra / to root path * New dag created using DugPipeline subclasses * EmptyOperator imported from wrong place * import and syntax fixes * utterly silly syntax error * Added anvil to default input data sets for testing purposes * adding / to local path * commit meta task args empty string * add merge logic * add merge logic * upstream task dir pull for downstream task * Switched from subdag to taskgroup because latest Airflow depricated subdag * Added BACPAC pipeline object * Temporarily ignoring configuration variable for enabled datasets for testing * Passed dag in to create task group to see if it helps dag errors * Fixed silly syntax error * adding input / output dir params for make kgx * Trying different syntax to make taskgroups work. * adding input / output dir params for make kgx * Parsing, syntax, pylint fixes * adding input / output dir params for make kgx * Added pipeline name to task group name to ensure uniqueness * oops, moved something out of scope. Fixed * Filled out pipeline with methods from dug_utils. Needs data path changes * Finished implementing input_data_path and output_data_path handling, pylint cleanup * Update requirements.txt * adding toggle to avoid sending config obj * adding toggle to avoid sending config obj * disable to string for test * control pipelines for testing * add self to anvil get files * add log stream to make it available * typo fix * correcting branch id * adding source repo * adding source repo * patch name-resolver response * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * dug pipeline edit * recurisvely find recursively * recurisvely find recursively * setup output path for crawling * all task functions should have input and output params * adding annotation as upstream for validate index * revamp create task , and task wrapper * add validate concepts index task * adding concept validation * add index_variables task as dependecy for validate concepts * add index_variables task as dependecy for validate concepts * await client exist * await client exist * concepts not getting picked up for indexing * concepts not getting picked up for indexing * fix search elements * converting annotation output to json * json format annotation outputs * adding support for json format elements and concepts read * json back to dug objects * fixing index valriables with json objects * indetation and new line for better change detection :? * indetation and new line for better change detection * treat dictionary concepts as dictionary * read concepts json as a dict * concepts files are actually file paths * debug message * make output jsonable * clear up dir after commit , and delete unmerged branch even if no changes * don`t clear indexes, parallel dataset processing will be taxed * memory leak? * memory leak? * memory leak? * dumping pickles to debug locally * find out why concepts are being added to every other element * find out why concepts are being added to every other element * pointless shuffle 🤷‍♂️ * revert back in time * back to sanitize dug * output just json for annotation * adding jsonpickle * jsonpickle 🥒 * unpickle for index * unpickle for validate index * crawling fixes * crawling fixes * crawling validation fixes * fix index concepts * fix makekgx * adding other bdc pipelines * adding pipeline paramters to be able to configure per instance * fix * add input dataset for pipelines * Adding README to document how to create data set-specific pipelines * catchup on base.py * Added dbgap and nida pipelines * fix import errors * annotator modules added by passing config val (#90) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 --------- Co-authored-by: YaphetKG * Add heal parsers (#96) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Add heal parsers (#97) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Radx pipeline (#99) * point to large download * fix schema path * debug bulk input dir * fix schema read * fix schema read * fix schema read * commenting steup dir for test * adding logs * fix path stuff * add commented stuff back in * testing radx parser * adding parser * skip indexing vars with no id * adding indexes as part of bulk loader paramters * fix id index cli arg * fix local cli * dug latest --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * pin avalon --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * remove jenkins file * bump apache version * revert airflow version --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * Updated docker image to 2.10.2 * fix kgx path error * fix kgx path error * Trying out the slim apache images to reduce vulnerability footprint * Building general package update into dockerfile * Trying more rigorous dist-upgrade to try to get rid of vulerabilities that won't quit * dist-upgrade also needs -y flag * Rolling back from slim image * Fixed apt-get syntax error * Reverting develop to 2.7.2 for stability's sake, will revisit after upgrades finish * Update Dockerfile * Update Dockerfile * Removed post-install package cleanup * Update requirements.txt * test merge issue * fix merge issue * modify radx pipeline (#111) * modify radx pipeline * pin to dug release * Heal ingest 2 18 (#112) * rename repos for heal ingest * change lib for bulk loader * change lib for bulk loader * cde?? * cde?? * cde?? * cde?? * cde?? * final push * final push * test node type * Heal ingest 2 18 (#114) * rename repos for heal ingest * change lib for bulk loader * change lib for bulk loader * cde?? * cde?? * cde?? * cde?? * cde?? * final push * final push * add study url * Bitnami (#117) * checkpoint * pushing docker changes * Executor tests 2.10 (#115) * test out bash operator * task name * pushing * revert python executor , with no excutor config * docker file and requriements * everything latest --------- Co-authored-by: waTeim --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> Co-authored-by: Patrick Hachicho Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com> Co-authored-by: waTeim * fix study name in graph (#121) * comment clear index. rolledback during merge --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> Co-authored-by: Patrick Hachicho Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com> Co-authored-by: waTeim * Update requirements.txt with Dug V2.13.10 which have all the new changes from new_bdc_parser branch --------- Co-authored-by: YSK Co-authored-by: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> Co-authored-by: Patrick Hachicho Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com> Co-authored-by: waTeim --- dags/roger/pipelines/base.py | 36 +++++++++++++++++++-------- dags/roger/pipelines/bdc_pipelines.py | 28 ++++++++++++++------- dags/roger/pipelines/picsure_test.py | 26 +++++++++++++++++++ requirements.txt | 2 +- 4 files changed, 71 insertions(+), 21 deletions(-) create mode 100644 dags/roger/pipelines/picsure_test.py diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py index d9a41493..e108443e 100644 --- a/dags/roger/pipelines/base.py +++ b/dags/roger/pipelines/base.py @@ -216,6 +216,24 @@ def get_annotator(self): ) return annotator + def init_annotator(self, max_retries=5, base_delay=1, max_delay=10): + attempt = 0 + while attempt < max_retries: + try: + log.info("Initializing annotator") + annotator = self.get_annotator() + return annotator # success + except Exception as e: + attempt += 1 + if attempt == max_retries: + log.error("Max retries reached when creating annotator. Failing with error: %s", e) + raise + delay = min(base_delay * (2 ** (attempt - 1)), max_delay) + delay += random.uniform(0, 1) # add jitter + log.warning("Error occurred: %s. Retrying in %.2f seconds...", e, delay) + time.sleep(delay) + + def annotate_files(self, parsable_files, output_data_path=None): """ Annotates a Data element file using a Dug parser. @@ -226,11 +244,14 @@ def annotate_files(self, parsable_files, output_data_path=None): if not output_data_path: output_data_path = storage.dug_annotation_path('') log.info("Parsing files") + log.info("Intializing parser") + parser = self.get_parser() + log.info("Done intializing parser") + annotator = self.init_annotator() + log.info("Done intializing annotator") for _, parse_file in enumerate(parsable_files): log.debug("Creating Dug Crawler object on parse_file %s at %d of %d", - parse_file, _ , len(parsable_files)) - parser = self.get_parser() - annotator = self.get_annotator() + parse_file, _ , len(parsable_files)) crawler = Crawler( crawl_file=parse_file, parser=parser, @@ -247,14 +268,7 @@ def annotate_files(self, parsable_files, output_data_path=None): output_data_path, current_file_name) elements_file = os.path.join(elements_file_path, 'elements.txt') concepts_file = os.path.join(elements_file_path, 'concepts.txt') - # This is a file that the crawler will later populate. We start here - # by creating an empty elements file. - # This also creates output dir if it doesn't exist. - # elements_json = os.path.join(elements_file_path, - # 'element_file.json') - # log.debug("Creating empty file: %s", elements_json) - # storage.write_object({}, elements_json) - + # Use the specified parser to parse the parse_file into elements. log.debug("Parser is %s", str(parser)) elements = parser(parse_file) diff --git a/dags/roger/pipelines/bdc_pipelines.py b/dags/roger/pipelines/bdc_pipelines.py index 5a945641..d4c6436d 100644 --- a/dags/roger/pipelines/bdc_pipelines.py +++ b/dags/roger/pipelines/bdc_pipelines.py @@ -4,29 +4,29 @@ class BIOLINCCdbGaPPipeline(DugPipeline): "Pipeline for the dbGaP data set" - pipeline_name = 'biolincc' + pipeline_name = 'bdc-biolincc' parser_name = 'biolincc' class covid19dbGaPPipeline(DugPipeline): "Pipeline for the dbGaP data set" - pipeline_name = 'covid19-dbgap' + pipeline_name = 'bdc-covid19' parser_name = 'covid19' class dirDbGaPPipeline(DugPipeline): - pipeline_name = "dir-dbgap" + pipeline_name = "bdc-dir" parser_name = "dir" class LungMapDbGaPPipeline(DugPipeline): - pipeline_name = "lungmap-dbgap" + pipeline_name = "bdc-lungmap" parser_name = "lungmap" class nsrrDbGaPPipeline(DugPipeline): - pipeline_name = "nsrr-dbgap" + pipeline_name = "bdc-nsrr" parser_name = "nsrr" class ParentDbGaPPipeline(DugPipeline): - pipeline_name = "parent-dbgap" + pipeline_name = "bdc-parent" parser_name = "parent" class PCGCDbGaPPipeline(DugPipeline): @@ -34,15 +34,25 @@ class PCGCDbGaPPipeline(DugPipeline): parser_name = "pcgc" class RecoverDbGaPPipeline(DugPipeline): - pipeline_name = "recover-dbgap" + pipeline_name = "bdc-recover" parser_name = "recover" class TopmedDBGaPPipeline(DugPipeline): - pipeline_name = "topmed-gen3-dbgap" + pipeline_name = "bdc-topmed" parser_name = "topmeddbgap" class CureSCPipeline(DugPipeline): - pipeline_name = "curesc-dbgap" + pipeline_name = "bdc-curesc" parser_name = "curesc" +class HeartFailurePipeline(DugPipeline): + pipeline_name = "bdc-heartfailure" + parser_name = "heartfailure" +class ImagingPipeline(DugPipeline): + pipeline_name = "bdc-imaging" + parser_name = "imaging" + +class RedsPipeline(DugPipeline): + pipeline_name = "bdc-reds" + parser_name = "reds" \ No newline at end of file diff --git a/dags/roger/pipelines/picsure_test.py b/dags/roger/pipelines/picsure_test.py new file mode 100644 index 00000000..bea4469f --- /dev/null +++ b/dags/roger/pipelines/picsure_test.py @@ -0,0 +1,26 @@ +from roger.pipelines import DugPipeline +from roger.core import storage +from roger.logger import logger + + +class PicSure(DugPipeline): + "Pipeline for BACPAC data set" + pipeline_name = "bdc-test6" #lakefs + parser_name = "dbgap" + + def get_objects(self, input_data_path=None): + """Retrieve anvil objects + + This code is imported from roger.core.storage.dug_anvil_objects + """ + if not input_data_path: + input_data_path = storage.dug_input_files_path( + self.files_dir) + files = storage.get_files_recursive( + lambda file_name: ( + not file_name.startswith('GapExchange_') + and file_name.endswith('.xml')), + input_data_path) + logger.info("**********") + logger.info(files) + return sorted([str(f) for f in files]) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 96cd3ef4..871f52c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6 setuptools>=66 pytest PyYAML -git+https://github.com/helxplatform/dug@2.13.8 +git+https://github.com/helxplatform/dug@2.13.10 orjson==3.9.15 git+https://github.com/helxplatform/kg_utils.git@v0.0.10 git+https://github.com/helxplatform/python-stringcase@1.2.1 From 01827386a0385351db0b5b9fce356a6fc8490cbd Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:40:13 -0400 Subject: [PATCH 55/63] fix dug version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 871f52c2..c113288f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6 setuptools>=66 pytest PyYAML -git+https://github.com/helxplatform/dug@2.13.10 +git+https://github.com/helxplatform/dug@v2.13.10 orjson==3.9.15 git+https://github.com/helxplatform/kg_utils.git@v0.0.10 git+https://github.com/helxplatform/python-stringcase@1.2.1 From 14d09f3e67b26565e2f2a61eebb9c937f76d2172 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Tue, 9 Sep 2025 11:46:56 -0400 Subject: [PATCH 56/63] Picsure data ingest test (#127) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test * changed the pipeline name * parser name updated * update pipeline name * update the get_object/files * changed the lakefs dataset name for testing * update the pipelines for new programs in BDC * init parser and crawler objects once and retry * fix logger -> log typo * remove clear index (#119) * update with develop (#124) * Sync develop with main (#116) * Update _version.py (#86) * Update _version.py * Rti merge (#84) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Update requirements.txt * Update .gitignore * Update dug_utils.py Handle Error when curie not found in validate * Update __init__.py * Update config.yaml * Update dev-config.yaml * Update docker-compose.yaml * fixed docker-compose * adding back postgres volume to docker compose * env correction , docker compose updates --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent * adding v5.0 * cde-links branch * pin linkml * Update config.yaml collection_action to action * pop total items before result * print extracted elements * Update requirements.txt * Keep edge provenance (#94) * Update kgx.py * Update kgx.py * Update kgx.py can't delete edge keys while looping over them. * just collect then update * Update requirements.txt (#93) * Pipeline parameterize restructure (#95) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * Parameterized annotate tasks with input_data_path and output_data_path * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Parameterized annotate tasks with input_data_path and output_data_path (#85) * adding lakefs changes to roger-2.0 * point avalon to vg1 branch * change avalon dep * update airflow * fix avalon tag typo * update jenkins to tag version on main branch only * update jenkins to tag version * update jenkins to tag version * psycopg2 installation * add cncf k8s req * use airflow non-slim * simplified for testing * simplified for testing * change dag name * Erroneous parameter passed, should not be None * adding pre-exec * adding pre-exec * adding pre-exec * typo preexec * typo preexec * fix context * get files from repo * get files from repo * get files from repo * get files from repo * First shot at moving pipeline into base class and implementing. Anvil pipeline not complete * Syntax fix, docker image version bump to airflow 2.7.2-python3.11 * update storage dir * update remove dir code * update remove dir code * remote path to * * fix input dir for annotators * fix input dir for annotators * fix input dir for annotators * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * adding branch info on lakefs config * callback push to branch * back to relative import * reformat temp branch name based on unique task id * add logging * add logging * convert posix path to str for avalon * add extra / to root path * New dag created using DugPipeline subclasses * EmptyOperator imported from wrong place * import and syntax fixes * utterly silly syntax error * Added anvil to default input data sets for testing purposes * adding / to local path * commit meta task args empty string * add merge logic * add merge logic * upstream task dir pull for downstream task * Switched from subdag to taskgroup because latest Airflow depricated subdag * Added BACPAC pipeline object * Temporarily ignoring configuration variable for enabled datasets for testing * Passed dag in to create task group to see if it helps dag errors * Fixed silly syntax error * adding input / output dir params for make kgx * Trying different syntax to make taskgroups work. * adding input / output dir params for make kgx * Parsing, syntax, pylint fixes * adding input / output dir params for make kgx * Added pipeline name to task group name to ensure uniqueness * oops, moved something out of scope. Fixed * Filled out pipeline with methods from dug_utils. Needs data path changes * Finished implementing input_data_path and output_data_path handling, pylint cleanup * Update requirements.txt * adding toggle to avoid sending config obj * adding toggle to avoid sending config obj * disable to string for test * control pipelines for testing * add self to anvil get files * add log stream to make it available * typo fix * correcting branch id * adding source repo * adding source repo * patch name-resolver response * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * dug pipeline edit * recurisvely find recursively * recurisvely find recursively * setup output path for crawling * all task functions should have input and output params * adding annotation as upstream for validate index * revamp create task , and task wrapper * add validate concepts index task * adding concept validation * add index_variables task as dependecy for validate concepts * add index_variables task as dependecy for validate concepts * await client exist * await client exist * concepts not getting picked up for indexing * concepts not getting picked up for indexing * fix search elements * converting annotation output to json * json format annotation outputs * adding support for json format elements and concepts read * json back to dug objects * fixing index valriables with json objects * indetation and new line for better change detection :? * indetation and new line for better change detection * treat dictionary concepts as dictionary * read concepts json as a dict * concepts files are actually file paths * debug message * make output jsonable * clear up dir after commit , and delete unmerged branch even if no changes * don`t clear indexes, parallel dataset processing will be taxed * memory leak? * memory leak? * memory leak? * dumping pickles to debug locally * find out why concepts are being added to every other element * find out why concepts are being added to every other element * pointless shuffle 🤷‍♂️ * revert back in time * back to sanitize dug * output just json for annotation * adding jsonpickle * jsonpickle 🥒 * unpickle for index * unpickle for validate index * crawling fixes * crawling fixes * crawling validation fixes * fix index concepts * fix makekgx * adding other bdc pipelines * adding pipeline paramters to be able to configure per instance * fix * add input dataset for pipelines * Adding README to document how to create data set-specific pipelines * catchup on base.py * Added dbgap and nida pipelines * fix import errors * annotator modules added by passing config val (#90) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 --------- Co-authored-by: YaphetKG * Add heal parsers (#96) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Add heal parsers (#97) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Radx pipeline (#99) * point to large download * fix schema path * debug bulk input dir * fix schema read * fix schema read * fix schema read * commenting steup dir for test * adding logs * fix path stuff * add commented stuff back in * testing radx parser * adding parser * skip indexing vars with no id * adding indexes as part of bulk loader paramters * fix id index cli arg * fix local cli * dug latest --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * pin avalon * deleted jenkins and added workflows * unlinked helx-actions * testing paths * testing again * d * tests * commented out pytest * try again for bandit * commented out bandit * changed dag to dags * Added fixes * Bagel (#103) * bump dug version * adding bdc new pipelines * adding curesc * adding bagel config * add test parser * add score threshold * point to dug develop * Dbgap programs (#104) * bump dug version * adding bdc new pipelines * adding curesc * fix up merge conflict * adding bagel config parse to bool * Dev sync main (#106) * release sync (#100) * Update _version.py (#86) * Update _version.py * Rti merge (#84) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Update requirements.txt * Update .gitignore * Update dug_utils.py Handle Error when curie not found in validate * Update __init__.py * Update config.yaml * Update dev-config.yaml * Update docker-compose.yaml * fixed docker-compose * adding back postgres volume to docker compose * env correction , docker compose updates --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent * adding v5.0 * cde-links branch * pin linkml * Update config.yaml collection_action to action * pop total items before result * print extracted elements * Update requirements.txt * Keep edge provenance (#94) * Update kgx.py * Update kgx.py * Update kgx.py can't delete edge keys while looping over them. * just collect then update * Update requirements.txt (#93) * Pipeline parameterize restructure (#95) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * Parameterized annotate tasks with input_data_path and output_data_path * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Parameterized annotate tasks with input_data_path and output_data_path (#85) * adding lakefs changes to roger-2.0 * point avalon to vg1 branch * change avalon dep * update airflow * fix avalon tag typo * update jenkins to tag version on main branch only * update jenkins to tag version * update jenkins to tag version * psycopg2 installation * add cncf k8s req * use airflow non-slim * simplified for testing * simplified for testing * change dag name * Erroneous parameter passed, should not be None * adding pre-exec * adding pre-exec * adding pre-exec * typo preexec * typo preexec * fix context * get files from repo * get files from repo * get files from repo * get files from repo * First shot at moving pipeline into base class and implementing. Anvil pipeline not complete * Syntax fix, docker image version bump to airflow 2.7.2-python3.11 * update storage dir * update remove dir code * update remove dir code * remote path to * * fix input dir for annotators * fix input dir for annotators * fix input dir for annotators * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * adding branch info on lakefs config * callback push to branch * back to relative import * reformat temp branch name based on unique task id * add logging * add logging * convert posix path to str for avalon * add extra / to root path * New dag created using DugPipeline subclasses * EmptyOperator imported from wrong place * import and syntax fixes * utterly silly syntax error * Added anvil to default input data sets for testing purposes * adding / to local path * commit meta task args empty string * add merge logic * add merge logic * upstream task dir pull for downstream task * Switched from subdag to taskgroup because latest Airflow depricated subdag * Added BACPAC pipeline object * Temporarily ignoring configuration variable for enabled datasets for testing * Passed dag in to create task group to see if it helps dag errors * Fixed silly syntax error * adding input / output dir params for make kgx * Trying different syntax to make taskgroups work. * adding input / output dir params for make kgx * Parsing, syntax, pylint fixes * adding input / output dir params for make kgx * Added pipeline name to task group name to ensure uniqueness * oops, moved something out of scope. Fixed * Filled out pipeline with methods from dug_utils. Needs data path changes * Finished implementing input_data_path and output_data_path handling, pylint cleanup * Update requirements.txt * adding toggle to avoid sending config obj * adding toggle to avoid sending config obj * disable to string for test * control pipelines for testing * add self to anvil get files * add log stream to make it available * typo fix * correcting branch id * adding source repo * adding source repo * patch name-resolver response * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * dug pipeline edit * recurisvely find recursively * recurisvely find recursively * setup output path for crawling * all task functions should have input and output params * adding annotation as upstream for validate index * revamp create task , and task wrapper * add validate concepts index task * adding concept validation * add index_variables task as dependecy for validate concepts * add index_variables task as dependecy for validate concepts * await client exist * await client exist * concepts not getting picked up for indexing * concepts not getting picked up for indexing * fix search elements * converting annotation output to json * json format annotation outputs * adding support for json format elements and concepts read * json back to dug objects * fixing index valriables with json objects * indetation and new line for better change detection :? * indetation and new line for better change detection * treat dictionary concepts as dictionary * read concepts json as a dict * concepts files are actually file paths * debug message * make output jsonable * clear up dir after commit , and delete unmerged branch even if no changes * don`t clear indexes, parallel dataset processing will be taxed * memory leak? * memory leak? * memory leak? * dumping pickles to debug locally * find out why concepts are being added to every other element * find out why concepts are being added to every other element * pointless shuffle 🤷‍♂️ * revert back in time * back to sanitize dug * output just json for annotation * adding jsonpickle * jsonpickle 🥒 * unpickle for index * unpickle for validate index * crawling fixes * crawling fixes * crawling validation fixes * fix index concepts * fix makekgx * adding other bdc pipelines * adding pipeline paramters to be able to configure per instance * fix * add input dataset for pipelines * Adding README to document how to create data set-specific pipelines * catchup on base.py * Added dbgap and nida pipelines * fix import errors * annotator modules added by passing config val (#90) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 --------- Co-authored-by: YaphetKG * Add heal parsers (#96) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Add heal parsers (#97) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Radx pipeline (#99) * point to large download * fix schema path * debug bulk input dir * fix schema read * fix schema read * fix schema read * commenting steup dir for test * adding logs * fix path stuff * add commented stuff back in * testing radx parser * adding parser * skip indexing vars with no id * adding indexes as part of bulk loader paramters * fix id index cli arg * fix local cli * dug latest --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * pin avalon --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * remove jenkins file * bump apache version * revert airflow version --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * Updated docker image to 2.10.2 * fix kgx path error * fix kgx path error * Trying out the slim apache images to reduce vulnerability footprint * Building general package update into dockerfile * Trying more rigorous dist-upgrade to try to get rid of vulerabilities that won't quit * dist-upgrade also needs -y flag * Rolling back from slim image * Fixed apt-get syntax error * Reverting develop to 2.7.2 for stability's sake, will revisit after upgrades finish * Update Dockerfile * Update Dockerfile * Removed post-install package cleanup * Update requirements.txt * test merge issue * fix merge issue * modify radx pipeline (#111) * modify radx pipeline * pin to dug release * Heal ingest 2 18 (#112) * rename repos for heal ingest * change lib for bulk loader * change lib for bulk loader * cde?? * cde?? * cde?? * cde?? * cde?? * final push * final push * test node type * Heal ingest 2 18 (#114) * rename repos for heal ingest * change lib for bulk loader * change lib for bulk loader * cde?? * cde?? * cde?? * cde?? * cde?? * final push * final push * add study url * Bitnami (#117) * checkpoint * pushing docker changes * Executor tests 2.10 (#115) * test out bash operator * task name * pushing * revert python executor , with no excutor config * docker file and requriements * everything latest --------- Co-authored-by: waTeim --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> Co-authored-by: Patrick Hachicho Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com> Co-authored-by: waTeim * fix study name in graph (#121) * comment clear index. rolledback during merge --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> Co-authored-by: Patrick Hachicho Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com> Co-authored-by: waTeim * Update requirements.txt with Dug V2.13.10 which have all the new changes from new_bdc_parser branch * fix req --------- Co-authored-by: YSK Co-authored-by: ykale <14827177+yskale@users.noreply.github.com> Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> Co-authored-by: Patrick Hachicho Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com> Co-authored-by: waTeim --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c113288f..8b6ca0b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6 setuptools>=66 pytest PyYAML -git+https://github.com/helxplatform/dug@v2.13.10 +git+https://github.com/helxplatform/dug@2.13.11 orjson==3.9.15 git+https://github.com/helxplatform/kg_utils.git@v0.0.10 git+https://github.com/helxplatform/python-stringcase@1.2.1 From 691753886b72d1185628b1b442cb49c168e047db Mon Sep 17 00:00:00 2001 From: "Vladimir G." <117946722+vladimir2217@users.noreply.github.com> Date: Tue, 9 Sep 2025 11:47:55 -0400 Subject: [PATCH 57/63] DUG-529 Added support to process diffs in annotate_and_index (#110) * added params * changed to do_nothing, download diff * pass dag params to setupinputdata * added more logging * dag_test * dag_test * dag_test * dag_test * dag_test * dag_test * Added setup_input_data support for diff * lets try to run it * fix params * fix params * fix params another attempt * reverted do_nothing placeholder and some debug logging --- dags/annotate_and_index.py | 41 +++++++++++++++++++++++++++++- dags/roger/pipelines/base.py | 3 +-- dags/roger/tasks.py | 49 ++++++++++++++++++++++++++---------- 3 files changed, 77 insertions(+), 16 deletions(-) diff --git a/dags/annotate_and_index.py b/dags/annotate_and_index.py index 884cd149..b82c019d 100644 --- a/dags/annotate_and_index.py +++ b/dags/annotate_and_index.py @@ -10,7 +10,8 @@ from airflow.models import DAG from airflow.operators.empty import EmptyOperator -from roger.tasks import default_args, create_pipeline_taskgroup +from airflow.operators.python import PythonOperator +from roger.tasks import default_args, create_pipeline_taskgroup, logger, create_python_task env_enabled_datasets = os.getenv( "ROGER_DUG__INPUTS_DATA__SETS", "topmed,anvil").split(",") @@ -18,11 +19,19 @@ with DAG( dag_id='annotate_and_index', default_args=default_args, + params= + { + "repository_id": None, + "branch_name": None, + "commitid_from": None, + "commitid_to": None + }, schedule_interval=None ) as dag: init = EmptyOperator(task_id="init", dag=dag) finish = EmptyOperator(task_id="finish", dag=dag) + from roger import pipelines from roger.config import config envspec = os.getenv("ROGER_DUG__INPUTS_DATA__SETS","topmed:v2.0") @@ -42,3 +51,33 @@ # . . . init >> create_pipeline_taskgroup(dag, pipeline_class, config) >> finish + + + + +with DAG( + dag_id='dag_test', + default_args=default_args, + params= + { + "repository_id": None, + "branch_name": None, + "commitid_from": None, + "commitid_to": None + }, + schedule_interval=None +) as dag: + + init = EmptyOperator(task_id="init", dag=dag) + finish = EmptyOperator(task_id="finish", dag=dag) + + def print_context(ds=None, **kwargs): + print(">>>All kwargs") + print(kwargs) + print(">>>All ds") + print(ds) + + + init >> create_python_task(dag, "get_from_lakefs", print_context) >> finish + + #run_this = PythonOperator(task_id="print_the_context", python_callable=print_context) \ No newline at end of file diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py index e108443e..5c1df4cc 100644 --- a/dags/roger/pipelines/base.py +++ b/dags/roger/pipelines/base.py @@ -840,8 +840,7 @@ def annotate(self, to_string=False, files=None, input_data_path=None, "Annotate files with the appropriate parsers and crawlers" if files is None: files = self.get_objects(input_data_path=input_data_path) - self.annotate_files(parsable_files=files, - output_data_path=output_data_path) + self.annotate_files(parsable_files=files, output_data_path=output_data_path) output_log = self.log_stream.getvalue() if to_string else '' return output_log diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py index 5fe5ff90..2749a523 100755 --- a/dags/roger/tasks.py +++ b/dags/roger/tasks.py @@ -250,6 +250,9 @@ def setup_input_data(context, exec_conf): - put dependency data in input dir - if for some reason data was not found raise an exception """) + logger.info(">>> context") + logger.info(context) + # Serves as a location where files the task will work on are placed. # computed as ROGER_DATA_DIR + /current task instance name_input_dir @@ -263,6 +266,17 @@ def setup_input_data(context, exec_conf): # Download files from lakefs and store them in this new input_path client = init_lakefs_client(config=config) repos = exec_conf['repos'] + dag_params = context["params"] + + if dag_params.get("repository_id"): + logger.info(">>> repository_id supplied. Overriding repo.") + repos=[{ + 'repo': dag_params.get("repository_id"), + 'branch': dag_params.get("branch_name"), + 'commitid_from': dag_params.get("commitid_from"), + 'commitid_to': dag_params.get("commitid_to") + }] + # if no external repo is provided we assume to get the upstream task dataset. if not repos or len(repos) == 0: # merge destination branch @@ -276,7 +290,9 @@ def setup_input_data(context, exec_conf): repos = [{ 'repo': repo, 'branch': branch, - 'path': f'{dag_id}/{upstream_id}' + 'path': f'{dag_id}/{upstream_id}', + 'commitid_from': None, + 'commitid_to': None } for upstream_id in upstream_ids] # input_repo = exec_conf['input_repo'] @@ -287,20 +303,27 @@ def setup_input_data(context, exec_conf): # get all if path is not specified repo['path'] = '*' logger.info(f"repos : {repos}") + + logger.info(">>> start of downloading data") for r in repos: - logger.info("downloading %s from %s@%s to %s", - r['path'], r['repo'], r['branch'], input_dir) # create path to download to ... if not os.path.exists(input_dir + f'/{r["repo"]}'): os.mkdir(input_dir + f'/{r["repo"]}') - get_files( - local_path=input_dir + f'/{r["repo"]}', - remote_path=r['path'], - branch=r['branch'], - repo=r['repo'], - changes_only=False, - lake_fs_client=client - ) + + if not dag_params.get("repository_id"): + logger.info("downloading %s from %s@%s to %s", r['path'], r['repo'], r['branch'], input_dir) + get_files( + local_path=input_dir + f'/{r["repo"]}', + remote_path=r['path'], + branch=r['branch'], + repo=r['repo'], + changes_only=r.get("commitid_from") is not None, + changes_from=r.get("commitid_from"), + changes_to=r.get("commitid_to"), + lake_fs_client=client + ) + logger.info(">>> end of downloading data") + def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos = {}, pass_conf=True, no_output_files=False): @@ -349,7 +372,7 @@ def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos = 'path': r.get('path', '*') } for r in external_repos] } - + pre_exec = partial(setup_input_data, exec_conf=pre_exec_conf) # add pre_exec partial function as an argument to python executor conf python_operator_args['pre_execute'] = pre_exec @@ -400,7 +423,7 @@ def create_pipeline_taskgroup( validate_index_variables_task = create_python_task( dag, f"validate_{name}_index_variables", - pipeline.validate_indexed_variables, + pipeline.validate_indexed_variables, pass_conf=False, # declare that this task will not generate files. no_output_files=True From 23ab7a109f76289898cc140b9badf297a1a37acd Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Fri, 19 Sep 2025 11:56:32 -0400 Subject: [PATCH 58/63] Picsure data ingest test (#128) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test * changed the pipeline name * parser name updated * update pipeline name * update the get_object/files * changed the lakefs dataset name for testing * update the pipelines for new programs in BDC * init parser and crawler objects once and retry * fix logger -> log typo * remove clear index (#119) * update with develop (#124) * Sync develop with main (#116) * Update _version.py (#86) * Update _version.py * Rti merge (#84) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Update requirements.txt * Update .gitignore * Update dug_utils.py Handle Error when curie not found in validate * Update __init__.py * Update config.yaml * Update dev-config.yaml * Update docker-compose.yaml * fixed docker-compose * adding back postgres volume to docker compose * env correction , docker compose updates --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent * adding v5.0 * cde-links branch * pin linkml * Update config.yaml collection_action to action * pop total items before result * print extracted elements * Update requirements.txt * Keep edge provenance (#94) * Update kgx.py * Update kgx.py * Update kgx.py can't delete edge keys while looping over them. * just collect then update * Update requirements.txt (#93) * Pipeline parameterize restructure (#95) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * Parameterized annotate tasks with input_data_path and output_data_path * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Parameterized annotate tasks with input_data_path and output_data_path (#85) * adding lakefs changes to roger-2.0 * point avalon to vg1 branch * change avalon dep * update airflow * fix avalon tag typo * update jenkins to tag version on main branch only * update jenkins to tag version * update jenkins to tag version * psycopg2 installation * add cncf k8s req * use airflow non-slim * simplified for testing * simplified for testing * change dag name * Erroneous parameter passed, should not be None * adding pre-exec * adding pre-exec * adding pre-exec * typo preexec * typo preexec * fix context * get files from repo * get files from repo * get files from repo * get files from repo * First shot at moving pipeline into base class and implementing. Anvil pipeline not complete * Syntax fix, docker image version bump to airflow 2.7.2-python3.11 * update storage dir * update remove dir code * update remove dir code * remote path to * * fix input dir for annotators * fix input dir for annotators * fix input dir for annotators * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * adding branch info on lakefs config * callback push to branch * back to relative import * reformat temp branch name based on unique task id * add logging * add logging * convert posix path to str for avalon * add extra / to root path * New dag created using DugPipeline subclasses * EmptyOperator imported from wrong place * import and syntax fixes * utterly silly syntax error * Added anvil to default input data sets for testing purposes * adding / to local path * commit meta task args empty string * add merge logic * add merge logic * upstream task dir pull for downstream task * Switched from subdag to taskgroup because latest Airflow depricated subdag * Added BACPAC pipeline object * Temporarily ignoring configuration variable for enabled datasets for testing * Passed dag in to create task group to see if it helps dag errors * Fixed silly syntax error * adding input / output dir params for make kgx * Trying different syntax to make taskgroups work. * adding input / output dir params for make kgx * Parsing, syntax, pylint fixes * adding input / output dir params for make kgx * Added pipeline name to task group name to ensure uniqueness * oops, moved something out of scope. Fixed * Filled out pipeline with methods from dug_utils. Needs data path changes * Finished implementing input_data_path and output_data_path handling, pylint cleanup * Update requirements.txt * adding toggle to avoid sending config obj * adding toggle to avoid sending config obj * disable to string for test * control pipelines for testing * add self to anvil get files * add log stream to make it available * typo fix * correcting branch id * adding source repo * adding source repo * patch name-resolver response * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * dug pipeline edit * recurisvely find recursively * recurisvely find recursively * setup output path for crawling * all task functions should have input and output params * adding annotation as upstream for validate index * revamp create task , and task wrapper * add validate concepts index task * adding concept validation * add index_variables task as dependecy for validate concepts * add index_variables task as dependecy for validate concepts * await client exist * await client exist * concepts not getting picked up for indexing * concepts not getting picked up for indexing * fix search elements * converting annotation output to json * json format annotation outputs * adding support for json format elements and concepts read * json back to dug objects * fixing index valriables with json objects * indetation and new line for better change detection :? * indetation and new line for better change detection * treat dictionary concepts as dictionary * read concepts json as a dict * concepts files are actually file paths * debug message * make output jsonable * clear up dir after commit , and delete unmerged branch even if no changes * don`t clear indexes, parallel dataset processing will be taxed * memory leak? * memory leak? * memory leak? * dumping pickles to debug locally * find out why concepts are being added to every other element * find out why concepts are being added to every other element * pointless shuffle 🤷‍♂️ * revert back in time * back to sanitize dug * output just json for annotation * adding jsonpickle * jsonpickle 🥒 * unpickle for index * unpickle for validate index * crawling fixes * crawling fixes * crawling validation fixes * fix index concepts * fix makekgx * adding other bdc pipelines * adding pipeline paramters to be able to configure per instance * fix * add input dataset for pipelines * Adding README to document how to create data set-specific pipelines * catchup on base.py * Added dbgap and nida pipelines * fix import errors * annotator modules added by passing config val (#90) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 --------- Co-authored-by: YaphetKG * Add heal parsers (#96) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Add heal parsers (#97) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Radx pipeline (#99) * point to large download * fix schema path * debug bulk input dir * fix schema read * fix schema read * fix schema read * commenting steup dir for test * adding logs * fix path stuff * add commented stuff back in * testing radx parser * adding parser * skip indexing vars with no id * adding indexes as part of bulk loader paramters * fix id index cli arg * fix local cli * dug latest --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * pin avalon * deleted jenkins and added workflows * unlinked helx-actions * testing paths * testing again * d * tests * commented out pytest * try again for bandit * commented out bandit * changed dag to dags * Added fixes * Bagel (#103) * bump dug version * adding bdc new pipelines * adding curesc * adding bagel config * add test parser * add score threshold * point to dug develop * Dbgap programs (#104) * bump dug version * adding bdc new pipelines * adding curesc * fix up merge conflict * adding bagel config parse to bool * Dev sync main (#106) * release sync (#100) * Update _version.py (#86) * Update _version.py * Rti merge (#84) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Update requirements.txt * Update .gitignore * Update dug_utils.py Handle Error when curie not found in validate * Update __init__.py * Update config.yaml * Update dev-config.yaml * Update docker-compose.yaml * fixed docker-compose * adding back postgres volume to docker compose * env correction , docker compose updates --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent * adding v5.0 * cde-links branch * pin linkml * Update config.yaml collection_action to action * pop total items before result * print extracted elements * Update requirements.txt * Keep edge provenance (#94) * Update kgx.py * Update kgx.py * Update kgx.py can't delete edge keys while looping over them. * just collect then update * Update requirements.txt (#93) * Pipeline parameterize restructure (#95) * roger cli preped for Merge Deploy * Update Makefile to work with python env * Update redisgraph-bulk-loader to fix issue with loading MODULE LIST * Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST" This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d. * Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations * updated to reflect the Dug-Api updates to FastAPI * adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges * Working multi label redis nodes w/ no biolink label * Latest code changes to deploy working Roger in Merge * biolink data move to '.' separator * updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.' * adding test roger code * removed helm deployments * change docker owner * remove core.py * remove dup dev config * redis graph is not directly used removing cruft * remove print statement * remove logging files * update requriemtns * update requriemtns * add redis graph.py * fix import error for logger * adding es scheme and ca_path config * adding es scheme and ca_path config * Parameterized annotate tasks with input_data_path and output_data_path * adding debug code * removing debug * adding nodes args * adding biolink. * adding biolink. * Parameterized annotate tasks with input_data_path and output_data_path (#85) * adding lakefs changes to roger-2.0 * point avalon to vg1 branch * change avalon dep * update airflow * fix avalon tag typo * update jenkins to tag version on main branch only * update jenkins to tag version * update jenkins to tag version * psycopg2 installation * add cncf k8s req * use airflow non-slim * simplified for testing * simplified for testing * change dag name * Erroneous parameter passed, should not be None * adding pre-exec * adding pre-exec * adding pre-exec * typo preexec * typo preexec * fix context * get files from repo * get files from repo * get files from repo * get files from repo * First shot at moving pipeline into base class and implementing. Anvil pipeline not complete * Syntax fix, docker image version bump to airflow 2.7.2-python3.11 * update storage dir * update remove dir code * update remove dir code * remote path to * * fix input dir for annotators * fix input dir for annotators * fix input dir for annotators * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * kwargs to task * adding branch info on lakefs config * callback push to branch * back to relative import * reformat temp branch name based on unique task id * add logging * add logging * convert posix path to str for avalon * add extra / to root path * New dag created using DugPipeline subclasses * EmptyOperator imported from wrong place * import and syntax fixes * utterly silly syntax error * Added anvil to default input data sets for testing purposes * adding / to local path * commit meta task args empty string * add merge logic * add merge logic * upstream task dir pull for downstream task * Switched from subdag to taskgroup because latest Airflow depricated subdag * Added BACPAC pipeline object * Temporarily ignoring configuration variable for enabled datasets for testing * Passed dag in to create task group to see if it helps dag errors * Fixed silly syntax error * adding input / output dir params for make kgx * Trying different syntax to make taskgroups work. * adding input / output dir params for make kgx * Parsing, syntax, pylint fixes * adding input / output dir params for make kgx * Added pipeline name to task group name to ensure uniqueness * oops, moved something out of scope. Fixed * Filled out pipeline with methods from dug_utils. Needs data path changes * Finished implementing input_data_path and output_data_path handling, pylint cleanup * Update requirements.txt * adding toggle to avoid sending config obj * adding toggle to avoid sending config obj * disable to string for test * control pipelines for testing * add self to anvil get files * add log stream to make it available * typo fix * correcting branch id * adding source repo * adding source repo * patch name-resolver response * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * no pass input repo and branch , if not overriden to pre-exec * dug pipeline edit * recurisvely find recursively * recurisvely find recursively * setup output path for crawling * all task functions should have input and output params * adding annotation as upstream for validate index * revamp create task , and task wrapper * add validate concepts index task * adding concept validation * add index_variables task as dependecy for validate concepts * add index_variables task as dependecy for validate concepts * await client exist * await client exist * concepts not getting picked up for indexing * concepts not getting picked up for indexing * fix search elements * converting annotation output to json * json format annotation outputs * adding support for json format elements and concepts read * json back to dug objects * fixing index valriables with json objects * indetation and new line for better change detection :? * indetation and new line for better change detection * treat dictionary concepts as dictionary * read concepts json as a dict * concepts files are actually file paths * debug message * make output jsonable * clear up dir after commit , and delete unmerged branch even if no changes * don`t clear indexes, parallel dataset processing will be taxed * memory leak? * memory leak? * memory leak? * dumping pickles to debug locally * find out why concepts are being added to every other element * find out why concepts are being added to every other element * pointless shuffle 🤷‍♂️ * revert back in time * back to sanitize dug * output just json for annotation * adding jsonpickle * jsonpickle 🥒 * unpickle for index * unpickle for validate index * crawling fixes * crawling fixes * crawling validation fixes * fix index concepts * fix makekgx * adding other bdc pipelines * adding pipeline paramters to be able to configure per instance * fix * add input dataset for pipelines * Adding README to document how to create data set-specific pipelines * catchup on base.py * Added dbgap and nida pipelines * fix import errors * annotator modules added by passing config val (#90) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 --------- Co-authored-by: YaphetKG * Add heal parsers (#96) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Add heal parsers (#97) * annotator modules added by passing config val * fix merge conflict * following same pattern as parsers , modify configs * fix to dug config method * fix old dug pipeline for backward compatiblity * correct default annotator type * reflective changes * typo extra quotes * annotator type not being picked up from config * remove annotate simple , log env value for lakefs enabled * testing lakefs off * add more logging * add more logging * post init for config to parse to boolean * put back task calls * revert some changes * adding new pipeline * lakefs io support for merge task * fix name * add io params for kg tasks * wire up i/o paths for merge * fix variable name * print files * few debug logs * few debug logs * treat path as path not str * few debug logs * some fixes * logging edge files * bug fix knowledge has edge * re-org graph structure * adding pathing for other tasks * pagenation logic fix for avalon * update lakefs client code * fix glob for get kgx files * fix up get merged objects * send down fake commit id for metadata * working on edges schema * bulk create nodes I/O * find schema file * bulk create edges I/O * bulk create edges I/O * bulk load io * no outputs for final tasks * add recursive glob * fix globbing * oops * delete dags * pin dug to latest release * cruft cleanup * re-org kgx config * add support for multiple initial repos * fix comma * create dir to download to * swap branch and repo * clean up dirs * fix up other pipeline 👌 * add remaining pipelines * adding ctn parser * change merge strategy * merge init fix * debug dir * fix topmed file read * fix topmed file read * return file names as strings * topmed kgx builder custom * topmed kgx builder custom * add skip * get files pattern recursive * version pin avalon * pin dug --------- Co-authored-by: braswent * Radx pipeline (#99) * point to large download * fix schema path * debug bulk input dir * fix schema read * fix schema read * fix schema read * commenting steup dir for test * adding logs * fix path stuff * add commented stuff back in * testing radx parser * adding parser * skip indexing vars with no id * adding indexes as part of bulk loader paramters * fix id index cli arg * fix local cli * dug latest --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * pin avalon --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * remove jenkins file * bump apache version * revert airflow version --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> * Updated docker image to 2.10.2 * fix kgx path error * fix kgx path error * Trying out the slim apache images to reduce vulnerability footprint * Building general package update into dockerfile * Trying more rigorous dist-upgrade to try to get rid of vulerabilities that won't quit * dist-upgrade also needs -y flag * Rolling back from slim image * Fixed apt-get syntax error * Reverting develop to 2.7.2 for stability's sake, will revisit after upgrades finish * Update Dockerfile * Update Dockerfile * Removed post-install package cleanup * Update requirements.txt * test merge issue * fix merge issue * modify radx pipeline (#111) * modify radx pipeline * pin to dug release * Heal ingest 2 18 (#112) * rename repos for heal ingest * change lib for bulk loader * change lib for bulk loader * cde?? * cde?? * cde?? * cde?? * cde?? * final push * final push * test node type * Heal ingest 2 18 (#114) * rename repos for heal ingest * change lib for bulk loader * change lib for bulk loader * cde?? * cde?? * cde?? * cde?? * cde?? * final push * final push * add study url * Bitnami (#117) * checkpoint * pushing docker changes * Executor tests 2.10 (#115) * test out bash operator * task name * pushing * revert python executor , with no excutor config * docker file and requriements * everything latest --------- Co-authored-by: waTeim --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> Co-authored-by: Patrick Hachicho Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com> Co-authored-by: waTeim * fix study name in graph (#121) * comment clear index. rolledback during merge --------- Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> Co-authored-by: Patrick Hachicho Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com> Co-authored-by: waTeim * Update requirements.txt with Dug V2.13.10 which have all the new changes from new_bdc_parser branch * fix req --------- Co-authored-by: YSK Co-authored-by: ykale <14827177+yskale@users.noreply.github.com> Co-authored-by: Nathan Braswell Co-authored-by: esurface Co-authored-by: braswent Co-authored-by: Howard Lander Co-authored-by: Michael T. Bacon Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com> Co-authored-by: Patrick Hachicho Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com> Co-authored-by: waTeim From faccf8641e190ee6145e7d4fdd4d1c39da9beb3f Mon Sep 17 00:00:00 2001 From: Jeff Waller Date: Mon, 3 Nov 2025 16:11:02 -0500 Subject: [PATCH 59/63] Dug 547 (#129) * Add landing zone for 3rd party specializations; add falkordb dockerfile * pinning to a version --------- Co-authored-by: YaphetKG --- ext/containers/FalkorDB/Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 ext/containers/FalkorDB/Dockerfile diff --git a/ext/containers/FalkorDB/Dockerfile b/ext/containers/FalkorDB/Dockerfile new file mode 100644 index 00000000..e929a199 --- /dev/null +++ b/ext/containers/FalkorDB/Dockerfile @@ -0,0 +1,5 @@ +FROM falkordb/falkordb-server:v4.14.4-alpine +# Remove Go if present (apk-installed or tarball) +RUN apk del --no-network go 2>/dev/null || true \ + && rm -rf /usr/local/go /usr/lib/go /root/go /go 2>/dev/null || true +RUN apk add bash From c6ab5e07d74d531a2530971cf540ddb14ceb24be Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Wed, 12 Nov 2025 08:52:09 -0500 Subject: [PATCH 60/63] fix recover (#131) Co-authored-by: YSK --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8b6ca0b2..63275110 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6 setuptools>=66 pytest PyYAML -git+https://github.com/helxplatform/dug@2.13.11 +git+https://github.com/helxplatform/dug@develop orjson==3.9.15 git+https://github.com/helxplatform/kg_utils.git@v0.0.10 git+https://github.com/helxplatform/python-stringcase@1.2.1 From 086ef6abfa62a741aa927df4c0c64f5188ab2076 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Fri, 14 Nov 2025 15:35:06 -0500 Subject: [PATCH 61/63] Airflow 3.1.1 (#130) * airflow-3.1.1 changes * fix * last docker file changes * patch libs * strip newline from node descriptions * add elastic container and redis containers to make dags load * fix workflow --- .env | 31 ++- .github/workflows/trivy-pr-scan.yml | 2 +- Dockerfile | 103 +++++++--- dags/annotate_and_index.py | 9 +- dags/knowledge_graph_build.py | 2 +- dags/roger/config/__init__.py | 8 +- dags/roger/core/bulkload.py | 3 + dags/roger/tasks.py | 209 ++++++++------------ docker-compose.yaml | 286 ++++++++++++---------------- requirements.txt | 12 +- 10 files changed, 314 insertions(+), 351 deletions(-) diff --git a/.env b/.env index 2b42e8d7..28994cc6 100644 --- a/.env +++ b/.env @@ -1,23 +1,14 @@ -AIRFLOW_UID=502 -AIRFLOW_GID=0 -API_WORKERS=4 -API_PORT=5551 -API_TIMEOUT=10 +KGX_DATA_SETS="bdc-studies-kgx:v1.0" +INPUT_DATA_SETS="heal-mds-studies:v1.0" -DATA_DIR=./local_storage +LAKEFS_ACCESS_KEY="" +LAKEFS_SECRET_KEY="" +LAKEFS_REPO="" +LAKEFS_BRANCH="" +LAKEFS_URL="https://lakefs.apps.renci.org" -DUG_LOG_LEVEL=INFO - -ELASTICSEARCH_PASSWORD=12345 -ELASTICSEARCH_HOST=elasticsearch -ELASTICSEARCH_USERNAME=elastic - -NBOOST_API_HOST=nboost - -REDIS_PASSWORD=weak -REDIS_HOST=merge-redis-master -REDIS_PORT=6379 -TRANQL_ACCESS_LOG=access.log -TRANQL_ERROR_LOG=error.log -ROGER_DUG__INPUTS_DATA__SETS=topmed:v1.0 \ No newline at end of file +BIOMEGATRON_URL="https://med-nemo.apps.renci.org/annotate" +SAPBERT_URL="https://sap-qdrant.apps.renci.org/annotate" +NODE_NORM_URL="https://nodenormalization-sri.renci.org/get_normalized_nodes?conflate=false&description=true&curie=" +NAME_RES_URL="https://name-resolution-sri.renci.org/reverse_lookup" \ No newline at end of file diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml index 1e7bc060..b8935164 100644 --- a/.github/workflows/trivy-pr-scan.yml +++ b/.github/workflows/trivy-pr-scan.yml @@ -61,7 +61,7 @@ jobs: # We still fail the job if results are found, so below will always run # unless manually canceled. - name: Upload Trivy scan results to GitHub Security tab - uses: github/codeql-action/upload-sarif@v2 + uses: github/codeql-action/upload-sarif@v3 if: '!cancelled()' with: sarif_file: 'trivy-results.sarif' \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 47d2c13f..7323534b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,33 +1,74 @@ -FROM bitnami/airflow:2.10.5-debian-12-r7 - -USER root -RUN apt-get update && apt-get install -y git nano vim gcc rustc cargo -#RUN useradd -u 1001 -ms /bin/bash airflow && chown -R airflow /home/airflow -COPY requirements.txt requirements.txt -RUN source /opt/bitnami/airflow/venv/bin/activate && CARGO_HOME=/tmp/.cargo && \ - pip install setuptools wheel && \ - pip install -r requirements.txt - -RUN rm -f requirements.txt - -## Vul patches -## Python lib patches on airflow python env -RUN source /opt/bitnami/airflow/venv/bin/activate pip install --upgrade \ - flask-appbuilder==4.5.3 \ - cryptography==44.0.1 \ - werkzeug==3.0.6 \ - urllib3==2.2.2 -RUN source /opt/bitnami/airflow/venv/bin/activate pip uninstall -y \ - apache-airflow-providers-mysql==6.2.0 - -# Uninstall these from non airflow python env -RUN pip install --upgrade \ - flask-appbuilder==4.5.3 \ - cryptography==44.0.1 \ - werkzeug==3.0.6 \ - urllib3==2.2.2 -RUN apt-get autoremove -y vim -RUN apt-get autoremove -y binutils -RUN apt-get autoremove -y linux-libc-dev +# Use a Debian-based image for better compatibility +FROM python:3.11.14-slim-trixie +# Set Airflow version and home directory +ARG AIRFLOW_VERSION=3.1.1 +ARG AIRFLOW_HOME=/opt/airflow + +# Environment variables +ENV AIRFLOW_HOME=${AIRFLOW_HOME} +ENV AIRFLOW__CORE__LOAD_EXAMPLES=False +ENV AIRFLOW__CORE__EXECUTOR=LocalExecutor +ENV AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres:5432/airflow +ENV PYTHONUNBUFFERED=1 + +# Create airflow user and directories +RUN useradd --uid 50000 --home-dir ${AIRFLOW_HOME} --create-home airflow && \ + mkdir -p ${AIRFLOW_HOME}/dags ${AIRFLOW_HOME}/logs ${AIRFLOW_HOME}/plugins ${AIRFLOW_HOME}/config + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + libpq-dev \ + libffi-dev \ + libssl-dev \ + curl \ + tini \ + tzdata \ + git \ + && rm -rf /var/lib/apt/lists/* + +# Upgrade pip tools +RUN pip install --no-cache-dir --upgrade pip setuptools wheel + +# Install Airflow (with PostgreSQL, Celery, Redis support) +RUN pip install --no-cache-dir \ + "apache-airflow[postgres,celery,redis,fab]==${AIRFLOW_VERSION}" \ + "apache-airflow-providers-cncf-kubernetes" \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-3.11.txt" + +# Optional: install extra packages +RUN pip install --no-cache-dir psycopg2-binary redis + +COPY ./requirements.txt /tmp/requirements.txt + +RUN pip install -r /tmp/requirements.txt + +RUN rm /tmp/requirements.txt + + + +RUN apt-get purge -y --auto-remove \ + build-essential \ + libpq-dev \ + libffi-dev \ + libssl-dev \ + curl \ + git && \ + apt-get clean + +# Set ownership +RUN chown -R airflow:airflow ${AIRFLOW_HOME} + +# Switch to airflow user USER airflow +WORKDIR ${AIRFLOW_HOME} + +# Expose Airflow webserver port +EXPOSE 8080 + +# Use tini for signal handling +ENTRYPOINT ["/usr/bin/tini", "--"] + +# Default command +CMD ["airflow", "webserver"] diff --git a/dags/annotate_and_index.py b/dags/annotate_and_index.py index b82c019d..e4cdfd98 100644 --- a/dags/annotate_and_index.py +++ b/dags/annotate_and_index.py @@ -26,7 +26,7 @@ "commitid_from": None, "commitid_to": None }, - schedule_interval=None + # schedule_interval=None ) as dag: init = EmptyOperator(task_id="init", dag=dag) finish = EmptyOperator(task_id="finish", dag=dag) @@ -65,7 +65,7 @@ "commitid_from": None, "commitid_to": None }, - schedule_interval=None + # schedule_interval=None ) as dag: init = EmptyOperator(task_id="init", dag=dag) @@ -80,4 +80,7 @@ def print_context(ds=None, **kwargs): init >> create_python_task(dag, "get_from_lakefs", print_context) >> finish - #run_this = PythonOperator(task_id="print_the_context", python_callable=print_context) \ No newline at end of file + #run_this = PythonOperator(task_id="print_the_context", python_callable=print_context) + +if __name__ == "__main__": + dag.test() \ No newline at end of file diff --git a/dags/knowledge_graph_build.py b/dags/knowledge_graph_build.py index de28aa78..d94d2262 100644 --- a/dags/knowledge_graph_build.py +++ b/dags/knowledge_graph_build.py @@ -15,7 +15,7 @@ with DAG( dag_id='knowledge_graph_build', default_args=default_args, - schedule_interval=None + # schedule_interval=None ) as dag: """ Build the workflow tasks. """ diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py index 71111f39..cc017fca 100644 --- a/dags/roger/config/__init__.py +++ b/dags/roger/config/__init__.py @@ -3,7 +3,7 @@ import warnings from dataclasses import dataclass, field from pathlib import Path -from typing import Dict, Optional, List +from typing import Dict, Optional, List, Union import yaml from dug.config import Config as DugConfig @@ -33,7 +33,7 @@ class LakefsConfig(DictLike): secret_access_key: str branch: str repo: str - enabled: bool = False + enabled: Union[bool, str] = False def __post_init__(self): if isinstance(self.enabled, str): @@ -135,8 +135,8 @@ class AnnotationConfig(DictLike): ]) def __post_init__(self): - self.annotator_args["sapbert"]["bagel"]["enabled"] = self.annotator_args["sapbert"]["bagel"][ - "enabled"].lower() == "true" + self.annotator_args["sapbert"]["bagel"]["enabled"] = str(self.annotator_args["sapbert"]["bagel"][ + "enabled"]).lower() == "true" @dataclass diff --git a/dags/roger/core/bulkload.py b/dags/roger/core/bulkload.py index a8ddbc15..17bd179f 100644 --- a/dags/roger/core/bulkload.py +++ b/dags/roger/core/bulkload.py @@ -53,6 +53,9 @@ def create_nodes_csv_file(self, input_data_path=None, output_data_path=None): merged_nodes_file = storage.merged_objects('nodes', input_data_path) counter = 1 for node in storage.json_line_iter(merged_nodes_file): + if node.get('description'): + node['description'] = node['description'].replace('\n', + ' ') if not node.get('category'): category_error_nodes.add(node['id']) node['category'] = [BiolinkModel.root_type] diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py index 2749a523..cd828383 100755 --- a/dags/roger/tasks.py +++ b/dags/roger/tasks.py @@ -1,20 +1,21 @@ -"Tasks and methods related to Airflow implementations of Roger" +# Tasks and methods related to Airflow implementations of Roger import os - -from airflow.operators.python import PythonOperator -from airflow.operators.empty import EmptyOperator -from airflow.utils.task_group import TaskGroup -from airflow.utils.dates import days_ago -from airflow.models import DAG -from airflow.models.dag import DagContext -from airflow.models.taskinstance import TaskInstance -from airflow.operators.bash import BashOperator +from datetime import datetime +from functools import partial from typing import Union from pathlib import Path import glob import shutil +# Airflow 3.x - prefer provider imports and new public types +from airflow.providers.standard.operators.python import PythonOperator +from airflow.operators.empty import EmptyOperator +from airflow.utils.task_group import TaskGroup +from airflow.models import DAG +from airflow.models.taskinstance import TaskInstance +from airflow.providers.standard.operators.bash import BashOperator +from airflow.utils.context import Context # type: ignore from roger.config import config, RogerConfig from roger.logger import get_logger @@ -22,13 +23,12 @@ from avalon.mainoperations import put_files, LakeFsWrapper, get_files from lakefs_sdk.configuration import Configuration from lakefs_sdk.models.merge import Merge -from functools import partial logger = get_logger() default_args = { 'owner': 'RENCI', - 'start_date': days_ago(1) + 'start_date': datetime(2025, 1, 1) } @@ -44,13 +44,17 @@ def task_wrapper(python_callable, **kwargs): pass_conf = kwargs.get('pass_conf', True) if config.lakefs_config.enabled: # get input path - input_data_path = generate_dir_name_from_task_instance(kwargs['ti'], - roger_config=config, - suffix='input') + input_data_path = generate_dir_name_from_task_instance( + kwargs['ti'], + roger_config=config, + suffix='input' + ) # get output path from task id run id dag id combo - output_data_path = generate_dir_name_from_task_instance(kwargs['ti'], - roger_config=config, - suffix='output') + output_data_path = generate_dir_name_from_task_instance( + kwargs['ti'], + roger_config=config, + suffix='output' + ) else: input_data_path, output_data_path = None, None # cast it to a path object @@ -66,6 +70,7 @@ def task_wrapper(python_callable, **kwargs): return python_callable(config=config, **func_args) return python_callable(**func_args) + def get_executor_config(data_path='/opt/airflow/share/data'): """ Get an executor configuration. :param annotations: Annotations to attach to the executor. @@ -73,12 +78,11 @@ def get_executor_config(data_path='/opt/airflow/share/data'): """ env_var_prefix = config.OS_VAR_PREFIX # based on environment set on scheduler pod, make secrets for worker pod - # this ensures passwords don't leak as pod templates. secrets_map = [{ "secret_name_ref": "ELASTIC_SEARCH_PASSWORD_SECRET", "secret_key_ref": "ELASTIC_SEARCH_PASSWORD_SECRET_KEY", "env_var_name": f"{env_var_prefix}ELASTIC__SEARCH_PASSWORD" - },{ + }, { "secret_name_ref": "REDIS_PASSWORD_SECRET", "secret_key_ref": "REDIS_PASSWORD_SECRET_KEY", "env_var_name": f"{env_var_prefix}REDISGRAPH_PASSWORD" @@ -92,8 +96,8 @@ def get_executor_config(data_path='/opt/airflow/share/data'): "name": secret["env_var_name"], "valueFrom": { "secretKeyRef": { - "name": secret_name, - "key": secret_key_name + "name": secret_name, + "key": secret_key_name } }}) @@ -104,6 +108,7 @@ def get_executor_config(data_path='/opt/airflow/share/data'): } return k8s_executor_config + def init_lakefs_client(config: RogerConfig) -> LakeFsWrapper: configuration = Configuration() configuration.username = config.lakefs_config.access_key_id @@ -123,47 +128,28 @@ def pagination_helper(page_fetcher, **kwargs): kwargs['after'] = resp.pagination.next_offset -def avalon_commit_callback(context: DagContext, **kwargs): - client: LakeFsWrapper = init_lakefs_client(config=config) +def avalon_commit_callback(context: Context, **kwargs): + client: LakeFsWrapper = init_lakefs_client(config=config) # now files have been processed, # this part should # get the out path of the task - local_path = str(generate_dir_name_from_task_instance(context['ti'], - roger_config=config, - suffix='output')).rstrip('/') + '/' + local_path = str(generate_dir_name_from_task_instance( + context['ti'], + roger_config=config, + suffix='output')).rstrip('/') + '/' task_id = context['ti'].task_id dag_id = context['ti'].dag_id run_id = context['ti'].run_id - # run id looks like 2023-10-18T17:35:14.890186+00:00 - # normalized to 2023_10_18T17_35_14_890186_00_00 - # since lakefs branch id must consist of letters, digits, underscores and dashes, - # and cannot start with a dash - run_id_normalized = run_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_') - dag_id_normalized = dag_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_') - task_id_normalized = task_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_') + # normalize run/dag/task ids for branch name + run_id_normalized = run_id.replace('-', '_').replace(':', '_').replace('+', '_').replace('.', '_') + dag_id_normalized = dag_id.replace('-', '_').replace(':', '_').replace('+', '_').replace('.', '_') + task_id_normalized = task_id.replace('-', '_').replace(':', '_').replace('+', '_').replace('.', '_') temp_branch_name = f'{dag_id_normalized}_{task_id_normalized}_{run_id_normalized}' - # remote path to upload the files to. remote_path = f'{dag_id}/{task_id}/' - # merge destination branch branch = config.lakefs_config.branch repo = config.lakefs_config.repo - # This part pushes to a temp branch on the repo - # now we have the output path lets do some pushing but where ? - # right now lets stick to using one repo , - - # issue Vladmir pointed out if uploads to a single lakefs branch have not - # been finalized with commit, - # this would cause dirty commits if parallel tasks target the same branch. - - # solution: Lakefs team suggested we commit to a different temp branch per - # task, and merge that branch. - # this callback function will do that for now. - - # 1. put files into a temp branch. - # 2. make sure a commit happens. - # 3. merge that branch to master branch. logger.info("Pushing local path %s to %s@%s in %s dir", local_path, repo, temp_branch_name, remote_path) put_files( @@ -182,27 +168,22 @@ def avalon_commit_callback(context: DagContext, **kwargs): source_branch_name=branch ) - # see what changes are going to be pushed from this branch to main branch for diff in pagination_helper(client._client.refs_api.diff_refs, repository=repo, left_ref=branch, right_ref=temp_branch_name): logger.info("Diff: " + str(diff)) - + try: - # merging temp branch to working branch - # the current working branch wins incase of conflicts merge = Merge(**{"strategy": "source-wins"}) client._client.refs_api.merge_into_branch(repository=repo, - source_ref=temp_branch_name, - destination_branch=branch, - merge=merge - ) + source_ref=temp_branch_name, + destination_branch=branch, + merge=merge + ) logger.info(f"merged branch {temp_branch_name} into {branch}") except Exception as e: - # remove temp logger.error(e) - # delete temp branch finally: client._client.branches_api.delete_branch( repository=repo, @@ -211,38 +192,43 @@ def avalon_commit_callback(context: DagContext, **kwargs): logger.info(f"deleted temp branch {temp_branch_name}") logger.info(f"deleting local dir {local_path}") - files_to_clean = glob.glob(local_path + '**', recursive=True) + [local_path] + # cleanup local dirs clean_up(context, **kwargs) -def clean_up(context: DagContext, **kwargs): - input_dir = str(generate_dir_name_from_task_instance(context['ti'], - roger_config=config, - suffix='output')).rstrip('/') + '/' - output_dir = str(generate_dir_name_from_task_instance(context['ti'], - roger_config=config, - suffix='input')).rstrip('/') + '/' + +def clean_up(context: Context, **kwargs): + input_dir = str(generate_dir_name_from_task_instance( + context['ti'], + roger_config=config, + suffix='output')).rstrip('/') + '/' + output_dir = str(generate_dir_name_from_task_instance( + context['ti'], + roger_config=config, + suffix='input')).rstrip('/') + '/' files_to_clean = glob.glob(input_dir + '**', recursive=True) + [input_dir] files_to_clean += glob.glob(output_dir + '**', recursive=True) + [output_dir] for f in files_to_clean: if os.path.exists(f): shutil.rmtree(f) + def generate_dir_name_from_task_instance(task_instance: TaskInstance, - roger_config: RogerConfig, suffix:str): + roger_config: RogerConfig, suffix: str): # if lakefs is not enabled just return none so methods default to using # local dir structure. if not roger_config.lakefs_config.enabled: return None - root_data_dir = os.getenv("ROGER_DATA_DIR").rstrip('/') + root_data_dir = os.getenv("ROGER_DATA_DIR").rstrip('/') task_id = task_instance.task_id dag_id = task_instance.dag_id run_id = task_instance.run_id - try_number = task_instance._try_number + try_number = task_instance.try_number return Path( f"{root_data_dir}/{dag_id}_{task_id}_{run_id}_{try_number}_{suffix}") -def setup_input_data(context, exec_conf): + +def setup_input_data(context: Context, exec_conf): logger.info(""" - Figures out the task name and id, - find its data dependencies @@ -253,40 +239,29 @@ def setup_input_data(context, exec_conf): logger.info(">>> context") logger.info(context) - # Serves as a location where files the task will work on are placed. - # computed as ROGER_DATA_DIR + /current task instance name_input_dir - input_dir = str(generate_dir_name_from_task_instance( context['ti'], roger_config=config, suffix="input")) - # Clear up files from previous run etc... - - # create input dir os.makedirs(input_dir, exist_ok=True) - # Download files from lakefs and store them in this new input_path client = init_lakefs_client(config=config) - repos = exec_conf['repos'] - dag_params = context["params"] + repos = exec_conf.get('repos', []) + dag_params = context.get("params", {}) if dag_params.get("repository_id"): logger.info(">>> repository_id supplied. Overriding repo.") - repos=[{ + repos = [{ 'repo': dag_params.get("repository_id"), 'branch': dag_params.get("branch_name"), 'commitid_from': dag_params.get("commitid_from"), 'commitid_to': dag_params.get("commitid_to") }] - # if no external repo is provided we assume to get the upstream task dataset. if not repos or len(repos) == 0: - # merge destination branch branch = config.lakefs_config.branch repo = config.lakefs_config.repo task_instance: TaskInstance = context['ti'] - # get upstream ids upstream_ids = task_instance.task.upstream_task_ids dag_id = task_instance.dag_id - # calculate remote dirs using dag_id + upstreams repos = [{ 'repo': repo, 'branch': branch, @@ -295,18 +270,13 @@ def setup_input_data(context, exec_conf): 'commitid_to': None } for upstream_id in upstream_ids] - # input_repo = exec_conf['input_repo'] - # input_branch = exec_conf['input_branch'] - # If input repo is provided use that as source of files - for repo in repos: - if not repo.get('path'): - # get all if path is not specified - repo['path'] = '*' + for r in repos: + if not r.get('path'): + r['path'] = '*' logger.info(f"repos : {repos}") logger.info(">>> start of downloading data") for r in repos: - # create path to download to ... if not os.path.exists(input_dir + f'/{r["repo"]}'): os.mkdir(input_dir + f'/{r["repo"]}') @@ -325,46 +295,39 @@ def setup_input_data(context, exec_conf): logger.info(">>> end of downloading data") - -def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos = {}, pass_conf=True, no_output_files=False): +def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos=None, pass_conf=True, no_output_files=False): """ Create a python task. :param func_kwargs: additional arguments for callable. :param dag: dag to add task to. :param name: The name of the task. :param a_callable: The code to run in this task. - """ - + """ + + if external_repos is None: + external_repos = {} + # these are actual arguments passed down to the task function op_kwargs = { "python_callable": a_callable, "to_string": True, "pass_conf": pass_conf } - # update / override some of the args passed to the task function by default if func_kwargs is None: func_kwargs = {} op_kwargs.update(func_kwargs) - - # Python operator arguments , by default for non-lakefs config this is all we need. python_operator_args = { - "task_id": name, - "python_callable":task_wrapper, - # "executor_config" : get_executor_config(), - "dag": dag, - "provide_context" : True + "task_id": name, + "python_callable": task_wrapper, + # executor_config example left commented; fill if needed + "dag": dag, } - # if we have lakefs... if config.lakefs_config.enabled: - - # repo and branch for pre-execution , to download input objects pre_exec_conf = { 'repos': [] } if external_repos: - # if the task is a root task , beginning of the dag... - # and we want to pull data from a different repo. pre_exec_conf = { 'repos': [{ 'repo': r['name'], @@ -374,18 +337,19 @@ def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos = } pre_exec = partial(setup_input_data, exec_conf=pre_exec_conf) - # add pre_exec partial function as an argument to python executor conf + # pre_execute will be called with context -> partial keeps exec_conf fixed python_operator_args['pre_execute'] = pre_exec - python_operator_args['on_failure_callback'] = partial(clean_up, kwargs=op_kwargs) - # if the task has output files, we will add a commit callback + + # pass fixed kwargs into partials so resulting callback accepts (context,) + python_operator_args['on_failure_callback'] = partial(clean_up, **op_kwargs) if not no_output_files: - python_operator_args['on_success_callback'] = partial(avalon_commit_callback, kwargs=op_kwargs) - - # add kwargs + python_operator_args['on_success_callback'] = partial(avalon_commit_callback, **op_kwargs) + python_operator_args["op_kwargs"] = op_kwargs return PythonOperator(**python_operator_args) + def create_pipeline_taskgroup( dag, pipeline_class: type, @@ -416,7 +380,6 @@ def create_pipeline_taskgroup( f"index_{name}_variables", pipeline.index_variables, pass_conf=False, - # declare that this task will not generate files. no_output_files=True) index_variables_task.set_upstream(annotate_task) @@ -425,9 +388,8 @@ def create_pipeline_taskgroup( f"validate_{name}_index_variables", pipeline.validate_indexed_variables, pass_conf=False, - # declare that this task will not generate files. no_output_files=True - ) + ) validate_index_variables_task.set_upstream([annotate_task, index_variables_task]) make_kgx_task = create_python_task( @@ -441,7 +403,7 @@ def create_pipeline_taskgroup( dag, f"crawl_{name}", pipeline.crawl_tranql, - pass_conf=False) + pass_conf=False) crawl_task.set_upstream(annotate_task) index_concepts_task = create_python_task( @@ -449,7 +411,6 @@ def create_pipeline_taskgroup( f"index_{name}_concepts", pipeline.index_concepts, pass_conf=False, - # declare that this task will not generate files. no_output_files=True) index_concepts_task.set_upstream(crawl_task) @@ -458,12 +419,10 @@ def create_pipeline_taskgroup( f"validate_{name}_index_concepts", pipeline.validate_indexed_concepts, pass_conf=False, - # declare that this task will not generate files. no_output_files=True ) validate_index_concepts_task.set_upstream([crawl_task, index_concepts_task, annotate_task]) - complete_task = EmptyOperator(task_id=f"complete_{name}") complete_task.set_upstream( (make_kgx_task, diff --git a/docker-compose.yaml b/docker-compose.yaml index 7c698ed6..fd81f5f5 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,207 +1,167 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# +version: '3.8' -# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. -# -# WARNING: This configuration is for local development. Do not use it in a production deployment. -# -# This configuration supports basic configuration using environment variables or an .env file -# The following variables are supported: -# -# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. -# Default: apache/airflow:master-python3.8 -# AIRFLOW_UID - User ID in Airflow containers -# Default: 50000 -# AIRFLOW_GID - Group ID in Airflow containers -# Default: 50000 -# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account. -# Default: airflow -# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account. -# Default: airflow -# -# Feel free to modify this file to suit your needs. ---- -version: '3' -x-airflow-common: - &airflow-common - build: - dockerfile: Dockerfile - context: . - environment: - &airflow-common-env - AIRFLOW__CORE__EXECUTOR: CeleryExecutor - AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow - AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow - AIRFLOW__CELERY__BROKER_URL: redis://:$REDIS_PASSWORD@redis:$REDIS_PORT/0 +x-airflow-common: &airflow-common + build: . + environment: &airflow_common_environment + AIRFLOW__CORE__EXECUTOR: LocalExecutor + AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres:5432/airflow AIRFLOW__CORE__FERNET_KEY: '' AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' AIRFLOW__CORE__LOAD_EXAMPLES: 'false' - - ROGER_DUG__INPUTS_DATA__SETS: "$ROGER_DUG__INPUTS_DATA__SETS" - ROGER_ELASTICSEARCH_HOST: "$ELASTIC_API_HOST" - ROGER_ELASTICSEARCH_PASSWORD: "$ELASTIC_PASSWORD" - ROGER_ELASTICSEARCH_NBOOST__HOST: "$NBOOST_API_HOST" - ROGER_REDISGRAPH_HOST: "$REDIS_HOST" - ROGER_REDISGRAPH_PASSWORD: "$REDIS_PASSWORD" - ROGER_KGX_DATASET__VERSION: "v3.0" - ROGER_DATA_DIR: "/opt/airflow/share/data" + AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' + AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' + AIRFLOW__CORE__SIMPLE_AUTH_MANAGER_USERS: 'admin:Admin' + ROGER_ELASTICSEARCH_HOST: "elasticsearch" + ROGER_ELASTICSEARCH_PASSWORD: "" + ROGER_ELASTICSEARCH_SCHEME: "http" + ROGER_ELASTICSEARCH_USERNAME: "elastic" + ROGER_REDISGRAPH_GRAPH: "test" + ROGER_REDISGRAPH_HOST: "redis-stack" + ROGER_REDISGRAPH_PASSWORD: "" + ROGER_REDISGRAPH_PORT: "6379" + ROGER_KGX_DATA__SETS: ${KGX_DATA_SETS} + ROGER_LAKEFS__CONFIG_ACCESS__KEY__ID: ${LAKEFS_ACCESS_KEY} + ROGER_LAKEFS__CONFIG_BRANCH: ${LAKEFS_BRANCH} + ROGER_LAKEFS__CONFIG_ENABLED: "true" + ROGER_LAKEFS__CONFIG_HOST: ${LAKEFS_URL} + ROGER_LAKEFS__CONFIG_REPO: ${LAKEFS_REPO} + ROGER_LAKEFS__CONFIG_SECRET__ACCESS__KEY: ${LAKEFS_SECRET_KEY} + ROGER_DUG__INPUTS_DATA__SETS: ${INPUT_DATA_SETS} + ROGER_ANNOTATION_ANNOTATOR__ARGS_SAPBERT_CLASSIFICATION__URL: ${BIOMEGATRON_URL} + ROGER_ANNOTATION_ANNOTATOR__ARGS_SAPBERT_ANNOTATOR__URL : ${SAPBERT_URL} + ROGER_ANNOTATION_NORMALIZER : ${NODE_NORM_URL} + ROGER_ANNOTATION_SYNONYM__SERVICE : ${NAME_RES_URL} volumes: - ./dags:/opt/airflow/dags - ./logs:/opt/airflow/logs - ./plugins:/opt/airflow/plugins - - ./data:/opt/airflow/share/data - user: root + - ./config:/opt/airflow/config depends_on: - redis: - condition: service_healthy postgres: condition: service_healthy + networks: + - airflow-network services: postgres: - image: postgres:13 + image: postgres:15-alpine environment: POSTGRES_USER: airflow POSTGRES_PASSWORD: airflow POSTGRES_DB: airflow volumes: - postgres-db-volume:/var/lib/postgresql/data - - ${DATA_DIR}/elastic:/elastic - - ${DATA_DIR}/redis:/redis healthcheck: test: ["CMD", "pg_isready", "-U", "airflow"] - interval: 5s + interval: 10s retries: 5 - restart: always + start_period: 5s + ports: + - "5432:5432" + networks: + - airflow-network - airflow-webserver: - <<: *airflow-common - command: webserver + # --- NEW: Elasticsearch Service --- + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.11.1 + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - ES_JAVA_OPTS=-Xms512m -Xmx512m # Limit RAM usage for dev + volumes: + - elasticsearch-data:/usr/share/elasticsearch/data ports: - - 8080:8080 + - "9200:9200" # REST API + - "9300:9300" # Internal transport + networks: + - airflow-network + + # --- NEW: Redis Stack (includes RedisGraph/FalkorDB) --- + redis-stack: + image: redis/redis-stack:latest + volumes: + - redis-stack-data:/data + ports: + - "6379:6379" # Redis port + - "8001:8001" # RedisInsight UI healthcheck: - test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] + test: [ "CMD", "redis-cli", "ping" ] interval: 10s - timeout: 10s retries: 5 - restart: always - - airflow-scheduler: - <<: *airflow-common - command: scheduler - restart: always - - airflow-worker: - <<: *airflow-common - command: celery worker - restart: always + start_period: 5s + networks: + - airflow-network airflow-init: <<: *airflow-common - command: version - environment: - <<: *airflow-common-env - _AIRFLOW_DB_UPGRADE: 'true' - _AIRFLOW_WWW_USER_CREATE: 'true' - _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} - _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} + entrypoint: /bin/bash + command: + - -c + - | + mkdir -p /opt/airflow/logs /opt/airflow/dags /opt/airflow/plugins + airflow db migrate + depends_on: + postgres: + condition: service_healthy - flower: + airflow-webserver: <<: *airflow-common - command: celery flower + command: airflow api-server ports: - - 5555:5555 + - "8080:8080" healthcheck: - test: ["CMD", "curl", "--fail", "http://localhost:5555/"] - interval: 10s + test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] + interval: 30s timeout: 10s retries: 5 - restart: always + start_period: 30s + depends_on: + airflow-init: + condition: service_completed_successfully - redis: - # image: redislabs/redisgraph:2.10.9 #Alternative Image - user: root - image: 'redis/redis-stack:6.2.4-v2' - command: "redis-server --requirepass $REDIS_PASSWORD --loadmodule /opt/redis-stack/lib/redisgraph.so" - environment: - - REDIS_ARGS=--requirepass $REDIS_PASSWORD - volumes: - - $DATA_DIR/redis:/data # FIX RDB Error on local - ports: - - $REDIS_PORT:$REDIS_PORT + airflow-scheduler: + <<: *airflow-common + command: airflow scheduler healthcheck: - test: ["CMD", "redis-cli", "ping"] - interval: 5s - timeout: 30s - retries: 50 - restart: always + test: ["CMD", "airflow", "jobs", "check", "--job-type", "SchedulerJob", "--hostname", "$${HOSTNAME}"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + depends_on: + airflow-init: + condition: service_completed_successfully - dug: - image: containers.renci.org/helxplatform/dug:latest + airflow-triggerer: + <<: *airflow-common + command: airflow triggerer + healthcheck: + test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s depends_on: - - elasticsearch - - redis - restart: always - environment: - ELASTIC_API_HOST: "$ELASTIC_API_HOST" - ELASTIC_PASSWORD: "$ELASTIC_PASSWORD" - REDIS_HOST: "$REDIS_HOST" - REDIS_PASSWORD: "$REDIS_PASSWORD" - FLASK_ENV: "development" - PYTHONUNBUFFERED: "TRUE" - entrypoint: [ "gunicorn", - "--workers=$API_WORKERS", "--name=dug", - "--bind=0.0.0.0:$API_PORT", "--timeout=$API_TIMEOUT", - "--log-level=DEBUG", "-k", "uvicorn.workers.UvicornWorker", "--reload", "dug.server:APP"] - ports: - - $API_PORT:$API_PORT + airflow-init: + condition: service_completed_successfully - elasticsearch: - user: root - image: docker.elastic.co/elasticsearch/elasticsearch:8.5.2 - environment: - - ELASTIC_PASSWORD=$ELASTIC_PASSWORD - - discovery.type=single-node - - xpack.security.enabled=true - - ingest.geoip.downloader.enabled=false - volumes: - - $DATA_DIR/elastic:/usr/share/elasticsearch/data - ports: - - '9200:9200' - - '9300:9300' + airflow-dag-processor: + <<: *airflow-common + command: airflow dag-processor + healthcheck: + test: [ "CMD", "airflow", "jobs", "check", "--job-type", "DagProcessorJob", "--hostname", "$${HOSTNAME}" ] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + depends_on: + airflow-init: + condition: service_completed_successfully - tranql: - image: containers.renci.org/helxplatform/tranql:rti-merge - ports: - - '8001:8001' - entrypoint: [ - "gunicorn", - "--workers=4", - "--bind=0.0.0.0:8001", - "--timeout=300", - "--access-logfile=$TRANQL_ACCESS_LOG", - "--error-logfile=$TRANQL_ERROR_LOG", - "--log-level=debug", - "tranql.api:app", - ] - environment: - - REDIS_PASSWORD=$REDIS_PASSWORD - volumes: - - ./tranql-schema.yaml:/tranql/tranql/conf/schema.yaml volumes: - postgres-db-volume: \ No newline at end of file + postgres-db-volume: + elasticsearch-data: # <-- New volume for Elasticsearch + redis-stack-data: +networks: + airflow-network: + driver: bridge \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 63275110..c8c705fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -elasticsearch==8.5.2 +elasticsearch>=8.5.2 flatten-dict jsonpickle git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6 @@ -6,9 +6,15 @@ setuptools>=66 pytest PyYAML git+https://github.com/helxplatform/dug@develop -orjson==3.9.15 +orjson>=3.9.15 git+https://github.com/helxplatform/kg_utils.git@v0.0.10 git+https://github.com/helxplatform/python-stringcase@1.2.1 bmt==1.4.4 -git+https://github.com/helxplatform/avalon.git@v1.1.0 +git+https://github.com/helxplatform/avalon.git@lakefs-1.71.0 h11>=0.16.0 +starlette>=0.49.1 +datetime +aiohttp +#--- patch +werkzeug==3.0.6 +cryptography>=44.0.1 \ No newline at end of file From e913111312103badd913d3b2425184b8995c7da3 Mon Sep 17 00:00:00 2001 From: Griffin Roupe <31631417+frostyfan109@users.noreply.github.com> Date: Wed, 7 Jan 2026 13:00:57 -0500 Subject: [PATCH 62/63] Update urllib version requirement to address vulnerability (#134) * pin min urllib version * upgrade packages to mitigate vulns * remove werkzeug constraint from requirements * add werkzeug constraint back, upgrade to 3.1.4 --- Dockerfile | 2 +- requirements.txt | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7323534b..47f32824 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM python:3.11.14-slim-trixie # Set Airflow version and home directory -ARG AIRFLOW_VERSION=3.1.1 +ARG AIRFLOW_VERSION=3.1.5 ARG AIRFLOW_HOME=/opt/airflow # Environment variables diff --git a/requirements.txt b/requirements.txt index 44e6ffae..def8e43b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,9 @@ git+https://github.com/helxplatform/avalon.git@lakefs-1.71.0 h11>=0.16.0 starlette>=0.49.1 datetime -aiohttp #--- patch -werkzeug==3.0.6 -cryptography>=44.0.1 \ No newline at end of file +aiohttp>=3.13.3 +werkzeug==3.1.4 +cryptography>=44.0.1 +urllib3>=2.6.2 +marshmallow==3.26.2 # upgrade from 3.26.1 specified in airflow constraints file \ No newline at end of file From 61133245c83e22803d90477b9a6ff1cc3be7e5f0 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Fri, 16 Jan 2026 15:44:59 -0500 Subject: [PATCH 63/63] bump recent --- requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 975229df..3e9ac3d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,8 +16,9 @@ starlette>=0.49.1 datetime #--- patch aiohttp>=3.13.3 -werkzeug==3.1.4 +werkzeug==3.1.5 cryptography>=44.0.1 -urllib3>=2.6.2 +urllib3>=2.6.3 +jaraco.context==6.1.0 marshmallow==3.26.2 # upgrade from 3.26.1 specified in airflow constraints file