From ebd0e59ff3454ea782fbac811380c96905f375d3 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Thu, 5 Oct 2023 09:00:18 -0400
Subject: [PATCH 01/63] Update _version.py (#86)

---
 dags/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/_version.py b/dags/_version.py
index fe7b0e06..40bcdb68 100644
--- a/dags/_version.py
+++ b/dags/_version.py
@@ -1 +1 @@
-version = "0.10.3-dev"
+version = "0.10.3"

From b66eb64fcf42d803656cc2b392daa5fd4942f6fd Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Thu, 5 Oct 2023 19:47:36 -0400
Subject: [PATCH 02/63] Update _version.py

---
 dags/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/_version.py b/dags/_version.py
index 40bcdb68..b8ac7e54 100644
--- a/dags/_version.py
+++ b/dags/_version.py
@@ -1 +1 @@
-version = "0.10.3"
+version = "0.10.4-dev"

From e7e5f7c89b1ba096b40600bca79dcc9b1ef7617e Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Tue, 31 Oct 2023 09:05:49 -0400
Subject: [PATCH 03/63] Rti merge (#84)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Update requirements.txt

* Update .gitignore

* Update dug_utils.py

Handle Error when curie not found in validate

* Update __init__.py

* Update config.yaml

* Update dev-config.yaml

* Update docker-compose.yaml

* fixed docker-compose

* adding back postgres volume to docker compose

* env correction , docker compose updates

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
---
 .env                                    |   8 +-
 .gitignore                              |   4 +-
 Makefile                                |   5 +-
 README.md                               |   2 +-
 bin/docker_backend/docker-compose.yaml  |   4 +-
 bin/roger                               |   2 +-
 cli.py                                  |   9 +-
 dags/dug_helpers/dug_utils.py           |  31 ++++--
 dags/roger/config/__init__.py           |   4 +
 dags/roger/config/config.yaml           |   4 +-
 dags/roger/{ => config}/dev-config.yaml |   4 +-
 dags/roger/core/bulkload.py             |  24 +++--
 dags/roger/core/redis_graph.py          |  55 +++++------
 dags/test_metadata.yaml                 | 124 ++++++++++++++++++++++++
 docker-compose.yaml                     |  62 ++++++------
 requirements.txt                        |   8 +-
 roger-cli-steps.md                      |  27 ++++++
 tests/test_redis_query.cypher           |   5 +
 18 files changed, 283 insertions(+), 99 deletions(-)
 rename dags/roger/{ => config}/dev-config.yaml (98%)
 create mode 100644 dags/test_metadata.yaml
 create mode 100644 roger-cli-steps.md
 create mode 100644 tests/test_redis_query.cypher

diff --git a/.env b/.env
index 7bc31d98..9e2ba0e3 100644
--- a/.env
+++ b/.env
@@ -15,9 +15,9 @@ ELASTIC_USERNAME=elastic
 
 NBOOST_API_HOST=nboost
 
-REDIS_PASSWORD=12345
-REDIS_HOST=redis
+REDIS_PASSWORD=weak
+REDIS_HOST=merge-redis-master
 REDIS_PORT=6379
-
 TRANQL_ACCESS_LOG=access.log
-TRANQL_ERROR_LOG=error.log
\ No newline at end of file
+TRANQL_ERROR_LOG=error.log
+ROGER_DUG__INPUTS_DATA__SETS=topmed:v1.0
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 065d4e8b..8ca8ea91 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,5 @@
 # Git ignore bioler plate from https://github.com/github/gitignore/blob/master/Python.gitignore
 
-
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -108,6 +107,7 @@ celerybeat.pid
 *.sage.py
 
 # Environments
+.secrets-env
 .venv
 env/
 venv/
@@ -149,4 +149,4 @@ cython_debug/
 dags/roger/data
 local_storage
 logs
-tests/integration/data/bulk/
\ No newline at end of file
+tests/integration/data/bulk/
diff --git a/Makefile b/Makefile
index 644a6d01..ef227aa4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,5 @@
-PYTHON       = 	PYTHONPATH=dags /usr/bin/env python3
+PYTHON       = $(shell which python3)
+PYTHONPATH   = dags
 VERSION_FILE = ./dags/_version.py
 VERSION      = $(shell cut -d " " -f 3 ${VERSION_FILE})
 DOCKER_REPO  = docker.io
@@ -17,10 +18,12 @@ help:
 mk_dirs:
 	mkdir -p {logs,plugins}
 	mkdir -p local_storage/elastic
+	mkdir -p local_storage/redis
 
 rm_dirs:
 	rm -rf logs/*
 	rm -rf local_storage/elastic/*
+	rm -rf local_storage/redis/*
 	rm -rf ./dags/roger/data/*
 
 #install: Install application along with required packages to local environment
diff --git a/README.md b/README.md
index 5ee4a69b..92ed1652 100644
--- a/README.md
+++ b/README.md
@@ -533,7 +533,7 @@ Open localhost:8080 in a browser.
 
 Then run:
 ```
-python tranql_translator.py
+python tranql_translate.py
 ```
 The Airflow interface shows the workflow:
 ![image](https://user-images.githubusercontent.com/306971/97787955-b968f680-1b8b-11eb-86cc-4d93842eafd3.png)
diff --git a/bin/docker_backend/docker-compose.yaml b/bin/docker_backend/docker-compose.yaml
index 695363b0..d87c6ae6 100644
--- a/bin/docker_backend/docker-compose.yaml
+++ b/bin/docker_backend/docker-compose.yaml
@@ -22,9 +22,9 @@ services:
       - roger-network
     environment:
       - REDIS_PASSWORD=$ROGERENV_REDISGRAPH_PASSWORD
-    entrypoint: /usr/local/bin/gunicorn --workers=2 --bind=0.0.0.0:8081 --name=tranql --timeout=600 tranql.api:app
+    entrypoint: /usr/local/bin/gunicorn --workers=2 --bind=0.0.0.0:8001 --name=tranql --timeout=600 tranql.api:app
     ports:
-    - 8081:8081
+    - 8001:8001
     volumes:
     - ./tranql-schema.yaml:/tranql/tranql/conf/schema.yaml
   #################################################################################
diff --git a/bin/roger b/bin/roger
index 55fd4697..4626df88 100755
--- a/bin/roger
+++ b/bin/roger
@@ -10,7 +10,7 @@ export PYTHONPATH=$ROGER_HOME:$ROGER_HOME/../kgx
 export DB_NAME=test
 
 roger () {
-    python $ROGER_HOME/roger/core.py $*
+    python $ROGER_HOME/dags/roger/core.py $*
 }
 
 kgx () {
diff --git a/cli.py b/cli.py
index 7b6b609f..be77525a 100644
--- a/cli.py
+++ b/cli.py
@@ -4,12 +4,15 @@
 from dug_helpers.dug_utils import DugUtil, get_topmed_files, get_dbgap_files, get_sparc_files, get_anvil_files, get_nida_files
 import sys
 import argparse
+import os
+import time
 
 
 log = get_logger()
 
 if __name__ == "__main__":
-
+    start = time.time()
+    log.info(f"Start TIME:{start}")
     parser = argparse.ArgumentParser(description='Roger common cli tool.')
     """ Common CLI. """
     parser.add_argument('-d', '--data-root', help="Root of data hierarchy", default=None)
@@ -102,4 +105,8 @@
     if args.validate_concepts:
         DugUtil.validate_indexed_concepts(config=config)
 
+    end = time.time()
+    time_elapsed = end - start
+    log.info(f"Completion TIME:{time_elapsed}")
+    
     sys.exit (0)
diff --git a/dags/dug_helpers/dug_utils.py b/dags/dug_helpers/dug_utils.py
index 85b40d8f..e55fdb34 100644
--- a/dags/dug_helpers/dug_utils.py
+++ b/dags/dug_helpers/dug_utils.py
@@ -381,10 +381,17 @@ def _search_elements(self, curie, search_term):
             query=search_term
         ))
         ids_dict = []
-        for element_type in response:
-            all_elements_ids = [e['id'] for e in
-                                reduce(lambda x, y: x + y['elements'], response[element_type], [])]
-            ids_dict += all_elements_ids
+        if 'total_items' in response:
+            if response['total_items'] == 0:
+                log.error(f"No search elements returned for variable search: {self.variables_index}.")
+                log.error(f"Concept id : {curie}, Search term: {search_term}")
+                raise Exception(f"Validation error - Did not find {curie} for"
+                                f"Search term: {search_term}")
+            else:
+                for element_type in response:
+                    all_elements_ids = [e['id'] for e in
+                                        reduce(lambda x, y: x + y['elements'], response[element_type], [])]
+                    ids_dict += all_elements_ids
         return ids_dict
 
     def crawl_concepts(self, concepts, data_set_name):
@@ -511,18 +518,24 @@ def validate_indexed_concepts(self, elements, concepts):
 
                 searched_element_ids = self._search_elements(curie, search_term)
 
-                present = bool(len([x for x in sample_elements[curie] if x in searched_element_ids]))
-                if not present:
-                    log.error(f"Did not find expected variable {element.id} in search result.")
+                if curie not in sample_elements:
+                    log.error(f"Did not find Curie id {curie} in Elements.")
                     log.error(f"Concept id : {concept.id}, Search term: {search_term}")
                     raise Exception(f"Validation error - Did not find {element.id} for"
                                     f" Concept id : {concept.id}, Search term: {search_term}")
+                else:
+                    present = bool(len([x for x in sample_elements[curie] if x in searched_element_ids]))
+                    if not present:
+                        log.error(f"Did not find expected variable {element.id} in search result.")
+                        log.error(f"Concept id : {concept.id}, Search term: {search_term}")
+                        raise Exception(f"Validation error - Did not find {element.id} for"
+                                        f" Concept id : {concept.id}, Search term: {search_term}")
 
     def clear_index(self, index_id):
-        exists = self.search_obj.es.indices.exists(index_id)
+        exists = self.search_obj.es.indices.exists(index=index_id)
         if exists:
             log.info(f"Deleting index {index_id}")
-            response = self.event_loop.run_until_complete(self.search_obj.es.indices.delete(index_id))
+            response = self.event_loop.run_until_complete(self.search_obj.es.indices.delete(index=index_id))
             log.info(f"Cleared Elastic : {response}")
         log.info("Re-initializing the indicies")
         self.index_obj.init_indices()
diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py
index 6a6d757b..403b25b1 100644
--- a/dags/roger/config/__init__.py
+++ b/dags/roger/config/__init__.py
@@ -144,6 +144,8 @@ class ElasticsearchConfig(DictLike):
     username: str = "elastic"
     password: str = ""
     nboost_host: str = ""
+    scheme: str = "http"
+    ca_path: str = ""
 
 
 
@@ -174,6 +176,8 @@ def to_dug_conf(self) -> DugConfig:
             elastic_host=self.elasticsearch.host,
             elastic_password=self.elasticsearch.password,
             elastic_username=self.elasticsearch.username,
+            elastic_scheme=self.elasticsearch.scheme,
+            elastic_ca_path=self.elasticsearch.ca_path,
             redis_host=self.redisgraph.host,
             redis_password=self.redisgraph.password,
             redis_port=self.redisgraph.port,
diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml
index 92503030..ae370235 100644
--- a/dags/roger/config/config.yaml
+++ b/dags/roger/config/config.yaml
@@ -92,6 +92,8 @@ elasticsearch:
   username: elastic
   password: ""
   nboost_host: ""
+  scheme: "http"
+  ca_path: ""
 
 validation:
   queries:
@@ -154,4 +156,4 @@ lakefs_config:
   enabled: false
   access_key_id: ""
   secret_access_key: ""
-  host: ""
\ No newline at end of file
+  host: ""
diff --git a/dags/roger/dev-config.yaml b/dags/roger/config/dev-config.yaml
similarity index 98%
rename from dags/roger/dev-config.yaml
rename to dags/roger/config/dev-config.yaml
index 67fd6d5e..91fc0af3 100644
--- a/dags/roger/dev-config.yaml
+++ b/dags/roger/config/dev-config.yaml
@@ -13,7 +13,7 @@ data_root: "/Users/schreepc/Projects/helxplatform/roger/roger/test/data"
 dug_data_root: dug_helpers/dug_data/topmed_data
 base_data_uri: https://stars.renci.org/var/kgx_data/trapi-1.0/
 kgx:
-  biolink_model_version: 1.5.0
+  biolink_model_version: test
 
 #https://github.com/RedisGraph/redisgraph-bulk-loader/blob/master/redisgraph_bulk_loader/bulk_insert.py#L43
 bulk_loader:
@@ -115,4 +115,4 @@ validation:
       - var: "[N-]=[N+]=[N-]"
       - var: "[Ag+]"
       - var: "[Zn+2]"
-      - var: "[C-]#[O+]"
\ No newline at end of file
+      - var: "[C-]#[O+]"
diff --git a/dags/roger/core/bulkload.py b/dags/roger/core/bulkload.py
index 772b7cf9..1eca92db 100644
--- a/dags/roger/core/bulkload.py
+++ b/dags/roger/core/bulkload.py
@@ -341,20 +341,26 @@ def insert (self):
         args = []
         if len(nodes) > 0:
             bulk_path_root = storage.bulk_path('nodes') + os.path.sep
-            nodes_with_type = [
-                f"{ x.replace(bulk_path_root, '').split('.')[0].replace('~', ':')} {x}"
-                for x in nodes]
+            nodes_with_type = []
+            for x in nodes:
+                """ 
+                    These lines prep nodes bulk load by:
+                    1) appending to labels 'biolink.'
+                    2) combine labels to create a multilabel redis node i.e. "biolink.OrganismalEntity:biolink.SubjectOfInvestigation" 
+                """
+                file_name_type_part = x.replace(bulk_path_root, '').split('.')[0].split('~')[1]
+                all_labels = "biolink." + file_name_type_part + ":" + ":".join([f'biolink.{v.lstrip("biolink:")}' for v in self.biolink.toolkit.get_ancestors("biolink:" + file_name_type_part, reflexive=False, formatted=True )] )
+                nodes_with_type.append(f"{all_labels} {x}")
             args.extend(("-N " + " -N ".join(nodes_with_type)).split())
         if len(edges) > 0:
             bulk_path_root = storage.bulk_path('edges') + os.path.sep
-            edges_with_type = [
-                f"{x.replace(bulk_path_root, '').strip(os.path.sep).split('.')[0].replace('~', ':')} {x}"
-                for x in edges]
+            edges_with_type = [f"biolink.{x.replace(bulk_path_root, '').strip(os.path.sep).split('.')[0].split('~')[1]} {x}"
+                               for x in edges]
+            # Edge label now no longer has 'biolink:'
             args.extend(("-R " + " -R ".join(edges_with_type)).split())
         args.extend([f"--separator={self.separator}"])
-        args.extend([f"--host={redisgraph['host']}"])
-        args.extend([f"--port={redisgraph['port']}"])
-        args.extend([f"--password={redisgraph['password']}"])
+        log.debug(f"--redis-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}")
+        args.extend([f"--redis-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}"])
         args.extend(['--enforce-schema'])
         args.extend([f"{redisgraph['graph']}"])
         """ standalone_mode=False tells click not to sys.exit() """
diff --git a/dags/roger/core/redis_graph.py b/dags/roger/core/redis_graph.py
index 6710d3fc..ca65ddce 100644
--- a/dags/roger/core/redis_graph.py
+++ b/dags/roger/core/redis_graph.py
@@ -1,51 +1,44 @@
-"Graph abstraction layer over redisgraph python module"
-
 import copy
 
 import redis
-from redisgraph import Node, Edge, Graph
+# from redisgraph import Node, Edge, Graph
+# https://redis-py.readthedocs.io/en/v4.5.1/redismodules.html#redisgraph-commands
+from redis.commands.graph.node import Node
+from redis.commands.graph.edge import Edge
+
 from roger.logger import get_logger
 
 logger = get_logger ()
 
-
 class RedisGraph:
-    """ Graph abstraction over RedisGraph
-
-    A thin wrapper but provides us some options.
-    """
-
-    def __init__(self, host='localhost', port=6379, graph='default',
-                 password=''):
+    """ Graph abstraction over RedisGraph. A thin wrapper but provides us some options. """
+    
+    def __init__(self, host='localhost', port=6379, graph='default', password=''):
         """ Construct a connection to Redis Graph. """
         self.r = redis.Redis(host=host, port=port, password=password)
-        self.redis_graph = Graph(graph, self.r)
+        self.redis_graph = self.r.graph(graph)
 
     def add_node (self, identifier=None, label=None, properties=None):
         """ Add a node with the given label and properties. """
-        logger.debug (
-            f"--adding node id:{identifier} label:{label} prop:{properties}")
+        logger.debug (f"--adding node id:{identifier} label:{label} prop:{properties}")
         if identifier and properties:
             properties['id'] = identifier
-        node = Node(node_id=identifier, alias=identifier,
-                    label=label, properties=properties)
+        node = Node(node_id=identifier, alias=identifier, label=label, properties=properties)
         self.redis_graph.add_node(node)
         return node
 
     def get_edge (self, start, end, predicate=None):
-        "Get an edge from the graph with the specified start and end ids"
+        """ Get an edge from the graph with the specified start and end identifiers. """
         result = None
         for edge in self.redis_graph.edges:
             if edge.src_node.id == start and edge.dest_node.id == end:
                 result = edge
                 break
         return result
-
+    
     def add_edge (self, start, predicate, end, properties={}):
-        "Add edge with given predicate and properties btw start and end nodes"
-        logger.debug (
-            f"--adding edge start:{start} pred:{predicate} "
-            f"end:{end} prop:{properties}")
+        """ Add an edge with the given predicate and properties between start and end nodes. """
+        logger.debug (f"--adding edge start:{start} pred:{predicate} end:{end} prop:{properties}")
         if isinstance(start, str) and isinstance(end, str):
             start = Node(node_id = start, label='thing')
             end = Node(node_id = end, label='thing')
@@ -56,13 +49,11 @@ def add_edge (self, start, predicate, end, properties={}):
         return edge
 
     def has_node (self, identifier):
-        "Does the graph have a node with this ID"
         return identifier in self.redis_graph.nodes
 
     def get_node (self, identifier, properties=None):
-        "Retrieve the node with the given id"
         return self.redis_graph.nodes[identifier]
-
+    
     def commit (self):
         """ Commit modifications to the graph. """
         self.redis_graph.commit()
@@ -70,13 +61,13 @@ def commit (self):
     def query (self, query):
         """ Query and return result set. """
         result = self.redis_graph.query(query)
-        result.pretty_print()
+        print(result)
         return result
-
+    
     def delete (self):
         """ Delete the named graph. """
         self.redis_graph.delete()
-
+        
 def test ():
     rg = RedisGraph ()
     p = { 'a' : 4,
@@ -91,10 +82,10 @@ def test ():
         if last is not None:
             rg.add_edge (node, 'link', last)
         last = node
-    rg.commit ()
-    rg.query ("MATCH (obj:yeah)-[:link]->(j:yeah) RETURN obj.a, obj.b, obj.x")
-    rg.query ("MATCH (a) RETURN a")
+    rg.commit ()    
+    rg.query ("""MATCH (obj:yeah)-[:link]->(j:yeah) RETURN obj.a, obj.b, obj.x""")    
+    rg.query ("""MATCH (a) RETURN a""")
     rg.delete ()
 
 #    rg.query ("""MATCH (a { id : 'chemical_substance' }) RETURN a""")
-#test ()
+#test ()
\ No newline at end of file
diff --git a/dags/test_metadata.yaml b/dags/test_metadata.yaml
new file mode 100644
index 00000000..54d508c4
--- /dev/null
+++ b/dags/test_metadata.yaml
@@ -0,0 +1,124 @@
+# This is a file that lists the data to be used for testing purposes
+# It contains a reduced set of the metadata.yaml file
+kgx:
+  versions:
+  - files:
+    - biolink-v1.0.json
+    - ctd-v1.0.json
+    - gtopdb-v1.0.json
+    - hetio-v1.0.json
+    - hgnc-v1.0.json
+    - hmdb-v1.0.json
+    - kegg-v1.0.json
+    - mychem-v1.0.json
+    - ontological-hierarchy-v1.0.json
+    - panther-v1.0.json
+    - foodb-v1.0.json
+    - pharos-v1.0.json
+    - intact-v1.0.json
+    - human-goa-v1.0.json
+    - uberongraph-v1.0.json
+    - viral-proteome-v1.0.json
+    version: v1.0
+    name: baseline-graph
+    format: json
+  - files:
+    - biolink-v2.0.json
+    - ctd-v2.0.json
+    - gtopdb-v2.0.json
+    - hetio-v2.0.json
+    - hgnc-v2.0.json
+    - hmdb-v2.0.json
+    - kegg-v2.0.json
+    - mychem-v2.0.json
+    - ontological-hierarchy-v2.0.json
+    - panther-v2.0.json
+    - foodb-v2.0.json
+    - pharos-v2.0.json
+    - intact-v2.0.json
+    - human-goa-v2.0.json
+    - uberongraph-v2.0.json
+    - viral-proteome-v2.0.json
+    version: v2.0
+    name: baseline-graph
+    format: json
+  - files:
+      - heal/sparc/curation-export-processed.json
+    version: v2.0
+    name: sparc-kgx
+    format: json
+  - files:
+      - Biolink_edges_v3.0.jsonl
+      - Biolink_nodes_v3.0.jsonl
+      - CTD_edges_v3.0.jsonl
+      - CTD_nodes_v3.0.jsonl
+      - DrugCentral_edges_v3.0.jsonl
+      - DrugCentral_nodes_v3.0.jsonl
+      - GtoPdb_edges_v3.0.jsonl
+      - GtoPdb_nodes_v3.0.jsonl
+      - Hetio_edges_v3.0.jsonl
+      - Hetio_nodes_v3.0.jsonl
+      - HGNC_edges_v3.0.jsonl
+      - HGNC_nodes_v3.0.jsonl
+      - HMDB_edges_v3.0.jsonl
+      - HMDB_nodes_v3.0.jsonl
+      - HumanGOA_edges_v3.0.jsonl
+      - HumanGOA_nodes_v3.0.jsonl
+      - IntAct_edges_v3.0.jsonl
+      - IntAct_nodes_v3.0.jsonl
+      - OntologicalHierarchy_edges_v3.0.jsonl
+      - OntologicalHierarchy_nodes_v3.0.jsonl
+      - PANTHER_edges_v3.0.jsonl
+      - PANTHER_nodes_v3.0.jsonl
+      - PHAROS_edges_v3.0.jsonl
+      - PHAROS_nodes_v3.0.jsonl
+      - UberGraph_edges_v3.0.jsonl
+      - UberGraph_nodes_v3.0.jsonl
+    version: v3.0
+    name: baseline-graph
+    format: jsonl
+  - version: test
+    files:
+    - hgnc_nodes.jsonl
+    - hgnc_edges.jsonl
+    name: test
+  - version: v3.0
+    name: cde-graph
+    format: jsonl
+    files:
+      - cde/annotated_edges_v3.0.jsonl
+      - cde/annotated_nodes_v3.0.jsonl
+dug_inputs:
+  versions:
+    - name: bdc
+      version: v1.0
+      files:
+        s3:
+          - "bdc/v1.0/bdc_dbgap_data_dicts.tar.gz"
+        stars:
+          - "bdc_dbgap_data_dicts.tar.gz"
+      format: dbGaP
+    - name: nida
+      version: v1.0
+      files:
+        s3:
+          - "nida/v1.0/nida-12studies.tar.gz"
+        stars:
+          - "nida-12studies.tar.gz"
+      format: nida
+    - name: sparc
+      version: v1.0
+      files:
+        s3:
+          - "sparc/v1.0/sparc-dbgap-xml-formatted.tar.gz"
+        stars:
+          - "sparc-dbgap-xml-formatted.tar.gz"
+      format: sparc
+    - name: anvil
+      version: v1.0
+      files:
+        s3:
+          - "bdc/v1.0/anvil_dbgap_data_dicts.tar.gz"
+        stars:
+          - "anvil_dbgap_data_dicts.tar.gz"
+      format: anvil
\ No newline at end of file
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 15e4b88f..7c698ed6 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -52,18 +52,20 @@ x-airflow-common:
     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
 
-    ROGER_DUG__INPUTS_DATA__SETS: topmed,bdc-dbGaP
+    ROGER_DUG__INPUTS_DATA__SETS: "$ROGER_DUG__INPUTS_DATA__SETS"
     ROGER_ELASTICSEARCH_HOST: "$ELASTIC_API_HOST"
     ROGER_ELASTICSEARCH_PASSWORD: "$ELASTIC_PASSWORD"
     ROGER_ELASTICSEARCH_NBOOST__HOST: "$NBOOST_API_HOST"
     ROGER_REDISGRAPH_HOST: "$REDIS_HOST"
     ROGER_REDISGRAPH_PASSWORD: "$REDIS_PASSWORD"
     ROGER_KGX_DATASET__VERSION: "v3.0"
+    ROGER_DATA_DIR: "/opt/airflow/share/data"
   volumes:
     - ./dags:/opt/airflow/dags
     - ./logs:/opt/airflow/logs
     - ./plugins:/opt/airflow/plugins
-  user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}"
+    - ./data:/opt/airflow/share/data
+  user: root
   depends_on:
     redis:
       condition: service_healthy
@@ -79,24 +81,14 @@ services:
       POSTGRES_DB: airflow
     volumes:
       - postgres-db-volume:/var/lib/postgresql/data
+      - ${DATA_DIR}/elastic:/elastic
+      - ${DATA_DIR}/redis:/redis
     healthcheck:
       test: ["CMD", "pg_isready", "-U", "airflow"]
       interval: 5s
       retries: 5
     restart: always
 
-  redis:
-    image: redislabs/redisgraph:2.4.1
-    command: "redis-server --requirepass $REDIS_PASSWORD --loadmodule /usr/lib/redis/modules/redisgraph.so"
-    ports:
-      - $REDIS_PORT:$REDIS_PORT
-    healthcheck:
-      test: ["CMD", "redis-cli", "ping"]
-      interval: 5s
-      timeout: 30s
-      retries: 50
-    restart: always
-
   airflow-webserver:
     <<: *airflow-common
     command: webserver
@@ -141,17 +133,33 @@ services:
       retries: 5
     restart: always
 
+  redis:
+    # image: redislabs/redisgraph:2.10.9 #Alternative Image
+    user: root
+    image: 'redis/redis-stack:6.2.4-v2'
+    command: "redis-server --requirepass $REDIS_PASSWORD --loadmodule /opt/redis-stack/lib/redisgraph.so"
+    environment:
+      - REDIS_ARGS=--requirepass $REDIS_PASSWORD
+    volumes:
+      - $DATA_DIR/redis:/data # FIX RDB Error on local
+    ports:
+      - $REDIS_PORT:$REDIS_PORT
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 30s
+      retries: 50
+    restart: always
+
   dug:
-    image: cschreep/dug:develop
+    image: containers.renci.org/helxplatform/dug:latest
     depends_on:
       - elasticsearch
       - redis
-      - nboost
     restart: always
     environment:
       ELASTIC_API_HOST: "$ELASTIC_API_HOST"
       ELASTIC_PASSWORD: "$ELASTIC_PASSWORD"
-      NBOOST_API_HOST: "$NBOOST_API_HOST"
       REDIS_HOST: "$REDIS_HOST"
       REDIS_PASSWORD: "$REDIS_PASSWORD"
       FLASK_ENV: "development"
@@ -159,35 +167,32 @@ services:
     entrypoint: [ "gunicorn",
                      "--workers=$API_WORKERS", "--name=dug",
                      "--bind=0.0.0.0:$API_PORT", "--timeout=$API_TIMEOUT",
-                     "--log-level=DEBUG", "--enable-stdio-inheritance", "--reload", "dug.api:app" ]
+                     "--log-level=DEBUG", "-k", "uvicorn.workers.UvicornWorker", "--reload", "dug.server:APP"]
     ports:
       - $API_PORT:$API_PORT
 
   elasticsearch:
-    image: docker.elastic.co/elasticsearch/elasticsearch:7.6.1
+    user: root
+    image: docker.elastic.co/elasticsearch/elasticsearch:8.5.2
     environment:
       - ELASTIC_PASSWORD=$ELASTIC_PASSWORD
       - discovery.type=single-node
       - xpack.security.enabled=true
+      - ingest.geoip.downloader.enabled=false
     volumes:
       - $DATA_DIR/elastic:/usr/share/elasticsearch/data
     ports:
       - '9200:9200'
       - '9300:9300'
 
-  nboost:
-    image: koursaros/nboost:0.3.9-pt
-    ports:
-      - '8000:8000'
-
   tranql:
-    image: helxplatform/tranql-app:develop-0.0.56
+    image: containers.renci.org/helxplatform/tranql:rti-merge
     ports:
-      - '8081:8081'
+      - '8001:8001'
     entrypoint: [
         "gunicorn",
         "--workers=4",
-        "--bind=0.0.0.0:8081",
+        "--bind=0.0.0.0:8001",
         "--timeout=300",
         "--access-logfile=$TRANQL_ACCESS_LOG",
         "--error-logfile=$TRANQL_ERROR_LOG",
@@ -198,6 +203,5 @@ services:
       - REDIS_PASSWORD=$REDIS_PASSWORD
     volumes:
       - ./tranql-schema.yaml:/tranql/tranql/conf/schema.yaml
-
 volumes:
-  postgres-db-volume:
+  postgres-db-volume:
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index c964fce7..54918df6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,12 @@
-apache-airflow==2.5.0
 boto3==1.18.23
 botocore==1.21.23
 #black==21.10b0
+elasticsearch==8.5.2
 flatten-dict
-redisgraph==2.4.1
-redisgraph-bulk-loader==0.9.5
+redisgraph-bulk-loader==0.12.3
 pytest
 PyYAML
-git+https://github.com/helxplatform/dug@2.11.2
-elasticsearch==7.11.0
+git+https://github.com/helxplatform/dug@2.12.0
 orjson
 kg-utils==0.0.6
 bmt==1.1.0
diff --git a/roger-cli-steps.md b/roger-cli-steps.md
new file mode 100644
index 00000000..8e132746
--- /dev/null
+++ b/roger-cli-steps.md
@@ -0,0 +1,27 @@
+# Deployment with Roger CLI
+
+## QUICK Local Set Up
+
+This is list steps to produce a local deployment of Roger. This set up does NOT use airflow and instead only uses the Roger CLI via **Makefile** commands.
+
+### Prerequsite Steps
+
+- Set up Roger dependencies by ensuring that the `.env` has all the correct information.
+- Run the following docker compose commands
+  - `docker compose up tranql -d`: starts up tranql which is the API handlerfor redis graph in the `graph` stage
+  - `docker compose up redis -d`: starts up redis which will be used via redis graph for the `graph` stage
+  - `docker compose up dug -d`: starts up dug API to work as the API handler for elastic search in the `index` stage
+  - `docker compose up elasticsearch -d`: starts up elastic search for the `index` stage
+
+### Roger CLI Steps
+
+1) `python3 -m venv ~/.environments/roger`
+2) `source ~/.environments/roger/bin/activate`
+3) `pip install -r requirements.txt`
+4) `export PYTHONPATH=$PWD/dags`
+5) Change the elasticsearch and redisgraph `host` values to localhost in `dags/roger/config/config.yaml`
+6) Get the S3 Bucket credentials (access_key, bucket, host, secret_key) and export them as environment variables with ROGER_S3_ in the front of the value like: `ROGER_S3_ACCESS__KEY=XXXXKEYXXXX`
+7) `cd bin/` and here either run `make all` OR separate the commands into three steps:
+   1) `make annotate`: executes the CLI related commands found in `bin/dug_annotate/Makefile`
+   2) `make graph`: executes the CLI related commands found in `bin/roger_graph_build/Makefile`
+   3) `make index`: executes the CLI related commands found in `bin/dug_index/Makefile`
diff --git a/tests/test_redis_query.cypher b/tests/test_redis_query.cypher
new file mode 100644
index 00000000..509df1fa
--- /dev/null
+++ b/tests/test_redis_query.cypher
@@ -0,0 +1,5 @@
+MATCH (c{id:'HP:0032316'}) return c
+
+MATCH (disease:`Disease` {`id`: 'MONDO:0004979'}) WITH disease MATCH (disease)-[e1_disease_phenotypic_feature]-(phenotypic_feature:`PhenotypicFeature` {})
+WITH disease AS disease, phenotypic_feature AS phenotypic_feature, collect(e1_disease_phenotypic_feature) AS e1_disease_phenotypic_feature
+RETURN disease,phenotypic_feature,e1_disease_phenotypic_feature,labels(disease) AS type__disease,labels(phenotypic_feature) AS type__phenotypic_feature,[edge in e1_disease_phenotypic_feature | type(edge)] AS type__e1_disease_phenotypic_feature,[edge in e1_disease_phenotypic_feature | [startNode(edge).id, endNode(edge).id]] AS id_pairs__e1_disease_phenotypic_feature
\ No newline at end of file

From fee790d4299f05b5d4ab0a9a50c257f4b0f9cd1f Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Wed, 1 Nov 2023 12:29:22 -0400
Subject: [PATCH 04/63] adding v5.0

---
 dags/metadata.yaml            | 12 ++++++++++++
 dags/roger/config/config.yaml |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/dags/metadata.yaml b/dags/metadata.yaml
index 47fedcb5..0cedb6a0 100644
--- a/dags/metadata.yaml
+++ b/dags/metadata.yaml
@@ -97,6 +97,18 @@ kgx:
     files:
       - cde/annotated_edges_v4.0.jsonl
       - cde/annotated_nodes_v4.0.jsonl
+  - version: v5.0
+    name: baseline-graph
+    format: jsonl
+    files:
+      - baseline-5.0/edges_v5.0.jsonl
+      - baseline-5.0/nodes_v5.0.jsonl
+  - version: v5.0
+    name: cde-graph
+    format: jsonl
+    files:
+      - cde/annotated_edges_v5.0.jsonl
+      - cde/annotated_nodes_v5.0.jsonl
 dug_inputs:
   versions:
     - name: bdc
diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml
index ae370235..30eddcd1 100644
--- a/dags/roger/config/config.yaml
+++ b/dags/roger/config/config.yaml
@@ -16,7 +16,7 @@ annotation_base_data_uri: https://stars.renci.org/var/dug/
 
 kgx:
   biolink_model_version: v3.1.2
-  dataset_version: v4.0
+  dataset_version: v5.0
   merge_db_id: 1
   merge_db_temp_dir: workspace
   data_sets:

From e15a37305d40adb0c428f70258f8b2dcd6d80b83 Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Mon, 6 Nov 2023 15:37:43 -0500
Subject: [PATCH 05/63] cde-links branch

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 54918df6..504042af 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ flatten-dict
 redisgraph-bulk-loader==0.12.3
 pytest
 PyYAML
-git+https://github.com/helxplatform/dug@2.12.0
+git+https://github.com/helxplatform/dug@cde-links-2
 orjson
 kg-utils==0.0.6
 bmt==1.1.0

From 6aac2fd81e40dadfdd3c4ef5493ebbf177ca2969 Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Mon, 6 Nov 2023 16:53:19 -0500
Subject: [PATCH 06/63] pin linkml

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 504042af..17a59d90 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@ git+https://github.com/helxplatform/dug@cde-links-2
 orjson
 kg-utils==0.0.6
 bmt==1.1.0
+linkml-runtime==1.6.0
\ No newline at end of file

From 487b2c1ab24a5323dcc33e7528a724431e9dc69e Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Tue, 7 Nov 2023 13:15:16 -0500
Subject: [PATCH 07/63] Update config.yaml

collection_action to action
---
 dags/roger/config/config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml
index 30eddcd1..49bc927b 100644
--- a/dags/roger/config/config.yaml
+++ b/dags/roger/config/config.yaml
@@ -85,7 +85,7 @@ indexing:
         desc: "summary"
         collection_name: "cde_category"
         collection_id: "cde_category"
-        collection_action: "files"
+        action: "files"
 
 elasticsearch:
   host: elasticsearch

From bb61ab9596e3af267cc1ffd190ef9bef60a0ef43 Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Tue, 7 Nov 2023 19:55:44 -0500
Subject: [PATCH 08/63] pop total items before result

---
 dags/dug_helpers/dug_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dags/dug_helpers/dug_utils.py b/dags/dug_helpers/dug_utils.py
index e55fdb34..f90c591c 100644
--- a/dags/dug_helpers/dug_utils.py
+++ b/dags/dug_helpers/dug_utils.py
@@ -388,6 +388,7 @@ def _search_elements(self, curie, search_term):
                 raise Exception(f"Validation error - Did not find {curie} for"
                                 f"Search term: {search_term}")
             else:
+                del response['total_items']
                 for element_type in response:
                     all_elements_ids = [e['id'] for e in
                                         reduce(lambda x, y: x + y['elements'], response[element_type], [])]

From 48650b82f4343e5cd04938aee7fc9b8748b7acb1 Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Wed, 8 Nov 2023 15:31:22 -0500
Subject: [PATCH 09/63] print extracted elements

---
 dags/dug_helpers/dug_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dags/dug_helpers/dug_utils.py b/dags/dug_helpers/dug_utils.py
index f90c591c..4d417e04 100644
--- a/dags/dug_helpers/dug_utils.py
+++ b/dags/dug_helpers/dug_utils.py
@@ -435,12 +435,15 @@ def crawl_concepts(self, concepts, data_set_name):
                 casting_config = query['casting_config']
                 tranql_source = query['tranql_source']
                 dug_element_type = query['output_dug_type']
-                extracted_dug_elements += crawler.expand_to_dug_element(
+                new_elements =  crawler.expand_to_dug_element(
                     concept=concept,
                     casting_config=casting_config,
                     dug_element_type=dug_element_type,
                     tranql_source=tranql_source
                 )
+                log.debug("extracted:")
+                log.debug(str(list([el.get_searchable_dict() for el in new_elements])))
+                extracted_dug_elements += new_elements
             concept.clean()
             percent_complete = int((counter / total) * 100)
             if percent_complete % 10 == 0:

From 3cfe3f5f7545a6915c20749db7e641153c6d0ce4 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Thu, 21 Dec 2023 15:40:41 -0500
Subject: [PATCH 10/63] Update requirements.txt

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 17a59d90..f2f2d16b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,8 +6,8 @@ flatten-dict
 redisgraph-bulk-loader==0.12.3
 pytest
 PyYAML
-git+https://github.com/helxplatform/dug@cde-links-2
+git+https://github.com/helxplatform/dug@patch-nrslv-resp
 orjson
 kg-utils==0.0.6
 bmt==1.1.0
-linkml-runtime==1.6.0
\ No newline at end of file
+linkml-runtime==1.6.0

From 21889ed9c0bbdb8db13da74b13ce5e95dea3c2e7 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Thu, 4 Jan 2024 13:30:21 -0500
Subject: [PATCH 11/63] Keep edge provenance (#94)

* Update kgx.py

* Update kgx.py

* Update kgx.py

can't delete edge keys while looping over them.

* just collect then update

* Update requirements.txt (#93)
---
 dags/roger/models/kgx.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dags/roger/models/kgx.py b/dags/roger/models/kgx.py
index 62a57a61..a8976a00 100644
--- a/dags/roger/models/kgx.py
+++ b/dags/roger/models/kgx.py
@@ -478,6 +478,13 @@ def merge(self):
                     edges['subject'] + edges['predicate'] +
                     edges['object'] +
                     edges.get("biolink:primary_knowledge_source", ""))
+                keys_to_del = set()
+                for key in edges:
+                    if key.startswith('biolink:'):                        
+                        keys_to_del.add(key)
+                for k in keys_to_del:
+                    edges[k.replace('biolink:', '')] = edges[k]
+                    del edges[k]
                 stream.write(json.dumps(edges).decode('utf-8') + '\n')
 
         write_merge_metric['edges_writing_time'] = time.time() - start_edge_jsonl

From 2eb136ad917472bd7e4711a68ce9e90fa64da63f Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Thu, 18 Apr 2024 13:04:27 -0400
Subject: [PATCH 12/63] Pipeline parameterize restructure (#95)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* Parameterized annotate tasks with input_data_path and output_data_path

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Parameterized annotate tasks with input_data_path and output_data_path (#85)

* adding lakefs changes to roger-2.0

* point avalon to vg1 branch

* change avalon dep

* update airflow

* fix avalon tag typo

* update jenkins to tag version on main branch only

* update jenkins to tag version

* update jenkins to tag version

* psycopg2 installation

* add cncf k8s req

* use airflow non-slim

* simplified for testing

* simplified for testing

* change dag name

* Erroneous parameter passed, should not be None

* adding pre-exec

* adding pre-exec

* adding pre-exec

* typo preexec

* typo preexec

* fix context

* get files from repo

* get files from repo

* get files from repo

* get files from repo

* First shot at moving pipeline into base class and implementing. Anvil pipeline not complete

* Syntax fix, docker image version bump to airflow 2.7.2-python3.11

* update storage dir

* update remove dir code

* update remove dir code

* remote path to *

* fix input dir for annotators

* fix input dir for annotators

* fix input dir for annotators

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* adding branch info on lakefs config

* callback push to branch

* back to relative import

* reformat temp branch name based on unique task id

* add logging

* add logging

* convert posix path to str for avalon

* add extra / to root path

* New dag created using DugPipeline subclasses

* EmptyOperator imported from wrong place

* import and syntax fixes

* utterly silly syntax error

* Added anvil to default input data sets for testing purposes

* adding / to local path

* commit meta task args empty string

* add merge logic

* add merge logic

* upstream task dir pull for downstream task

* Switched from subdag to taskgroup because latest Airflow depricated subdag

* Added BACPAC pipeline object

* Temporarily ignoring configuration variable for enabled datasets for testing

* Passed dag in to create task group to see if it helps dag errors

* Fixed silly syntax error

* adding input / output dir params for make kgx

* Trying different syntax to make taskgroups work.

* adding input / output dir params for make kgx

* Parsing, syntax, pylint fixes

* adding input / output dir params for make kgx

* Added pipeline name to task group name to ensure uniqueness

* oops, moved something out of scope. Fixed

* Filled out pipeline with methods from dug_utils. Needs data path changes

* Finished implementing input_data_path and output_data_path handling, pylint cleanup

* Update requirements.txt

* adding toggle to avoid sending config obj

* adding toggle to avoid sending config obj

* disable to string for test

* control pipelines for testing

* add self to anvil get files

* add log stream to make it available

* typo fix

* correcting branch id

* adding source repo

* adding source repo

* patch name-resolver response

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* dug pipeline edit

* recurisvely find recursively

* recurisvely find recursively

* setup output path for crawling

* all task functions should have input and output params

* adding annotation as upstream for validate index

* revamp create task , and task wrapper

* add validate concepts index task

* adding concept validation

* add index_variables task as dependecy for validate concepts

* add index_variables task as dependecy for validate concepts

* await client exist

* await client exist

* concepts not getting picked up for indexing

* concepts not getting picked up for indexing

* fix search elements

* converting annotation output to json

* json format annotation outputs

* adding support for json format elements and concepts read

* json back to dug objects

* fixing index valriables with json objects

* indetation and new line for better change detection :?

* indetation and new line for better change detection

* treat dictionary concepts as dictionary

* read concepts json as a dict

* concepts files are actually file paths

* debug message

* make output jsonable

* clear up dir after commit , and delete unmerged branch even if no changes

* don`t clear indexes, parallel dataset processing will be taxed

* memory leak?

* memory leak?

* memory leak?

* dumping pickles to debug locally

* find out why concepts are being added to every other element

* find out why concepts are being added to every other element

* pointless shuffle 🤷‍♂️

* revert back in time

* back to sanitize dug

* output just json for annotation

* adding jsonpickle

* jsonpickle 🥒

* unpickle for index

* unpickle for validate index

* crawling fixes

* crawling fixes

* crawling validation fixes

* fix index concepts

* fix makekgx

* adding other bdc pipelines

* adding pipeline paramters to be able to configure per instance

* fix

* add input dataset for pipelines

* Adding README to document how to create data set-specific pipelines

* catchup on base.py

* Added dbgap and nida pipelines

* fix import errors

* annotator modules added by passing config val (#90)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

---------

Co-authored-by: YaphetKG <kebedey@renci.org>

* Add heal parsers (#96)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Add heal parsers (#97)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Radx pipeline (#99)

* point to large download

* fix schema path

* debug bulk input dir

* fix schema read

* fix schema read

* fix schema read

* commenting steup dir for test

* adding logs

* fix path stuff

* add commented stuff back in

* testing radx parser

* adding parser

* skip indexing vars with no id

* adding indexes as part of bulk loader paramters

* fix id index cli arg

* fix local cli

* dug latest

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>
---
 .env                                          |   6 +-
 .gitignore                                    |   2 +
 Dockerfile                                    |  10 +-
 Jenkinsfile                                   |  32 +-
 dags/annotate.py                              | 103 --
 dags/annotate_and_index.py                    |  44 +
 dags/dug_helpers/dug_utils.py                 | 201 ++--
 dags/index_dag.py                             |  28 -
 dags/knowledge_graph_build.py                 | 102 ++
 dags/roger/config/__init__.py                 |  42 +-
 dags/roger/config/config.yaml                 |  17 +-
 dags/roger/config/dev-config.yaml             |   2 +-
 dags/roger/core/base.py                       |  36 +-
 dags/roger/core/bulkload.py                   |  71 +-
 dags/roger/core/storage.py                    | 200 ++--
 dags/roger/models/kgx.py                      |  55 +-
 dags/roger/pipelines/README.md                |  99 ++
 dags/roger/pipelines/__init__.py              |  28 +
 dags/roger/pipelines/anvil.py                 |  24 +
 dags/roger/pipelines/bacpac.py                |   8 +
 dags/roger/pipelines/base.py                  | 964 ++++++++++++++++++
 dags/roger/pipelines/bdc.py                   |  19 +
 dags/roger/pipelines/crdc.py                  |  19 +
 dags/roger/pipelines/ctn.py                   |  10 +
 dags/roger/pipelines/db_gap.py                |  10 +
 .../roger/pipelines/heal_research_programs.py |  16 +
 dags/roger/pipelines/heal_studies.py          |  16 +
 dags/roger/pipelines/kfdrc.py                 |  19 +
 dags/roger/pipelines/nida.py                  |  18 +
 dags/roger/pipelines/radx.py                  |   8 +
 dags/roger/pipelines/sparc.py                 |  17 +
 dags/roger/pipelines/topmed.py                |  41 +
 dags/roger/tasks.py                           | 393 ++++++-
 dags/tranql_translate.py                      |  43 -
 requirements.txt                              |   5 +-
 35 files changed, 2238 insertions(+), 470 deletions(-)
 delete mode 100755 dags/annotate.py
 create mode 100644 dags/annotate_and_index.py
 delete mode 100755 dags/index_dag.py
 create mode 100644 dags/knowledge_graph_build.py
 create mode 100644 dags/roger/pipelines/README.md
 create mode 100644 dags/roger/pipelines/__init__.py
 create mode 100644 dags/roger/pipelines/anvil.py
 create mode 100644 dags/roger/pipelines/bacpac.py
 create mode 100644 dags/roger/pipelines/base.py
 create mode 100644 dags/roger/pipelines/bdc.py
 create mode 100644 dags/roger/pipelines/crdc.py
 create mode 100644 dags/roger/pipelines/ctn.py
 create mode 100644 dags/roger/pipelines/db_gap.py
 create mode 100644 dags/roger/pipelines/heal_research_programs.py
 create mode 100644 dags/roger/pipelines/heal_studies.py
 create mode 100644 dags/roger/pipelines/kfdrc.py
 create mode 100644 dags/roger/pipelines/nida.py
 create mode 100644 dags/roger/pipelines/radx.py
 create mode 100644 dags/roger/pipelines/sparc.py
 create mode 100644 dags/roger/pipelines/topmed.py
 delete mode 100755 dags/tranql_translate.py

diff --git a/.env b/.env
index 9e2ba0e3..2b42e8d7 100644
--- a/.env
+++ b/.env
@@ -9,9 +9,9 @@ DATA_DIR=./local_storage
 
 DUG_LOG_LEVEL=INFO
 
-ELASTIC_PASSWORD=12345
-ELASTIC_API_HOST=elasticsearch
-ELASTIC_USERNAME=elastic
+ELASTICSEARCH_PASSWORD=12345
+ELASTICSEARCH_HOST=elasticsearch
+ELASTICSEARCH_USERNAME=elastic
 
 NBOOST_API_HOST=nboost
 
diff --git a/.gitignore b/.gitignore
index 8ca8ea91..6c46fe7f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 # Git ignore bioler plate from https://github.com/github/gitignore/blob/master/Python.gitignore
+.secret-env
+.vscode/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/Dockerfile b/Dockerfile
index 5556709c..49c1fd26 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,11 +1,9 @@
-FROM apache/airflow:2.5.0-python3.10
+FROM apache/airflow:2.7.2-python3.11
+
 USER root
 RUN apt-get update && \
-    apt-get install -y git gcc python3-dev nano vim
+    apt-get install -y git nano vim 
 COPY requirements.txt requirements.txt
 USER airflow
-# dependency resolution taking hours eventually failing,
-# @TODO fix click lib dependency
-RUN pip install -r requirements.txt && \
-    pip uninstall -y elasticsearch-dsl
+RUN pip install -r requirements.txt
 RUN rm -f requirements.txt
diff --git a/Jenkinsfile b/Jenkinsfile
index 3372b9e0..a68ba92b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -70,31 +70,15 @@ spec:
             steps {
                 script {
                     container(name: 'kaniko', shell: '/busybox/sh') {
-                        kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2", "$IMAGE_NAME:$TAG3", "$IMAGE_NAME:$TAG4"])
+                        if (env.BRANCH_NAME == "main") {
+                          // Tag with latest and version iff when pushed to master
+                          kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2", "$IMAGE_NAME:$TAG3", "$IMAGE_NAME:$TAG4"])
+                        } else {
+                          kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2"])
+                        }
                     }
                 }
-            }
-            // post {
-            //     always {
-            //         archiveArtifacts artifacts: 'image.tar', onlyIfSuccessful: true
-            //     }
-            // }
-        }
-        // stage('Publish') {
-        //     steps {
-        //         script {
-        //             container(name: 'crane', shell: '/busybox/sh') {
-        //                 def imageTagsPushAlways = ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2"]
-        //                 def imageTagsPushForDevelopBranch = ["$IMAGE_NAME:$TAG3"]
-        //                 def imageTagsPushForMasterBranch = ["$IMAGE_NAME:$TAG3", "$IMAGE_NAME:$TAG4"]
-        //                 image.publish(
-        //                     imageTagsPushAlways,
-        //                     imageTagsPushForDevelopBranch,
-        //                     imageTagsPushForMasterBranch
-        //                 )
-        //             }
-        //         }
-        //     }
-        // }
+            }          
+        }        
     }
 }
diff --git a/dags/annotate.py b/dags/annotate.py
deleted file mode 100755
index e0d4a2e4..00000000
--- a/dags/annotate.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import os
-
-from airflow.models import DAG
-from airflow.operators.empty import EmptyOperator
-
-from dug_helpers.dug_utils import (
-    DugUtil, 
-    get_topmed_files,
-    get_dbgap_files,
-    get_nida_files,
-    get_sparc_files,
-    get_anvil_files,
-    get_cancer_data_commons_files,
-    get_kids_first_files,
-    get_sprint_files,
-    get_bacpac_files,
-    get_heal_study_files,
-    get_heal_research_program_files
-    )
-from roger.tasks import default_args, create_python_task
-
-DAG_ID = 'annotate_dug'
-
-""" Build the workflow's tasks and DAG. """
-with DAG(
-    dag_id=DAG_ID,
-    default_args=default_args,
-    schedule_interval=None
-) as dag:
-
-    """Build workflow tasks."""
-    intro = EmptyOperator(task_id='Intro', dag=dag)
-
-    # Unzip and get files, avoid this because
-    # 1. it takes a bit of time making the dag itself, webserver hangs
-    # 2. Every task in this dag would still need to execute this part making it redundant
-    # 3. tasks like intro would fail because they don't have the data dir mounted.
-
-    make_kg_tagged = create_python_task(dag, "make_tagged_kgx", DugUtil.make_kg_tagged)
-
-    dummy_stepover = EmptyOperator(task_id="continue")
-
-    #intro >> run_printlog
-    envspec = os.getenv("ROGER_DUG__INPUTS_DATA__SETS","topmed")
-    data_sets = envspec.split(",")
-
-    clear_annotation_items = create_python_task(dag, "clear_annotation_files", DugUtil.clear_annotation_cached)
-
-    for i, data_set in enumerate(data_sets):
-        annotate_files = None
-        if data_set.startswith("bdc"):
-            prepare_files = create_python_task(dag, "get_dbgap_data", get_dbgap_files)
-            annotate_files = create_python_task(dag, "annotate_db_gap_files", DugUtil.annotate_db_gap_files)
-        elif data_set.startswith("nida"):
-            prepare_files = create_python_task(dag, "get_nida_files", get_nida_files)
-            annotate_files = create_python_task(dag, "annotate_nida_files", DugUtil.annotate_nida_files)
-        elif data_set.startswith("sparc"):
-            prepare_files = create_python_task(dag, "get_sparc_files", get_sparc_files)
-            annotate_files = create_python_task(dag, "annotate_sparc_files", DugUtil.annotate_sparc_files)
-        elif data_set.startswith("topmed"):
-            prepare_files = create_python_task(dag, "get_topmed_data", get_topmed_files)
-            annotate_files = create_python_task(dag, "annotate_topmed_files", DugUtil.annotate_topmed_files)
-        elif data_set.startswith("anvil"):
-            prepare_files = create_python_task(dag, "get_anvil_data", get_anvil_files)
-            annotate_files = create_python_task(dag, "annotate_anvil_files", DugUtil.annotate_anvil_files)
-        elif data_set.startswith("crdc"):
-            prepare_files = create_python_task(dag, "get_cancer_commons_files", get_cancer_data_commons_files)
-            annotate_files = create_python_task(dag, "annotate_cancer_commons_files",
-                                                DugUtil.annotate_cancer_commons_files)
-        elif data_set.startswith("kfdrc"):
-            prepare_files = create_python_task(dag, "get_kids_first_files", get_kids_first_files)
-            annotate_files = create_python_task(dag, "annotate_kids_first_files",
-                                                DugUtil.annotate_kids_first_files)
-        elif data_set.startswith("sprint"):
-            prepare_files = create_python_task(dag, "get_sprint_files", get_sprint_files)
-            annotate_files = create_python_task(dag, "annotate_sprint_files",
-                                                DugUtil.annotate_sprint_files)
-        elif data_set.startswith("bacpac"):
-            prepare_files = create_python_task(dag, "get_bacpac_files", get_bacpac_files)
-            annotate_files = create_python_task(dag, "annotate_bacpac_files",
-                                                DugUtil.annotate_bacpac_files)        
-        
-        elif data_set.startswith("heal-studies"):
-            prepare_files = create_python_task(dag, "get_heal_study_files", get_heal_study_files)
-            annotate_files = create_python_task(dag, "annotate_heal_study_files",
-                                                DugUtil.annotate_heal_study_files)
-        # elif data_set.startswith("heal-mds-imports"):
-        #     prepare_files = create_python_task(dag, "get_heal_mds_imports", get_heal_study_files)            
-
-        elif data_set.startswith("heal-research-programs"):
-            prepare_files = create_python_task(dag, "get_heal_research_program_files", get_heal_research_program_files)
-            annotate_files = create_python_task(dag, "annotate_heal_research_program_files",
-                                                DugUtil.annotate_heal_research_program_files)
-
-        
-        intro >> prepare_files
-        prepare_files >> clear_annotation_items
-
-        if annotate_files:
-            clear_annotation_items >> annotate_files
-            annotate_files >> dummy_stepover
-
-    dummy_stepover >> make_kg_tagged
diff --git a/dags/annotate_and_index.py b/dags/annotate_and_index.py
new file mode 100644
index 00000000..884cd149
--- /dev/null
+++ b/dags/annotate_and_index.py
@@ -0,0 +1,44 @@
+"""DAG which performs Dug annotate and index operations
+
+This DAG differes slightly from prior versions of the same functionality in
+Roger not only in that the annotation and indexing happen in the same DAG, but
+also those tasks are broken out into sub-DAGs organized by dataset. Each dataset
+has a subdag for all tasks.
+"""
+
+import os
+
+from airflow.models import DAG
+from airflow.operators.empty import EmptyOperator
+from roger.tasks import default_args, create_pipeline_taskgroup
+
+env_enabled_datasets = os.getenv(
+    "ROGER_DUG__INPUTS_DATA__SETS", "topmed,anvil").split(",")
+
+with DAG(
+        dag_id='annotate_and_index',
+        default_args=default_args,
+        schedule_interval=None
+) as dag:
+    init = EmptyOperator(task_id="init", dag=dag)
+    finish = EmptyOperator(task_id="finish", dag=dag)
+
+    from roger import pipelines
+    from roger.config import config
+    envspec = os.getenv("ROGER_DUG__INPUTS_DATA__SETS","topmed:v2.0")
+    data_sets = envspec.split(",")
+    pipeline_names = {x.split(':')[0]: x.split(':')[1] for x in data_sets}
+    for pipeline_class in pipelines.get_pipeline_classes(pipeline_names):
+        # Only use pipeline classes that are in the enabled datasets list and
+        # that have a properly defined pipeline_name attribute
+
+        # TODO
+        # Overriding environment variable just to see if this is working.
+        # name = getattr(pipeline_class, 'pipeline_name', '*not defined*')
+        # if not name in env_enabled_datasets:
+        #     continue
+
+        # Do the thing to add the pipeline's subdag to the dag in the right way
+        # . . .
+
+        init >> create_pipeline_taskgroup(dag, pipeline_class, config) >> finish
diff --git a/dags/dug_helpers/dug_utils.py b/dags/dug_helpers/dug_utils.py
index 4d417e04..52db9624 100644
--- a/dags/dug_helpers/dug_utils.py
+++ b/dags/dug_helpers/dug_utils.py
@@ -11,8 +11,9 @@
 from typing import Union, List
 
 import requests
-from dug.core import get_parser, get_plugin_manager, DugConcept
-from dug.core.annotate import DugAnnotator, ConceptExpander
+from dug.core import get_parser, get_annotator, get_plugin_manager, DugConcept
+from dug.core.annotators._base import Annotator
+from dug.core.concept_expander import ConceptExpander
 from dug.core.crawler import Crawler
 from dug.core.factory import DugFactory
 from dug.core.parsers import Parser, DugElement
@@ -44,7 +45,7 @@ def __init__(self, config: RogerConfig, to_string=True):
             self.string_handler = logging.StreamHandler(self.log_stream)
             log.addHandler(self.string_handler)
 
-        self.annotator: DugAnnotator = self.factory.build_annotator()
+        self.annotator_name: str = config.annotation.annotator_type
 
         self.tranqlizer: ConceptExpander = self.factory.build_tranqlizer()
 
@@ -85,7 +86,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             log.error(f"{exc_val} {exc_val} {exc_tb}")
             log.exception("Got an exception")
 
-    def annotate_files(self, parser_name, parsable_files):
+    def annotate_files(self, parser_name, parsable_files,
+                       output_data_path=None):
         """
         Annotates a Data element file using a Dug parser.
         :param parser_name: Name of Dug parser to use.
@@ -94,14 +96,16 @@ def annotate_files(self, parser_name, parsable_files):
         """
         dug_plugin_manager = get_plugin_manager()
         parser: Parser = get_parser(dug_plugin_manager.hook, parser_name)
-        output_base_path = storage.dug_annotation_path('')
+        annotator: Annotator = get_annotator(dug_plugin_manager.hook, annotator_name=self.annotator_name, config=self.config.to_dug_conf())
+        if not output_data_path:
+            output_data_path = storage.dug_annotation_path('')
         log.info("Parsing files")
         for parse_file in parsable_files:
             log.debug("Creating Dug Crawler object")
             crawler = Crawler(
                 crawl_file=parse_file,
                 parser=parser,
-                annotator=self.annotator,
+                annotator=annotator,
                 tranqlizer='',
                 tranql_queries=[],
                 http_session=self.cached_session
@@ -109,7 +113,7 @@ def annotate_files(self, parser_name, parsable_files):
 
             # configure output space.
             current_file_name = '.'.join(os.path.basename(parse_file).split('.')[:-1])
-            elements_file_path = os.path.join(output_base_path, current_file_name)
+            elements_file_path = os.path.join(output_data_path, current_file_name)
             elements_file_name = 'elements.pickle'
             concepts_file_name = 'concepts.pickle'
 
@@ -566,141 +570,224 @@ def clear_annotation_cached(config=None, to_string=False):
                 dug.cached_session.cache.clear()
 
     @staticmethod
-    def annotate_db_gap_files(config=None, to_string=False, files=None):
+    def annotate_db_gap_files(config=None, to_string=False, input_data_path=None, output_data_path=None):
         with Dug(config, to_string=to_string) as dug:
-            if files is None:
-                files = storage.dug_dd_xml_objects()
+            if not input_data_path:
+                files = storage.dug_dd_xml_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
             parser_name = "DbGaP"
             dug.annotate_files(parser_name=parser_name,
-                               parsable_files=files)
+                               parsable_files=files,
+                               output_data_path=output_data_path)
             output_log = dug.log_stream.getvalue() if to_string else ''
         return output_log
 
     @staticmethod
-    def annotate_anvil_files(config=None, to_string=False, files=None):
+    def annotate_anvil_files(config=None, to_string=False,
+                             input_data_path=None, output_data_path=None):
         with Dug(config, to_string=to_string) as dug:
-            if files is None:
-                files = storage.dug_anvil_objects()
+            if not input_data_path:
+                files = storage.dug_anvil_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
             parser_name = "Anvil"
             dug.annotate_files(parser_name=parser_name,
-                               parsable_files=files)
+                               parsable_files=files,
+                               output_data_path=output_data_path)
             output_log = dug.log_stream.getvalue() if to_string else ''
         return output_log
 
     @staticmethod
-    def annotate_cancer_commons_files(config=None, to_string=False, files=None):
+    def annotate_cancer_commons_files(config=None, to_string=False,
+                                      input_data_path=None,
+                                      output_data_path=None):
         with Dug(config, to_string=to_string) as dug:
-            if files is None:
-                files = storage.dug_crdc_objects()
+            if not input_data_path:
+                files = storage.dug_crdc_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
             parser_name = "crdc"
             dug.annotate_files(parser_name=parser_name,
-                               parsable_files=files)
+                               parsable_files=files,
+                               output_data_path=output_data_path)
             output_log = dug.log_stream.getvalue() if to_string else ''
         return output_log
 
     @staticmethod
-    def annotate_kids_first_files(config=None, to_string=False, files=None):
+    def annotate_kids_first_files(config=None, to_string=False,
+                                  input_data_path=None, output_data_path=None):
         with Dug(config, to_string=to_string) as dug:
-            if files is None:
-                files = storage.dug_kfdrc_objects()
+            if not input_data_path:
+                files = storage.dug_kfdrc_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
             parser_name = "kfdrc"
             dug.annotate_files(parser_name=parser_name,
-                               parsable_files=files)
+                               parsable_files=files,
+                               output_data_path=output_data_path)
             output_log = dug.log_stream.getvalue() if to_string else ''
         return output_log
 
     @staticmethod
-    def annotate_nida_files(config=None, to_string=False, files=None):
+    def annotate_nida_files(config=None, to_string=False,
+                            input_data_path=None, output_data_path=None):
         with Dug(config, to_string=to_string) as dug:
-            if files is None:
-                files = storage.dug_nida_objects()
+            if not input_data_path:
+                files = storage.dug_nida_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
             parser_name = "NIDA"
             dug.annotate_files(parser_name=parser_name,
-                               parsable_files=files)
+                               parsable_files=files,
+                               output_data_path=output_data_path)
             output_log = dug.log_stream.getvalue() if to_string else ''
         return output_log
 
     @staticmethod
-    def annotate_sparc_files(config=None, to_string=False, files=None):
+    def annotate_sparc_files(config=None, to_string=False, 
+                             input_data_path=None, output_data_path=None):
         with Dug(config, to_string=to_string) as dug:
-            if files is None:
-                files = storage.dug_sparc_objects()
+            if not input_data_path:
+                files = storage.dug_sparc_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
             parser_name = "SciCrunch"
             dug.annotate_files(parser_name=parser_name,
-                               parsable_files=files)
+                               parsable_files=files,
+                               output_data_path=output_data_path)
             output_log = dug.log_stream.getvalue() if to_string else ''
         return output_log
 
     @staticmethod
-    def annotate_sprint_files(config=None, to_string=False, files=None):
+    def annotate_sprint_files(config=None, to_string=False,
+                              input_data_path=None, output_data_path=None):
         with Dug(config, to_string=to_string) as dug:
-            if files is None:
-                files = storage.dug_sprint_objects()
+            if not input_data_path:
+                files = storage.dug_sprint_objects(
+                    input_data_path=input_data_path)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
             parser_name = "SPRINT"
             dug.annotate_files(parser_name=parser_name,
-                               parsable_files=files)
+                               parsable_files=files,
+                               output_data_path=output_data_path)
             output_log = dug.log_stream.getvalue() if to_string else ''
         return output_log
 
     @staticmethod
-    def annotate_topmed_files(config=None, to_string=False, files=None):
+    def annotate_topmed_files(config=None, to_string=False,
+                              input_data_path=None, output_data_path=None):
         with Dug(config, to_string=to_string) as dug:
-            if files is None:
-                files = storage.dug_topmed_objects()
+            if not input_data_path:
+                files = storage.dug_topmed_objects(
+                    input_data_path=None)
+            else:
+                files = storage.get_files_recursive(
+                        lambda x: True, input_data_path
+                    )
             parser_name = "TOPMedTag"
             log.info(files)
             dug.annotate_files(parser_name=parser_name,
-                               parsable_files=files)
+                               parsable_files=files,
+                               output_data_path=output_data_path)
             output_log = dug.log_stream.getvalue() if to_string else ''
         return output_log
 
     @staticmethod
-    def annotate_bacpac_files(config=None, to_string=False, files=None):
+    def annotate_bacpac_files(config=None, to_string=False, 
+                              input_data_path=None, output_data_path=None):
+        
+        log.info(f"Input data path is: {input_data_path}")
         with Dug(config, to_string=to_string) as dug:
-            if files is None:
-                files = storage.dug_bacpac_objects()
+            files = storage.dug_bacpac_objects(
+                input_data_path=input_data_path)
+               
             parser_name = "BACPAC"
             log.info(files)
             dug.annotate_files(parser_name=parser_name,
-                               parsable_files=files)
+                               parsable_files=files,
+                               output_data_path=output_data_path)
             output_log = dug.log_stream.getvalue() if to_string else ''
         return output_log
  
     
     @staticmethod
-    def annotate_heal_study_files(config=None, to_string=False, files=None):
+    def annotate_heal_study_files(config=None, to_string=False,
+                                  input_data_path=None, output_data_path=None):
         with Dug(config, to_string=to_string) as dug:
-            if files is None:
-                files = storage.dug_heal_study_objects()
+            if not input_data_path:
+                files = storage.dug_heal_study_objects(
+                    input_data_path=None)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
 
             parser_name = "heal-studies"
             log.info(files)
             dug.annotate_files(parser_name=parser_name,
-                               parsable_files=files)
+                               parsable_files=files,
+                               output_data_path=output_data_path)
             output_log = dug.log_stream.getvalue() if to_string else ''
         return output_log
 
     
     @staticmethod
-    def annotate_heal_research_program_files(config=None, to_string=False, files=None):
+    def annotate_heal_research_program_files(config=None, to_string=False,
+                                             input_data_path=None,
+                                             output_data_path=None):
         with Dug(config, to_string=to_string) as dug:
-            if files is None:
-                files = storage.dug_heal_research_program_objects()
-
+            if not input_data_path:
+                files = storage.dug_heal_research_program_objects(
+                    input_data_path=None)
+            else:
+                files = storage.get_files_recursive(
+                    lambda x: True, input_data_path
+                )
             parser_name = "heal-research"
             log.info(files)
             dug.annotate_files(parser_name=parser_name,
-                               parsable_files=files)
+                               parsable_files=files,
+                               output_data_path=output_data_path)
             output_log = dug.log_stream.getvalue() if to_string else ''
         return output_log
 
     @staticmethod
-    def make_kg_tagged(config=None, to_string=False):
+    def make_kg_tagged(config=None, to_string=False, input_data_path=None, output_data_path=None):
         with Dug(config, to_string=to_string) as dug:
-            output_base_path = storage.dug_kgx_path("")
-            storage.clear_dir(output_base_path)
+            output_base_path = output_data_path
+            if not output_data_path:
+                output_base_path = storage.dug_kgx_path("")            
+            storage.clear_dir(output_base_path)            
             log.info("Starting building KGX files")
-            elements_files = storage.dug_elements_objects()
+            if not input_data_path:
+                elements_files = storage.dug_elements_objects()
+            else:
+                import glob
+                glob_pattern = str(input_data_path / "**" /  'elements.pickle')
+                elements_files = glob.glob(glob_pattern, recursive=True)
+            log.info(f"making kgx files for the following pickles: {elements_files}")
             for file in elements_files:
                 elements = storage.read_object(file)
                 if "topmed_" in file:
@@ -930,5 +1017,3 @@ def get_heal_study_files(config: RogerConfig, to_string=False) -> List[str]:
 
 def get_heal_research_program_files(config: RogerConfig, to_string=False) -> List[str]:
     return get_versioned_files(config, "heal-research", "heal-research-programs", data_store=config.dug_inputs.data_source, unzip=True)
-
-
diff --git a/dags/index_dag.py b/dags/index_dag.py
deleted file mode 100755
index 6ddc4b02..00000000
--- a/dags/index_dag.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from airflow.models import DAG
-from airflow.operators.empty import EmptyOperator
-
-from roger.tasks import get_executor_config, default_args, create_python_task
-from dug_helpers.dug_utils import DugUtil
-
-""" Build the workflow's tasks and DAG. """
-with DAG(
-    dag_id='index_dug',
-    default_args=default_args,
-    schedule_interval=None
-) as dag:
-
-    """ Build the workflow tasks. """
-    intro = EmptyOperator(task_id='Intro')
-    index_variables = create_python_task (dag, "IndexVariables", DugUtil.index_variables)
-    validate_index_variables = create_python_task(dag,"ValidateIndexVariables", DugUtil.validate_indexed_variables)
-    crawl_tags = create_python_task(dag, "CrawlConcepts", DugUtil.crawl_tranql)
-    index_concepts = create_python_task(dag, "IndexConcepts", DugUtil.index_concepts)
-    dummy_stepover = EmptyOperator(task_id="continue")
-    index_extracted_dug_elements = create_python_task(dag, "IndexExtractedElements", DugUtil.index_extracted_elements)
-    validate_index_concepts = create_python_task(dag, "ValidateIndexConcepts", DugUtil.validate_indexed_concepts)
-    finish = EmptyOperator(task_id='Finish')
-    """ Build the DAG. """
-    intro >> index_variables >> validate_index_variables >> finish
-    intro >> crawl_tags >> index_concepts >> dummy_stepover
-    intro >> crawl_tags >> index_extracted_dug_elements >> dummy_stepover
-    dummy_stepover >> validate_index_concepts >> finish
\ No newline at end of file
diff --git a/dags/knowledge_graph_build.py b/dags/knowledge_graph_build.py
new file mode 100644
index 00000000..de28aa78
--- /dev/null
+++ b/dags/knowledge_graph_build.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+#
+
+"""
+An Airflow workflow for the Roger Translator KGX data pipeline.
+"""
+
+from airflow.models import DAG
+from airflow.operators.empty import EmptyOperator
+import roger
+from roger.tasks import default_args, create_python_task
+from roger.config import config
+
+""" Build the workflow's tasks and DAG. """
+with DAG(
+    dag_id='knowledge_graph_build',
+    default_args=default_args,
+    schedule_interval=None
+) as dag:
+
+    """ Build the workflow tasks. """
+    intro = EmptyOperator(task_id='Intro')
+
+    # Merge nodes needs inputs from two sources
+    # 1. baseline and/or CDE KGX files from LakeFS (External repo)
+    # 2. Infer which local kgx files are needed based on dug_inputs and grab them from the current repo
+
+    # build the annotate and index pipeline output locations
+    #lakefs://yk-heal/main/annotate_and_index/crdc_dataset_pipeline_task_group.make_kgx_crdc/
+    working_repo = config.lakefs_config.repo
+    branch = config.lakefs_config.branch
+    kgx_repos = config.kgx.data_sets
+    input_repos = [{
+        'name': repo.split(':')[0],
+        'branch': repo.split(':')[1],
+        'path': '*'
+    } for repo in kgx_repos]
+
+    # Figure out a way to extract paths
+    get_path_on_lakefs = lambda d: f"annotate_and_index/{d}_dataset_pipeline_task_group.make_kgx_{d}/"
+
+
+    for dataset in config.dug_inputs.data_sets:
+        dataset_name = dataset.split(":")[0]
+        # add datasets from the other pipeline
+        input_repos.append(
+            {
+                'name': working_repo,
+                'branch': branch,
+                'path': get_path_on_lakefs(dataset_name)
+            }
+        )
+
+    merge_nodes = create_python_task (dag, name="MergeNodes",
+                                      a_callable=roger.merge_nodes,
+                                      external_repos=input_repos
+                                      )
+
+    # The rest of these  guys can just operate on the local lakefs repo/branch
+    # we need to add input dir and output dir similar to what we did for dug tasks
+
+    create_nodes_schema = create_python_task(dag,
+                                            name="CreateNodesSchema",
+                                            a_callable=roger.create_nodes_schema
+                                            )
+    create_edges_schema = create_python_task(dag,
+                                             name="CreateEdgesSchema",
+                                             a_callable=roger.create_edges_schema)
+
+    create_bulk_load_nodes = create_python_task(dag,
+                                                name="CreateBulkLoadNodes",
+                                                a_callable=roger.create_bulk_nodes)
+    create_bulk_load_edges = create_python_task(dag,
+                                                name="CreateBulkLoadEdges",
+                                                a_callable=roger.create_bulk_edges)
+    bulk_load = create_python_task(dag,
+                                   name="BulkLoad",
+                                   a_callable=roger.bulk_load,
+                                   no_output_files=True)
+    check_tranql = create_python_task(dag,
+                                      name="CheckTranql",
+                                      a_callable=roger.check_tranql,
+                                      no_output_files=True)
+    validate = create_python_task(dag,
+                                  name="Validate",
+                                  a_callable=roger.validate,
+                                  no_output_files=True)
+
+
+    """ Build the DAG. """
+    merge_nodes.set_upstream(intro)
+    create_nodes_schema.set_upstream(merge_nodes)
+    create_edges_schema.set_upstream(merge_nodes)
+    create_bulk_load_nodes.set_upstream(create_nodes_schema)
+    create_bulk_load_nodes.set_upstream(merge_nodes)
+    create_bulk_load_edges.set_upstream(create_edges_schema)
+    create_bulk_load_edges.set_upstream(merge_nodes)
+    bulk_load.set_upstream(create_bulk_load_nodes)
+    bulk_load.set_upstream(create_bulk_load_edges)
+    validate.set_upstream(bulk_load)
+    check_tranql.set_upstream(bulk_load)
+
diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py
index 403b25b1..ac9eb23a 100644
--- a/dags/roger/config/__init__.py
+++ b/dags/roger/config/__init__.py
@@ -26,6 +26,21 @@ def __post_init__(self):
         self.port = int(self.port)
 
 
+@dataclass
+class LakefsConfig(DictLike):
+    host: str
+    access_key_id: str
+    secret_access_key: str
+    branch: str
+    repo: str
+    enabled: bool = False
+
+    def __post_init__(self):
+        if isinstance(self.enabled, str):
+            self.enabled = self.enabled.lower() == "true"
+
+
+
 @dataclass
 class LoggingConfig(DictLike):
     level: str = "DEBUG"
@@ -35,10 +50,8 @@ class LoggingConfig(DictLike):
 @dataclass
 class KgxConfig(DictLike):
     biolink_model_version: str = "1.5.0"
-    dataset_version: str = "v1.0"
-    merge_db_id: int = 1
     merge_db_temp_dir: str = "workspace"
-    data_sets: List = field(default_factory=lambda: ['baseline-graph'])
+    data_sets: List = field(default_factory=lambda: ['baseline-graph:v5.0'])
 
     def __post_init__(self):
         # Convert strings to list. In cases where this is passed as env variable with a single value
@@ -77,7 +90,18 @@ class BulkLoaderConfig(DictLike):
 
 @dataclass
 class AnnotationConfig(DictLike):
-    annotator: str = "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content="
+    annotator_type: str = "monarch"
+    annotator_args: dict = field(
+        default_factory=lambda: {
+            "monarch": {
+                "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content="
+            },
+            "sapbert": {
+                "classification_url": "https://med-nemo.apps.renci.org/annotate/",
+                "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/",
+            },
+        }
+    )
     normalizer: str = "https://nodenormalization-sri.renci.org/get_normalized_nodes?curie="
     synonym_service: str = "https://onto.renci.org/synonyms/"
     ontology_metadata: str = "https://api.monarchinitiative.org/api/bioentity/"
@@ -116,7 +140,7 @@ class IndexingConfig(DictLike):
         "anat_to_disease": ["anatomical_entity", "disease"],
         "anat_to_pheno": ["anatomical_entity", "phenotypic_feature"],
     })
-    tranql_endpoint: str = "http://tranql:8081/tranql/query?dynamic_id_resolution=true&asynchronous=false"
+    tranql_endpoint: str = "http://tranql-service/tranql/query?dynamic_id_resolution=true&asynchronous=false"
     # by default skips node to element queries
     node_to_element_queries: dict = field(default_factory=lambda: {})
     element_mapping: str = ""
@@ -170,6 +194,7 @@ def __init__(self, **kwargs):
         self.annotation_base_data_uri: str = kwargs.pop("annotation_base_data_uri", "")
         self.validation = kwargs.pop("validation")
         self.dag_run = kwargs.pop('dag_run', None)
+        self.lakefs_config  = LakefsConfig(**kwargs.pop("lakefs_config"))
 
     def to_dug_conf(self) -> DugConfig:
         return DugConfig(
@@ -183,9 +208,8 @@ def to_dug_conf(self) -> DugConfig:
             redis_port=self.redisgraph.port,
             nboost_host=self.elasticsearch.nboost_host,
             preprocessor=self.annotation.preprocessor,
-            annotator={
-                'url': self.annotation.annotator,
-            },
+            annotator_type=self.annotation.annotator_type,
+            annotator_args=self.annotation.annotator_args,
             normalizer={
                 'url': self.annotation.normalizer,
             },
@@ -328,7 +352,7 @@ def update(new_value: Dict):
     def __str__(self):
         flat = flatten(Config.__instance__)
         for k in flat:
-            if 'PASSWORD' in k or 'password' in k:
+            if 'PASSWORD' in k or 'password' in k or 'key' in k.lower():
                 flat[k] = '******'
         flat = unflatten(flat)
         result = json.dumps(flat)
diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml
index 49bc927b..e9402ce4 100644
--- a/dags/roger/config/config.yaml
+++ b/dags/roger/config/config.yaml
@@ -16,11 +16,9 @@ annotation_base_data_uri: https://stars.renci.org/var/dug/
 
 kgx:
   biolink_model_version: v3.1.2
-  dataset_version: v5.0
-  merge_db_id: 1
   merge_db_temp_dir: workspace
   data_sets:
-    - baseline-graph
+    - baseline-graph:v5.0
 
 dug_inputs:
   data_source: s3
@@ -44,10 +42,17 @@ bulk_loader:
 
 annotation:
   clear_http_cache: false
-  annotator: "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content="
+  annotator_type: monarch
+  annotator_args:
+    monarch:
+      url: "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content="
+    sapbert: 
+      classification_url: "https://med-nemo.apps.renci.org/annotate/"
+      annotator_url: "https://babel-sapbert.apps.renci.org/annotate/"
   normalizer: "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie="
   synonym_service: "https://name-resolution-sri.renci.org/reverse_lookup"
   ontology_metadata: "https://api.monarchinitiative.org/api/bioentity/"
+
   preprocessor:
     debreviator:
       BMI: "body mass index"
@@ -72,7 +77,7 @@ indexing:
     "small_molecule_to_disease": ["small_molecule", "disease"]
     "chemical_mixture_to_disease": ["chemical_mixture", "disease"]
     "phen_to_anat": ["phenotypic_feature", "anatomical_entity"]
-  tranql_endpoint: "http://tranql:8081/tranql/query?dynamic_id_resolution=true&asynchronous=false"
+  tranql_endpoint: "http://tranql-service/tranql/query?dynamic_id_resolution=true&asynchronous=false"
   node_to_element_queries:
     enabled: false
     cde:
@@ -157,3 +162,5 @@ lakefs_config:
   access_key_id: ""
   secret_access_key: ""
   host: ""
+  branch: ""
+  repo: ""
diff --git a/dags/roger/config/dev-config.yaml b/dags/roger/config/dev-config.yaml
index 91fc0af3..bece11a8 100644
--- a/dags/roger/config/dev-config.yaml
+++ b/dags/roger/config/dev-config.yaml
@@ -54,7 +54,7 @@ indexing:
     "phen_to_anat": ["phenotypic_feature", "anatomical_entity"]
     "anat_to_disease": ["anatomical_entity", "disease"]
     "anat_to_pheno": ["anatomical_entity", "phenotypic_feature"]
-  tranql_endpoint: "http://tranql:8081/tranql/query?dynamic_id_resolution=true&asynchronous=false"
+  tranql_endpoint: "http://tranql-service/tranql/query?dynamic_id_resolution=true&asynchronous=false"
 
 elasticsearch:
   host: elasticsearch
diff --git a/dags/roger/core/base.py b/dags/roger/core/base.py
index 6696f81d..7ba9409a 100644
--- a/dags/roger/core/base.py
+++ b/dags/roger/core/base.py
@@ -73,62 +73,68 @@ def create_schema(to_string=False, config=None):
     output = (o1 + o2 ) if to_string else None
     return output
 
-def create_edges_schema(to_string=False, config=None):
+def create_edges_schema(to_string=False, config=None, input_data_path=None, output_data_path=None):
     "Create edges schema on KGX object"
     output = None
     with Roger(to_string, config=config) as roger:
-        roger.kgx.create_edges_schema()
+        roger.kgx.create_edges_schema(
+            input_data_path=input_data_path,
+            output_data_path=output_data_path
+        )
         output = roger.log_stream.getvalue() if to_string else None
     return output
 
-def create_nodes_schema(to_string=False, config=None):
+def create_nodes_schema(to_string=False, config=None, input_data_path=None, output_data_path=None):
     "Create nodes schema on KGX object"
     output = None
     with Roger(to_string, config=config) as roger:
-        roger.kgx.create_nodes_schema()
+        roger.kgx.create_nodes_schema(input_data_path=input_data_path,
+                                      output_data_path=output_data_path)
         output = roger.log_stream.getvalue() if to_string else None
     return output
 
-def merge_nodes(to_string=False, config=None):
+def merge_nodes(to_string=False, config=None, input_data_path=None, output_data_path=None):
     "Run KGX merge"
     output = None
     with Roger (to_string, config=config) as roger:
-        roger.kgx.merge()
+        roger.kgx.merge(input_path=input_data_path, output_path=output_data_path)
         output = roger.log_stream.getvalue () if to_string else None
     return output
 
-def create_bulk_load(to_string=False, config=None):
+def create_bulk_load(to_string=False, config=None, input_data_path=None, output_data_path=None):
     "Generate bulk load files"
     o1 = create_bulk_nodes(to_string=to_string, config=config)
     o2 = create_bulk_edges(to_string=to_string, config=config)
     output = (o1 + o2) if to_string else None
     return output
 
-def create_bulk_nodes(to_string=False, config=None):
+def create_bulk_nodes(to_string=False, config=None, input_data_path=None, output_data_path=None):
     "Generate bulk node CSV file"
     output = None
     with Roger(to_string, config=config) as roger:
-        roger.bulk.create_nodes_csv_file()
+        log.info("input path: %s", input_data_path)
+        log.info("output path: %s", output_data_path)
+        roger.bulk.create_nodes_csv_file(input_data_path, output_data_path)
         output = roger.log_stream.getvalue() if to_string else None
     return output
 
-def create_bulk_edges(to_string=False, config=None):
+def create_bulk_edges(to_string=False, config=None, input_data_path=None, output_data_path=None):
     "Create bulk edges CSV file"
     output = None
     with Roger(to_string, config=config) as roger:
-        roger.bulk.create_edges_csv_file()
+        roger.bulk.create_edges_csv_file(input_data_path, output_data_path)
         output = roger.log_stream.getvalue() if to_string else None
     return output
 
-def bulk_load(to_string=False, config=None):
+def bulk_load(to_string=False, config=None, input_data_path=None, output_data_path=None):
     "Run bulk load insert process"
     output = None
     with Roger (to_string, config=config) as roger:
-        roger.bulk.insert()
+        roger.bulk.insert(input_data_path=input_data_path)
         output = roger.log_stream.getvalue () if to_string else None
     return output
 
-def validate (to_string=False, config=None):
+def validate (to_string=False, config=None, input_data_path=None, output_data_path=None):
     "Run bulk validate process"
     output = None
     with Roger (to_string, config=config) as roger:
@@ -136,7 +142,7 @@ def validate (to_string=False, config=None):
         output = roger.log_stream.getvalue () if to_string else None
     return output
 
-def check_tranql(to_string=False, config=None):
+def check_tranql(to_string=False, config=None, input_data_path=None, output_data_path=None):
     "Tranql server smoke check"
     output = None
     with Roger(to_string, config=config) as roger:
diff --git a/dags/roger/core/bulkload.py b/dags/roger/core/bulkload.py
index 1eca92db..0f0fecee 100644
--- a/dags/roger/core/bulkload.py
+++ b/dags/roger/core/bulkload.py
@@ -33,31 +33,24 @@ def __init__(self, biolink, config=None):
         self.separator =(chr(separator) if isinstance(separator, int)
                          else separator)
 
-    def tables_up_to_date (self):
-        return storage.is_up_to_date (
-            source=[
-                storage.schema_path (f"{SchemaType.PREDICATE.value}-schema.json"),
-                storage.schema_path (f"{SchemaType.PREDICATE.value}-schema.json")
-            ] + storage.merged_objects (),
-            targets=glob.glob (storage.bulk_path ("nodes/**.csv")) + \
-            glob.glob (storage.bulk_path ("edges/**.csv")))
-
-    def create_nodes_csv_file(self):
-        if self.tables_up_to_date ():
-            log.info ("up to date.")
-            return
+    def create (self):
+        """Used in the CLI on args.create_bulk"""
+        self.create_nodes_csv_file()
+        self.create_edges_csv_file()
+
+    def create_nodes_csv_file(self, input_data_path=None, output_data_path=None):
         # clear out previous data
-        bulk_path = storage.bulk_path("nodes")
+        bulk_path = storage.bulk_path("nodes", output_data_path)
         if os.path.exists(bulk_path):
             shutil.rmtree(bulk_path)
-        categories_schema = storage.read_schema (SchemaType.CATEGORY)
+        categories_schema = storage.read_schema (SchemaType.CATEGORY, input_data_path)
         state = defaultdict(lambda: None)
         log.info(f"processing nodes")
         """ Write node data for bulk load. """
 
         categories = defaultdict(lambda: [])
         category_error_nodes = set()
-        merged_nodes_file = storage.merge_path("nodes.jsonl")
+        merged_nodes_file = storage.merged_objects('nodes', input_data_path)
         counter = 1
         for node in storage.json_line_iter(merged_nodes_file):
             if not node['category']:
@@ -74,7 +67,7 @@ def create_nodes_csv_file(self):
                     f"Showing first 10: {list(category_error_nodes)[:10]}.")
             # flush every 100K
             if counter % 100_000 == 0:
-                self.write_bulk(storage.bulk_path("nodes"),
+                self.write_bulk(storage.bulk_path("nodes", output_data_path),
                                 categories, categories_schema,
                                 state=state, is_relation=False)
                 # reset variables.
@@ -83,22 +76,19 @@ def create_nodes_csv_file(self):
             counter += 1
         # write back if any thing left.
         if len(categories):
-            self.write_bulk(storage.bulk_path("nodes"),
+            self.write_bulk(storage.bulk_path("nodes", output_data_path),
                             categories, categories_schema,
                             state=state, is_relation=False)
 
-    def create_edges_csv_file(self):
+    def create_edges_csv_file(self, input_data_path=None, output_data_path=None):
         """ Write predicate data for bulk load. """
-        if self.tables_up_to_date ():
-            log.info ("up to date.")
-            return
         # Clear out previous data
-        bulk_path = storage.bulk_path("edges")
+        bulk_path = storage.bulk_path("edges", output_data_path)
         if os.path.exists(bulk_path):
             shutil.rmtree(bulk_path)
-        predicates_schema = storage.read_schema(SchemaType.PREDICATE)
+        predicates_schema = storage.read_schema(SchemaType.PREDICATE, input_data_path)
         predicates = defaultdict(lambda: [])
-        edges_file = storage.merge_path('edges.jsonl')
+        edges_file = storage.merged_objects('edges', input_data_path)
         counter = 1
         state = {}
         for edge in storage.json_line_iter(edges_file):
@@ -106,14 +96,14 @@ def create_edges_csv_file(self):
             # write out every 100K , to avoid large predicate dict.
             if counter % 100_000 == 0:
                 self.write_bulk(
-                    storage.bulk_path("edges"),predicates, predicates_schema,
+                    storage.bulk_path("edges", output_data_path),predicates, predicates_schema,
                     state=state, is_relation=True)
                 predicates = defaultdict(lambda : [])
             counter += 1
         # if there are some items left (if loop ended before counter reached the
         # specified value)
         if len(predicates):
-            self.write_bulk(storage.bulk_path("edges"), predicates,
+            self.write_bulk(storage.bulk_path("edges", output_data_path), predicates,
                             predicates_schema,state=state, is_relation=True)
 
     @staticmethod
@@ -316,17 +306,10 @@ def write_bulk(self, bulk_path, obj_map, schema, state={},
         state['processed_id'] = processed_objects_id
         state['called_times'] = called_x_times
 
-    def insert (self):
-
-        redisgraph = {
-            'host': os.getenv('REDIS_HOST'),
-            'port': os.getenv('REDIS_PORT', '6379'),
-            'password': os.getenv('REDIS_PASSWORD'),
-            'graph': os.getenv('REDIS_GRAPH'),
-        }
+    def insert (self, input_data_path=None):
         redisgraph = self.config.redisgraph
-        nodes = sorted(glob.glob (storage.bulk_path ("nodes/**.csv*")))
-        edges = sorted(glob.glob (storage.bulk_path ("edges/**.csv*")))
+        nodes = sorted(glob.glob (storage.bulk_path ("**/nodes/**.csv*", input_data_path), recursive=True))
+        edges = sorted(glob.glob (storage.bulk_path ("**/edges/**.csv*", input_data_path), recursive=True))
         graph = redisgraph['graph']
         log.info(f"bulk loading \n  nodes: {nodes} \n  edges: {edges}")
 
@@ -340,8 +323,9 @@ def insert (self):
         log.info ("bulk loading graph: %s", str(graph))
         args = []
         if len(nodes) > 0:
-            bulk_path_root = storage.bulk_path('nodes') + os.path.sep
+            bulk_path_root = glob.glob(storage.bulk_path('**/nodes', path=input_data_path), recursive=True)[0] + os.path.sep
             nodes_with_type = []
+            collect_labels = set()
             for x in nodes:
                 """ 
                     These lines prep nodes bulk load by:
@@ -350,24 +334,29 @@ def insert (self):
                 """
                 file_name_type_part = x.replace(bulk_path_root, '').split('.')[0].split('~')[1]
                 all_labels = "biolink." + file_name_type_part + ":" + ":".join([f'biolink.{v.lstrip("biolink:")}' for v in self.biolink.toolkit.get_ancestors("biolink:" + file_name_type_part, reflexive=False, formatted=True )] )
+                collect_labels.add("biolink." + file_name_type_part)
+                for v in self.biolink.toolkit.get_ancestors("biolink:" + file_name_type_part, reflexive=False,
+                                                            formatted=True):
+                    collect_labels.add(f'biolink.{v.lstrip("biolink:")}')
                 nodes_with_type.append(f"{all_labels} {x}")
             args.extend(("-N " + " -N ".join(nodes_with_type)).split())
         if len(edges) > 0:
-            bulk_path_root = storage.bulk_path('edges') + os.path.sep
+            bulk_path_root = glob.glob(storage.bulk_path('**/edges', path=input_data_path), recursive=True)[0] + os.path.sep
             edges_with_type = [f"biolink.{x.replace(bulk_path_root, '').strip(os.path.sep).split('.')[0].split('~')[1]} {x}"
                                for x in edges]
             # Edge label now no longer has 'biolink:'
             args.extend(("-R " + " -R ".join(edges_with_type)).split())
         args.extend([f"--separator={self.separator}"])
-        log.debug(f"--redis-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}")
         args.extend([f"--redis-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}"])
         args.extend(['--enforce-schema'])
+        for lbl in collect_labels:
+            args.extend([f'-i `{lbl}`:id', f'-f {lbl}:name', f'-f {lbl}:synonyms'])
         args.extend([f"{redisgraph['graph']}"])
         """ standalone_mode=False tells click not to sys.exit() """
         log.debug(f"Calling bulk_insert with extended args: {args}")
         try:
             bulk_insert(args, standalone_mode=False)
-            self.add_indexes()
+            # self.add_indexes()
         except Exception as e:
             log.error(f"Unexpected {e.__class__.__name__}: {e}")
             raise
diff --git a/dags/roger/core/storage.py b/dags/roger/core/storage.py
index 48034b07..c1a3e808 100644
--- a/dags/roger/core/storage.py
+++ b/dags/roger/core/storage.py
@@ -85,7 +85,7 @@ def read_object(path, key=None):
     elif path.endswith(".pickle"):
         with open(file=path, mode="rb") as stream:
             obj = pickle.load(stream)
-    elif path.endswith(".jsonl"):
+    elif path.endswith(".jsonl") or path.endswith('.txt'):
         obj = read_data(path)
     return obj
 
@@ -117,11 +117,11 @@ def write_object (obj, path, key=None):
             yaml.dump (obj, outfile)
     elif path.endswith (".json"):
         with open (path, "w", encoding='utf-8') as stream:
-            stream.write(str(json.dumps (obj).decode('utf-8')))
+            stream.write(str(json.dumps (obj, option=json.OPT_INDENT_2).decode('utf-8')))
     elif path.endswith(".pickle"):
         with open (path, "wb") as stream:
             pickle.dump(obj, file=stream)
-    elif path.endswith(".jsonl"):
+    elif path.endswith(".jsonl") or path.endswith('.txt'):
         with open (path, "w", encoding="utf-8") as stream:
             stream.write(obj)
     else:
@@ -152,30 +152,50 @@ def kgx_path(name):
     :path name: Name of the KGX object. """
     return str(ROGER_DATA_DIR / "kgx" / name)
 
-def kgx_objects(format_="json"):
+def kgx_objects(format_="json", path=None):
     """ A list of KGX objects. """
     kgx_pattern = kgx_path(f"**.{format_}")
-    return sorted(glob.glob (kgx_pattern))
+    if path:
+        kgx_pattern = f"{path}/**/*.{format_}"
+    return sorted(glob.glob (kgx_pattern, recursive=True))
 
-def merge_path(name):
+def merge_path(name, path: Path=None):
     """ Form a merged KGX object path.
     :path name: Name of the merged KGX object. """
-    return str(ROGER_DATA_DIR / 'merge' / name)
-
-def merged_objects():
+    if path is None:
+        # create output dir
+        if not os.path.exists(ROGER_DATA_DIR / 'merge'):
+            os.makedirs(ROGER_DATA_DIR / 'merge')
+        return str(ROGER_DATA_DIR / 'merge' / name)
+    return str(path.joinpath(name))
+
+def merged_objects(file_type, path=None):
     """ A list of merged KGX objects. """
-    merged_pattern = merge_path("**.json")
-    return sorted(glob.glob (merged_pattern))
+    if not path:
+        merged_pattern = merge_path(f"**/{file_type}.jsonl")
+    else:
+        merged_pattern =  merge_path(f"**/{file_type}.jsonl", path=path)
+    # this thing should always return one edges or nodes file (based on file_type)
+    try:
+        return sorted(glob.glob(merged_pattern, recursive=True))[0]
+    except IndexError:
+        raise ValueError(f"Could not find merged KGX of type {file_type} in {merged_pattern}")
 
-def schema_path(name):
+
+def schema_path(name, path=None):
     """ Path to a schema object.
     :param name: Name of the object to get a path for. """
-    return str(ROGER_DATA_DIR / 'schema' / name)
+    if not path:
+        return str(ROGER_DATA_DIR / 'schema' / name)
+    return str (path / 'schema' / name)
 
-def bulk_path(name):
+def bulk_path(name, path=None):
     """ Path to a bulk load object.
     :param name: Name of the object. """
-    return str(ROGER_DATA_DIR / 'bulk' / name)
+    if not path:
+        return str(ROGER_DATA_DIR / 'bulk' / name)
+    else:
+        return str(path / name)
 
 def metrics_path(name):
     """
@@ -194,10 +214,14 @@ def dug_annotation_path(name):
 def dug_expanded_concepts_path(name):
     return str(ROGER_DATA_DIR / 'dug' / 'expanded_concepts' / name)
 
-def dug_expanded_concept_objects():
-    file_pattern = dug_expanded_concepts_path(
-        os.path.join('*','expanded_concepts.pickle'))
-    return sorted(glob.glob(file_pattern))
+def dug_expanded_concept_objects(data_path=None, format="pickle"):
+    "Return a list of files containing expaneded concept objects"
+    if data_path:
+        file_pattern = os.path.join(data_path, '**', f'expanded_concepts.{format}')
+    else:
+        file_pattern = dug_expanded_concepts_path(
+            os.path.join('*',f'expanded_concepts.{format}'))
+    return sorted(glob.glob(file_pattern, recursive=True))
 
 def dug_extracted_elements_objects():
     file_pattern = dug_expanded_concepts_path(
@@ -212,17 +236,25 @@ def dug_kgx_objects():
     dug_kgx_pattern = dug_kgx_path("**.json")
     return sorted(glob.glob(dug_kgx_pattern))
 
-def dug_concepts_objects():
+def dug_concepts_objects(data_path, format="pickle"):
     """ A list of dug annotation Objects. """
-    concepts_file_path = dug_annotation_path(
-        os.path.join('*','concepts.pickle'))
-    return sorted(glob.glob(concepts_file_path))
+    if not data_path:
+        concepts_file_path = dug_annotation_path(
+            os.path.join('*',f'concepts.{format}'))
+    else:
+        concepts_file_path = os.path.join(
+            data_path, '**', f'concepts.{format}')
+    return sorted(glob.glob(concepts_file_path, recursive=True))
 
-def dug_elements_objects():
+def dug_elements_objects(data_path=None, format='pickle'):
     """ A list of dug annotation Objects. """
-    concepts_file_path = dug_annotation_path(
-        os.path.join('*', 'elements.pickle'))
-    return sorted(glob.glob(concepts_file_path))
+    if not data_path:
+        concepts_file_pattern = dug_annotation_path(
+            os.path.join('*', f'elements.{format}'))
+    else:
+        concepts_file_pattern = os.path.join(
+            data_path, '**', f'elements.{format}')
+    return sorted(glob.glob(concepts_file_pattern, recursive=True))
 
 def dug_input_files_path(name) -> pathlib.Path:
     path = ROGER_DATA_DIR / "dug" / "input_files" / name
@@ -233,22 +265,13 @@ def dug_input_files_path(name) -> pathlib.Path:
         log.info(f"Input file path: {path} already exists")
     return path
 
-def dug_topmed_path(name):
-    """ Topmed source files"""
-    return dug_input_files_path('topmed') / name
-
-def dug_topmed_objects():
-    topmed_file_pattern = str(dug_topmed_path("topmed_*.csv"))
+def dug_topmed_objects(input_data_path=None):
+    "Return list of TOPMed source files"
+    if not input_data_path:
+        input_data_path = str(dug_input_files_path('topmed'))
+    topmed_file_pattern = os.path.join(input_data_path, "topmed_*.csv")
     return sorted(glob.glob(topmed_file_pattern))
 
-def dug_nida_path(name):
-    """ NIDA source files"""
-    return dug_input_files_path('nida') / name
-
-def dug_sparc_path(name):
-    """ NIDA source files"""
-    return dug_input_files_path('sparc') / name
-
 def dug_anvil_path():
     """Anvil source files"""
     return dug_input_files_path('anvil')
@@ -281,62 +304,78 @@ def dug_kfdrc_path():
     """Anvil source files"""
     return dug_input_files_path('kfdrc')
 
-def dug_nida_objects():
-    nida_file_pattern = str(dug_nida_path("NIDA-*.xml"))
+def dug_nida_objects(input_data_path=None):
+    "Return list of NIDA source files"
+    if not input_data_path:
+        input_data_path = str(dug_input_files_path('nida'))
+    nida_file_pattern = os.path.join(input_data_path, "NIDA-*.xml")
     return sorted(glob.glob(nida_file_pattern))
 
-def dug_sparc_objects():
-    file_pattern = str(dug_sparc_path("scicrunch/*.xml"))
+def dug_sparc_objects(input_data_path=None):
+    if not input_data_path:
+        input_data_path = str(dug_input_files_path('sparc'))
+    file_pattern = os.path.join(input_data_path, "scicrunch/*.xml")
     return sorted(glob.glob(file_pattern))
 
-def dug_anvil_objects():
-    file_path = dug_anvil_path()
+def dug_anvil_objects(input_data_path=None):
+    if not input_data_path:
+        input_data_path = dug_anvil_path()
     files = get_files_recursive(
         lambda file_name: (
             not file_name.startswith('GapExchange_')
             and file_name.endswith('.xml')),
-        file_path)
+        input_data_path)
     return sorted([str(f) for f in files])
 
-def dug_sprint_objects():
-    file_path = dug_sprint_path()
+def dug_sprint_objects(input_data_path=None):
+    if not input_data_path:
+        input_data_path = dug_sprint_path()
     files = get_files_recursive(
-        lambda file_name: file_name.endswith('.xml'), file_path)
+        lambda file_name: file_name.endswith('.xml'), input_data_path)
     return sorted([str(f) for f in files])
 
-def dug_bacpac_objects():
-    file_path = dug_bacpac_path()
+def dug_bacpac_objects(input_data_path=None):
+    "Return list of BACPAC source files"
+    if not input_data_path:
+        input_data_path = dug_bacpac_path()
     files = get_files_recursive(
-        lambda file_name: file_name.endswith('.xml'), file_path)
+        lambda file_name: file_name.endswith('.xml'), input_data_path)
     return sorted([str(f) for f in files])
 
-def dug_crdc_objects():
-    file_path = dug_crdc_path()
+def dug_crdc_objects(input_data_path=None):
+    if not input_data_path:
+        input_data_path = dug_crdc_path()
     files = get_files_recursive(
         lambda file_name: (
             not file_name.startswith('GapExchange_')
             and file_name.endswith('.xml')),
-        file_path)
+        input_data_path)
     return sorted([str(f) for f in files])
 
-def dug_heal_study_objects():
-    file_path = dug_heal_study_path()
-    files = get_files_recursive(lambda file_name : file_name.endswith('.xml'), file_path)
+def dug_heal_study_objects(input_data_path=None):
+    "Return list of HEAL study source files"
+    if not input_data_path:
+        input_data_path = dug_heal_study_path()
+    files = get_files_recursive(lambda file_name : file_name.endswith('.xml'),
+                                input_data_path)
     return sorted([str(f) for f in files])
 
-def dug_heal_research_program_objects():
-    file_path = dug_heal_research_program_path()
-    files = get_files_recursive(lambda file_name : file_name.endswith('.xml'), file_path)
+def dug_heal_research_program_objects(input_data_path=None):
+    "Return list of HEAL research program source files"
+    if not input_data_path:
+        input_data_path = dug_heal_research_program_path()
+    files = get_files_recursive(lambda file_name : file_name.endswith('.xml'),
+                                input_data_path)
     return sorted([str(f) for f in files])
 
-
-def dug_kfdrc_objects():
-    file_path = dug_kfdrc_path()
+def dug_kfdrc_objects(input_data_path=None):
+    if not input_data_path:
+        input_data_path = dug_kfdrc_path()
     files = get_files_recursive(
         lambda file_name: (
             not file_name.startswith('GapExchange_')
             and file_name.endswith('.xml')),
-        file_path)
+        input_data_path)
     return sorted([str(f) for f in files])
 
 
@@ -356,23 +395,26 @@ def get_files_recursive(file_name_filter, current_dir):
             file_paths += [child]
     return file_paths
 
-def dug_dd_xml_objects():
-    file_path = dug_dd_xml_path()
+def dug_dd_xml_objects(input_data_path=None):
+    if not input_data_path:
+        input_data_path = dug_dd_xml_path()
     files = get_files_recursive(
         lambda file_name: (
             not file_name.startswith('._')
             and file_name.endswith('.xml')),
-        file_path)
+        input_data_path)
     return sorted([str(f) for f in files])
 
 def copy_file_to_dir(file_location, dir_name):
     return shutil.copy(file_location, dir_name)
 
-def read_schema (schema_type: SchemaType):
+def read_schema (schema_type: SchemaType, path=None):
     """ Read a schema object.
     :param schema_type: Schema type of the object to read. """
-    path = schema_path (f"{schema_type.value}-schema.json")
-    return read_object (path)
+    if path is not None:
+        path = path / '**'
+    location = glob.glob(schema_path (f"{schema_type.value}-schema.json", path=path), recursive=True)[0]
+    return read_object (location)
 
 def get_uri (path, key):
     """ Build a URI.
@@ -392,17 +434,7 @@ def read_relative_object (path):
 def trunc(text, limit):
     return ('..' + text[-limit-2:]) if len(text) > limit else text
 
-def is_up_to_date (source, targets):
-    target_time_list = [
-        os.stat (f).st_mtime for f in targets if os.path.exists(f)]
-    if len(target_time_list) == 0:
-        log.debug (f"no targets found")
-        return False
-    source = [ os.stat (f).st_mtime for f in source if os.path.exists (f) ]
-    if len(source) == 0:
-        log.debug ("no source found. up to date")
-        return True
-    return max(source) < min(target_time_list)
+
 
 def json_line_iter(jsonl_file_path):
     f = open(file=jsonl_file_path, mode='r', encoding='utf-8')
diff --git a/dags/roger/models/kgx.py b/dags/roger/models/kgx.py
index a8976a00..e80e65db 100644
--- a/dags/roger/models/kgx.py
+++ b/dags/roger/models/kgx.py
@@ -8,9 +8,8 @@
 from collections import defaultdict
 from xxhash import xxh64_hexdigest
 import orjson as json
-import redis
 import ntpath
-from kg_utils.merging import GraphMerger, MemoryGraphMerger, DiskGraphMerger
+from kg_utils.merging import DiskGraphMerger
 from kg_utils.constants import *
 
 from roger.config import get_default_config
@@ -43,18 +42,11 @@ def __init__(self, biolink=None, config=None):
         self.merger = DiskGraphMerger(temp_directory=self.temp_directory,
                                       chunk_size=5_000_000)
         self.biolink_version = self.config.kgx.biolink_model_version
-        self.merge_db_id = self.config.kgx.merge_db_id
-        self.merge_db_name = f'db{self.merge_db_id}'
         log.debug(f"Trying to get biolink version : {self.biolink_version}")
         if biolink is None:
             self.biolink = BiolinkModel(self.biolink_version)
         else:
             self.biolink = biolink
-        self.redis_conn = redis.Redis(
-                    host=self.config.redisgraph.host,
-                    port=self.config.redisgraph.port,
-                    password=self.config.redisgraph.password,
-                    db=self.merge_db_id)
         self.enable_metrics = self.config.get('enable_metrics', False)
 
     def get_kgx_json_format(self, files: list, dataset_version: str):
@@ -301,7 +293,7 @@ def fetch_dug_kgx(self):
         log.info("Done copying dug KGX files.")
         return all_kgx_files
 
-    def create_nodes_schema(self):
+    def create_nodes_schema(self, input_data_path=None, output_data_path=None):
         """
         Extracts schema for nodes based on biolink leaf types
         :return:
@@ -309,7 +301,7 @@ def create_nodes_schema(self):
 
         category_schemas = defaultdict(lambda: None)
         category_error_nodes = set()
-        merged_nodes_file = storage.merge_path("nodes.jsonl")
+        merged_nodes_file = storage.merged_objects("nodes", input_data_path)
         log.info(f"Processing : {merged_nodes_file}")
         counter = 0
         for node in storage.json_line_iter(merged_nodes_file):
@@ -356,15 +348,15 @@ def create_nodes_schema(self):
                      f"These will be treated as {BiolinkModel.root_type}.")
 
         # Write node schemas.
-        self.write_schema(category_schemas, SchemaType.CATEGORY)
+        self.write_schema(category_schemas, SchemaType.CATEGORY, output_path=output_data_path)
 
-    def create_edges_schema(self):
+    def create_edges_schema(self, input_data_path=None, output_data_path=None):
         """
         Create unified schema for all edges in an edges jsonl file.
         :return:
         """
         predicate_schemas = defaultdict(lambda: None)
-        merged_edges_file = storage.merge_path("edges.jsonl")
+        merged_edges_file = storage.merged_objects("edges", input_data_path)
         """ Infer predicate schemas. """
         for edge in storage.json_line_iter(merged_edges_file):
             predicate = edge['predicate']
@@ -378,7 +370,7 @@ def create_edges_schema(self):
                     previous_type = predicate_schemas[predicate][k]
                     predicate_schemas[predicate][k] = compare_types(
                         previous_type, current_type)
-        self.write_schema(predicate_schemas, SchemaType.PREDICATE)
+        self.write_schema(predicate_schemas, SchemaType.PREDICATE, output_path=output_data_path)
 
     def create_schema (self):
         """Determine the schema of each type of object.
@@ -403,31 +395,40 @@ def schema_up_to_date (self):
                     f"{SchemaType.PREDICATE.value}-schema.json")
             ])
 
-    def write_schema(self, schema, schema_type: SchemaType):
+    def write_schema(self, schema, schema_type: SchemaType ,output_path=None):
         """ Output the schema file.
  
         :param schema: Schema to get keys from.
         :param schema_type: Type of schema to write.
         """
-        file_name = storage.schema_path (f"{schema_type.value}-schema.json")
+        file_name = storage.schema_path (f"{schema_type.value}-schema.json", output_path)
         log.info("writing schema: %s", file_name)
         dictionary = { k : v for k, v in schema.items () }
         storage.write_object (dictionary, file_name)
 
-    def merge(self):
+    def merge(self, input_path=None, output_path=None):
         """ This version uses the disk merging from the kg_utils module """
-        data_set_version = self.config.get('kgx', {}).get('dataset_version')
+
         metrics = {}
         start = time.time()
-        json_format_files = storage.kgx_objects("json")
-        jsonl_format_files = storage.kgx_objects("jsonl")
+
+        log.info(f"Input path = {input_path}, Output path = {output_path}")
+
+        if input_path:
+            json_format_files = storage.kgx_objects("json", input_path)
+            jsonl_format_files = storage.kgx_objects("jsonl", input_path)
+        else:
+            json_format_files = storage.kgx_objects("json")
+            jsonl_format_files = storage.kgx_objects("jsonl")
 
         # Create lists of the nodes and edges files in both json and jsonl
         # formats
         jsonl_node_files = {file for file in jsonl_format_files
-                            if "node" in file}
+                            if "node" in file.split('/')[-1]}
         jsonl_edge_files = {file for file in jsonl_format_files
-                            if "edge" in file}
+                            if "edge" in file.split('/')[-1]}
+        log.info(f"Jsonl edge files : {jsonl_edge_files}")
+        log.info(f"Jsonl node files : {jsonl_node_files}")
 
         # Create all the needed iterators and sets thereof
         jsonl_node_iterators = [storage.jsonl_iter(file_name)
@@ -449,13 +450,16 @@ def merge(self):
         self.merger.merge_nodes(node_iterators)
         merged_nodes = self.merger.get_merged_nodes_jsonl()
 
+
         self.merger.merge_edges(edge_iterators)
         merged_edges = self.merger.get_merged_edges_jsonl()
 
         write_merge_metric = {}
         t = time.time()
         start_nodes_jsonl = time.time()
-        nodes_file_path = storage.merge_path("nodes.jsonl")
+
+
+        nodes_file_path = storage.merge_path("nodes.jsonl", output_path)
 
         # stream out nodes to nodes.jsonl file
         with open(nodes_file_path, 'w') as stream:
@@ -468,7 +472,7 @@ def merge(self):
         start_edge_jsonl = time.time()
 
         # stream out edges to edges.jsonl file
-        edges_file_path = storage.merge_path("edges.jsonl")
+        edges_file_path = storage.merge_path("edges.jsonl", output_path)
         with open(edges_file_path, 'w') as stream:
             for edges in merged_edges:
                 edges = json.loads(edges)
@@ -496,3 +500,4 @@ def merge(self):
         if self.enable_metrics:
             metricsfile_path = storage.metrics_path('merge_metrics.yaml')
             storage.write_object(metrics, metricsfile_path)
+
diff --git a/dags/roger/pipelines/README.md b/dags/roger/pipelines/README.md
new file mode 100644
index 00000000..e77e6a29
--- /dev/null
+++ b/dags/roger/pipelines/README.md
@@ -0,0 +1,99 @@
+# Building custom Dug data pipelines
+
+The pipelines submodule is where data pipelines can be defined for specific data
+sets with specific, custom behaviors for each one. In previous versions of the
+code, customizations for each pipeline were spread across several modules. With
+this instantiation, the customizations for each data set pipeline are
+consolidated into a single overridden subclass of the DataPipeline class.
+
+## What the base pipeline does
+
+The function `roger.tasks.create_pipeline_taskgroup`, when called with the given
+data pipeline class, will emit an Airflow task group with the following
+structure. If Airflow is not being used, another executor should use a similarly
+structured set of calls and dependencies to ensure that the task pipeline
+executes fully and in order.
+
+```mermaid
+graph TD;
+    annotate-->index_variables;
+    annotate-->validate_index_variables;
+    index_variables-->validate_index_variables;
+    annotate-->make_kg_tagged;
+    annotate-->crawl_tranql;
+    annotate-->index_concepts;
+    crawl_tranql-->validate_index_concepts;
+    index_concepts-->validate_index_concepts;
+    annotate-->validate_index_concepts;
+```
+The pipeline steps are briefly described below
+
+### annotate
+
+By default, `annotate` will call the `get_objects` method to collect a list of
+parsable files. For each of these files, a Dug Crawler object will be created
+which will apply the parser returned by the pipeline class's `get_parser_name`
+method. (This by default will return `parser_name` if it's defined, or will fall
+back to `pipeline_name`.) The results will be written to `elements.json` and
+`concepts.json` as appropriate.
+
+### index_variables
+
+This will load the `elements.json` files from `annotate` and pass them to the
+indexer built from a DugFactory object. (This is sending them to ElasticSearch
+for indexing under the hood.)
+
+### make_kg_tagged
+
+All `elements.json` files will be loaded, and based on the annotations, a
+Translator-compliant knowledge graph will be written to a `_kgx.json` file.
+
+### index_concepts
+
+The `concepts.json` files are read and submitted to ElasticSearch using the
+indexer object derived from the embedded DugFactory object. 
+
+### validate_index_concepts
+
+Concepts from `concepts.json` are double-checked to ensure that the ES indexing
+process actually worked.
+
+## Defining a basic pipeline, with no customizations
+
+Simple pipelines, such as that for the BACPAC dataset, need very little
+customization. All pipelines must define a `pipeline_name`, which will be used
+as the default value for a number of other parameters if they are not
+defined. In the case of BACPAC, a difference in case means that both the
+`pipeline_name` and the `parser_name` need to be defined.
+
+```python
+from roger.pipelines import DugPipeline
+
+class BacPacPipeline(DugPipeline):
+    "Pipeline for BACPAC data set"
+    pipeline_name = "bacpac"
+    parser_name = "BACPAC"
+```
+
+This is the full extent of the code needed to adapt the DugPipeline object to
+BACPAC. Other data sets have more specific customizations that need more custom
+code or variables defined.
+
+## More extensive customization
+
+Because the base pipeline (defined in `roger/pipelines/base.py:DugPipeline`) is
+inherited as a subclass for customizing, effectively any part of the pipeline
+that isn't part of Dug proper can be overriden. Here are some common
+customizations that are expected to be necessary for many parts of the process:
+
+### get_objects
+
+The `get_objects` method by default looks in the `input_data_path` that is
+passed to it, and if that is None, loads the default from the `ROGER_DATA_DIR`
+environment variable. By default, it reads all files with the `.xml` extension
+recursively anywhere in that directory or its subdirectories.
+
+One example customization is the anvil data pipeline, which additionally
+excludes any file that starts with 'GapExchange_'. Any overriden method should
+accept an optional `input_data_path` parameter and return a list of files,
+sorted in the order that they should be processed.
diff --git a/dags/roger/pipelines/__init__.py b/dags/roger/pipelines/__init__.py
new file mode 100644
index 00000000..d6664b8e
--- /dev/null
+++ b/dags/roger/pipelines/__init__.py
@@ -0,0 +1,28 @@
+"Modules for individual datasets"
+
+import pkgutil
+from pathlib import Path
+import importlib
+
+from .base import DugPipeline
+
+def get_pipeline_classes(pipeline_names_dict):
+    """Return a list of all defined pipeline classes
+    """
+
+    base_path = Path(__file__).resolve().parent
+
+    for (_, mod_name, _) in pkgutil.iter_modules([base_path]):
+        if mod_name == 'base':
+            continue
+
+        # No need to actuall get the module symbol, once it's imported, it will
+        # show up below in __subclasses__.
+        importlib.import_module(f"{__name__}.{mod_name}")
+    pipeline_list = []
+
+    for subclass in DugPipeline.__subclasses__():
+        if getattr(subclass, 'pipeline_name') and getattr(subclass, 'pipeline_name') in pipeline_names_dict.keys():
+            subclass.input_version = pipeline_names_dict[getattr(subclass, 'pipeline_name')]
+            pipeline_list.append(subclass)
+    return pipeline_list
diff --git a/dags/roger/pipelines/anvil.py b/dags/roger/pipelines/anvil.py
new file mode 100644
index 00000000..baa82c05
--- /dev/null
+++ b/dags/roger/pipelines/anvil.py
@@ -0,0 +1,24 @@
+"Pipeline for anvil data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class AnvilPipeline(DugPipeline):
+    "Pipeline for Anvil data set"
+    pipeline_name = 'anvil'
+    parser_name = 'Anvil'
+
+    def get_objects(self, input_data_path=None):
+        """Retrieve anvil objects
+
+        This code is imported from roger.core.storage.dug_anvil_objects
+        """
+        if not input_data_path:
+            input_data_path = storage.dug_input_files_path(
+                self.files_dir)
+        files = storage.get_files_recursive(
+            lambda file_name: (
+                not file_name.startswith('GapExchange_')
+                and file_name.endswith('.xml')),
+            input_data_path)
+        return sorted([str(f) for f in files])
diff --git a/dags/roger/pipelines/bacpac.py b/dags/roger/pipelines/bacpac.py
new file mode 100644
index 00000000..495ba3b9
--- /dev/null
+++ b/dags/roger/pipelines/bacpac.py
@@ -0,0 +1,8 @@
+"Pipeline for BACPAC data"
+
+from roger.pipelines import DugPipeline
+
+class BacPacPipeline(DugPipeline):
+    "Pipeline for BACPAC data set"
+    pipeline_name = "bacpac"
+    parser_name = "BACPAC"
diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py
new file mode 100644
index 00000000..2a1b605e
--- /dev/null
+++ b/dags/roger/pipelines/base.py
@@ -0,0 +1,964 @@
+"Base class for implementing a dataset annotate, crawl, and index pipeline"
+
+import os
+import asyncio
+from io import StringIO
+import logging
+import re
+import hashlib
+import traceback
+from functools import reduce
+from pathlib import Path
+import tarfile
+from typing import Union
+import jsonpickle
+
+import requests
+
+from dug.core import get_parser, get_annotator, get_plugin_manager, DugConcept
+from dug.core.concept_expander import ConceptExpander
+from dug.core.crawler import Crawler
+from dug.core.factory import DugFactory
+from dug.core.parsers import Parser, DugElement
+from dug.core.annotators import Annotator
+from dug.core.async_search import Search
+from dug.core.index import Index
+
+from roger.config import RogerConfig
+from roger.core import storage
+from roger.models.biolink import BiolinkModel
+from roger.logger import get_logger
+
+from utils.s3_utils import S3Utils
+
+log = get_logger()
+
+class PipelineException(Exception):
+    "Exception raised from DugPipeline and related classes"
+
+def make_edge(subj,
+              obj,
+              predicate='biolink:related_to',
+              predicate_label='related to',
+              relation='biolink:related_to',
+              relation_label='related to'
+              ):
+    """Create an edge between two nodes.
+
+    :param subj: The identifier of the subject.
+    :param pred: The predicate linking the subject and object.
+    :param obj: The object of the relation.
+    :param predicate: Biolink compatible edge type.
+    :param predicate_label: Edge label.
+    :param relation: Ontological edge type.
+    :param relation_label: Ontological edge type label.
+    :returns: Returns and edge.
+    """
+    edge_id = hashlib.md5(
+        f'{subj}{predicate}{obj}'.encode('utf-8')).hexdigest()
+    return {
+        "subject": subj,
+        "predicate": predicate,
+        "predicate_label": predicate_label,
+        "id": edge_id,
+        "relation": relation,
+        "relation_label": relation_label,
+        "object": obj,
+        "provided_by": "renci.bdc.semanticsearch.annotator"
+    }
+
+class FileFetcher:
+    """A basic remote file fetcher class
+    """
+
+    def __init__(
+            self,
+            remote_host: str,
+            remote_dir: Union[str, Path],
+            local_dir: Union[str, Path] = "."
+    ):
+        self.remote_host = remote_host
+        if isinstance(remote_dir, str):
+            self.remote_dir = remote_dir.rstrip("/")
+        else:
+            self.remote_dir = str(remote_dir.as_posix())
+        self.local_dir = Path(local_dir).resolve()
+
+    def __call__(self, remote_file_path: Union[str, Path]) -> Path:
+        remote_path = self.remote_dir + "/" + remote_file_path
+        local_path = self.local_dir / remote_file_path
+        url = f"{self.remote_host}{remote_path}"
+        log.debug("Fetching %s", url)
+        try:
+            response = requests.get(url, allow_redirects=True, timeout=60)
+        except Exception as e:
+            log.error("Unexpected %s: %s", e.__class__.__name__, str(e))
+            raise RuntimeError(f"Unable to fetch {url}") from e
+
+        log.debug("Response: %d", response.status_code)
+        if response.status_code != 200:
+            log.debug("Unable to fetch %s: %d", url, response.status_code)
+            raise RuntimeError(f"Unable to fetch {url}")
+
+        with local_path.open('wb') as file_obj:
+            file_obj.write(response.content)
+        return local_path
+
+class DugPipeline():
+    "Base class for dataset pipelines"
+
+    pipeline_name = None
+    unzip_source = True
+    input_version = ""
+
+    def __init__(self, config: RogerConfig, to_string=False):
+        "Set instance variables and check to make sure we're overriden"
+        if not self.pipeline_name:
+            raise PipelineException(
+                "Subclass must at least define pipeline_name as class var")        
+        self.config = config
+        self.bl_toolkit = BiolinkModel()
+        dug_conf = config.to_dug_conf()
+        self.element_mapping = config.indexing.element_mapping
+        self.factory = DugFactory(dug_conf)
+        self.cached_session = self.factory.build_http_session()
+        self.event_loop = asyncio.new_event_loop()
+        self.log_stream = StringIO()
+        if to_string:
+            self.string_handler = logging.StreamHandler(self.log_stream)
+            log.addHandler(self.string_handler)
+        self.s3_utils = S3Utils(self.config.s3_config)
+
+        self.tranqlizer: ConceptExpander = self.factory.build_tranqlizer()
+
+        graph_name = self.config["redisgraph"]["graph"]
+        source = f"redis:{graph_name}"
+        self.tranql_queries: dict = self.factory.build_tranql_queries(source)
+        self.node_to_element_queries: list = (
+            self.factory.build_element_extraction_parameters(source))
+
+        indexing_config = config.indexing
+        self.variables_index = indexing_config.get('variables_index')
+        self.concepts_index = indexing_config.get('concepts_index')
+        self.kg_index = indexing_config.get('kg_index')
+
+        self.search_obj: Search = self.factory.build_search_obj([
+            self.variables_index,
+            self.concepts_index,
+            self.kg_index,
+        ])
+        self.index_obj: Index = self.factory.build_indexer_obj([
+                self.variables_index,
+                self.concepts_index,
+                self.kg_index,
+
+        ])
+
+    def __enter__(self):
+        self.event_loop = asyncio.new_event_loop()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # close elastic search connection
+        self.event_loop.run_until_complete(self.search_obj.es.close())
+        # close async loop
+        if self.event_loop.is_running() and not self.event_loop.is_closed():
+            self.event_loop.close()
+        if exc_type or exc_val or exc_tb:
+            traceback.print_exc()
+            log.error("%s %s %s", exc_val, exc_val, exc_tb)
+            log.exception("Got an exception")
+
+    def get_data_format(self):
+        """Access method for data_format parameter
+
+        Defaults to pipeline_name unless self.data_format is set. This method
+        can also be overriden
+        """
+        return getattr(self, 'data_format', self.pipeline_name)
+
+    def get_files_dir(self):
+        """Access method for files_dir parameter
+
+        Defaults to pipeline_name unless self.files_dir is set. This method can
+        also be overriden.
+        """
+        return getattr(self, 'files_dir', self.pipeline_name)
+
+    def get_parser_name(self):
+        """Access method for parser_name
+
+        Defaults to pipeline_name unless self.parser_name is set. This method
+        can also be overriden.
+        """
+        return getattr(self, 'parser_name', self.pipeline_name)
+    
+    def get_annotator_name(self):
+        """
+        Access method for annotator_name
+        Defaults to annotator_monarch unless specified using annotation.annotator_type in the configuration file.
+        """
+        return self.config.annotation.annotator_type
+    
+
+    def get_parser(self):
+        dug_plugin_manager = get_plugin_manager()
+        parser: Parser = get_parser(dug_plugin_manager.hook,
+                                         self.get_parser_name())
+        return parser
+    
+    def get_annotator(self):
+        dug_plugin_manager = get_plugin_manager()
+        annotator: Annotator = get_annotator(
+            dug_plugin_manager.hook,
+            self.get_annotator_name(),
+            self.config.to_dug_conf()
+        )
+        return annotator
+
+    def annotate_files(self, parsable_files, output_data_path=None):
+        """
+        Annotates a Data element file using a Dug parser.
+        :param parser_name: Name of Dug parser to use.
+        :param parsable_files: Files to parse.
+        :return: None.
+        """
+        if not output_data_path:
+            output_data_path = storage.dug_annotation_path('')
+        log.info("Parsing files")
+        for _, parse_file in enumerate(parsable_files):
+            log.debug("Creating Dug Crawler object on parse_file %s at %d of %d",
+                      parse_file, _ , len(parsable_files))
+            parser = self.get_parser()
+            annotator = self.get_annotator() 
+            crawler = Crawler(
+                crawl_file=parse_file,
+                parser=parser,
+                annotator=annotator,
+                tranqlizer='',
+                tranql_queries=[],
+                http_session=self.cached_session
+            )
+
+            # configure output space.
+            current_file_name = '.'.join(
+                os.path.basename(parse_file).split('.')[:-1])
+            elements_file_path = os.path.join(
+                output_data_path, current_file_name)
+            elements_file = os.path.join(elements_file_path, 'elements.txt')
+            concepts_file = os.path.join(elements_file_path, 'concepts.txt')         
+            # This is a file that the crawler will later populate. We start here
+            # by creating an empty elements file.
+            # This also creates output dir if it doesn't exist.
+            # elements_json = os.path.join(elements_file_path,
+            #                              'element_file.json')
+            # log.debug("Creating empty file: %s", elements_json)
+            # storage.write_object({}, elements_json)
+
+            # Use the specified parser to parse the parse_file into elements.
+            log.debug("Parser is %s", str(parser))
+            elements = parser(parse_file)
+            log.debug("Parsed elements: %s", str(elements))
+
+            # This inserts the list of elements into the crawler where
+            # annotate_elements expects to find it. Maybe in some future version
+            # of Dug this could be a parameter instead of an attribute?
+            crawler.elements = elements
+
+            # @TODO propose for Dug to make this a crawler class init param(??)
+            crawler.crawlspace = elements_file_path
+            log.debug("Crawler annotator: %s", str(crawler.annotator))
+            crawler.annotate_elements()
+
+            # Extract out the concepts gotten out of annotation
+            # Extract out the elements
+            non_expanded_concepts = crawler.concepts
+            # The elements object will have been modified by annotate_elements,
+            # so we want to make sure to catch those modifications.
+            elements = crawler.elements
+
+            # Write pickles of objects to file
+            log.info("Parsed and annotated: %s", parse_file)            
+            
+            storage.write_object(jsonpickle.encode(elements, indent=2), elements_file)
+            log.info("Serialized annotated elements to : %s", elements_file)
+
+            storage.write_object(jsonpickle.encode(non_expanded_concepts, indent=2), concepts_file)
+            log.info("Serialized annotated concepts to : %s", concepts_file)
+
+    def convert_to_kgx_json(self, elements, written_nodes=None):
+        """
+        Given an annotated and normalized set of study variables,
+        generate a KGX compliant graph given the normalized annotations.
+        Write that grpah to a graph database.
+        See BioLink Model for category descriptions.
+        https://biolink.github.io/biolink-model/notes.html
+        """
+        if written_nodes is None:
+            written_nodes = set()
+        graph = {
+            "nodes": [],
+            "edges": []
+        }
+        edges = graph['edges']
+        nodes = graph['nodes']
+
+        for _, element in enumerate(elements):
+            # DugElement means a variable (Study variable...)
+            if not isinstance(element, DugElement):
+                continue
+            study_id = element.collection_id
+            if study_id not in written_nodes:
+                nodes.append({
+                    "id": study_id,
+                    "category": ["biolink:Study"],
+                    "name": study_id
+                })
+                written_nodes.add(study_id)
+
+            # connect the study and the variable.
+            edges.append(make_edge(
+                subj=element.id,
+                relation_label='part of',
+                relation='BFO:0000050',
+                obj=study_id,
+                predicate='biolink:part_of',
+                predicate_label='part of'))
+            edges.append(make_edge(
+                subj=study_id,
+                relation_label='has part',
+                relation="BFO:0000051",
+                obj=element.id,
+                predicate='biolink:has_part',
+                predicate_label='has part'))
+
+            # a node for the variable. Should be BL compatible
+            variable_node = {
+                "id": element.id,
+                "name": element.name,
+                "category": ["biolink:StudyVariable"],
+                # bulk loader parsing issue
+                "description": (
+                    element.description.replace("'", '`').replace('\n', ' '))
+            }
+            if element.id not in written_nodes:
+                nodes.append(variable_node)
+                written_nodes.add(element.id)
+
+            for identifier, metadata in element.concepts.items():
+                identifier_object = metadata.identifiers.get(identifier)
+                # This logic is treating DBGap files.
+                # First item in current DBGap xml files is a topmed tag,
+                # This is treated as a DugConcept Object. But since its not
+                # a concept we get from annotation (?) its never added to
+                # variable.concepts.items (Where variable is a DugElement obj)
+                # The following logic is trying to extract types, and for the
+                # aformentioned topmed tag it adds
+                # `biolink:InfomrmationContentEntity`
+                # Maybe a better solution could be adding types on
+                # DugConcept objects
+                # More specifically Biolink compatible types (?)
+                #
+                if identifier_object:
+                    category = identifier_object.types
+                elif identifier.startswith("TOPMED.TAG:"):
+                    category = ["biolink:InformationContentEntity"]
+                else:
+                    continue
+                if identifier not in written_nodes:
+                    if isinstance(category, str):
+                        bl_element = self.bl_toolkit.toolkit.get_element(
+                            category)
+                        category = [bl_element.class_uri or bl_element.slot_uri]
+                    nodes.append({
+                        "id": identifier,
+                        "category": category,
+                        "name": metadata.name
+                    })
+                    written_nodes.add(identifier)
+                # related to edge
+                edges.append(make_edge(
+                    subj=element.id,
+                    obj=identifier
+                    ))
+                # related to edge
+                edges.append(make_edge(
+                    subj=identifier,
+                    obj=element.id))
+        return graph
+
+    def make_tagged_kg(self, elements):
+        """ Make a Translator standard knowledge graph representing
+        tagged study variables.
+        :param variables: The variables to model.
+        :param tags: The tags characterizing the variables.
+        :returns: dict with nodes and edges modeling a Translator/Biolink KG.
+        """
+        graph = {
+            "nodes": [],
+            "edges": []
+        }
+        edges = graph['edges']
+        nodes = graph['nodes']
+
+        # Create graph elements to model tags and their
+        # links to identifiers gathered by semantic tagging
+        tag_map = {}
+        # @TODO extract this into config or maybe dug ??
+        topmed_tag_concept_type = "TOPMed Phenotype Concept"
+        nodes_written = set()
+        for tag in elements:            
+            if not (isinstance(tag, DugConcept)
+                    and tag.type == topmed_tag_concept_type):
+                continue
+            tag_id = tag.id
+            tag_map[tag_id] = tag
+            nodes.append({
+                "id": tag_id,
+                "name": tag.name,
+                "description": tag.description.replace("'", "`"),
+                "category": ["biolink:InformationContentEntity"]
+            })
+
+            # Link ontology identifiers we've found for this tag via nlp.
+            for identifier, metadata in tag.identifiers.items():
+                if isinstance(metadata.types, str):
+                    bl_element = self.bl_toolkit.toolkit.get_element(
+                        metadata.types)
+                    category = [bl_element.class_uri or bl_element.slot_uri]
+                else:
+                    category = metadata.types
+                synonyms = metadata.synonyms if metadata.synonyms else []
+                nodes.append({
+                    "id": identifier,
+                    "name": metadata.label,
+                    "category": category,
+                    "synonyms": synonyms
+                })
+                nodes_written.add(identifier)
+                edges.append(make_edge(
+                    subj=tag_id,
+                    obj=identifier))
+                edges.append(make_edge(
+                    subj=identifier,
+                    obj=tag_id))
+
+        concepts_graph = self.convert_to_kgx_json(elements,
+                                                  written_nodes=nodes_written)
+        graph['nodes'] += concepts_graph['nodes']
+        graph['edges'] += concepts_graph['edges']
+
+        return graph
+
+    def index_elements(self, elements_file):
+        "Submit elements_file to ElasticSearch for indexing "
+        log.info("Indexing %s...", str(elements_file))
+        elements =jsonpickle.decode(storage.read_object(elements_file))
+        count = 0
+        total = len(elements)
+        # Index Annotated Elements
+        log.info("found %d from elements files.", len(elements))
+        for element in elements:
+            count += 1
+            # Only index DugElements as concepts will be
+            # indexed differently in next step
+            if not isinstance(element, DugConcept):
+                # override data-type with mapping values
+                if element.type.lower() in self.element_mapping:
+                    element.type = self.element_mapping[element.type.lower()]
+                if not element.id:
+                    # no id no indexing
+                    continue
+                # Use the Dug Index object to submit the element to ES
+                self.index_obj.index_element(
+                    element, index=self.variables_index)
+            percent_complete = (count / total) * 100
+            if percent_complete % 10 == 0:
+                log.info("%d %%", percent_complete)
+        log.info("Done indexing %s.", elements_file)
+
+    def validate_indexed_element_file(self, elements_file):
+        "After submitting elements for indexing, verify that they're available"
+        elements = [x for x in jsonpickle.decode(storage.read_object(elements_file))
+                    if not isinstance(x, DugConcept)]
+        # Pick ~ 10 %
+        sample_size = int(len(elements) * 0.1)
+
+        # random.choices(elements, k=sample_size)
+        test_elements = elements[:sample_size]
+        log.info("Picked %d from %s for validation.", len(test_elements),
+                 elements_file)
+        for element in test_elements:
+            # Pick a concept
+            concepts = [element.concepts[curie] for curie in element.concepts
+                        if element.concepts[curie].name]
+
+            if len(concepts):
+                # Pick the first concept
+                concept = concepts[0]
+                curie = concept.id
+                search_term = re.sub(r'[^a-zA-Z0-9_\ ]+', '', concept.name)
+                log.debug("Searching for Concept: %s and Search term: %s",
+                          str(curie), search_term)
+                all_elements_ids = self._search_elements(curie, search_term)
+                present = element.id in all_elements_ids
+                if not present:
+                    log.error("Did not find expected variable %s in search "
+                              "result.", str(element.id))
+                    log.error("Concept id : %s, Search term: %s",
+                              str(concept.id), search_term)
+                    raise PipelineException(
+                        f"Validation exception - did not find variable "
+                        f"{element.id} from {str(elements_file)}"
+                        f"when searching variable index with Concept ID : "
+                        f"{concept.id} using Search Term : {search_term} ")
+            else:
+                log.info(
+                    "%s has no concepts annotated. Skipping validation for it.",
+                    str(element.id))
+
+    def _search_elements(self, curie, search_term):
+        "Asynchronously call a search on the curie and search term"
+        response = self.event_loop.run_until_complete(self.search_obj.search_vars_unscored(
+            concept=curie,
+            query=search_term
+        ))
+        ids_dict = []
+        if 'total_items' in response:
+            if response['total_items'] == 0:
+                log.error(f"No search elements returned for variable search: {self.variables_index}.")
+                log.error(f"Concept id : {curie}, Search term: {search_term}")
+                raise Exception(f"Validation error - Did not find {curie} for"
+                                f"Search term: {search_term}")
+            else:
+                del response['total_items']
+                for element_type in response:
+                    all_elements_ids = [e['id'] for e in
+                                        reduce(lambda x, y: x + y['elements'], response[element_type], [])]
+                    ids_dict += all_elements_ids
+        return ids_dict
+
+    def crawl_concepts(self, concepts, data_set_name, output_path=None):
+        """Adds tranql KG to Concepts
+
+        Terms grabbed from KG are also added as search terms
+        :param concepts:
+        :param data_set_name:
+        :return:
+        """
+        # TODO crawl dir seems to be storaing crawling info to avoid re-crawling, but is that consting us much? , it was when tranql was slow, but
+        # might right to consider getting rid of it.
+        crawl_dir = storage.dug_crawl_path('crawl_output')
+        output_file_name = os.path.join(data_set_name,
+                                        'expanded_concepts.txt')
+        extracted_dug_elements_file_name = os.path.join(data_set_name,
+                                                        'extracted_graph_elements.txt')
+        if not output_path:
+            output_file = storage.dug_expanded_concepts_path(output_file_name)
+            extracted_output_file = storage.dug_expanded_concepts_path(
+                extracted_dug_elements_file_name
+                )
+        else:
+            output_file = os.path.join(output_path, output_file_name)
+            extracted_output_file = os.path.join( output_path, extracted_dug_elements_file_name)
+        
+        Path(crawl_dir).mkdir(parents=True, exist_ok=True)
+        extracted_dug_elements = []
+        log.debug("Creating Dug Crawler object")
+        crawler = Crawler(
+            crawl_file="",
+            parser=None,
+            annotator=None,
+            tranqlizer=self.tranqlizer,
+            tranql_queries=self.tranql_queries,
+            http_session=self.cached_session,
+        )
+        crawler.crawlspace = crawl_dir
+        counter = 0
+        total = len(concepts)
+        for concept in concepts.values():
+            counter += 1
+            try:
+                crawler.expand_concept(concept)
+                concept.set_search_terms()
+                concept.set_optional_terms()
+            except Exception as e:
+                log.error(concept)
+                raise e
+            for query in self.node_to_element_queries:
+                log.info(query)
+                casting_config = query['casting_config']
+                tranql_source = query['tranql_source']
+                dug_element_type = query['output_dug_type']
+                extracted_dug_elements += crawler.expand_to_dug_element(
+                    concept=concept,
+                    casting_config=casting_config,
+                    dug_element_type=dug_element_type,
+                    tranql_source=tranql_source
+                )
+            concept.clean()
+            percent_complete = int((counter / total) * 100)
+            if percent_complete % 10 == 0:
+                log.info("%d%%", percent_complete)
+        log.info("Crawling %s done", data_set_name)
+        storage.write_object(obj=jsonpickle.encode(concepts, indent=2), path=output_file)
+        log.info ("Concepts serialized to %s", output_file)
+        storage.write_object(obj=jsonpickle.encode(extracted_dug_elements, indent=2),
+                             path=extracted_output_file)
+        log.info("Extracted elements serialized to %s", extracted_output_file)
+
+    def _index_concepts(self, concepts):
+        "Submit concepts to ElasticSearch for indexing"
+        log.info("Indexing Concepts")
+        total = len(concepts)
+        count = 0
+        for concept_id, concept in concepts.items():
+            count += 1
+            self.index_obj.index_concept(concept, index=self.concepts_index)
+            # Index knowledge graph answers for each concept
+            for kg_answer_id, kg_answer in concept.kg_answers.items():
+                self.index_obj.index_kg_answer(
+                    concept_id=concept_id,
+                    kg_answer=kg_answer,
+                    index=self.kg_index,
+                    id_suffix=kg_answer_id
+                )
+            percent_complete = int((count / total) * 100)
+            if percent_complete % 10 == 0:
+                log.info("%s %%", percent_complete)
+        log.info("Done Indexing concepts")
+
+    def _validate_indexed_concepts(self, elements, concepts):
+        """
+        Validates linked concepts are searchable
+        :param elements: Annotated dug elements
+        :param concepts: Crawled (expanded) concepts
+        :return:
+        """
+        # 1 . Find concepts with KG <= 10% of all concepts,
+        # <= because we might have no results for some concepts from tranql
+        sample_concepts = {key: value for key, value
+                           in concepts.items() if value.kg_answers}
+        if len(concepts) == 0:
+            log.info("No Concepts found.")
+            return
+        log.info("Found only %d Concepts with Knowledge graph out of %d. %d%%",
+                 len(sample_concepts), len(concepts),
+                 (len(sample_concepts) / len(concepts)) * 100)
+        # 2. pick elements that have concepts in the sample concepts set
+        sample_elements = {}
+        for element in elements:
+            if isinstance(element, DugConcept):
+                continue
+            for concept in element.concepts:
+                # add elements that have kg
+                if concept in sample_concepts:
+                    sample_elements[concept] = sample_elements.get(
+                        concept, set())
+                    sample_elements[concept].add(element.id)
+
+        # Time for some validation
+        for curie in concepts:
+            concept = concepts[curie]
+            if not concept.kg_answers:
+                continue
+            search_terms = []
+            for key in concept.kg_answers:
+                kg_object = concept.kg_answers[key]
+                search_terms += kg_object.get_node_names()
+                search_terms += kg_object.get_node_synonyms()
+                # reduce(lambda x,y: x + y, [[node.get("name")]
+                #                            + node.get("synonyms", [])
+                #             for node in concept.kg_answers[
+                #                 "knowledge_graph"]["nodes"]], [])
+            # validation here is that for any of these nodes we should get back
+            # the variable.
+            # make unique
+            search_terms_cap = 10
+            search_terms = list(set(search_terms))[:search_terms_cap]
+            log.debug("Using %d Search terms for concept %s", len(search_terms),
+                      str(curie))
+            for search_term in search_terms:
+                # avoids elastic failure due to some reserved characters
+                # 'search_phase_execution_exception',
+                # 'token_mgr_error: Lexical error ...
+                search_term = re.sub(r'[^a-zA-Z0-9_\ ]+', '', search_term)
+
+                searched_element_ids = self._search_elements(curie, search_term)
+
+                if curie not in sample_elements:
+                    log.error("Did not find Curie id %s in Elements.",
+                              str(curie))
+                    log.error("Concept id : %s, Search term: %s",
+                              str(concept.id), search_term)
+                    raise PipelineException(
+                        f"Validation error - Did not find {curie} for "
+                        f"Concept id : {concept.id}, "
+                        f"Search term: {search_term}")
+
+                present = bool([x for x in sample_elements[curie]
+                                if x in searched_element_ids])
+                if not present:
+                    log.error("Did not find expected variable %s "
+                              "in search result.",
+                              str(curie))
+                    log.error("Concept id : %s, Search term: %s",
+                              str(concept.id), search_term)
+                    raise PipelineException(
+                        f"Validation error - Did not find {curie} for"
+                        f" Concept id : {concept.id}, "
+                        f"Search term: {search_term}")
+
+    def clear_index(self, index_id):
+        "Delete the index specified by index_id from ES"
+        exists = self.event_loop.run_until_complete(self.search_obj.es.indices.exists(index=index_id))
+        if exists:
+            log.info("Deleting index %s", str(index_id))
+            response = self.event_loop.run_until_complete(
+                self.search_obj.es.indices.delete(index=index_id))
+            log.info("Cleared Elastic : %s", str(response))
+        log.info("Re-initializing the indicies")
+        self.index_obj.init_indices()
+
+    def clear_variables_index(self):
+        "Delete the variables index from ES"
+        self.clear_index(self.variables_index)
+
+    def clear_kg_index(self):
+        "Delete the KG index from ES"
+        self.clear_index(self.kg_index)
+
+    def clear_concepts_index(self):
+        "Delete the concepts index from ES"
+        self.clear_index(self.concepts_index)
+
+    ####
+    # Methods above this are directly from what used to be
+    # dug_helpers.dug_utils.Dug. Methods below are consolidated from what used
+    # to be dug_helpers.dug_utils.DugUtil. These are intented to be the "top
+    # level" interface to Roger, which Airflow DAGs or other orchestrators can
+    # call directly.
+
+    def _fetch_s3_file(self, filename, output_dir):
+        "Fetch a file from s3 to output_dir"
+        log.info("Fetching %s", filename)
+        output_name = filename.split('/')[-1]
+        output_path = output_dir / output_name
+        self.s3_utils.get(
+            str(filename),
+            str(output_path),
+        )
+        if self.unzip_source:
+            log.info("Unzipping %s", str(output_path))
+            with tarfile.open(str(output_path)) as tar:
+                tar.extractall(path=output_dir)
+        return output_path
+
+    def _fetch_remote_file(self, filename, output_dir, current_version):
+        "Fetch a file from a location using FileFetcher"
+        log.info("Fetching %s", filename)
+        # fetch from stars
+        remote_host = self.config.annotation_base_data_uri
+        fetch = FileFetcher(
+            remote_host=remote_host,
+            remote_dir=current_version,
+            local_dir=output_dir)
+        output_path = fetch(filename)
+        if self.unzip_source:
+            log.info("Unzipping %s", str(output_path))
+            with tarfile.open(str(output_path)) as tar:
+                tar.extractall(path=output_dir)
+        return output_path
+
+    def get_versioned_files(self):
+        """ Fetches a dug input data files to input file directory
+        """
+        meta_data = storage.read_relative_object("../../metadata.yaml")
+        output_dir: Path = storage.dug_input_files_path(
+            self.get_files_dir())
+        data_store = self.config.dug_inputs.data_source
+
+        # clear dir
+        storage.clear_dir(output_dir)
+        data_sets = self.config.dug_inputs.data_sets
+        log.info("dataset: %s", data_sets)
+        pulled_files = []
+        for data_set in data_sets:
+            data_set_name, current_version = data_set.split(':')
+            for item in meta_data["dug_inputs"]["versions"]:
+                if (item["version"] == current_version and
+                    item["name"] == data_set_name and
+                    item["format"] == self.get_data_format()):
+                    if data_store == "s3":
+                        for filename in item["files"]["s3"]:
+                            pulled_files.append(
+                                self._fetch_s3_file(filename, output_dir))
+                    else:
+                        for filename in item["files"]["stars"]:
+                            pulled_files.append(
+                                self.fetch_remote_file(filename, output_dir,
+                                                       current_version))
+        return [str(filename) for filename in pulled_files]
+
+    def get_objects(self, input_data_path=None):
+        """Retrieve initial source objects for parsing
+
+        This is a default method that will be overridden by subclasses
+        frequently, it is expected.
+        """
+        if not input_data_path:
+            input_data_path = storage.dug_input_files_path(
+                self.get_files_dir())
+        files = storage.get_files_recursive(
+            lambda file_name: file_name.endswith('.xml'),
+            input_data_path)
+        return sorted([str(f) for f in files])
+
+    def annotate(self, to_string=False, files=None, input_data_path=None,
+                 output_data_path=None):
+        "Annotate files with the appropriate parsers and crawlers"
+        if files is None:
+            files = self.get_objects(input_data_path=input_data_path)
+        self.annotate_files(parsable_files=files,
+                            output_data_path=output_data_path)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    def index_variables(self, to_string=False, element_object_files=None,
+                        input_data_path=None, output_data_path=None):
+        """Index variables from element object files for pipeline
+
+        if element_object_files is specified, only those files are
+        indexed. Otherwise, if the input_data_path is supplied, elements files
+        under that path are indexed. If neither is supplied, the default
+        directory is searched for index files and those are indexed.
+        """
+        # self.clear_variables_index()
+        if element_object_files is None:
+            element_object_files = storage.dug_elements_objects(input_data_path,format='txt')
+        for file_ in element_object_files:            
+            self.index_elements(file_)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    def validate_indexed_variables(self, to_string=None,
+                                   element_object_files=None,
+                                   input_data_path=None,
+                                   output_data_path=None):
+        "Validate output from index variables task for pipeline"
+        if not element_object_files:
+            element_object_files = storage.dug_elements_objects(input_data_path, format='txt')
+        for file_ in element_object_files:
+            log.info("Validating %s", str(file_))
+            self.validate_indexed_element_file(file_)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    def validate_indexed_concepts(self, config=None, to_string=None, input_data_path=None, output_data_path=None):
+        """
+        Entry for validate concepts
+        """
+        get_data_set_name = lambda file: os.path.split(os.path.dirname(file))[-1]
+        expanded_concepts_files_dict = {
+            get_data_set_name(file): file for file  in storage.dug_expanded_concept_objects(data_path=input_data_path, format='txt')
+        }
+        annotated_elements_files_dict = {
+            get_data_set_name(file): file for file in storage.dug_elements_objects(data_path=input_data_path, format='txt')
+        }
+        try: 
+            assert len(expanded_concepts_files_dict) == len(annotated_elements_files_dict)
+        except:
+            log.error("Files Annotated Elements files and Expanded concepts files, should be pairs")
+            if len(expanded_concepts_files_dict) > len(annotated_elements_files_dict):
+                log.error("Some Annotated Elements files (from load_and_annotate task) are missing")
+            else:
+                log.error("Some Expanded Concepts files (from crawl task) are missing")
+            log.error(f"Annotated Datasets : {list(annotated_elements_files_dict.keys())}")
+            log.error(f"Expanded Concepts Datasets: {list(expanded_concepts_files_dict.keys())}")
+            exit(-1)
+        for data_set_name in annotated_elements_files_dict:
+            log.debug(f"Reading concepts and elements for dataset {data_set_name}")
+            elements_file_path = annotated_elements_files_dict[data_set_name]
+            concepts_file_path = expanded_concepts_files_dict[data_set_name]
+            dug_elements = jsonpickle.decode(storage.read_object(elements_file_path))
+            dug_concepts = jsonpickle.decode(storage.read_object(concepts_file_path))
+            log.debug(f"Read {len(dug_elements)} elements, and {len(dug_concepts)} Concepts")
+            log.info(f"Validating {data_set_name}")
+            self._validate_indexed_concepts(elements=dug_elements, concepts=dug_concepts)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    def make_kg_tagged(self, to_string=False, elements_files=None,
+                       input_data_path=None, output_data_path=None):
+        "Create tagged knowledge graphs from elements"
+        if not output_data_path:
+            output_data_path = storage.dug_kgx_path("")
+        storage.clear_dir(output_data_path)
+        log.info("Starting building KGX files")
+
+        if not elements_files:
+            elements_files = storage.dug_elements_objects(input_data_path, format='txt')
+        log.info(f"found {len(elements_files)} files : {elements_files}")
+        for file_ in elements_files:
+            elements = jsonpickle.decode(storage.read_object(file_))
+            if "topmed_" in file_:
+                kg = self.make_tagged_kg(elements)
+            else:
+                kg = self.convert_to_kgx_json(elements)
+            dug_base_file_name = file_.split(os.path.sep)[-2]
+            output_file_path = os.path.join(output_data_path,
+                                            dug_base_file_name + '_kgx.json')
+            storage.write_object(kg, output_file_path)
+            log.info("Wrote %d and %d edges, to %s", len(kg['nodes']),
+                     len(kg['edges']), output_file_path)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    def crawl_tranql(self, to_string=False, concept_files=None,
+                     input_data_path=None, output_data_path=None):
+        "Perform the tranql crawl"        
+        if not concept_files:
+            concept_files = storage.dug_concepts_objects(input_data_path, format='txt')          
+
+        if output_data_path:
+            crawl_dir = os.path.join(output_data_path, 'crawl_output')
+            expanded_concepts_dir = os.path.join(output_data_path,
+                                                 'expanded_concepts')
+        else:
+            crawl_dir = storage.dug_crawl_path('crawl_output')
+            expanded_concepts_dir = storage.dug_expanded_concepts_path("")
+        log.info("Clearing crawl output dir %s", crawl_dir)
+        storage.clear_dir(crawl_dir)
+
+        log.info("Clearing expanded concepts dir: %s", expanded_concepts_dir)
+        storage.clear_dir(expanded_concepts_dir)
+
+        log.info("Crawling Dug Concepts, found %d file(s).",
+                 len(concept_files))
+        for file_ in concept_files:
+            objects = storage.read_object(file_) 
+            objects = objects or {} 
+            if not objects:
+                log.info(f'no concepts in {file_}')
+            data_set =  jsonpickle.decode(objects)
+            original_variables_dataset_name = os.path.split(
+                os.path.dirname(file_))[-1]
+            self.crawl_concepts(concepts=data_set,
+                                data_set_name=original_variables_dataset_name, output_path= output_data_path)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
+    def index_concepts(self, to_string=False,
+                       input_data_path=None, output_data_path=None):
+        "Index concepts from expanded concept files"
+        # These are concepts that have knowledge graphs  from tranql
+        # clear out concepts and kg indicies from previous runs
+        # self.clear_concepts_index()
+        # self.clear_kg_index()
+        expanded_concepts_files = storage.dug_expanded_concept_objects(
+            input_data_path, format="txt")
+        for file_ in expanded_concepts_files:
+            concepts = jsonpickle.decode(storage.read_object(file_))            
+            self._index_concepts(concepts=concepts)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
diff --git a/dags/roger/pipelines/bdc.py b/dags/roger/pipelines/bdc.py
new file mode 100644
index 00000000..bc30cf44
--- /dev/null
+++ b/dags/roger/pipelines/bdc.py
@@ -0,0 +1,19 @@
+"Pipeline for BDC-dbGap data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class bdcPipeline(DugPipeline):
+    "Pipeline for BDC-dbGap data set"
+    pipeline_name = "bdc"
+    parser_name = "dbgap"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_dd_xml_path()
+        files = storage.get_files_recursive(
+            lambda file_name: (
+                not file_name.startswith('._')
+                and file_name.endswith('.xml')),
+            input_data_path)
+        return sorted([str(f) for f in files])
diff --git a/dags/roger/pipelines/crdc.py b/dags/roger/pipelines/crdc.py
new file mode 100644
index 00000000..2143cf7b
--- /dev/null
+++ b/dags/roger/pipelines/crdc.py
@@ -0,0 +1,19 @@
+"Pipeline for Cancer Commons data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class CRDCPipeline(DugPipeline):
+    "Pipeline for Cancer Commons data set"
+    pipeline_name = "crdc"
+    parser_name = "crdc"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_crdc_path()
+        files = storage.get_files_recursive(
+            lambda file_name: (
+                not file_name.startswith('GapExchange_')
+                and file_name.endswith('.xml')),
+            input_data_path)
+        return sorted([str(f) for f in files])
diff --git a/dags/roger/pipelines/ctn.py b/dags/roger/pipelines/ctn.py
new file mode 100644
index 00000000..25918062
--- /dev/null
+++ b/dags/roger/pipelines/ctn.py
@@ -0,0 +1,10 @@
+"Pipeline for Clinical trials network data"
+
+from roger.pipelines import DugPipeline
+
+class CTNPipeline(DugPipeline):
+    "Pipeline for Clinical trials nework data set"
+    pipeline_name = "ctn"
+    parser_name = "ctn"
+
+
diff --git a/dags/roger/pipelines/db_gap.py b/dags/roger/pipelines/db_gap.py
new file mode 100644
index 00000000..7c1db504
--- /dev/null
+++ b/dags/roger/pipelines/db_gap.py
@@ -0,0 +1,10 @@
+"Dug pipeline for dbGaP data set"
+
+from roger.pipelines import DugPipeline
+
+class dbGaPPipeline(DugPipeline):
+    "Pipeline for the dbGaP data set"
+
+    pipeline_name = 'dbGaP'
+    parser_name = 'DbGaP'
+    files_dir = 'db_gap'
diff --git a/dags/roger/pipelines/heal_research_programs.py b/dags/roger/pipelines/heal_research_programs.py
new file mode 100644
index 00000000..d95a4497
--- /dev/null
+++ b/dags/roger/pipelines/heal_research_programs.py
@@ -0,0 +1,16 @@
+"Pipeline for Heal-studies data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class HealResearchProgramPipeline(DugPipeline):
+    "Pipeline for Heal-research-programs  data set"
+    pipeline_name = "heal-research-programs"
+    parser_name = "heal-research"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_heal_research_program_path()
+        files = storage.get_files_recursive(lambda file_name: file_name.endswith('.xml'),
+                                    input_data_path)
+        return sorted([str(f) for f in files])
\ No newline at end of file
diff --git a/dags/roger/pipelines/heal_studies.py b/dags/roger/pipelines/heal_studies.py
new file mode 100644
index 00000000..cf78c042
--- /dev/null
+++ b/dags/roger/pipelines/heal_studies.py
@@ -0,0 +1,16 @@
+"Pipeline for Heal-studies data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class HealStudiesPipeline(DugPipeline):
+    "Pipeline for Heal-studies  data set"
+    pipeline_name = "heal-studies"
+    parser_name = "heal-studies"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_heal_study_path()
+        files = storage.get_files_recursive(lambda file_name: file_name.endswith('.xml'),
+                                    input_data_path)
+        return sorted([str(f) for f in files])
diff --git a/dags/roger/pipelines/kfdrc.py b/dags/roger/pipelines/kfdrc.py
new file mode 100644
index 00000000..bcb0b7ac
--- /dev/null
+++ b/dags/roger/pipelines/kfdrc.py
@@ -0,0 +1,19 @@
+"Pipeline for KDFRC data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class kfdrcPipeline(DugPipeline):
+    "Pipeline for KDFRC data set"
+    pipeline_name = "kfdrc"
+    parser_name = "kfdrc"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_kfdrc_path()
+        files = storage.get_files_recursive(
+            lambda file_name: (
+                not file_name.startswith('GapExchange_')
+                and file_name.endswith('.xml')),
+            input_data_path)
+        return sorted([str(f) for f in files])
diff --git a/dags/roger/pipelines/nida.py b/dags/roger/pipelines/nida.py
new file mode 100644
index 00000000..b2e841bd
--- /dev/null
+++ b/dags/roger/pipelines/nida.py
@@ -0,0 +1,18 @@
+"NIDA data set pipeline definition"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class NIDAPipeline(DugPipeline):
+    "NIDA data pipeline"
+
+    pipeline_name = 'nida'
+    parser_name = 'NIDA'
+
+    def get_objects(self, input_data_path=None):
+        "Return list of NIDA source files"
+        if not input_data_path:
+            input_data_path = storage.dug_input_files_path(
+                self.get_files_dir())
+        files = sorted(storage.get_files_recursive(lambda x: 'NIDA-' in x , input_data_path))
+        return files
diff --git a/dags/roger/pipelines/radx.py b/dags/roger/pipelines/radx.py
new file mode 100644
index 00000000..32491fb0
--- /dev/null
+++ b/dags/roger/pipelines/radx.py
@@ -0,0 +1,8 @@
+"Pipeline for BACPAC data"
+
+from roger.pipelines import DugPipeline
+
+class RadxPipeline(DugPipeline):
+    "Pipeline for BACPAC data set"
+    pipeline_name = "radx"
+    parser_name = "radx"
diff --git a/dags/roger/pipelines/sparc.py b/dags/roger/pipelines/sparc.py
new file mode 100644
index 00000000..d1c9c950
--- /dev/null
+++ b/dags/roger/pipelines/sparc.py
@@ -0,0 +1,17 @@
+"Pipeline for Sparc data"
+
+from roger.pipelines import DugPipeline
+from roger.core import storage
+
+class SparcPipeline(DugPipeline):
+    "Pipeline for Sparc  data set"
+    pipeline_name = "sparc"
+    parser_name = "SciCrunch"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_heal_study_path()
+        files = storage.get_files_recursive(
+            lambda x: True, input_data_path
+        )
+        return sorted([str(f) for f in files])
diff --git a/dags/roger/pipelines/topmed.py b/dags/roger/pipelines/topmed.py
new file mode 100644
index 00000000..90b3e515
--- /dev/null
+++ b/dags/roger/pipelines/topmed.py
@@ -0,0 +1,41 @@
+"Pipeline for Topmed data"
+
+from roger.pipelines import DugPipeline
+from roger.pipelines.base import log, os
+import jsonpickle
+from roger.core import storage
+from roger.logger import logger
+class TopmedPipeline(DugPipeline):
+    "Pipeline for Topmed data set"
+    pipeline_name = "topmed"
+    parser_name = "TOPMedTag"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = str(storage.dug_input_files_path('topmed'))
+        files =storage.get_files_recursive(
+                lambda file_name: file_name.endswith('.csv'),
+                input_data_path)
+        return sorted([str(x) for x in files])
+
+    def make_kg_tagged(self, to_string=False, elements_files=None,
+                       input_data_path=None, output_data_path=None):
+        "Create tagged knowledge graphs from elements"
+        log.info("Override base.make_kg_tagged called")
+        if not output_data_path:
+            output_data_path = storage.dug_kgx_path("")
+        storage.clear_dir(output_data_path)
+        if not elements_files:
+            elements_files = storage.dug_elements_objects(input_data_path, format='txt')
+        for file_ in elements_files:
+            elements = jsonpickle.decode(storage.read_object(file_))
+            kg = self.make_tagged_kg(elements)
+            dug_base_file_name = file_.split(os.path.sep)[-2]
+            output_file_path = os.path.join(output_data_path,
+                                            dug_base_file_name + '_kgx.json')
+            storage.write_object(kg, output_file_path)
+            log.info("Wrote %d and %d edges, to %s", len(kg['nodes']),
+                     len(kg['edges']), output_file_path)
+        output_log = self.log_stream.getvalue() if to_string else ''
+        return output_log
+
diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py
index c2367e32..f5451140 100755
--- a/dags/roger/tasks.py
+++ b/dags/roger/tasks.py
@@ -1,10 +1,28 @@
+"Tasks and methods related to Airflow implementations of Roger"
+
 import os
 
 from airflow.operators.python import PythonOperator
+from airflow.operators.empty import EmptyOperator
+from airflow.utils.task_group import TaskGroup
 from airflow.utils.dates import days_ago
+from airflow.models import DAG
+from airflow.models.dag import DagContext
+from airflow.models.taskinstance import TaskInstance
+from typing import Union
+from pathlib import Path
+import glob
+import shutil
 
-from roger.config import config
+from roger.config import config, RogerConfig
 from roger.logger import get_logger
+from roger.pipelines.base import DugPipeline
+from avalon.mainoperations import put_files, LakeFsWrapper, get_files
+from lakefs_sdk.configuration import Configuration
+from lakefs_sdk.models.merge import Merge
+from functools import partial
+
+logger = get_logger()
 
 default_args = {
     'owner': 'RENCI',
@@ -21,21 +39,35 @@ def task_wrapper(python_callable, **kwargs):
     """
     # get dag config provided
     dag_run = kwargs.get('dag_run')
-    dag_conf = {}
-    logger = get_logger()
-    if dag_run:
-        dag_conf = dag_run.conf
-        # remove this since to send every other argument to the python callable.
-        del kwargs['dag_run']
+    pass_conf = kwargs.get('pass_conf', True)
+    if config.lakefs_config.enabled:
+        # get input path
+        input_data_path = generate_dir_name_from_task_instance(kwargs['ti'],
+                                                            roger_config=config,
+                                                            suffix='input')
+        # get output path from task id run id dag id combo
+        output_data_path = generate_dir_name_from_task_instance(kwargs['ti'],
+                                                            roger_config=config,
+                                                            suffix='output')
+    else:
+        input_data_path, output_data_path = None, None
+    # cast it to a path object
+    func_args = {
+        'input_data_path': input_data_path,
+        'output_data_path': output_data_path,
+        'to_string': kwargs.get('to_string')
+    }
+    logger.info(f"Task function args: {func_args}")
     # overrides values
     config.dag_run = dag_run
-    return python_callable(to_string=False, config=config)
-
+    if pass_conf:
+        return python_callable(config=config, **func_args)
+    return python_callable(**func_args)
 
 def get_executor_config(data_path='/opt/airflow/share/data'):
     """ Get an executor configuration.
     :param annotations: Annotations to attach to the executor.
-    :returns: Returns a KubernetesExecutor if K8s is configured and None otherwise.
+    :returns: Returns a KubernetesExecutor if K8s configured, None otherwise.
     """
     env_var_prefix = config.OS_VAR_PREFIX
     # based on environment set on scheduler pod, make secrets for worker pod
@@ -70,28 +102,343 @@ def get_executor_config(data_path='/opt/airflow/share/data'):
     }
     return k8s_executor_config
 
+def init_lakefs_client(config: RogerConfig) -> LakeFsWrapper:
+    configuration = Configuration()
+    configuration.username = config.lakefs_config.access_key_id
+    configuration.password = config.lakefs_config.secret_access_key
+    configuration.host = config.lakefs_config.host
+    the_lake = LakeFsWrapper(configuration=configuration)
+    return the_lake
+
+
+def pagination_helper(page_fetcher, **kwargs):
+    """Helper function to iterate over paginated results"""
+    while True:
+        resp = page_fetcher(**kwargs)
+        yield from resp.results
+        if not resp.pagination.has_more:
+            break
+        kwargs['after'] = resp.pagination.next_offset
+
+
+def avalon_commit_callback(context: DagContext, **kwargs):
+    client: LakeFsWrapper  = init_lakefs_client(config=config)
+    # now files have been processed,
+    # this part should
+    # get the out path of the task
+    local_path = str(generate_dir_name_from_task_instance(context['ti'],
+                                                           roger_config=config,
+                                                           suffix='output')).rstrip('/') + '/'
+    task_id = context['ti'].task_id
+    dag_id = context['ti'].dag_id
+    run_id = context['ti'].run_id
+    # run id looks like 2023-10-18T17:35:14.890186+00:00
+    # normalized to 2023_10_18T17_35_14_890186_00_00
+    # since lakefs branch id must consist of letters, digits, underscores and dashes, 
+    # and cannot start with a dash
+    run_id_normalized = run_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_')
+    dag_id_normalized = dag_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_')
+    task_id_normalized = task_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_')
+    temp_branch_name = f'{dag_id_normalized}_{task_id_normalized}_{run_id_normalized}'
+    # remote path to upload the files to.
+    remote_path = f'{dag_id}/{task_id}/'
+
+    # merge destination branch
+    branch = config.lakefs_config.branch
+    repo = config.lakefs_config.repo
+    # This part pushes to a temp branch on the repo
+
+    # now we have the output path lets do some pushing but where ?
+    # right now lets stick to using one repo ,
+
+    # issue Vladmir pointed out if uploads to a single lakefs branch have not
+    # been finalized with commit,
+    # this would cause dirty commits if parallel tasks target the same branch.
+
+    # solution: Lakefs team suggested we commit to a different temp branch per
+    # task, and merge that branch.
+    # this callback function will do that for now.
+
+    # 1. put files into a temp branch.
+    # 2. make sure a commit happens.
+    # 3. merge that branch to master branch.
+    logger.info("Pushing local path %s to %s@%s in %s dir",
+                local_path, repo, temp_branch_name, remote_path)
+    put_files(
+        local_path=local_path,
+        remote_path=remote_path,
+        task_name=task_id,
+        task_args=[""],
+        pipeline_id=dag_id,
+        task_docker_image="docker-image",
+        s3storage=False,
+        lake_fs_client=client,
+        branch=temp_branch_name,
+        repo=repo,
+        # @TODO figure out how to pass real commit id here
+        commit_id=branch
+    )
+
+    # see what changes are going to be pushed from this branch to main branch
+    for diff in pagination_helper(client._client.refs_api.diff_refs,
+                                  repository=repo, left_ref=branch,
+                                  right_ref=temp_branch_name):
+        logger.info("Diff: " + str(diff))
+    
+    try:
+        # merging temp branch to working branch
+        # the current working branch wins incase of conflicts
+        merge = Merge(**{"strategy": "source-wins"})
+        client._client.refs_api.merge_into_branch(repository=repo,
+                                                source_ref=temp_branch_name,
+                                                destination_branch=branch,
+                                                merge=merge
+                                                )
+
+        logger.info(f"merged branch {temp_branch_name} into {branch}")
+    except Exception as e:
+        # remove temp 
+        logger.error(e)
+    # delete temp branch
+    finally:
+        client._client.branches_api.delete_branch(
+            repository=repo,
+            branch=temp_branch_name
+        )
+        logger.info(f"deleted temp branch {temp_branch_name}")
+        logger.info(f"deleting local dir {local_path}")
+        files_to_clean = glob.glob(local_path + '**', recursive=True) + [local_path]
+
+    clean_up(context, **kwargs)
+
+def clean_up(context: DagContext, **kwargs):
+    input_dir = str(generate_dir_name_from_task_instance(context['ti'],
+                                                          roger_config=config,
+                                                          suffix='output')).rstrip('/') + '/'
+    output_dir = str(generate_dir_name_from_task_instance(context['ti'],
+                                                          roger_config=config,
+                                                          suffix='input')).rstrip('/') + '/'
+    files_to_clean = glob.glob(input_dir + '**', recursive=True) + [input_dir]
+    files_to_clean += glob.glob(output_dir + '**', recursive=True) + [output_dir]
+    for f in files_to_clean:
+        if os.path.exists(f):
+            shutil.rmtree(f)
+
+def generate_dir_name_from_task_instance(task_instance: TaskInstance,
+                                         roger_config: RogerConfig, suffix:str):
+    # if lakefs is not enabled just return none so methods default to using
+    # local dir structure.
+    if not roger_config.lakefs_config.enabled:
+        return None
+    root_data_dir =  os.getenv("ROGER_DATA_DIR").rstrip('/')
+    task_id = task_instance.task_id
+    dag_id = task_instance.dag_id
+    run_id = task_instance.run_id
+    try_number = task_instance._try_number
+    return Path(
+        f"{root_data_dir}/{dag_id}_{task_id}_{run_id}_{try_number}_{suffix}")
+
+def setup_input_data(context, exec_conf):
+    logger.info("""
+        - Figures out the task name and id,
+        - find its data dependencies
+        - clean up and create in and out dir
+        - put dependency data in input dir
+        - if for some reason data was not found raise an exception
+          """)
+    # Serves as a location where files the task will work on are placed.
+    # computed as ROGER_DATA_DIR + /current task instance name_input_dir
+
+    input_dir = str(generate_dir_name_from_task_instance(
+        context['ti'], roger_config=config, suffix="input"))
+    # Clear up files from previous run etc...
+
+    # create input dir
+    os.makedirs(input_dir, exist_ok=True)
+
+    # Download files from lakefs and store them in this new input_path
+    client = init_lakefs_client(config=config)
+    repos = exec_conf['repos']
+    # if no external repo is provided we assume to get the upstream task dataset.
+    if not repos or len(repos) == 0:
+        # merge destination branch
+        branch = config.lakefs_config.branch
+        repo = config.lakefs_config.repo
+        task_instance: TaskInstance = context['ti']
+        # get upstream ids
+        upstream_ids = task_instance.task.upstream_task_ids
+        dag_id = task_instance.dag_id
+        # calculate remote dirs using dag_id + upstreams
+        repos = [{
+            'repo': repo,
+            'branch': branch,
+            'path': f'{dag_id}/{upstream_id}'
+        } for upstream_id in upstream_ids]
 
-def create_python_task (dag, name, a_callable, func_kwargs=None):
+    # input_repo = exec_conf['input_repo']
+    # input_branch = exec_conf['input_branch']
+    # If input repo is provided use that as source of files
+    for repo in repos:
+        if not repo.get('path'):
+            # get all if path is not specified
+            repo['path'] = '*'
+    logger.info(f"repos : {repos}")
+    for r in repos:
+        logger.info("downloading %s from %s@%s to %s",
+                    r['path'], r['repo'], r['branch'], input_dir)
+        # create path to download to ...
+        if not os.path.exists(input_dir + f'/{r["repo"]}'):
+            os.mkdir(input_dir + f'/{r["repo"]}')
+        get_files(
+            local_path=input_dir + f'/{r["repo"]}',
+            remote_path=r['path'],
+            branch=r['branch'],
+            repo=r['repo'],
+            changes_only=False,
+            lake_fs_client=client
+        )
+
+
+def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos = {}, pass_conf=True, no_output_files=False):
     """ Create a python task.
     :param func_kwargs: additional arguments for callable.
     :param dag: dag to add task to.
     :param name: The name of the task.
     :param a_callable: The code to run in this task.
     """
+    # these are actual arguments passed down to the task function
     op_kwargs = {
-            "python_callable": a_callable,
-            "to_string": True
-        }
+        "python_callable": a_callable,
+        "to_string": True,
+        "pass_conf": pass_conf
+    }
+    # update / override some of the args passed to the task function by default
     if func_kwargs is None:
-        func_kwargs = dict()
+        func_kwargs = {}
     op_kwargs.update(func_kwargs)
-    return PythonOperator(
-        task_id=name,
-        python_callable=task_wrapper,
-        op_kwargs=op_kwargs,
-        executor_config=get_executor_config(),
-        dag=dag,
-        provide_context=True
-    )
 
 
+    # Python operator arguments , by default for non-lakefs config this is all we need. 
+    python_operator_args = {
+            "task_id": name,
+            "python_callable":task_wrapper,            
+            "executor_config" : get_executor_config(),
+            "dag": dag,
+            "provide_context" : True
+    }
+
+    # if we have lakefs...
+    if config.lakefs_config.enabled:
+
+        # repo and branch for pre-execution , to download input objects
+        pre_exec_conf = {
+            'repos': []
+        }
+        if external_repos:
+            # if the task is a root task , beginning of the dag...
+            # and we want to pull data from a different repo.
+            pre_exec_conf = {
+                'repos': [{
+                    'repo': r['name'],
+                    'branch': r['branch'],
+                    'path': r.get('path', '*')
+                } for r in external_repos]
+            }
+            
+        pre_exec = partial(setup_input_data, exec_conf=pre_exec_conf)
+        # add pre_exec partial function as an argument to python executor conf 
+        python_operator_args['pre_execute'] = pre_exec
+        python_operator_args['on_failure_callback'] = partial(clean_up, kwargs=op_kwargs)
+        # if the task has  output files, we will add a commit callback  
+        if not no_output_files:
+            python_operator_args['on_success_callback'] = partial(avalon_commit_callback, kwargs=op_kwargs)
+        
+    # add kwargs
+    python_operator_args["op_kwargs"] = op_kwargs
+
+    return PythonOperator(**python_operator_args)
+
+def create_pipeline_taskgroup(
+        dag,
+        pipeline_class: type,
+        configparam: RogerConfig,
+        **kwargs):
+    """Emit an Airflow dag pipeline for the specified pipeline_class
+
+    Extra kwargs are passed to the pipeline class init call.
+    """
+    name = pipeline_class.pipeline_name
+    input_dataset_version = pipeline_class.input_version
+
+    with TaskGroup(group_id=f"{name}_dataset_pipeline_task_group") as tg:
+        with pipeline_class(config=configparam, **kwargs) as pipeline:
+            pipeline: DugPipeline
+            annotate_task = create_python_task(
+                dag,
+                f"annotate_{name}_files",
+                pipeline.annotate,
+                external_repos=[{
+                    'name': getattr(pipeline_class, 'pipeline_name'),
+                    'branch': input_dataset_version
+                }],
+                pass_conf=False)
+
+            index_variables_task = create_python_task(
+                dag,
+                f"index_{name}_variables",
+                pipeline.index_variables,
+                pass_conf=False,
+                # declare that this task will not generate files.
+                no_output_files=True)
+            index_variables_task.set_upstream(annotate_task)
+
+            validate_index_variables_task = create_python_task(
+                dag,
+                f"validate_{name}_index_variables",
+                pipeline.validate_indexed_variables,                
+                pass_conf=False,
+                 # declare that this task will not generate files.
+                no_output_files=True
+                )
+            validate_index_variables_task.set_upstream([annotate_task, index_variables_task])
+
+            make_kgx_task = create_python_task(
+                dag,
+                f"make_kgx_{name}",
+                pipeline.make_kg_tagged,
+                pass_conf=False)
+            make_kgx_task.set_upstream(annotate_task)
+
+            crawl_task = create_python_task(
+                dag,
+                f"crawl_{name}",
+                pipeline.crawl_tranql,
+                pass_conf=False) 
+            crawl_task.set_upstream(annotate_task)
+
+            index_concepts_task = create_python_task(
+                dag,
+                f"index_{name}_concepts",
+                pipeline.index_concepts,
+                pass_conf=False,
+                 # declare that this task will not generate files.
+                no_output_files=True)
+            index_concepts_task.set_upstream(crawl_task)
+
+            validate_index_concepts_task = create_python_task(
+                dag,
+                f"validate_{name}_index_concepts",
+                pipeline.validate_indexed_concepts,
+                pass_conf=False,
+                 # declare that this task will not generate files.
+                no_output_files=True
+            )
+            validate_index_concepts_task.set_upstream([crawl_task, index_concepts_task, annotate_task])
+
+
+            complete_task = EmptyOperator(task_id=f"complete_{name}")
+            complete_task.set_upstream(
+                (make_kgx_task,
+                 validate_index_variables_task, validate_index_concepts_task))
+
+    return tg
diff --git a/dags/tranql_translate.py b/dags/tranql_translate.py
deleted file mode 100755
index 53394ba2..00000000
--- a/dags/tranql_translate.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-
-"""
-An Airflow workflow for the Roger Translator KGX data pipeline.
-"""
-
-from airflow.models import DAG
-from airflow.operators.empty import EmptyOperator
-import roger
-from roger.tasks import get_executor_config, default_args, create_python_task
-
-""" Build the workflow's tasks and DAG. """
-with DAG(
-    dag_id='tranql_translate',
-    default_args=default_args,
-    schedule_interval=None,
-    concurrency=16,
-) as dag:
-
-    """ Build the workflow tasks. """
-    intro = EmptyOperator(task_id='Intro')
-    get_kgx = create_python_task (dag, "GetSource", roger.get_kgx)
-    create_nodes_schema = create_python_task(dag, "CreateNodesSchema",
-                                             roger.create_nodes_schema)
-    create_edges_schema = create_python_task(dag, "CreateEdgesSchema",
-                                             roger.create_edges_schema)
-    continue_task_bulk_load = EmptyOperator(task_id="continueBulkCreate")
-    continue_task_validate = EmptyOperator(task_id="continueValidation")
-    merge_nodes = create_python_task (dag, "MergeNodes", roger.merge_nodes)
-    create_bulk_load_nodes = create_python_task(dag, "CreateBulkLoadNodes",
-                                                roger.create_bulk_nodes)
-    create_bulk_load_edges = create_python_task(dag, "CreateBulkLoadEdges",
-                                                roger.create_bulk_edges)
-    bulk_load = create_python_task(dag, "BulkLoad", roger.bulk_load)
-    check_tranql = create_python_task(dag, "CheckTranql",
-                                      roger.check_tranql)
-    validate = create_python_task(dag, "Validate", roger.validate)
-    finish = EmptyOperator(task_id='Finish')
-
-    """ Build the DAG. """
-    intro >> get_kgx >> merge_nodes >> [create_nodes_schema, create_edges_schema ] >> continue_task_bulk_load >> \
-    [create_bulk_load_nodes, create_bulk_load_edges] >> bulk_load >> continue_task_validate >>[validate, check_tranql ] >> finish
diff --git a/requirements.txt b/requirements.txt
index 9868ea75..d7eb73f8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,14 @@
 boto3==1.18.23
 botocore==1.21.23
-#black==21.10b0
 elasticsearch==8.5.2
 flatten-dict
+jsonpickle
 redisgraph-bulk-loader==0.12.3
 pytest
 PyYAML
-git+https://github.com/helxplatform/dug@2.12.0
+git+https://github.com/helxplatform/dug@2.13.1
 orjson
 kg-utils==0.0.6
 bmt==1.1.0
+git+https://github.com/helxplatform/avalon.git@VG3
 linkml-runtime==1.6.0

From f6711947b5c1bd4fee14481517da69c7bed91749 Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Thu, 18 Apr 2024 13:32:57 -0400
Subject: [PATCH 13/63] pin avalon

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index d7eb73f8..d1b1f68f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,5 +10,5 @@ git+https://github.com/helxplatform/dug@2.13.1
 orjson
 kg-utils==0.0.6
 bmt==1.1.0
-git+https://github.com/helxplatform/avalon.git@VG3
+git+https://github.com/helxplatform/avalon.git@v1.0.1
 linkml-runtime==1.6.0

From 473816a912045a25ccf4b123f967147e9d38773a Mon Sep 17 00:00:00 2001
From: Patrick Hachicho <hachichopatrick@gmail.com>
Date: Thu, 6 Jun 2024 16:01:38 -0400
Subject: [PATCH 14/63] deleted jenkins and added workflows

---
 .github/workflows/build-push-dev-image.yml | 27 +++++++
 .github/workflows/build-push-release.yml   | 25 +++++++
 .github/workflows/code-checks.yml          | 38 ++++++++++
 .github/workflows/trivy-pr-scan.yml        | 23 ++++++
 Jenkinsfile                                | 84 ----------------------
 5 files changed, 113 insertions(+), 84 deletions(-)
 create mode 100644 .github/workflows/build-push-dev-image.yml
 create mode 100644 .github/workflows/build-push-release.yml
 create mode 100644 .github/workflows/code-checks.yml
 create mode 100644 .github/workflows/trivy-pr-scan.yml
 delete mode 100644 Jenkinsfile

diff --git a/.github/workflows/build-push-dev-image.yml b/.github/workflows/build-push-dev-image.yml
new file mode 100644
index 00000000..f18a05c6
--- /dev/null
+++ b/.github/workflows/build-push-dev-image.yml
@@ -0,0 +1,27 @@
+# Workflow responsible for the 
+# development release processes.
+#
+name: Build-Push-Dev-Image
+on:
+  push:
+    branches:
+      - develop
+    paths-ignore:
+      - README.md
+      - .old_cicd/*
+      - .github/*
+      - .github/workflows/*
+      - LICENSE
+      - .gitignore
+      - .dockerignore
+      - .githooks
+  # Do not build another image on a pull request.
+  # Any push to develop will trigger a new build however.
+  pull_request:
+    branches-ignore:
+      - '*'
+
+jobs:
+  build-push-dev-image:
+    uses: helxplatform/helx-github-actions/.github/workflows/build-push-dev-image.yml@main
+    secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/build-push-release.yml b/.github/workflows/build-push-release.yml
new file mode 100644
index 00000000..942221a3
--- /dev/null
+++ b/.github/workflows/build-push-release.yml
@@ -0,0 +1,25 @@
+# Workflow responsible for the 
+# major release processes.
+#
+
+name: Build-Push-Release
+on:
+  push:
+    branches:
+      - master 
+      - main
+    paths-ignore:
+      - README.md
+      - .old_cicd/*
+      - .github/*
+      - .github/workflows/*
+      - LICENSE
+      - .gitignore
+      - .dockerignore
+      - .githooks
+    tags-ignore:
+      - '*'
+jobs:
+  build-push-release:
+    uses: helxplatform/helx-github-actions/.github/workflows/build-push-release.yml@main
+    secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
new file mode 100644
index 00000000..46f09d3c
--- /dev/null
+++ b/.github/workflows/code-checks.yml
@@ -0,0 +1,38 @@
+# Workflow responsible for core acceptance testing.
+# Tests Currently Run:
+#     - flake8-linter
+#     - PYTest
+#     - Bandit
+# For PR Vulnerability Scanning a separate workflow will run.
+# The build-push-dev-image and build-push-release workflows 
+# handle the develop and release image storage respectively.
+#
+#
+
+name: Code-Checks
+on:
+  # push:
+  #   branches-ignore:
+  #     - master
+  #     - main
+  #     - develop
+  pull_request:
+    branches:
+      - develop
+      - master
+      - main 
+    types: [ opened, synchronize ]
+    paths-ignore:
+      - README.md
+      - .old_cicd/*
+      # - .github/*
+      # - .github/workflows/*
+      - LICENSE
+      - .gitignore
+      - .dockerignore
+      - .githooks
+
+jobs:
+ build-push-release:
+    uses: helxplatform/helx-github-actions/.github/workflows/code-checks.yml@main
+    secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml
new file mode 100644
index 00000000..6097214a
--- /dev/null
+++ b/.github/workflows/trivy-pr-scan.yml
@@ -0,0 +1,23 @@
+
+name: trivy-pr-scan 
+on:
+  pull_request:
+    branches:
+      - develop
+      - master
+      - main 
+    types: [ opened, synchronize ]
+    paths-ignore:
+    - README.md
+    - .old_cicd/*
+    - .github/*
+    - .github/workflows/*
+    - LICENSE
+    - .gitignore
+    - .dockerignore
+    - .githooks
+
+jobs:
+ trivy-pr-scan:
+    uses: helxplatform/helx-github-actions/.github/workflows/trivy-pr-scan.yml@main
+    secrets: inherit
\ No newline at end of file
diff --git a/Jenkinsfile b/Jenkinsfile
deleted file mode 100644
index a68ba92b..00000000
--- a/Jenkinsfile
+++ /dev/null
@@ -1,84 +0,0 @@
-library 'pipeline-utils@master'
-
-pipeline {
-    agent {
-        kubernetes {
-            label 'kaniko-build-agent'
-            yaml '''
-kind: Pod
-metadata:
-  name: kaniko
-spec:
-  containers:
-  - name: jnlp
-    workingDir: /home/jenkins/agent
-  - name: kaniko
-    workingDir: /home/jenkins/agent
-    image: gcr.io/kaniko-project/executor:debug
-    imagePullPolicy: Always
-    resources:
-      requests:
-        cpu: "512m"
-        memory: "1024Mi"
-        ephemeral-storage: "4Gi"
-      limits:
-        cpu: "1024m"
-        memory: "2048Mi"
-        ephemeral-storage: "8Gi"
-    command:
-    - /busybox/cat
-    tty: true
-    volumeMounts:
-    - name: jenkins-docker-cfg
-      mountPath: /kaniko/.docker
-  volumes:
-  - name: jenkins-docker-cfg
-    projected:
-      sources:
-      - secret:
-          name: rencibuild-imagepull-secret
-          items:
-            - key: .dockerconfigjson
-              path: config.json
-'''
-        }
-    }
-    environment {
-        PATH = "/busybox:/kaniko:/ko-app/:$PATH"
-        DOCKERHUB_CREDS = credentials("${env.CONTAINERS_REGISTRY_CREDS_ID_STR}")
-        REGISTRY = "${env.REGISTRY}"
-        REG_OWNER="helxplatform"
-        REG_APP="roger"
-        COMMIT_HASH="${sh(script:"git rev-parse --short HEAD", returnStdout: true).trim()}"
-        VERSION_FILE="./dags/_version.py"
-        VERSION="${sh(script:'awk \'{ print $3 }\' ./dags/_version.py | xargs', returnStdout: true).trim()}"
-        IMAGE_NAME="${REGISTRY}/${REG_OWNER}/${REG_APP}"
-        TAG1="$BRANCH_NAME"
-        TAG2="$COMMIT_HASH"
-        TAG3="$VERSION"
-        TAG4="latest"
-    }
-    stages {
-        stage('Test') {
-            steps {
-                sh '''
-                echo "Test stage"
-                '''
-            }
-        }
-        stage('Build') {
-            steps {
-                script {
-                    container(name: 'kaniko', shell: '/busybox/sh') {
-                        if (env.BRANCH_NAME == "main") {
-                          // Tag with latest and version iff when pushed to master
-                          kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2", "$IMAGE_NAME:$TAG3", "$IMAGE_NAME:$TAG4"])
-                        } else {
-                          kaniko.buildAndPush("./Dockerfile", ["$IMAGE_NAME:$TAG1", "$IMAGE_NAME:$TAG2"])
-                        }
-                    }
-                }
-            }          
-        }        
-    }
-}

From 98257d9e1a8b163d4ed5d6880a8c8b96610f710e Mon Sep 17 00:00:00 2001
From: Patrick Hachicho <hachichopatrick@gmail.com>
Date: Fri, 7 Jun 2024 13:51:21 -0400
Subject: [PATCH 15/63] unlinked helx-actions

---
 .github/workflows/build-push-dev-image.yml |  63 ++++++++++-
 .github/workflows/build-push-release.yml   | 110 ++++++++++++++++++-
 .github/workflows/code-checks.yml          | 120 ++++++++++++++++++++-
 .github/workflows/trivy-pr-scan.yml        |  50 ++++++++-
 4 files changed, 333 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-push-dev-image.yml b/.github/workflows/build-push-dev-image.yml
index f18a05c6..13f8cfb7 100644
--- a/.github/workflows/build-push-dev-image.yml
+++ b/.github/workflows/build-push-dev-image.yml
@@ -23,5 +23,64 @@ on:
 
 jobs:
   build-push-dev-image:
-    uses: helxplatform/helx-github-actions/.github/workflows/build-push-dev-image.yml@main
-    secrets: inherit
\ No newline at end of file
+    runs-on: ubuntu-latest
+    steps:
+    
+    - name: Checkout Code
+      uses: actions/checkout@v3
+      with:
+        ref: ${{ github.head_ref }} 
+        # fetch-depth: 0 means, get all branches and commits
+        fetch-depth: 0
+
+    - name: Set short git commit SHA
+      id: vars
+      run: |
+        echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT
+    # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/
+
+    - name: Confirm git commit SHA output
+      run: echo ${{ steps.vars.outputs.short_sha }}
+
+    # Docker Buildx is important to caching in the Build And Push Container
+    # step
+    # https://github.com/marketplace/actions/build-and-push-docker-images
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+      with:
+        driver-opts: |
+          network=host
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v3
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_TOKEN }}
+        logout: true
+
+    - name: Login to Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: containers.renci.org
+        username: ${{ secrets.CONTAINERHUB_USERNAME }}
+        password: ${{ secrets.CONTAINERHUB_TOKEN }}
+        logout: true
+
+
+    # Notes on Cache: 
+    # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache
+    - name: Build Push Container
+      uses: docker/build-push-action@v5
+      with:
+        context: .
+        push: true
+        # Push to renci-registry and dockerhub here.
+        # cache comes from dockerhub.
+        tags: |
+          ${{ github.repository }}:develop
+          ${{ github.repository }}:${{ steps.vars.outputs.short_sha }}
+          containers.renci.org/${{ github.repository }}:develop
+          containers.renci.org/${{ github.repository }}:${{ steps.vars.outputs.short_sha }}
+        cache-from: type=registry,ref=${{ github.repository }}:buildcache-dev
+        cache-to: type=registry,ref=${{ github.repository }}:buildcache-dev,mode=max
\ No newline at end of file
diff --git a/.github/workflows/build-push-release.yml b/.github/workflows/build-push-release.yml
index 942221a3..07b22d21 100644
--- a/.github/workflows/build-push-release.yml
+++ b/.github/workflows/build-push-release.yml
@@ -21,5 +21,111 @@ on:
       - '*'
 jobs:
   build-push-release:
-    uses: helxplatform/helx-github-actions/.github/workflows/build-push-release.yml@main
-    secrets: inherit
\ No newline at end of file
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v3
+      with:
+        ref: ${{ github.head_ref }} 
+        fetch-depth: 0
+
+    - name: Set short git commit SHA
+      id: vars
+      run: |
+        echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT
+    # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/
+
+    - name: Confirm git commit SHA output
+      run: echo ${{ steps.vars.outputs.short_sha }}
+
+    # https://github.com/marketplace/actions/git-semantic-version
+    - name: Semver Check
+      uses: paulhatch/semantic-version@v5.0.3
+      id: version
+      with:
+        # The prefix to use to identify tags
+        tag_prefix: "v"
+        # A string which, if present in a git commit, indicates that a change represents a
+        # major (breaking) change, supports regular expressions wrapped with '/'
+        major_pattern: "/breaking:|major:/"
+        # A string which indicates the flags used by the `major_pattern` regular expression. Supported flags: idgs
+        major_regexp_flags: "ig"
+        # Same as above except indicating a minor change, supports regular expressions wrapped with '/'
+        minor_pattern: "/feat:|feature:|minor:/"
+        # A string which indicates the flags used by the `minor_pattern` regular expression. Supported flags: idgs
+        minor_regexp_flags: "ig"
+        # A string to determine the format of the version output
+        # version_format: "${major}.${minor}.${patch}-prerelease${increment}"
+        version_format: "${major}.${minor}.${patch}"
+        search_commit_body: false
+
+    # Docker Buildx is important to caching in the Build And Push Container
+    # step
+    # https://github.com/marketplace/actions/build-and-push-docker-images
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+      with:
+        driver-opts: |
+          network=host
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v3
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_TOKEN }}
+        logout: true
+
+    - name: Login to Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: containers.renci.org
+        username: ${{ secrets.CONTAINERHUB_USERNAME }}
+        password: ${{ secrets.CONTAINERHUB_TOKEN }}
+        logout: true
+
+    # Notes on Cache: 
+    # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache
+    - name: Build Push Container
+      uses: docker/build-push-action@v5
+      with:
+        push: true
+        # Push to renci-registry and dockerhub here.
+        # cache comes from dockerhub.
+        tags: |
+          containers.renci.org/${{ github.repository }}:v${{ steps.version.outputs.version }}
+          containers.renci.org/${{ github.repository }}:latest
+          containers.renci.org/${{ github.repository }}:${{ steps.vars.outputs.short_sha }}
+          ${{ github.repository }}:v${{ steps.version.outputs.version }}
+          ${{ github.repository }}:latest
+          ${{ github.repository }}:${{ steps.vars.outputs.short_sha }}
+        cache-from: type=registry,ref=${{ github.repository }}:buildcache-release
+        cache-to: type=registry,ref=${{ github.repository }}:buildcache-release,mode=max
+
+#==========================TAG & RELEASE W/ NOTES =========================
+
+    # Note: GITHUB_TOKEN is autogenerated feature of github app
+    # which is auto-enabled when using github actions.
+    # https://docs.github.com/en/actions/security-guides/automatic-token-authentication
+    # https://docs.github.com/en/rest/git/tags?apiVersion=2022-11-28#create-a-tag-object
+    # https://docs.github.com/en/rest/git/refs?apiVersion=2022-11-28#create-a-reference
+    # This creates a "lightweight" ref tag.
+    - name: Create Tag for Release
+      run: |
+        curl \
+        -s --fail -X POST \
+        -H "Accept: application/vnd.github+json" \
+        -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+        -H "X-GitHub-Api-Version: 2022-11-28" \
+        https://api.github.com/repos/${{ github.repository }}/git/refs \
+        -d '{"ref":"refs/tags/v${{ steps.version.outputs.version }}","sha":"${{ github.sha }}"}'
+        
+#   https://cli.github.com/manual/gh_release_create
+    - name: Create Release
+      env:
+        RELEASE_VERSION: ${{ steps.version.outputs.version }}
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      run: |
+        gh release create ${{ env.RELEASE_VERSION }} \
+          -t "${{ env.RELEASE_VERSION }}" \
+          --generate-notes \
+          --latest
\ No newline at end of file
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index 46f09d3c..401c24cc 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -33,6 +33,120 @@ on:
       - .githooks
 
 jobs:
- build-push-release:
-    uses: helxplatform/helx-github-actions/.github/workflows/code-checks.yml@main
-    secrets: inherit
\ No newline at end of file
+############################## flake8-linter ##############################
+  flake8-linter:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+          
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+
+    # Currently actions/setup-python supports caching
+    # but the cache is not as robust as cache action.
+    # Here we cache the entire python env which speeds subsequent builds up alot. (alot being scientific term)
+    # Ref: https://blog.allenai.org/python-caching-in-github-actions-e9452698e98d
+    - uses: actions/cache@v3
+      name: Cache Python
+      with:
+        path: ${{ env.pythonLocation }}
+        key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('pyproject.toml') }}
+
+    - name: Install Requirements
+      run: |
+        pip install -r requirements.txt
+
+    - name: Lint with flake8
+      run: |
+        pip install flake8
+        flake8 --ignore=E,W src 
+      # We continue on error here until the code is clean
+      # flake8 --ignore=E,W --exit-zero . 
+      continue-on-error: true
+
+################################### PYTEST ###################################
+  pytest:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+
+    - name: Install Requirements
+      run: |
+        pip install -r requirements.txt
+        pip install coverage
+        pip install .
+
+    - name: Test with pytest
+      run: |
+        make test
+
+############################ Bandit ################################
+  bandit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+
+    - name: Install Requirements
+      run: |
+        pip install -r requirements.txt
+        pip install bandit
+        pip install .
+
+    # Only report high security issues
+    - name: Test with Bandit
+      run: |
+        bandit -r src -n3 -lll
+
+############################## test-image-build ##############################
+  test-image-build:
+    runs-on: ubuntu-latest
+    # if: ${{ github.actor == 'dependabot[bot]' }}
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set short git commit SHA
+      id: vars
+      run: |
+        echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT
+    # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/
+
+    - name: Confirm git commit SHA output
+      run: echo ${{ steps.vars.outputs.short_sha }}
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v3
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_TOKEN }}
+        logout: true
+
+    - name: Parse Github Reference Name
+      id: branch
+      run: |
+        REF=${{ github.ref_name }}
+        echo "GHR=${REF%/*}" >> $GITHUB_OUTPUT
+        
+    # Notes on Cache: 
+    # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache
+    - name: Build Container
+      uses: docker/build-push-action@v5
+      with:
+        context: .
+        push: true
+        tags: |
+          ${{ github.repository }}:test_${{ steps.branch.outputs.GHR }}
+        cache-from: type=registry,ref=${{ github.repository }}:buildcache
+        cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max
\ No newline at end of file
diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml
index 6097214a..1e7bc060 100644
--- a/.github/workflows/trivy-pr-scan.yml
+++ b/.github/workflows/trivy-pr-scan.yml
@@ -1,4 +1,3 @@
-
 name: trivy-pr-scan 
 on:
   pull_request:
@@ -19,5 +18,50 @@ on:
 
 jobs:
  trivy-pr-scan:
-    uses: helxplatform/helx-github-actions/.github/workflows/trivy-pr-scan.yml@main
-    secrets: inherit
\ No newline at end of file
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+      with:
+        driver-opts: |
+          network=host
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v3
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_TOKEN }}
+        logout: true
+
+    # Notes on Cache: 
+    # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache
+    - name: Build Container
+      uses: docker/build-push-action@v5
+      with:
+        context: .
+        push: false
+        load: true
+        tags: ${{ github.repository }}:vuln-test
+        cache-from: type=registry,ref=${{ github.repository }}:buildcache
+        cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max
+
+    # We will not be concerned with Medium and Low vulnerabilities
+    - name: Run Trivy vulnerability scanner
+      uses: aquasecurity/trivy-action@master
+      with:
+        image-ref: '${{ github.repository }}:vuln-test'
+        format: 'sarif'
+        severity: 'CRITICAL,HIGH'
+        ignore-unfixed: true
+        output: 'trivy-results.sarif'
+        exit-code: '1'
+    # Scan results should be viewable in GitHub Security Dashboard
+    # We still fail the job if results are found, so below will always run
+    # unless manually canceled.
+    - name: Upload Trivy scan results to GitHub Security tab
+      uses: github/codeql-action/upload-sarif@v2
+      if: '!cancelled()'
+      with:
+        sarif_file: 'trivy-results.sarif'
\ No newline at end of file

From d7b1f7e99670228e59b2b6c750cf340defcf5c29 Mon Sep 17 00:00:00 2001
From: Patrick Hachicho <hachichopatrick@gmail.com>
Date: Tue, 11 Jun 2024 13:45:19 -0400
Subject: [PATCH 16/63] testing paths

---
 .github/workflows/code-checks.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index 401c24cc..372c8f76 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -80,7 +80,7 @@ jobs:
       run: |
         pip install -r requirements.txt
         pip install coverage
-        pip install .
+        pip install ./dags
 
     - name: Test with pytest
       run: |
@@ -100,7 +100,7 @@ jobs:
       run: |
         pip install -r requirements.txt
         pip install bandit
-        pip install .
+        pip install ./dags/roger
 
     # Only report high security issues
     - name: Test with Bandit

From b0d8c923df8798efcb532058d528fe5b96ecb101 Mon Sep 17 00:00:00 2001
From: Patrick Hachicho <hachichopatrick@gmail.com>
Date: Tue, 11 Jun 2024 13:53:40 -0400
Subject: [PATCH 17/63] testing again

---
 .github/workflows/code-checks.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index 372c8f76..126cca81 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -80,7 +80,7 @@ jobs:
       run: |
         pip install -r requirements.txt
         pip install coverage
-        pip install ./dags
+   #     pip install ./dags
 
     - name: Test with pytest
       run: |
@@ -100,7 +100,7 @@ jobs:
       run: |
         pip install -r requirements.txt
         pip install bandit
-        pip install ./dags/roger
+    #    pip install ./dags/roger
 
     # Only report high security issues
     - name: Test with Bandit

From 9bee0884bb646a3b97d0ff405c10c97d6bdbdf45 Mon Sep 17 00:00:00 2001
From: Patrick Hachicho <hachichopatrick@gmail.com>
Date: Tue, 11 Jun 2024 14:28:34 -0400
Subject: [PATCH 18/63] d

---
 .github/workflows/code-checks.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index 126cca81..ec18da7a 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -80,7 +80,7 @@ jobs:
       run: |
         pip install -r requirements.txt
         pip install coverage
-   #     pip install ./dags
+        pip install /dags
 
     - name: Test with pytest
       run: |
@@ -100,7 +100,7 @@ jobs:
       run: |
         pip install -r requirements.txt
         pip install bandit
-    #    pip install ./dags/roger
+        pip install .
 
     # Only report high security issues
     - name: Test with Bandit

From 2b44c2280b46f94fa5f05037625cb3b5e8253dc9 Mon Sep 17 00:00:00 2001
From: Patrick Hachicho <hachichopatrick@gmail.com>
Date: Tue, 11 Jun 2024 14:31:23 -0400
Subject: [PATCH 19/63] tests

---
 .github/workflows/code-checks.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index ec18da7a..35f526b4 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -80,7 +80,7 @@ jobs:
       run: |
         pip install -r requirements.txt
         pip install coverage
-        pip install /dags
+        pip install ./tests
 
     - name: Test with pytest
       run: |
@@ -100,7 +100,7 @@ jobs:
       run: |
         pip install -r requirements.txt
         pip install bandit
-        pip install .
+        pip install ./tests
 
     # Only report high security issues
     - name: Test with Bandit

From 7a8cc6dfec7f71589711e116f6b6f7e7c79f0677 Mon Sep 17 00:00:00 2001
From: Patrick Hachicho <hachichopatrick@gmail.com>
Date: Wed, 12 Jun 2024 21:56:01 -0400
Subject: [PATCH 20/63] commented out pytest

---
 .github/workflows/code-checks.yml | 40 +++++++++++++++----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index 35f526b4..1d175672 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -61,30 +61,30 @@ jobs:
     - name: Lint with flake8
       run: |
         pip install flake8
-        flake8 --ignore=E,W src 
+        flake8 --ignore=E,W dag 
       # We continue on error here until the code is clean
       # flake8 --ignore=E,W --exit-zero . 
       continue-on-error: true
 
 ################################### PYTEST ###################################
-  pytest:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.12'
-
-    - name: Install Requirements
-      run: |
-        pip install -r requirements.txt
-        pip install coverage
-        pip install ./tests
-
-    - name: Test with pytest
-      run: |
-        make test
+  # pytest:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #   - uses: actions/checkout@v3
+  #   - name: Set up Python
+  #     uses: actions/setup-python@v4
+  #     with:
+  #       python-version: '3.12'
+
+  #   - name: Install Requirements
+  #     run: |
+  #       pip install -r requirements.txt
+  #       pip install coverage
+  #       pip install ./tests
+
+  #   - name: Test with pytest
+  #     run: |
+  #       make test
 
 ############################ Bandit ################################
   bandit:
@@ -100,7 +100,7 @@ jobs:
       run: |
         pip install -r requirements.txt
         pip install bandit
-        pip install ./tests
+        pip install ./dags
 
     # Only report high security issues
     - name: Test with Bandit

From 94253c934cf7ed7912f7d195b09705e802345996 Mon Sep 17 00:00:00 2001
From: Patrick Hachicho <hachichopatrick@gmail.com>
Date: Wed, 12 Jun 2024 21:58:57 -0400
Subject: [PATCH 21/63] try again for bandit

---
 .github/workflows/code-checks.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index 1d175672..f22839a9 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -100,7 +100,7 @@ jobs:
       run: |
         pip install -r requirements.txt
         pip install bandit
-        pip install ./dags
+        pip install /dags
 
     # Only report high security issues
     - name: Test with Bandit

From 7f37f916f466d36c0eea45670ccd8b648ddbb20d Mon Sep 17 00:00:00 2001
From: Patrick Hachicho <hachichopatrick@gmail.com>
Date: Wed, 12 Jun 2024 22:21:30 -0400
Subject: [PATCH 22/63] commented out bandit

---
 .github/workflows/code-checks.yml | 187 +++++++++++++++---------------
 1 file changed, 93 insertions(+), 94 deletions(-)

diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index f22839a9..a2e53021 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -4,7 +4,7 @@
 #     - PYTest
 #     - Bandit
 # For PR Vulnerability Scanning a separate workflow will run.
-# The build-push-dev-image and build-push-release workflows 
+# The build-push-dev-image and build-push-release workflows
 # handle the develop and release image storage respectively.
 #
 #
@@ -20,8 +20,8 @@ on:
     branches:
       - develop
       - master
-      - main 
-    types: [ opened, synchronize ]
+      - main
+    types: [opened, synchronize]
     paths-ignore:
       - README.md
       - .old_cicd/*
@@ -33,40 +33,40 @@ on:
       - .githooks
 
 jobs:
-############################## flake8-linter ##############################
+  ############################## flake8-linter ##############################
   flake8-linter:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
-          
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.12'
-
-    # Currently actions/setup-python supports caching
-    # but the cache is not as robust as cache action.
-    # Here we cache the entire python env which speeds subsequent builds up alot. (alot being scientific term)
-    # Ref: https://blog.allenai.org/python-caching-in-github-actions-e9452698e98d
-    - uses: actions/cache@v3
-      name: Cache Python
-      with:
-        path: ${{ env.pythonLocation }}
-        key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('pyproject.toml') }}
-
-    - name: Install Requirements
-      run: |
-        pip install -r requirements.txt
-
-    - name: Lint with flake8
-      run: |
-        pip install flake8
-        flake8 --ignore=E,W dag 
-      # We continue on error here until the code is clean
-      # flake8 --ignore=E,W --exit-zero . 
-      continue-on-error: true
-
-################################### PYTEST ###################################
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+
+      # Currently actions/setup-python supports caching
+      # but the cache is not as robust as cache action.
+      # Here we cache the entire python env which speeds subsequent builds up alot. (alot being scientific term)
+      # Ref: https://blog.allenai.org/python-caching-in-github-actions-e9452698e98d
+      - uses: actions/cache@v3
+        name: Cache Python
+        with:
+          path: ${{ env.pythonLocation }}
+          key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('pyproject.toml') }}
+
+      - name: Install Requirements
+        run: |
+          pip install -r requirements.txt
+
+      - name: Lint with flake8
+        run: |
+          pip install flake8
+          flake8 --ignore=E,W dag
+        # We continue on error here until the code is clean
+        # flake8 --ignore=E,W --exit-zero .
+        continue-on-error: true
+
+  ################################### PYTEST ###################################
   # pytest:
   #   runs-on: ubuntu-latest
   #   steps:
@@ -86,67 +86,66 @@ jobs:
   #     run: |
   #       make test
 
-############################ Bandit ################################
-  bandit:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.12'
-
-    - name: Install Requirements
-      run: |
-        pip install -r requirements.txt
-        pip install bandit
-        pip install /dags
-
-    # Only report high security issues
-    - name: Test with Bandit
-      run: |
-        bandit -r src -n3 -lll
-
-############################## test-image-build ##############################
+  ############################ Bandit ################################
+  # bandit:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - uses: actions/checkout@v3
+  #     - name: Set up Python
+  #       uses: actions/setup-python@v4
+  #       with:
+  #         python-version: "3.12"
+
+  #     - name: Install Requirements
+  #       run: |
+  #         pip install -r requirements.txt
+  #         pip install bandit
+  #         pip install ./dags
+
+  #     # Only report high security issues
+  #     - name: Test with Bandit
+  #       run: |
+  #         bandit -r dags -n3 -lll
+
+  ############################## test-image-build ##############################
   test-image-build:
     runs-on: ubuntu-latest
     # if: ${{ github.actor == 'dependabot[bot]' }}
     steps:
-    - uses: actions/checkout@v3
-
-    - name: Set short git commit SHA
-      id: vars
-      run: |
-        echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT
-    # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/
-
-    - name: Confirm git commit SHA output
-      run: echo ${{ steps.vars.outputs.short_sha }}
-
-    - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3
-
-    - name: Login to DockerHub
-      uses: docker/login-action@v3
-      with:
-        username: ${{ secrets.DOCKERHUB_USERNAME }}
-        password: ${{ secrets.DOCKERHUB_TOKEN }}
-        logout: true
-
-    - name: Parse Github Reference Name
-      id: branch
-      run: |
-        REF=${{ github.ref_name }}
-        echo "GHR=${REF%/*}" >> $GITHUB_OUTPUT
-        
-    # Notes on Cache: 
-    # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache
-    - name: Build Container
-      uses: docker/build-push-action@v5
-      with:
-        context: .
-        push: true
-        tags: |
-          ${{ github.repository }}:test_${{ steps.branch.outputs.GHR }}
-        cache-from: type=registry,ref=${{ github.repository }}:buildcache
-        cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max
\ No newline at end of file
+      - uses: actions/checkout@v3
+
+      - name: Set short git commit SHA
+        id: vars
+        run: |
+          echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT
+      # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/
+      - name: Confirm git commit SHA output
+        run: echo ${{ steps.vars.outputs.short_sha }}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+          logout: true
+
+      - name: Parse Github Reference Name
+        id: branch
+        run: |
+          REF=${{ github.ref_name }}
+          echo "GHR=${REF%/*}" >> $GITHUB_OUTPUT
+
+      # Notes on Cache:
+      # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache
+      - name: Build Container
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: true
+          tags: |
+            ${{ github.repository }}:test_${{ steps.branch.outputs.GHR }}
+          cache-from: type=registry,ref=${{ github.repository }}:buildcache
+          cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max

From e696854fd71cd06a44d1cf721379364f325be59a Mon Sep 17 00:00:00 2001
From: Patrick Hachicho <hachichopatrick@gmail.com>
Date: Thu, 13 Jun 2024 21:54:59 -0400
Subject: [PATCH 23/63] changed dag to dags

---
 .github/workflows/code-checks.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index a2e53021..d79d0f7a 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -61,7 +61,7 @@ jobs:
       - name: Lint with flake8
         run: |
           pip install flake8
-          flake8 --ignore=E,W dag
+          flake8 --ignore=E,W dags
         # We continue on error here until the code is clean
         # flake8 --ignore=E,W --exit-zero .
         continue-on-error: true

From b90927495de5e79324d7cd17d1be9ac807f60716 Mon Sep 17 00:00:00 2001
From: Patrick Hachicho <hachichopatrick@gmail.com>
Date: Fri, 21 Jun 2024 11:51:36 -0400
Subject: [PATCH 24/63] Added fixes

---
 .github/workflows/code-checks.yml | 36 ++++++-------------------------
 1 file changed, 7 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index d79d0f7a..b7f3e6a5 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -11,11 +11,11 @@
 
 name: Code-Checks
 on:
-  # push:
-  #   branches-ignore:
-  #     - master
-  #     - main
-  #     - develop
+  push:
+    branches-ignore:
+      - master
+      - main
+      - develop
   pull_request:
     branches:
       - develop
@@ -25,8 +25,8 @@ on:
     paths-ignore:
       - README.md
       - .old_cicd/*
-      # - .github/*
-      # - .github/workflows/*
+      - .github/*
+      - .github/workflows/*
       - LICENSE
       - .gitignore
       - .dockerignore
@@ -85,28 +85,6 @@ jobs:
   #   - name: Test with pytest
   #     run: |
   #       make test
-
-  ############################ Bandit ################################
-  # bandit:
-  #   runs-on: ubuntu-latest
-  #   steps:
-  #     - uses: actions/checkout@v3
-  #     - name: Set up Python
-  #       uses: actions/setup-python@v4
-  #       with:
-  #         python-version: "3.12"
-
-  #     - name: Install Requirements
-  #       run: |
-  #         pip install -r requirements.txt
-  #         pip install bandit
-  #         pip install ./dags
-
-  #     # Only report high security issues
-  #     - name: Test with Bandit
-  #       run: |
-  #         bandit -r dags -n3 -lll
-
   ############################## test-image-build ##############################
   test-image-build:
     runs-on: ubuntu-latest

From 4532bdad1b1c89ab81c51ca9d05d9ce2e9d40583 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Wed, 7 Aug 2024 13:31:12 -0400
Subject: [PATCH 25/63] Bagel (#103)

* bump dug version

* adding bdc new pipelines

* adding curesc

* adding bagel config

* add test parser

* add score threshold

* point to dug develop
---
 dags/roger/config/__init__.py         | 15 ++++++++
 dags/roger/config/config.yaml         | 22 +++++++++---
 dags/roger/pipelines/bdc_pipelines.py | 50 +++++++++++++++++++++++++++
 requirements.txt                      |  2 +-
 4 files changed, 83 insertions(+), 6 deletions(-)
 create mode 100644 dags/roger/pipelines/bdc_pipelines.py

diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py
index ac9eb23a..8cb10247 100644
--- a/dags/roger/config/__init__.py
+++ b/dags/roger/config/__init__.py
@@ -99,6 +99,21 @@ class AnnotationConfig(DictLike):
             "sapbert": {
                 "classification_url": "https://med-nemo.apps.renci.org/annotate/",
                 "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/",
+                "score_threshold": 0.8,
+                "bagel": {
+                    "enabled": False,
+                    "url": "https://bagel.apps.renci.org/group_synonyms_openai",
+                    "prompt": "bagel/ask_classes",
+                    "llm_args": {
+                        "llm_model_name": "gpt-4o-2024-05-13",
+                        "organization": "",
+                        "access_key": "",
+                        "llm_model_args": {
+                            "top_p": 0,
+                            "temperature": 0.1
+                        }
+                    }
+                }
             },
         }
     )
diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml
index e9402ce4..86c95adc 100644
--- a/dags/roger/config/config.yaml
+++ b/dags/roger/config/config.yaml
@@ -1,6 +1,6 @@
 redisgraph:
   username: ""
-  password: "12345"
+  password: "weak"
   host: localhost
   graph: test
   port: 6379
@@ -42,13 +42,25 @@ bulk_loader:
 
 annotation:
   clear_http_cache: false
-  annotator_type: monarch
+  annotator_type: sapbert
   annotator_args:
     monarch:
       url: "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content="
     sapbert: 
       classification_url: "https://med-nemo.apps.renci.org/annotate/"
-      annotator_url: "https://babel-sapbert.apps.renci.org/annotate/"
+      annotator_url: "https://sap-qdrant.apps.renci.org/annotate/"
+      score_threshold: 0.5
+      bagel:
+        enabled: false
+        url: "http://localhost:9099/group_synonyms_openai"
+        prompt: "bagel/ask_classes"
+        llm_args:
+          llm_model_name: "gpt-4o-2024-05-13"
+          organization:
+          access_key:
+          llm_model_args:
+            top_p: 0
+            temperature: 0.1
   normalizer: "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie="
   synonym_service: "https://name-resolution-sri.renci.org/reverse_lookup"
   ontology_metadata: "https://api.monarchinitiative.org/api/bioentity/"
@@ -93,9 +105,9 @@ indexing:
         action: "files"
 
 elasticsearch:
-  host: elasticsearch
+  host: localhost
   username: elastic
-  password: ""
+  password: "12345"
   nboost_host: ""
   scheme: "http"
   ca_path: ""
diff --git a/dags/roger/pipelines/bdc_pipelines.py b/dags/roger/pipelines/bdc_pipelines.py
new file mode 100644
index 00000000..bf483928
--- /dev/null
+++ b/dags/roger/pipelines/bdc_pipelines.py
@@ -0,0 +1,50 @@
+"Dug pipeline for dbGaP data set"
+
+from roger.pipelines import DugPipeline
+
+class BIOLINCCdbGaPPipeline(DugPipeline):
+    "Pipeline for the dbGaP data set"
+    pipeline_name = 'biolincc'
+    parser_name = 'biolincc'
+
+
+class covid19dbGaPPipeline(DugPipeline):
+    "Pipeline for the dbGaP data set"
+    pipeline_name = 'covid19-dbgap'
+    parser_name = 'covid19'
+
+class dirDbGaPPipeline(DugPipeline):
+    pipeline_name = "dir-dbgap"
+    parser_name = "dir"
+
+class LungMapDbGaPPipeline(DugPipeline):
+    pipeline_name = "lungmap-dbgap"
+    parser_name = "lungmap"
+
+class nsrrDbGaPPipeline(DugPipeline):
+    pipeline_name = "nsrr-dbgap"
+    parser_name = "nsrr"
+
+class ParentDbGaPPipeline(DugPipeline):
+    pipeline_name = "parent-dbgap"
+    parser_name = "parent"
+
+class PCGCDbGaPPipeline(DugPipeline):
+    pipeline_name = "pcgc-dbgap"
+    parser_name = "pcgc"
+
+class RecoverDbGaPPipeline(DugPipeline):
+    pipeline_name = "recover-dbgap"
+    parser_name = "recover"
+
+class TopmedDBGaPPipeline(DugPipeline):
+    pipeline_name = "topmed-gen3-dbgap"
+    parser_name = "topmeddbgap"
+
+class CureSCPipeline(DugPipeline):
+    pipeline_name = "curesc-dbgap"
+    parser_name = "curesc"
+
+class SmallDataDbGap(DugPipeline):
+    pipeline_name = "small-data-dbgap"
+    parser_name = "topmeddbgap"
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index d1b1f68f..44c9a395 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ jsonpickle
 redisgraph-bulk-loader==0.12.3
 pytest
 PyYAML
-git+https://github.com/helxplatform/dug@2.13.1
+git+https://github.com/helxplatform/dug@develop
 orjson
 kg-utils==0.0.6
 bmt==1.1.0

From af41d8f60603efbd1e9bebe9879d2fab8ff88e71 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Wed, 7 Aug 2024 13:34:54 -0400
Subject: [PATCH 26/63] Dbgap programs (#104)

* bump dug version

* adding bdc new pipelines

* adding curesc

* fix up merge conflict
---
 dags/roger/pipelines/bdc_pipelines.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/dags/roger/pipelines/bdc_pipelines.py b/dags/roger/pipelines/bdc_pipelines.py
index bf483928..5a945641 100644
--- a/dags/roger/pipelines/bdc_pipelines.py
+++ b/dags/roger/pipelines/bdc_pipelines.py
@@ -45,6 +45,4 @@ class CureSCPipeline(DugPipeline):
     pipeline_name = "curesc-dbgap"
     parser_name = "curesc"
 
-class SmallDataDbGap(DugPipeline):
-    pipeline_name = "small-data-dbgap"
-    parser_name = "topmeddbgap"
\ No newline at end of file
+

From 99a6e7b8285231b1a1b8931de26edafbd5c17cfb Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Thu, 19 Sep 2024 10:06:39 -0400
Subject: [PATCH 27/63] adding bagel config parse to bool

---
 dags/roger/config/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py
index 8cb10247..71111f39 100644
--- a/dags/roger/config/__init__.py
+++ b/dags/roger/config/__init__.py
@@ -134,6 +134,9 @@ class AnnotationConfig(DictLike):
         "PATO", "CHEBI", "MONDO", "UBERON", "HP", "MESH", "UMLS"
     ])
 
+    def __post_init__(self):
+        self.annotator_args["sapbert"]["bagel"]["enabled"] = self.annotator_args["sapbert"]["bagel"][
+                                                                 "enabled"].lower() == "true"
 
 
 @dataclass

From 05b115ec9afce05ca0b1edb80e66356a16f5dbf1 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Thu, 19 Sep 2024 14:50:13 -0400
Subject: [PATCH 28/63] Dev sync main (#106)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* release sync (#100)

* Update _version.py (#86)

* Update _version.py

* Rti merge (#84)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Update requirements.txt

* Update .gitignore

* Update dug_utils.py

Handle Error when curie not found in validate

* Update __init__.py

* Update config.yaml

* Update dev-config.yaml

* Update docker-compose.yaml

* fixed docker-compose

* adding back postgres volume to docker compose

* env correction , docker compose updates

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>

* adding v5.0

* cde-links branch

* pin linkml

* Update config.yaml

collection_action to action

* pop total items before result

* print extracted elements

* Update requirements.txt

* Keep edge provenance (#94)

* Update kgx.py

* Update kgx.py

* Update kgx.py

can't delete edge keys while looping over them.

* just collect then update

* Update requirements.txt (#93)

* Pipeline parameterize restructure (#95)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* Parameterized annotate tasks with input_data_path and output_data_path

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Parameterized annotate tasks with input_data_path and output_data_path (#85)

* adding lakefs changes to roger-2.0

* point avalon to vg1 branch

* change avalon dep

* update airflow

* fix avalon tag typo

* update jenkins to tag version on main branch only

* update jenkins to tag version

* update jenkins to tag version

* psycopg2 installation

* add cncf k8s req

* use airflow non-slim

* simplified for testing

* simplified for testing

* change dag name

* Erroneous parameter passed, should not be None

* adding pre-exec

* adding pre-exec

* adding pre-exec

* typo preexec

* typo preexec

* fix context

* get files from repo

* get files from repo

* get files from repo

* get files from repo

* First shot at moving pipeline into base class and implementing. Anvil pipeline not complete

* Syntax fix, docker image version bump to airflow 2.7.2-python3.11

* update storage dir

* update remove dir code

* update remove dir code

* remote path to *

* fix input dir for annotators

* fix input dir for annotators

* fix input dir for annotators

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* adding branch info on lakefs config

* callback push to branch

* back to relative import

* reformat temp branch name based on unique task id

* add logging

* add logging

* convert posix path to str for avalon

* add extra / to root path

* New dag created using DugPipeline subclasses

* EmptyOperator imported from wrong place

* import and syntax fixes

* utterly silly syntax error

* Added anvil to default input data sets for testing purposes

* adding / to local path

* commit meta task args empty string

* add merge logic

* add merge logic

* upstream task dir pull for downstream task

* Switched from subdag to taskgroup because latest Airflow depricated subdag

* Added BACPAC pipeline object

* Temporarily ignoring configuration variable for enabled datasets for testing

* Passed dag in to create task group to see if it helps dag errors

* Fixed silly syntax error

* adding input / output dir params for make kgx

* Trying different syntax to make taskgroups work.

* adding input / output dir params for make kgx

* Parsing, syntax, pylint fixes

* adding input / output dir params for make kgx

* Added pipeline name to task group name to ensure uniqueness

* oops, moved something out of scope. Fixed

* Filled out pipeline with methods from dug_utils. Needs data path changes

* Finished implementing input_data_path and output_data_path handling, pylint cleanup

* Update requirements.txt

* adding toggle to avoid sending config obj

* adding toggle to avoid sending config obj

* disable to string for test

* control pipelines for testing

* add self to anvil get files

* add log stream to make it available

* typo fix

* correcting branch id

* adding source repo

* adding source repo

* patch name-resolver response

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* dug pipeline edit

* recurisvely find recursively

* recurisvely find recursively

* setup output path for crawling

* all task functions should have input and output params

* adding annotation as upstream for validate index

* revamp create task , and task wrapper

* add validate concepts index task

* adding concept validation

* add index_variables task as dependecy for validate concepts

* add index_variables task as dependecy for validate concepts

* await client exist

* await client exist

* concepts not getting picked up for indexing

* concepts not getting picked up for indexing

* fix search elements

* converting annotation output to json

* json format annotation outputs

* adding support for json format elements and concepts read

* json back to dug objects

* fixing index valriables with json objects

* indetation and new line for better change detection :?

* indetation and new line for better change detection

* treat dictionary concepts as dictionary

* read concepts json as a dict

* concepts files are actually file paths

* debug message

* make output jsonable

* clear up dir after commit , and delete unmerged branch even if no changes

* don`t clear indexes, parallel dataset processing will be taxed

* memory leak?

* memory leak?

* memory leak?

* dumping pickles to debug locally

* find out why concepts are being added to every other element

* find out why concepts are being added to every other element

* pointless shuffle 🤷‍♂️

* revert back in time

* back to sanitize dug

* output just json for annotation

* adding jsonpickle

* jsonpickle 🥒

* unpickle for index

* unpickle for validate index

* crawling fixes

* crawling fixes

* crawling validation fixes

* fix index concepts

* fix makekgx

* adding other bdc pipelines

* adding pipeline paramters to be able to configure per instance

* fix

* add input dataset for pipelines

* Adding README to document how to create data set-specific pipelines

* catchup on base.py

* Added dbgap and nida pipelines

* fix import errors

* annotator modules added by passing config val (#90)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

---------

Co-authored-by: YaphetKG <kebedey@renci.org>

* Add heal parsers (#96)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Add heal parsers (#97)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Radx pipeline (#99)

* point to large download

* fix schema path

* debug bulk input dir

* fix schema read

* fix schema read

* fix schema read

* commenting steup dir for test

* adding logs

* fix path stuff

* add commented stuff back in

* testing radx parser

* adding parser

* skip indexing vars with no id

* adding indexes as part of bulk loader paramters

* fix id index cli arg

* fix local cli

* dug latest

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* pin avalon

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* remove jenkins file

* bump apache version

* revert airflow version

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>
---
 Dockerfile                    | 2 +-
 dags/roger/config/config.yaml | 2 +-
 requirements.txt              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 49c1fd26..7760c983 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@ FROM apache/airflow:2.7.2-python3.11
 
 USER root
 RUN apt-get update && \
-    apt-get install -y git nano vim 
+    apt-get install -y git nano vim gcc
 COPY requirements.txt requirements.txt
 USER airflow
 RUN pip install -r requirements.txt
diff --git a/dags/roger/config/config.yaml b/dags/roger/config/config.yaml
index 86c95adc..c407555f 100644
--- a/dags/roger/config/config.yaml
+++ b/dags/roger/config/config.yaml
@@ -49,7 +49,7 @@ annotation:
     sapbert: 
       classification_url: "https://med-nemo.apps.renci.org/annotate/"
       annotator_url: "https://sap-qdrant.apps.renci.org/annotate/"
-      score_threshold: 0.5
+      score_threshold: 0.8
       bagel:
         enabled: false
         url: "http://localhost:9099/group_synonyms_openai"
diff --git a/requirements.txt b/requirements.txt
index 44c9a395..3a0ee223 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ jsonpickle
 redisgraph-bulk-loader==0.12.3
 pytest
 PyYAML
-git+https://github.com/helxplatform/dug@develop
+git+https://github.com/helxplatform/dug@2.13.2
 orjson
 kg-utils==0.0.6
 bmt==1.1.0

From 20585846d170257bf5383ea12c95aacd5b39eb7a Mon Sep 17 00:00:00 2001
From: "Michael T. Bacon" <mbacon@renci.org>
Date: Tue, 8 Oct 2024 13:50:17 -0400
Subject: [PATCH 29/63] Updated docker image to 2.10.2

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 7760c983..9d3ca383 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM apache/airflow:2.7.2-python3.11
+FROM apache/airflow:2.10.2-python3.11
 
 USER root
 RUN apt-get update && \

From 6dec958663857cdbe4f9c46e4d7d16fc47455269 Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Tue, 8 Oct 2024 14:55:46 -0400
Subject: [PATCH 30/63] fix kgx path error

---
 dags/roger/core/storage.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dags/roger/core/storage.py b/dags/roger/core/storage.py
index c1a3e808..3e11ecc6 100644
--- a/dags/roger/core/storage.py
+++ b/dags/roger/core/storage.py
@@ -167,6 +167,8 @@ def merge_path(name, path: Path=None):
         if not os.path.exists(ROGER_DATA_DIR / 'merge'):
             os.makedirs(ROGER_DATA_DIR / 'merge')
         return str(ROGER_DATA_DIR / 'merge' / name)
+    if not os.path.exists(path):
+        os.makedirs(ROGER_DATA_DIR / 'merge')
     return str(path.joinpath(name))
 
 def merged_objects(file_type, path=None):

From d3f03ab1a5554d75731edb18832f568f468bca49 Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Tue, 8 Oct 2024 16:42:53 -0400
Subject: [PATCH 31/63] fix kgx path error

---
 dags/roger/core/storage.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/roger/core/storage.py b/dags/roger/core/storage.py
index 3e11ecc6..4346cd92 100644
--- a/dags/roger/core/storage.py
+++ b/dags/roger/core/storage.py
@@ -168,7 +168,7 @@ def merge_path(name, path: Path=None):
             os.makedirs(ROGER_DATA_DIR / 'merge')
         return str(ROGER_DATA_DIR / 'merge' / name)
     if not os.path.exists(path):
-        os.makedirs(ROGER_DATA_DIR / 'merge')
+        os.makedirs(path)
     return str(path.joinpath(name))
 
 def merged_objects(file_type, path=None):

From 745cf79d5f05d7366fd856dbc63136469e7d972f Mon Sep 17 00:00:00 2001
From: "Michael T. Bacon" <mbacon@renci.org>
Date: Thu, 10 Oct 2024 17:52:59 -0400
Subject: [PATCH 32/63] Trying out the slim apache images to reduce
 vulnerability footprint

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 9d3ca383..63350c0f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM apache/airflow:2.10.2-python3.11
+FROM apache/airflow:slim-2.10.2-python3.11
 
 USER root
 RUN apt-get update && \

From b86807e0fc24c70041f94c06cc7fc2113926514e Mon Sep 17 00:00:00 2001
From: "Michael T. Bacon" <mbacon@renci.org>
Date: Thu, 10 Oct 2024 18:00:28 -0400
Subject: [PATCH 33/63] Building general package update into dockerfile

---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index 63350c0f..5175d1e8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,6 +2,7 @@ FROM apache/airflow:slim-2.10.2-python3.11
 
 USER root
 RUN apt-get update && \
+    apt-get upgrade -y && \
     apt-get install -y git nano vim gcc
 COPY requirements.txt requirements.txt
 USER airflow

From 5dbe502150523b50c4af67394b30ea0494308e90 Mon Sep 17 00:00:00 2001
From: "Michael T. Bacon" <mbacon@renci.org>
Date: Thu, 10 Oct 2024 18:09:42 -0400
Subject: [PATCH 34/63] Trying more rigorous dist-upgrade to try to get rid of
 vulerabilities that won't quit

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 5175d1e8..1933fcc0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@ FROM apache/airflow:slim-2.10.2-python3.11
 
 USER root
 RUN apt-get update && \
-    apt-get upgrade -y && \
+    apt-get dist-upgrade && \
     apt-get install -y git nano vim gcc
 COPY requirements.txt requirements.txt
 USER airflow

From 3f6e987a9949f4ecedfb1b21bfc0180264b97127 Mon Sep 17 00:00:00 2001
From: "Michael T. Bacon" <mbacon@renci.org>
Date: Thu, 10 Oct 2024 18:10:58 -0400
Subject: [PATCH 35/63] dist-upgrade also needs -y flag

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 1933fcc0..cda551c1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@ FROM apache/airflow:slim-2.10.2-python3.11
 
 USER root
 RUN apt-get update && \
-    apt-get dist-upgrade && \
+    apt-get dist-upgrade -y && \
     apt-get install -y git nano vim gcc
 COPY requirements.txt requirements.txt
 USER airflow

From a1673b69102be89642829ea570282de2a24b1490 Mon Sep 17 00:00:00 2001
From: "Michael T. Bacon" <mbacon@renci.org>
Date: Fri, 11 Oct 2024 12:22:29 -0400
Subject: [PATCH 36/63] Rolling back from slim image

---
 Dockerfile       | 5 +++--
 requirements.txt | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index cda551c1..ef772ddd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,9 @@
-FROM apache/airflow:slim-2.10.2-python3.11
+FROM apache/airflow:2.10.2-python3.11
 
 USER root
 RUN apt-get update && \
-    apt-get dist-upgrade -y && \
+    apt-get upgrade -y && \
+    apt-get uninstall libaom3 -y && \
     apt-get install -y git nano vim gcc
 COPY requirements.txt requirements.txt
 USER airflow
diff --git a/requirements.txt b/requirements.txt
index 3a0ee223..651daec0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ elasticsearch==8.5.2
 flatten-dict
 jsonpickle
 redisgraph-bulk-loader==0.12.3
+setuptools>=66
 pytest
 PyYAML
 git+https://github.com/helxplatform/dug@2.13.2

From 0464da117b2b587d9ceac4e3e5041f944c073f8f Mon Sep 17 00:00:00 2001
From: "Michael T. Bacon" <mbacon@renci.org>
Date: Fri, 11 Oct 2024 12:44:36 -0400
Subject: [PATCH 37/63] Fixed apt-get syntax error

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index ef772ddd..d8722b54 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,7 +3,7 @@ FROM apache/airflow:2.10.2-python3.11
 USER root
 RUN apt-get update && \
     apt-get upgrade -y && \
-    apt-get uninstall libaom3 -y && \
+    apt-get remove libaom3 -y && \
     apt-get install -y git nano vim gcc
 COPY requirements.txt requirements.txt
 USER airflow

From 47df70d10d91a8ecbe52f648db35be3f31b8c378 Mon Sep 17 00:00:00 2001
From: "Michael T. Bacon" <mbacon@renci.org>
Date: Fri, 11 Oct 2024 15:00:46 -0400
Subject: [PATCH 38/63] Reverting develop to 2.7.2 for stability's sake, will
 revisit after upgrades finish

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index d8722b54..9cc3f819 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM apache/airflow:2.10.2-python3.11
+FROM apache/airflow:2.7.2-python3.11
 
 USER root
 RUN apt-get update && \

From b721976c5ab507262aec57d4f7576bcdf03db780 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:23:21 -0400
Subject: [PATCH 39/63] Update Dockerfile

---
 Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 9cc3f819..bd4ef93d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,7 +3,6 @@ FROM apache/airflow:2.7.2-python3.11
 USER root
 RUN apt-get update && \
     apt-get upgrade -y && \
-    apt-get remove libaom3 -y && \
     apt-get install -y git nano vim gcc
 COPY requirements.txt requirements.txt
 USER airflow

From 6f505d8604167e2a33ffd4a01cb67ef8023c151b Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:25:08 -0400
Subject: [PATCH 40/63] Update Dockerfile

---
 Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index bd4ef93d..7760c983 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,6 @@ FROM apache/airflow:2.7.2-python3.11
 
 USER root
 RUN apt-get update && \
-    apt-get upgrade -y && \
     apt-get install -y git nano vim gcc
 COPY requirements.txt requirements.txt
 USER airflow

From be2b07d11cde611e5b325aa2cfa637a248d96954 Mon Sep 17 00:00:00 2001
From: "Michael T. Bacon" <mbacon@renci.org>
Date: Fri, 11 Oct 2024 15:43:38 -0400
Subject: [PATCH 41/63] Removed post-install package cleanup

---
 Dockerfile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 9cc3f819..7760c983 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,8 +2,6 @@ FROM apache/airflow:2.7.2-python3.11
 
 USER root
 RUN apt-get update && \
-    apt-get upgrade -y && \
-    apt-get remove libaom3 -y && \
     apt-get install -y git nano vim gcc
 COPY requirements.txt requirements.txt
 USER airflow

From 73812a1eb405f0d130f0516906958fdafac8ffc5 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Fri, 11 Oct 2024 21:36:29 -0400
Subject: [PATCH 42/63] Update requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 651daec0..3397cb9e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ redisgraph-bulk-loader==0.12.3
 setuptools>=66
 pytest
 PyYAML
-git+https://github.com/helxplatform/dug@2.13.2
+git+https://github.com/helxplatform/dug@patch-1-nemo-error
 orjson
 kg-utils==0.0.6
 bmt==1.1.0

From 6d5194ccc1b1f3b5172d60a81e0f70a98454ea3a Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Mon, 21 Oct 2024 16:44:14 -0400
Subject: [PATCH 43/63] test merge issue

---
 dags/roger/tasks.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py
index f5451140..1f17f6ae 100755
--- a/dags/roger/tasks.py
+++ b/dags/roger/tasks.py
@@ -201,10 +201,10 @@ def avalon_commit_callback(context: DagContext, **kwargs):
         logger.error(e)
     # delete temp branch
     finally:
-        client._client.branches_api.delete_branch(
-            repository=repo,
-            branch=temp_branch_name
-        )
+        # client._client.branches_api.delete_branch(
+        #     repository=repo,
+        #     branch=temp_branch_name
+        # )
         logger.info(f"deleted temp branch {temp_branch_name}")
         logger.info(f"deleting local dir {local_path}")
         files_to_clean = glob.glob(local_path + '**', recursive=True) + [local_path]

From 6db88f82879512c86ce335ec5946406d5abb7ad3 Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Mon, 21 Oct 2024 17:48:44 -0400
Subject: [PATCH 44/63] fix merge issue

---
 dags/roger/tasks.py | 3 ++-
 requirements.txt    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py
index 1f17f6ae..8823d629 100755
--- a/dags/roger/tasks.py
+++ b/dags/roger/tasks.py
@@ -176,7 +176,8 @@ def avalon_commit_callback(context: DagContext, **kwargs):
         branch=temp_branch_name,
         repo=repo,
         # @TODO figure out how to pass real commit id here
-        commit_id=branch
+        commit_id=branch,
+        source_branch_name=branch
     )
 
     # see what changes are going to be pushed from this branch to main branch
diff --git a/requirements.txt b/requirements.txt
index 3397cb9e..d89eb237 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,5 +11,5 @@ git+https://github.com/helxplatform/dug@patch-1-nemo-error
 orjson
 kg-utils==0.0.6
 bmt==1.1.0
-git+https://github.com/helxplatform/avalon.git@v1.0.1
+git+https://github.com/helxplatform/avalon.git@develop
 linkml-runtime==1.6.0

From 5794dc16f61f4f1de424b3bfddb51e8618e96cb4 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Tue, 18 Feb 2025 11:42:37 -0500
Subject: [PATCH 45/63] modify radx pipeline (#111)

* modify radx pipeline

* pin to dug release
---
 dags/roger/pipelines/radx.py | 12 +++++++++++-
 requirements.txt             |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/dags/roger/pipelines/radx.py b/dags/roger/pipelines/radx.py
index 32491fb0..7ffae159 100644
--- a/dags/roger/pipelines/radx.py
+++ b/dags/roger/pipelines/radx.py
@@ -1,8 +1,18 @@
 "Pipeline for BACPAC data"
 
 from roger.pipelines import DugPipeline
+from roger.core import storage
+
 
 class RadxPipeline(DugPipeline):
-    "Pipeline for BACPAC data set"
+    "Pipeline for Radx data set"
     pipeline_name = "radx"
     parser_name = "radx"
+
+    def get_objects(self, input_data_path=None):
+        if not input_data_path:
+            input_data_path = storage.dug_kfdrc_path()
+        files = storage.get_files_recursive(
+            lambda file_name: file_name.endswith('.json'),
+            input_data_path)
+        return sorted([str(f) for f in files])
diff --git a/requirements.txt b/requirements.txt
index 2c9dbbf4..4877244f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ redisgraph-bulk-loader==0.12.3
 setuptools>=66
 pytest
 PyYAML
-git+https://github.com/helxplatform/dug@2.13.4
+git+https://github.com/helxplatform/dug@2.13.6
 orjson
 kg-utils==0.0.6
 bmt==1.1.0

From dbfb5b2973624eac7e2fd0aaa8e1950943b1ddfd Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Wed, 26 Feb 2025 10:43:39 -0500
Subject: [PATCH 46/63] Heal ingest 2 18 (#112)

* rename repos for heal ingest

* change lib for bulk loader

* change lib for bulk loader

* cde??

* cde??

* cde??

* cde??

* cde??

* final push

* final push
---
 dags/roger/core/bulkload.py                   |  5 +--
 dags/roger/core/storage.py                    | 11 ++++---
 dags/roger/models/kgx.py                      |  4 +--
 dags/roger/pipelines/base.py                  | 15 +++++++--
 .../roger/pipelines/heal_research_programs.py |  2 +-
 dags/roger/pipelines/heal_studies.py          |  2 +-
 requirements.txt                              | 31 ++++++++++---------
 7 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/dags/roger/core/bulkload.py b/dags/roger/core/bulkload.py
index 0f0fecee..616846dc 100644
--- a/dags/roger/core/bulkload.py
+++ b/dags/roger/core/bulkload.py
@@ -10,7 +10,7 @@
 
 import requests
 import redis
-from redisgraph_bulk_loader.bulk_insert import bulk_insert
+from falkordb_bulk_loader.bulk_insert import bulk_insert
 
 from roger.config import get_default_config as get_config
 from roger.logger import get_logger
@@ -347,8 +347,9 @@ def insert (self, input_data_path=None):
             # Edge label now no longer has 'biolink:'
             args.extend(("-R " + " -R ".join(edges_with_type)).split())
         args.extend([f"--separator={self.separator}"])
-        args.extend([f"--redis-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}"])
+        args.extend([f"--server-url=redis://:{redisgraph['password']}@{redisgraph['host']}:{redisgraph['port']}"])
         args.extend(['--enforce-schema'])
+        args.extend(['-e'])
         for lbl in collect_labels:
             args.extend([f'-i `{lbl}`:id', f'-f {lbl}:name', f'-f {lbl}:synonyms'])
         args.extend([f"{redisgraph['graph']}"])
diff --git a/dags/roger/core/storage.py b/dags/roger/core/storage.py
index e3c9697c..b4869758 100644
--- a/dags/roger/core/storage.py
+++ b/dags/roger/core/storage.py
@@ -226,10 +226,13 @@ def dug_expanded_concept_objects(data_path=None, format="pickle"):
             os.path.join('*',f'expanded_concepts.{format}'))
     return sorted(glob.glob(file_pattern, recursive=True))
 
-def dug_extracted_elements_objects():
-    file_pattern = dug_expanded_concepts_path(
-        os.path.join('*', 'extracted_graph_elements.pickle'))
-    return sorted(glob.glob(file_pattern))
+def dug_extracted_elements_objects(data_path=None, format="txt"):
+    if data_path:
+        file_pattern = os.path.join(data_path, '**', f'extracted_graph_elements.{format}')
+    else:
+        file_pattern = dug_expanded_concepts_path(
+            os.path.join('*', f'extracted_graph_elements.{format}'))
+    return sorted(glob.glob(file_pattern, recursive=True))
 
 def dug_crawl_path(name):
     return str(ROGER_DATA_DIR / 'dug' / 'crawl' / name)
diff --git a/dags/roger/models/kgx.py b/dags/roger/models/kgx.py
index e80e65db..e35ab07e 100644
--- a/dags/roger/models/kgx.py
+++ b/dags/roger/models/kgx.py
@@ -310,7 +310,7 @@ def create_nodes_schema(self, input_data_path=None, output_data_path=None):
                 log.info(f"Processing node : {node} counter : {counter}")
             counter += 1
 
-            if not node['category']:
+            if not node.get('category'):
                 category_error_nodes.add(node['id'])
                 node['category'] = [BiolinkModel.root_type]
 
@@ -341,7 +341,7 @@ def create_nodes_schema(self, input_data_path=None, output_data_path=None):
 
 
         if len(category_error_nodes):
-            log.warn(f"some nodes didn't have category assigned. "
+            log.warning(f"some nodes didn't have category assigned. "
                      f"KGX file has errors."
                      f"Nodes {len(category_error_nodes)}."
                      f"Showing first 10: {list(category_error_nodes)[:10]}."
diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py
index 2a1b605e..9852147c 100644
--- a/dags/roger/pipelines/base.py
+++ b/dags/roger/pipelines/base.py
@@ -953,12 +953,21 @@ def index_concepts(self, to_string=False,
         "Index concepts from expanded concept files"
         # These are concepts that have knowledge graphs  from tranql
         # clear out concepts and kg indicies from previous runs
-        # self.clear_concepts_index()
-        # self.clear_kg_index()
+        self.clear_concepts_index()
+        self.clear_kg_index()
         expanded_concepts_files = storage.dug_expanded_concept_objects(
             input_data_path, format="txt")
         for file_ in expanded_concepts_files:
-            concepts = jsonpickle.decode(storage.read_object(file_))            
+            concepts = jsonpickle.decode(storage.read_object(file_))
             self._index_concepts(concepts=concepts)
+
+        if self.config.indexing.node_to_element_queries:
+            log.info("*******************")
+
+            extracted_elements_files = storage.dug_extracted_elements_objects(data_path=input_data_path)
+            log.info(f"{extracted_elements_files}")
+            for file_ in extracted_elements_files:
+                log.info(f"reading file {file_}")
+                self.index_elements(file_)
         output_log = self.log_stream.getvalue() if to_string else ''
         return output_log
diff --git a/dags/roger/pipelines/heal_research_programs.py b/dags/roger/pipelines/heal_research_programs.py
index d95a4497..bfec3f83 100644
--- a/dags/roger/pipelines/heal_research_programs.py
+++ b/dags/roger/pipelines/heal_research_programs.py
@@ -5,7 +5,7 @@
 
 class HealResearchProgramPipeline(DugPipeline):
     "Pipeline for Heal-research-programs  data set"
-    pipeline_name = "heal-research-programs"
+    pipeline_name = "heal-mds-research-networks"
     parser_name = "heal-research"
 
     def get_objects(self, input_data_path=None):
diff --git a/dags/roger/pipelines/heal_studies.py b/dags/roger/pipelines/heal_studies.py
index cf78c042..a08e8115 100644
--- a/dags/roger/pipelines/heal_studies.py
+++ b/dags/roger/pipelines/heal_studies.py
@@ -5,7 +5,7 @@
 
 class HealStudiesPipeline(DugPipeline):
     "Pipeline for Heal-studies  data set"
-    pipeline_name = "heal-studies"
+    pipeline_name = "heal-mds-studies"
     parser_name = "heal-studies"
 
     def get_objects(self, input_data_path=None):
diff --git a/requirements.txt b/requirements.txt
index 4877244f..743a2b49 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,16 @@
-boto3==1.18.23
-botocore==1.21.23
-elasticsearch==8.5.2
-flatten-dict
-jsonpickle
-redisgraph-bulk-loader==0.12.3
-setuptools>=66
-pytest
-PyYAML
-git+https://github.com/helxplatform/dug@2.13.6
-orjson
-kg-utils==0.0.6
-bmt==1.1.0
-git+https://github.com/helxplatform/avalon.git@v1.1.0
-linkml-runtime==1.6.0
+boto3==1.18.23
+botocore==1.21.23
+elasticsearch==8.5.2
+flatten-dict
+jsonpickle
+git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6
+#redisgraph-bulk-loader==0.12.3
+setuptools>=66
+pytest
+PyYAML
+git+https://github.com/helxplatform/dug@develop
+orjson
+kg-utils==0.0.6
+bmt==1.1.0
+git+https://github.com/helxplatform/avalon.git@v1.1.0
+linkml-runtime==1.6.0

From 06a4ed1a8090a3c735222909616c7e880d94c575 Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Wed, 26 Feb 2025 12:04:30 -0500
Subject: [PATCH 47/63] test node type

---
 dags/roger/core/bulkload.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dags/roger/core/bulkload.py b/dags/roger/core/bulkload.py
index 616846dc..a8ddbc15 100644
--- a/dags/roger/core/bulkload.py
+++ b/dags/roger/core/bulkload.py
@@ -53,7 +53,7 @@ def create_nodes_csv_file(self, input_data_path=None, output_data_path=None):
         merged_nodes_file = storage.merged_objects('nodes', input_data_path)
         counter = 1
         for node in storage.json_line_iter(merged_nodes_file):
-            if not node['category']:
+            if not node.get('category'):
                 category_error_nodes.add(node['id'])
                 node['category'] = [BiolinkModel.root_type]
             index = self.biolink.get_leaf_class(node['category'])

From faf49756063c8f0539ae613d3ab0cae5de5a7b53 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Mon, 24 Mar 2025 11:08:40 -0400
Subject: [PATCH 48/63] Heal ingest 2 18 (#114)

* rename repos for heal ingest

* change lib for bulk loader

* change lib for bulk loader

* cde??

* cde??

* cde??

* cde??

* cde??

* final push

* final push

* add study url
---
 dags/roger/pipelines/base.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py
index 9852147c..eab9159c 100644
--- a/dags/roger/pipelines/base.py
+++ b/dags/roger/pipelines/base.py
@@ -308,11 +308,16 @@ def convert_to_kgx_json(self, elements, written_nodes=None):
             if not isinstance(element, DugElement):
                 continue
             study_id = element.collection_id
+            study_link = element.collection_action
+            study_desc = element.collection_desc
+
             if study_id not in written_nodes:
                 nodes.append({
                     "id": study_id,
                     "category": ["biolink:Study"],
-                    "name": study_id
+                    "name": study_id,
+                    "url": study_link,
+                    "description": study_desc
                 })
                 written_nodes.add(study_id)
 

From 048e15f6e231239479ff738b1684dc73efef7401 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Wed, 26 Mar 2025 10:48:36 -0400
Subject: [PATCH 49/63] Bitnami (#117)

* checkpoint

* pushing docker changes

* Executor tests 2.10 (#115)

* test out bash operator

* task name

* pushing

* revert python executor , with no excutor config

* docker file and requriements

* everything latest

---------

Co-authored-by: waTeim <truthset@gmail.com>
---
 Dockerfile          | 34 +++++++++++++++++++++++++++++-----
 dags/roger/tasks.py |  7 +++++--
 requirements.txt    | 30 ++++++++++++++----------------
 3 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 7760c983..47d2c13f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,9 +1,33 @@
-FROM apache/airflow:2.7.2-python3.11
+FROM bitnami/airflow:2.10.5-debian-12-r7
 
 USER root
-RUN apt-get update && \
-    apt-get install -y git nano vim gcc
+RUN apt-get update &&  apt-get install -y git nano vim gcc rustc cargo
+#RUN useradd -u 1001 -ms /bin/bash airflow && chown -R airflow /home/airflow
 COPY requirements.txt requirements.txt
-USER airflow
-RUN pip install -r requirements.txt
+RUN source /opt/bitnami/airflow/venv/bin/activate && CARGO_HOME=/tmp/.cargo &&  \
+    pip install setuptools wheel &&  \
+    pip install -r requirements.txt
+
 RUN rm -f requirements.txt
+
+## Vul patches
+## Python lib patches on airflow python env
+RUN source /opt/bitnami/airflow/venv/bin/activate pip install --upgrade \
+    flask-appbuilder==4.5.3 \
+    cryptography==44.0.1 \
+    werkzeug==3.0.6 \
+    urllib3==2.2.2
+RUN source /opt/bitnami/airflow/venv/bin/activate pip uninstall -y  \
+    apache-airflow-providers-mysql==6.2.0
+
+# Uninstall these from non airflow python env
+RUN pip install --upgrade  \
+    flask-appbuilder==4.5.3 \
+    cryptography==44.0.1 \
+    werkzeug==3.0.6 \
+    urllib3==2.2.2
+RUN apt-get autoremove  -y vim
+RUN apt-get autoremove  -y binutils
+RUN apt-get autoremove  -y linux-libc-dev
+
+USER airflow
diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py
index 678f7137..5fe5ff90 100755
--- a/dags/roger/tasks.py
+++ b/dags/roger/tasks.py
@@ -9,11 +9,13 @@
 from airflow.models import DAG
 from airflow.models.dag import DagContext
 from airflow.models.taskinstance import TaskInstance
+from airflow.operators.bash import BashOperator
 from typing import Union
 from pathlib import Path
 import glob
 import shutil
 
+
 from roger.config import config, RogerConfig
 from roger.logger import get_logger
 from roger.pipelines.base import DugPipeline
@@ -307,7 +309,8 @@ def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos =
     :param dag: dag to add task to.
     :param name: The name of the task.
     :param a_callable: The code to run in this task.
-    """
+    """    
+    
     # these are actual arguments passed down to the task function
     op_kwargs = {
         "python_callable": a_callable,
@@ -324,7 +327,7 @@ def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos =
     python_operator_args = {
             "task_id": name,
             "python_callable":task_wrapper,            
-            "executor_config" : get_executor_config(),
+            # "executor_config" : get_executor_config(),
             "dag": dag,
             "provide_context" : True
     }
diff --git a/requirements.txt b/requirements.txt
index ef7b40e5..83d7c320 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +1,14 @@
-boto3==1.18.23
-botocore==1.21.23
-elasticsearch==8.5.2
-flatten-dict
-jsonpickle
-git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6
-#redisgraph-bulk-loader==0.12.3
-setuptools>=66
-pytest
-PyYAML
-git+https://github.com/helxplatform/dug@develop
-orjson
-kg-utils==0.0.6
-bmt==1.1.0
-git+https://github.com/helxplatform/avalon.git@v1.1.0
-linkml-runtime==1.6.0
\ No newline at end of file
+elasticsearch==8.5.2
+flatten-dict
+jsonpickle
+git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6
+setuptools>=66
+pytest
+PyYAML
+git+https://github.com/helxplatform/dug@2.13.8
+orjson==3.9.15
+git+https://github.com/helxplatform/kg_utils.git@v0.0.10
+git+https://github.com/helxplatform/python-stringcase@1.2.1
+bmt==1.4.4
+git+https://github.com/helxplatform/avalon.git@v1.1.0
+

From ddb3663f4f7290f598b6a11781267ec8eaa30cad Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Mon, 19 May 2025 14:15:16 -0400
Subject: [PATCH 50/63] remove clear index

---
 dags/roger/pipelines/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py
index eab9159c..24170f69 100644
--- a/dags/roger/pipelines/base.py
+++ b/dags/roger/pipelines/base.py
@@ -958,8 +958,8 @@ def index_concepts(self, to_string=False,
         "Index concepts from expanded concept files"
         # These are concepts that have knowledge graphs  from tranql
         # clear out concepts and kg indicies from previous runs
-        self.clear_concepts_index()
-        self.clear_kg_index()
+        # self.clear_concepts_index()
+        # self.clear_kg_index()
         expanded_concepts_files = storage.dug_expanded_concept_objects(
             input_data_path, format="txt")
         for file_ in expanded_concepts_files:

From 27cc8806fb3ed37ccf43a652133c343c2289596d Mon Sep 17 00:00:00 2001
From: ykale <14827177+yskale@users.noreply.github.com>
Date: Mon, 19 May 2025 16:36:22 -0400
Subject: [PATCH 51/63] Update requirements.txt

fixed h11 version
---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 83d7c320..b47e4b5a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,5 @@ git+https://github.com/helxplatform/kg_utils.git@v0.0.10
 git+https://github.com/helxplatform/python-stringcase@1.2.1
 bmt==1.4.4
 git+https://github.com/helxplatform/avalon.git@v1.1.0
+h11>=0.16.0
 

From 2cea20725c387331becbcfdc9d86eaac406cd450 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Tue, 27 May 2025 11:40:29 -0400
Subject: [PATCH 52/63] fix study name in graph (#121)

---
 dags/roger/pipelines/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py
index 24170f69..ba278deb 100644
--- a/dags/roger/pipelines/base.py
+++ b/dags/roger/pipelines/base.py
@@ -310,12 +310,13 @@ def convert_to_kgx_json(self, elements, written_nodes=None):
             study_id = element.collection_id
             study_link = element.collection_action
             study_desc = element.collection_desc
+            study_name = element.collection_name or element.collection_id
 
             if study_id not in written_nodes:
                 nodes.append({
                     "id": study_id,
                     "category": ["biolink:Study"],
-                    "name": study_id,
+                    "name": study_name,
                     "url": study_link,
                     "description": study_desc
                 })

From 30c4e8d0e986223091713055bc87a5dd97d35353 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Wed, 28 May 2025 15:03:19 -0400
Subject: [PATCH 53/63] comment clear index. rolledback during merge

---
 dags/roger/pipelines/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py
index 09e8284b..d9a41493 100644
--- a/dags/roger/pipelines/base.py
+++ b/dags/roger/pipelines/base.py
@@ -960,8 +960,8 @@ def index_concepts(self, to_string=False,
         "Index concepts from expanded concept files"
         # These are concepts that have knowledge graphs  from tranql
         # clear out concepts and kg indicies from previous runs
-        self.clear_concepts_index()
-        self.clear_kg_index()
+        # self.clear_concepts_index()
+        # self.clear_kg_index()
         expanded_concepts_files = storage.dug_expanded_concept_objects(
             input_data_path, format="txt")
         for file_ in expanded_concepts_files:

From 3a8181156ae67c25d5f13f84355ae9b3b56f293e Mon Sep 17 00:00:00 2001
From: ykale <14827177+yskale@users.noreply.github.com>
Date: Tue, 3 Jun 2025 12:02:48 -0400
Subject: [PATCH 54/63] Picsure data ingest test (#118)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* test

* changed the pipeline name

* parser name updated

* update pipeline name

* update the get_object/files

* changed the lakefs dataset name for testing

* update the pipelines for new programs in BDC

* init parser and crawler objects once and retry

* fix logger -> log typo

* remove clear index (#119)

* update with develop (#124)

* Sync develop with main  (#116)

* Update _version.py (#86)

* Update _version.py

* Rti merge (#84)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Update requirements.txt

* Update .gitignore

* Update dug_utils.py

Handle Error when curie not found in validate

* Update __init__.py

* Update config.yaml

* Update dev-config.yaml

* Update docker-compose.yaml

* fixed docker-compose

* adding back postgres volume to docker compose

* env correction , docker compose updates

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>

* adding v5.0

* cde-links branch

* pin linkml

* Update config.yaml

collection_action to action

* pop total items before result

* print extracted elements

* Update requirements.txt

* Keep edge provenance (#94)

* Update kgx.py

* Update kgx.py

* Update kgx.py

can't delete edge keys while looping over them.

* just collect then update

* Update requirements.txt (#93)

* Pipeline parameterize restructure (#95)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* Parameterized annotate tasks with input_data_path and output_data_path

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Parameterized annotate tasks with input_data_path and output_data_path (#85)

* adding lakefs changes to roger-2.0

* point avalon to vg1 branch

* change avalon dep

* update airflow

* fix avalon tag typo

* update jenkins to tag version on main branch only

* update jenkins to tag version

* update jenkins to tag version

* psycopg2 installation

* add cncf k8s req

* use airflow non-slim

* simplified for testing

* simplified for testing

* change dag name

* Erroneous parameter passed, should not be None

* adding pre-exec

* adding pre-exec

* adding pre-exec

* typo preexec

* typo preexec

* fix context

* get files from repo

* get files from repo

* get files from repo

* get files from repo

* First shot at moving pipeline into base class and implementing. Anvil pipeline not complete

* Syntax fix, docker image version bump to airflow 2.7.2-python3.11

* update storage dir

* update remove dir code

* update remove dir code

* remote path to *

* fix input dir for annotators

* fix input dir for annotators

* fix input dir for annotators

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* adding branch info on lakefs config

* callback push to branch

* back to relative import

* reformat temp branch name based on unique task id

* add logging

* add logging

* convert posix path to str for avalon

* add extra / to root path

* New dag created using DugPipeline subclasses

* EmptyOperator imported from wrong place

* import and syntax fixes

* utterly silly syntax error

* Added anvil to default input data sets for testing purposes

* adding / to local path

* commit meta task args empty string

* add merge logic

* add merge logic

* upstream task dir pull for downstream task

* Switched from subdag to taskgroup because latest Airflow depricated subdag

* Added BACPAC pipeline object

* Temporarily ignoring configuration variable for enabled datasets for testing

* Passed dag in to create task group to see if it helps dag errors

* Fixed silly syntax error

* adding input / output dir params for make kgx

* Trying different syntax to make taskgroups work.

* adding input / output dir params for make kgx

* Parsing, syntax, pylint fixes

* adding input / output dir params for make kgx

* Added pipeline name to task group name to ensure uniqueness

* oops, moved something out of scope. Fixed

* Filled out pipeline with methods from dug_utils. Needs data path changes

* Finished implementing input_data_path and output_data_path handling, pylint cleanup

* Update requirements.txt

* adding toggle to avoid sending config obj

* adding toggle to avoid sending config obj

* disable to string for test

* control pipelines for testing

* add self to anvil get files

* add log stream to make it available

* typo fix

* correcting branch id

* adding source repo

* adding source repo

* patch name-resolver response

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* dug pipeline edit

* recurisvely find recursively

* recurisvely find recursively

* setup output path for crawling

* all task functions should have input and output params

* adding annotation as upstream for validate index

* revamp create task , and task wrapper

* add validate concepts index task

* adding concept validation

* add index_variables task as dependecy for validate concepts

* add index_variables task as dependecy for validate concepts

* await client exist

* await client exist

* concepts not getting picked up for indexing

* concepts not getting picked up for indexing

* fix search elements

* converting annotation output to json

* json format annotation outputs

* adding support for json format elements and concepts read

* json back to dug objects

* fixing index valriables with json objects

* indetation and new line for better change detection :?

* indetation and new line for better change detection

* treat dictionary concepts as dictionary

* read concepts json as a dict

* concepts files are actually file paths

* debug message

* make output jsonable

* clear up dir after commit , and delete unmerged branch even if no changes

* don`t clear indexes, parallel dataset processing will be taxed

* memory leak?

* memory leak?

* memory leak?

* dumping pickles to debug locally

* find out why concepts are being added to every other element

* find out why concepts are being added to every other element

* pointless shuffle 🤷‍♂️

* revert back in time

* back to sanitize dug

* output just json for annotation

* adding jsonpickle

* jsonpickle 🥒

* unpickle for index

* unpickle for validate index

* crawling fixes

* crawling fixes

* crawling validation fixes

* fix index concepts

* fix makekgx

* adding other bdc pipelines

* adding pipeline paramters to be able to configure per instance

* fix

* add input dataset for pipelines

* Adding README to document how to create data set-specific pipelines

* catchup on base.py

* Added dbgap and nida pipelines

* fix import errors

* annotator modules added by passing config val (#90)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

---------

Co-authored-by: YaphetKG <kebedey@renci.org>

* Add heal parsers (#96)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Add heal parsers (#97)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Radx pipeline (#99)

* point to large download

* fix schema path

* debug bulk input dir

* fix schema read

* fix schema read

* fix schema read

* commenting steup dir for test

* adding logs

* fix path stuff

* add commented stuff back in

* testing radx parser

* adding parser

* skip indexing vars with no id

* adding indexes as part of bulk loader paramters

* fix id index cli arg

* fix local cli

* dug latest

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* pin avalon

* deleted jenkins and added workflows

* unlinked helx-actions

* testing paths

* testing again

* d

* tests

* commented out pytest

* try again for bandit

* commented out bandit

* changed dag to dags

* Added fixes

* Bagel (#103)

* bump dug version

* adding bdc new pipelines

* adding curesc

* adding bagel config

* add test parser

* add score threshold

* point to dug develop

* Dbgap programs (#104)

* bump dug version

* adding bdc new pipelines

* adding curesc

* fix up merge conflict

* adding bagel config parse to bool

* Dev sync main (#106)

* release sync (#100)

* Update _version.py (#86)

* Update _version.py

* Rti merge (#84)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Update requirements.txt

* Update .gitignore

* Update dug_utils.py

Handle Error when curie not found in validate

* Update __init__.py

* Update config.yaml

* Update dev-config.yaml

* Update docker-compose.yaml

* fixed docker-compose

* adding back postgres volume to docker compose

* env correction , docker compose updates

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>

* adding v5.0

* cde-links branch

* pin linkml

* Update config.yaml

collection_action to action

* pop total items before result

* print extracted elements

* Update requirements.txt

* Keep edge provenance (#94)

* Update kgx.py

* Update kgx.py

* Update kgx.py

can't delete edge keys while looping over them.

* just collect then update

* Update requirements.txt (#93)

* Pipeline parameterize restructure (#95)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* Parameterized annotate tasks with input_data_path and output_data_path

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Parameterized annotate tasks with input_data_path and output_data_path (#85)

* adding lakefs changes to roger-2.0

* point avalon to vg1 branch

* change avalon dep

* update airflow

* fix avalon tag typo

* update jenkins to tag version on main branch only

* update jenkins to tag version

* update jenkins to tag version

* psycopg2 installation

* add cncf k8s req

* use airflow non-slim

* simplified for testing

* simplified for testing

* change dag name

* Erroneous parameter passed, should not be None

* adding pre-exec

* adding pre-exec

* adding pre-exec

* typo preexec

* typo preexec

* fix context

* get files from repo

* get files from repo

* get files from repo

* get files from repo

* First shot at moving pipeline into base class and implementing. Anvil pipeline not complete

* Syntax fix, docker image version bump to airflow 2.7.2-python3.11

* update storage dir

* update remove dir code

* update remove dir code

* remote path to *

* fix input dir for annotators

* fix input dir for annotators

* fix input dir for annotators

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* adding branch info on lakefs config

* callback push to branch

* back to relative import

* reformat temp branch name based on unique task id

* add logging

* add logging

* convert posix path to str for avalon

* add extra / to root path

* New dag created using DugPipeline subclasses

* EmptyOperator imported from wrong place

* import and syntax fixes

* utterly silly syntax error

* Added anvil to default input data sets for testing purposes

* adding / to local path

* commit meta task args empty string

* add merge logic

* add merge logic

* upstream task dir pull for downstream task

* Switched from subdag to taskgroup because latest Airflow depricated subdag

* Added BACPAC pipeline object

* Temporarily ignoring configuration variable for enabled datasets for testing

* Passed dag in to create task group to see if it helps dag errors

* Fixed silly syntax error

* adding input / output dir params for make kgx

* Trying different syntax to make taskgroups work.

* adding input / output dir params for make kgx

* Parsing, syntax, pylint fixes

* adding input / output dir params for make kgx

* Added pipeline name to task group name to ensure uniqueness

* oops, moved something out of scope. Fixed

* Filled out pipeline with methods from dug_utils. Needs data path changes

* Finished implementing input_data_path and output_data_path handling, pylint cleanup

* Update requirements.txt

* adding toggle to avoid sending config obj

* adding toggle to avoid sending config obj

* disable to string for test

* control pipelines for testing

* add self to anvil get files

* add log stream to make it available

* typo fix

* correcting branch id

* adding source repo

* adding source repo

* patch name-resolver response

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* dug pipeline edit

* recurisvely find recursively

* recurisvely find recursively

* setup output path for crawling

* all task functions should have input and output params

* adding annotation as upstream for validate index

* revamp create task , and task wrapper

* add validate concepts index task

* adding concept validation

* add index_variables task as dependecy for validate concepts

* add index_variables task as dependecy for validate concepts

* await client exist

* await client exist

* concepts not getting picked up for indexing

* concepts not getting picked up for indexing

* fix search elements

* converting annotation output to json

* json format annotation outputs

* adding support for json format elements and concepts read

* json back to dug objects

* fixing index valriables with json objects

* indetation and new line for better change detection :?

* indetation and new line for better change detection

* treat dictionary concepts as dictionary

* read concepts json as a dict

* concepts files are actually file paths

* debug message

* make output jsonable

* clear up dir after commit , and delete unmerged branch even if no changes

* don`t clear indexes, parallel dataset processing will be taxed

* memory leak?

* memory leak?

* memory leak?

* dumping pickles to debug locally

* find out why concepts are being added to every other element

* find out why concepts are being added to every other element

* pointless shuffle 🤷‍♂️

* revert back in time

* back to sanitize dug

* output just json for annotation

* adding jsonpickle

* jsonpickle 🥒

* unpickle for index

* unpickle for validate index

* crawling fixes

* crawling fixes

* crawling validation fixes

* fix index concepts

* fix makekgx

* adding other bdc pipelines

* adding pipeline paramters to be able to configure per instance

* fix

* add input dataset for pipelines

* Adding README to document how to create data set-specific pipelines

* catchup on base.py

* Added dbgap and nida pipelines

* fix import errors

* annotator modules added by passing config val (#90)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

---------

Co-authored-by: YaphetKG <kebedey@renci.org>

* Add heal parsers (#96)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Add heal parsers (#97)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Radx pipeline (#99)

* point to large download

* fix schema path

* debug bulk input dir

* fix schema read

* fix schema read

* fix schema read

* commenting steup dir for test

* adding logs

* fix path stuff

* add commented stuff back in

* testing radx parser

* adding parser

* skip indexing vars with no id

* adding indexes as part of bulk loader paramters

* fix id index cli arg

* fix local cli

* dug latest

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* pin avalon

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* remove jenkins file

* bump apache version

* revert airflow version

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* Updated docker image to 2.10.2

* fix kgx path error

* fix kgx path error

* Trying out the slim apache images to reduce vulnerability footprint

* Building general package update into dockerfile

* Trying more rigorous dist-upgrade to try to get rid of vulerabilities that won't quit

* dist-upgrade also needs -y flag

* Rolling back from slim image

* Fixed apt-get syntax error

* Reverting develop to 2.7.2 for stability's sake, will revisit after upgrades finish

* Update Dockerfile

* Update Dockerfile

* Removed post-install package cleanup

* Update requirements.txt

* test merge issue

* fix merge issue

* modify radx pipeline (#111)

* modify radx pipeline

* pin to dug release

* Heal ingest 2 18 (#112)

* rename repos for heal ingest

* change lib for bulk loader

* change lib for bulk loader

* cde??

* cde??

* cde??

* cde??

* cde??

* final push

* final push

* test node type

* Heal ingest 2 18 (#114)

* rename repos for heal ingest

* change lib for bulk loader

* change lib for bulk loader

* cde??

* cde??

* cde??

* cde??

* cde??

* final push

* final push

* add study url

* Bitnami (#117)

* checkpoint

* pushing docker changes

* Executor tests 2.10 (#115)

* test out bash operator

* task name

* pushing

* revert python executor , with no excutor config

* docker file and requriements

* everything latest

---------

Co-authored-by: waTeim <truthset@gmail.com>

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>
Co-authored-by: Patrick Hachicho <hachichopatrick@gmail.com>
Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com>
Co-authored-by: waTeim <truthset@gmail.com>

* fix study name in graph (#121)

* comment clear index. rolledback during merge

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>
Co-authored-by: Patrick Hachicho <hachichopatrick@gmail.com>
Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com>
Co-authored-by: waTeim <truthset@gmail.com>

* Update requirements.txt with Dug V2.13.10 which have all the new changes from new_bdc_parser branch

---------

Co-authored-by: YSK <yskale@users.noreply.github.com>
Co-authored-by: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>
Co-authored-by: Patrick Hachicho <hachichopatrick@gmail.com>
Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com>
Co-authored-by: waTeim <truthset@gmail.com>
---
 dags/roger/pipelines/base.py          | 36 +++++++++++++++++++--------
 dags/roger/pipelines/bdc_pipelines.py | 28 ++++++++++++++-------
 dags/roger/pipelines/picsure_test.py  | 26 +++++++++++++++++++
 requirements.txt                      |  2 +-
 4 files changed, 71 insertions(+), 21 deletions(-)
 create mode 100644 dags/roger/pipelines/picsure_test.py

diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py
index d9a41493..e108443e 100644
--- a/dags/roger/pipelines/base.py
+++ b/dags/roger/pipelines/base.py
@@ -216,6 +216,24 @@ def get_annotator(self):
         )
         return annotator
 
+    def init_annotator(self, max_retries=5, base_delay=1, max_delay=10):
+        attempt = 0
+        while attempt < max_retries:
+            try:                                
+                log.info("Initializing annotator")
+                annotator = self.get_annotator()                
+                return annotator  # success
+            except Exception as e:
+                attempt += 1
+                if attempt == max_retries:
+                    log.error("Max retries reached when creating annotator. Failing with error: %s", e)
+                    raise
+                delay = min(base_delay * (2 ** (attempt - 1)), max_delay)
+                delay += random.uniform(0, 1)  # add jitter
+                log.warning("Error occurred: %s. Retrying in %.2f seconds...", e, delay)
+                time.sleep(delay)
+
+
     def annotate_files(self, parsable_files, output_data_path=None):
         """
         Annotates a Data element file using a Dug parser.
@@ -226,11 +244,14 @@ def annotate_files(self, parsable_files, output_data_path=None):
         if not output_data_path:
             output_data_path = storage.dug_annotation_path('')
         log.info("Parsing files")
+        log.info("Intializing parser")
+        parser = self.get_parser()
+        log.info("Done intializing parser")
+        annotator = self.init_annotator()
+        log.info("Done intializing annotator")
         for _, parse_file in enumerate(parsable_files):
             log.debug("Creating Dug Crawler object on parse_file %s at %d of %d",
-                      parse_file, _ , len(parsable_files))
-            parser = self.get_parser()
-            annotator = self.get_annotator() 
+                      parse_file, _ , len(parsable_files))             
             crawler = Crawler(
                 crawl_file=parse_file,
                 parser=parser,
@@ -247,14 +268,7 @@ def annotate_files(self, parsable_files, output_data_path=None):
                 output_data_path, current_file_name)
             elements_file = os.path.join(elements_file_path, 'elements.txt')
             concepts_file = os.path.join(elements_file_path, 'concepts.txt')         
-            # This is a file that the crawler will later populate. We start here
-            # by creating an empty elements file.
-            # This also creates output dir if it doesn't exist.
-            # elements_json = os.path.join(elements_file_path,
-            #                              'element_file.json')
-            # log.debug("Creating empty file: %s", elements_json)
-            # storage.write_object({}, elements_json)
-
+            
             # Use the specified parser to parse the parse_file into elements.
             log.debug("Parser is %s", str(parser))
             elements = parser(parse_file)
diff --git a/dags/roger/pipelines/bdc_pipelines.py b/dags/roger/pipelines/bdc_pipelines.py
index 5a945641..d4c6436d 100644
--- a/dags/roger/pipelines/bdc_pipelines.py
+++ b/dags/roger/pipelines/bdc_pipelines.py
@@ -4,29 +4,29 @@
 
 class BIOLINCCdbGaPPipeline(DugPipeline):
     "Pipeline for the dbGaP data set"
-    pipeline_name = 'biolincc'
+    pipeline_name = 'bdc-biolincc'
     parser_name = 'biolincc'
 
 
 class covid19dbGaPPipeline(DugPipeline):
     "Pipeline for the dbGaP data set"
-    pipeline_name = 'covid19-dbgap'
+    pipeline_name = 'bdc-covid19'
     parser_name = 'covid19'
 
 class dirDbGaPPipeline(DugPipeline):
-    pipeline_name = "dir-dbgap"
+    pipeline_name = "bdc-dir"
     parser_name = "dir"
 
 class LungMapDbGaPPipeline(DugPipeline):
-    pipeline_name = "lungmap-dbgap"
+    pipeline_name = "bdc-lungmap"
     parser_name = "lungmap"
 
 class nsrrDbGaPPipeline(DugPipeline):
-    pipeline_name = "nsrr-dbgap"
+    pipeline_name = "bdc-nsrr"
     parser_name = "nsrr"
 
 class ParentDbGaPPipeline(DugPipeline):
-    pipeline_name = "parent-dbgap"
+    pipeline_name = "bdc-parent"
     parser_name = "parent"
 
 class PCGCDbGaPPipeline(DugPipeline):
@@ -34,15 +34,25 @@ class PCGCDbGaPPipeline(DugPipeline):
     parser_name = "pcgc"
 
 class RecoverDbGaPPipeline(DugPipeline):
-    pipeline_name = "recover-dbgap"
+    pipeline_name = "bdc-recover"
     parser_name = "recover"
 
 class TopmedDBGaPPipeline(DugPipeline):
-    pipeline_name = "topmed-gen3-dbgap"
+    pipeline_name = "bdc-topmed"
     parser_name = "topmeddbgap"
 
 class CureSCPipeline(DugPipeline):
-    pipeline_name = "curesc-dbgap"
+    pipeline_name = "bdc-curesc"
     parser_name = "curesc"
 
+class HeartFailurePipeline(DugPipeline):
+    pipeline_name = "bdc-heartfailure"
+    parser_name = "heartfailure"
 
+class ImagingPipeline(DugPipeline):
+    pipeline_name = "bdc-imaging"
+    parser_name = "imaging"
+
+class RedsPipeline(DugPipeline):
+    pipeline_name = "bdc-reds"
+    parser_name = "reds"
\ No newline at end of file
diff --git a/dags/roger/pipelines/picsure_test.py b/dags/roger/pipelines/picsure_test.py
new file mode 100644
index 00000000..bea4469f
--- /dev/null
+++ b/dags/roger/pipelines/picsure_test.py
@@ -0,0 +1,26 @@
+from roger.pipelines import DugPipeline
+from roger.core import  storage
+from roger.logger import logger
+
+
+class PicSure(DugPipeline):
+    "Pipeline for BACPAC data set"
+    pipeline_name = "bdc-test6"  #lakefs 
+    parser_name = "dbgap"
+
+    def get_objects(self, input_data_path=None):
+        """Retrieve anvil objects
+
+        This code is imported from roger.core.storage.dug_anvil_objects
+        """
+        if not input_data_path:
+            input_data_path = storage.dug_input_files_path(
+                self.files_dir)
+        files = storage.get_files_recursive(
+            lambda file_name: (
+                not file_name.startswith('GapExchange_')
+                and file_name.endswith('.xml')),
+            input_data_path)
+        logger.info("**********")
+        logger.info(files)
+        return sorted([str(f) for f in files])
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 96cd3ef4..871f52c2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6
 setuptools>=66
 pytest
 PyYAML
-git+https://github.com/helxplatform/dug@2.13.8
+git+https://github.com/helxplatform/dug@2.13.10
 orjson==3.9.15
 git+https://github.com/helxplatform/kg_utils.git@v0.0.10
 git+https://github.com/helxplatform/python-stringcase@1.2.1

From 01827386a0385351db0b5b9fce356a6fc8490cbd Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Tue, 3 Jun 2025 16:40:13 -0400
Subject: [PATCH 55/63] fix dug version

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 871f52c2..c113288f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6
 setuptools>=66
 pytest
 PyYAML
-git+https://github.com/helxplatform/dug@2.13.10
+git+https://github.com/helxplatform/dug@v2.13.10
 orjson==3.9.15
 git+https://github.com/helxplatform/kg_utils.git@v0.0.10
 git+https://github.com/helxplatform/python-stringcase@1.2.1

From 14d09f3e67b26565e2f2a61eebb9c937f76d2172 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Tue, 9 Sep 2025 11:46:56 -0400
Subject: [PATCH 56/63] Picsure data ingest test (#127)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* test

* changed the pipeline name

* parser name updated

* update pipeline name

* update the get_object/files

* changed the lakefs dataset name for testing

* update the pipelines for new programs in BDC

* init parser and crawler objects once and retry

* fix logger -> log typo

* remove clear index (#119)

* update with develop (#124)

* Sync develop with main  (#116)

* Update _version.py (#86)

* Update _version.py

* Rti merge (#84)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Update requirements.txt

* Update .gitignore

* Update dug_utils.py

Handle Error when curie not found in validate

* Update __init__.py

* Update config.yaml

* Update dev-config.yaml

* Update docker-compose.yaml

* fixed docker-compose

* adding back postgres volume to docker compose

* env correction , docker compose updates

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>

* adding v5.0

* cde-links branch

* pin linkml

* Update config.yaml

collection_action to action

* pop total items before result

* print extracted elements

* Update requirements.txt

* Keep edge provenance (#94)

* Update kgx.py

* Update kgx.py

* Update kgx.py

can't delete edge keys while looping over them.

* just collect then update

* Update requirements.txt (#93)

* Pipeline parameterize restructure (#95)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* Parameterized annotate tasks with input_data_path and output_data_path

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Parameterized annotate tasks with input_data_path and output_data_path (#85)

* adding lakefs changes to roger-2.0

* point avalon to vg1 branch

* change avalon dep

* update airflow

* fix avalon tag typo

* update jenkins to tag version on main branch only

* update jenkins to tag version

* update jenkins to tag version

* psycopg2 installation

* add cncf k8s req

* use airflow non-slim

* simplified for testing

* simplified for testing

* change dag name

* Erroneous parameter passed, should not be None

* adding pre-exec

* adding pre-exec

* adding pre-exec

* typo preexec

* typo preexec

* fix context

* get files from repo

* get files from repo

* get files from repo

* get files from repo

* First shot at moving pipeline into base class and implementing. Anvil pipeline not complete

* Syntax fix, docker image version bump to airflow 2.7.2-python3.11

* update storage dir

* update remove dir code

* update remove dir code

* remote path to *

* fix input dir for annotators

* fix input dir for annotators

* fix input dir for annotators

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* adding branch info on lakefs config

* callback push to branch

* back to relative import

* reformat temp branch name based on unique task id

* add logging

* add logging

* convert posix path to str for avalon

* add extra / to root path

* New dag created using DugPipeline subclasses

* EmptyOperator imported from wrong place

* import and syntax fixes

* utterly silly syntax error

* Added anvil to default input data sets for testing purposes

* adding / to local path

* commit meta task args empty string

* add merge logic

* add merge logic

* upstream task dir pull for downstream task

* Switched from subdag to taskgroup because latest Airflow depricated subdag

* Added BACPAC pipeline object

* Temporarily ignoring configuration variable for enabled datasets for testing

* Passed dag in to create task group to see if it helps dag errors

* Fixed silly syntax error

* adding input / output dir params for make kgx

* Trying different syntax to make taskgroups work.

* adding input / output dir params for make kgx

* Parsing, syntax, pylint fixes

* adding input / output dir params for make kgx

* Added pipeline name to task group name to ensure uniqueness

* oops, moved something out of scope. Fixed

* Filled out pipeline with methods from dug_utils. Needs data path changes

* Finished implementing input_data_path and output_data_path handling, pylint cleanup

* Update requirements.txt

* adding toggle to avoid sending config obj

* adding toggle to avoid sending config obj

* disable to string for test

* control pipelines for testing

* add self to anvil get files

* add log stream to make it available

* typo fix

* correcting branch id

* adding source repo

* adding source repo

* patch name-resolver response

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* dug pipeline edit

* recurisvely find recursively

* recurisvely find recursively

* setup output path for crawling

* all task functions should have input and output params

* adding annotation as upstream for validate index

* revamp create task , and task wrapper

* add validate concepts index task

* adding concept validation

* add index_variables task as dependecy for validate concepts

* add index_variables task as dependecy for validate concepts

* await client exist

* await client exist

* concepts not getting picked up for indexing

* concepts not getting picked up for indexing

* fix search elements

* converting annotation output to json

* json format annotation outputs

* adding support for json format elements and concepts read

* json back to dug objects

* fixing index valriables with json objects

* indetation and new line for better change detection :?

* indetation and new line for better change detection

* treat dictionary concepts as dictionary

* read concepts json as a dict

* concepts files are actually file paths

* debug message

* make output jsonable

* clear up dir after commit , and delete unmerged branch even if no changes

* don`t clear indexes, parallel dataset processing will be taxed

* memory leak?

* memory leak?

* memory leak?

* dumping pickles to debug locally

* find out why concepts are being added to every other element

* find out why concepts are being added to every other element

* pointless shuffle 🤷‍♂️

* revert back in time

* back to sanitize dug

* output just json for annotation

* adding jsonpickle

* jsonpickle 🥒

* unpickle for index

* unpickle for validate index

* crawling fixes

* crawling fixes

* crawling validation fixes

* fix index concepts

* fix makekgx

* adding other bdc pipelines

* adding pipeline paramters to be able to configure per instance

* fix

* add input dataset for pipelines

* Adding README to document how to create data set-specific pipelines

* catchup on base.py

* Added dbgap and nida pipelines

* fix import errors

* annotator modules added by passing config val (#90)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

---------

Co-authored-by: YaphetKG <kebedey@renci.org>

* Add heal parsers (#96)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Add heal parsers (#97)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Radx pipeline (#99)

* point to large download

* fix schema path

* debug bulk input dir

* fix schema read

* fix schema read

* fix schema read

* commenting steup dir for test

* adding logs

* fix path stuff

* add commented stuff back in

* testing radx parser

* adding parser

* skip indexing vars with no id

* adding indexes as part of bulk loader paramters

* fix id index cli arg

* fix local cli

* dug latest

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* pin avalon

* deleted jenkins and added workflows

* unlinked helx-actions

* testing paths

* testing again

* d

* tests

* commented out pytest

* try again for bandit

* commented out bandit

* changed dag to dags

* Added fixes

* Bagel (#103)

* bump dug version

* adding bdc new pipelines

* adding curesc

* adding bagel config

* add test parser

* add score threshold

* point to dug develop

* Dbgap programs (#104)

* bump dug version

* adding bdc new pipelines

* adding curesc

* fix up merge conflict

* adding bagel config parse to bool

* Dev sync main (#106)

* release sync (#100)

* Update _version.py (#86)

* Update _version.py

* Rti merge (#84)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Update requirements.txt

* Update .gitignore

* Update dug_utils.py

Handle Error when curie not found in validate

* Update __init__.py

* Update config.yaml

* Update dev-config.yaml

* Update docker-compose.yaml

* fixed docker-compose

* adding back postgres volume to docker compose

* env correction , docker compose updates

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>

* adding v5.0

* cde-links branch

* pin linkml

* Update config.yaml

collection_action to action

* pop total items before result

* print extracted elements

* Update requirements.txt

* Keep edge provenance (#94)

* Update kgx.py

* Update kgx.py

* Update kgx.py

can't delete edge keys while looping over them.

* just collect then update

* Update requirements.txt (#93)

* Pipeline parameterize restructure (#95)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* Parameterized annotate tasks with input_data_path and output_data_path

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Parameterized annotate tasks with input_data_path and output_data_path (#85)

* adding lakefs changes to roger-2.0

* point avalon to vg1 branch

* change avalon dep

* update airflow

* fix avalon tag typo

* update jenkins to tag version on main branch only

* update jenkins to tag version

* update jenkins to tag version

* psycopg2 installation

* add cncf k8s req

* use airflow non-slim

* simplified for testing

* simplified for testing

* change dag name

* Erroneous parameter passed, should not be None

* adding pre-exec

* adding pre-exec

* adding pre-exec

* typo preexec

* typo preexec

* fix context

* get files from repo

* get files from repo

* get files from repo

* get files from repo

* First shot at moving pipeline into base class and implementing. Anvil pipeline not complete

* Syntax fix, docker image version bump to airflow 2.7.2-python3.11

* update storage dir

* update remove dir code

* update remove dir code

* remote path to *

* fix input dir for annotators

* fix input dir for annotators

* fix input dir for annotators

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* adding branch info on lakefs config

* callback push to branch

* back to relative import

* reformat temp branch name based on unique task id

* add logging

* add logging

* convert posix path to str for avalon

* add extra / to root path

* New dag created using DugPipeline subclasses

* EmptyOperator imported from wrong place

* import and syntax fixes

* utterly silly syntax error

* Added anvil to default input data sets for testing purposes

* adding / to local path

* commit meta task args empty string

* add merge logic

* add merge logic

* upstream task dir pull for downstream task

* Switched from subdag to taskgroup because latest Airflow depricated subdag

* Added BACPAC pipeline object

* Temporarily ignoring configuration variable for enabled datasets for testing

* Passed dag in to create task group to see if it helps dag errors

* Fixed silly syntax error

* adding input / output dir params for make kgx

* Trying different syntax to make taskgroups work.

* adding input / output dir params for make kgx

* Parsing, syntax, pylint fixes

* adding input / output dir params for make kgx

* Added pipeline name to task group name to ensure uniqueness

* oops, moved something out of scope. Fixed

* Filled out pipeline with methods from dug_utils. Needs data path changes

* Finished implementing input_data_path and output_data_path handling, pylint cleanup

* Update requirements.txt

* adding toggle to avoid sending config obj

* adding toggle to avoid sending config obj

* disable to string for test

* control pipelines for testing

* add self to anvil get files

* add log stream to make it available

* typo fix

* correcting branch id

* adding source repo

* adding source repo

* patch name-resolver response

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* dug pipeline edit

* recurisvely find recursively

* recurisvely find recursively

* setup output path for crawling

* all task functions should have input and output params

* adding annotation as upstream for validate index

* revamp create task , and task wrapper

* add validate concepts index task

* adding concept validation

* add index_variables task as dependecy for validate concepts

* add index_variables task as dependecy for validate concepts

* await client exist

* await client exist

* concepts not getting picked up for indexing

* concepts not getting picked up for indexing

* fix search elements

* converting annotation output to json

* json format annotation outputs

* adding support for json format elements and concepts read

* json back to dug objects

* fixing index valriables with json objects

* indetation and new line for better change detection :?

* indetation and new line for better change detection

* treat dictionary concepts as dictionary

* read concepts json as a dict

* concepts files are actually file paths

* debug message

* make output jsonable

* clear up dir after commit , and delete unmerged branch even if no changes

* don`t clear indexes, parallel dataset processing will be taxed

* memory leak?

* memory leak?

* memory leak?

* dumping pickles to debug locally

* find out why concepts are being added to every other element

* find out why concepts are being added to every other element

* pointless shuffle 🤷‍♂️

* revert back in time

* back to sanitize dug

* output just json for annotation

* adding jsonpickle

* jsonpickle 🥒

* unpickle for index

* unpickle for validate index

* crawling fixes

* crawling fixes

* crawling validation fixes

* fix index concepts

* fix makekgx

* adding other bdc pipelines

* adding pipeline paramters to be able to configure per instance

* fix

* add input dataset for pipelines

* Adding README to document how to create data set-specific pipelines

* catchup on base.py

* Added dbgap and nida pipelines

* fix import errors

* annotator modules added by passing config val (#90)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

---------

Co-authored-by: YaphetKG <kebedey@renci.org>

* Add heal parsers (#96)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Add heal parsers (#97)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Radx pipeline (#99)

* point to large download

* fix schema path

* debug bulk input dir

* fix schema read

* fix schema read

* fix schema read

* commenting steup dir for test

* adding logs

* fix path stuff

* add commented stuff back in

* testing radx parser

* adding parser

* skip indexing vars with no id

* adding indexes as part of bulk loader paramters

* fix id index cli arg

* fix local cli

* dug latest

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* pin avalon

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* remove jenkins file

* bump apache version

* revert airflow version

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* Updated docker image to 2.10.2

* fix kgx path error

* fix kgx path error

* Trying out the slim apache images to reduce vulnerability footprint

* Building general package update into dockerfile

* Trying more rigorous dist-upgrade to try to get rid of vulerabilities that won't quit

* dist-upgrade also needs -y flag

* Rolling back from slim image

* Fixed apt-get syntax error

* Reverting develop to 2.7.2 for stability's sake, will revisit after upgrades finish

* Update Dockerfile

* Update Dockerfile

* Removed post-install package cleanup

* Update requirements.txt

* test merge issue

* fix merge issue

* modify radx pipeline (#111)

* modify radx pipeline

* pin to dug release

* Heal ingest 2 18 (#112)

* rename repos for heal ingest

* change lib for bulk loader

* change lib for bulk loader

* cde??

* cde??

* cde??

* cde??

* cde??

* final push

* final push

* test node type

* Heal ingest 2 18 (#114)

* rename repos for heal ingest

* change lib for bulk loader

* change lib for bulk loader

* cde??

* cde??

* cde??

* cde??

* cde??

* final push

* final push

* add study url

* Bitnami (#117)

* checkpoint

* pushing docker changes

* Executor tests 2.10 (#115)

* test out bash operator

* task name

* pushing

* revert python executor , with no excutor config

* docker file and requriements

* everything latest

---------

Co-authored-by: waTeim <truthset@gmail.com>

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>
Co-authored-by: Patrick Hachicho <hachichopatrick@gmail.com>
Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com>
Co-authored-by: waTeim <truthset@gmail.com>

* fix study name in graph (#121)

* comment clear index. rolledback during merge

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>
Co-authored-by: Patrick Hachicho <hachichopatrick@gmail.com>
Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com>
Co-authored-by: waTeim <truthset@gmail.com>

* Update requirements.txt with Dug V2.13.10 which have all the new changes from new_bdc_parser branch

* fix req

---------

Co-authored-by: YSK <yskale@users.noreply.github.com>
Co-authored-by: ykale <14827177+yskale@users.noreply.github.com>
Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>
Co-authored-by: Patrick Hachicho <hachichopatrick@gmail.com>
Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com>
Co-authored-by: waTeim <truthset@gmail.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c113288f..8b6ca0b2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6
 setuptools>=66
 pytest
 PyYAML
-git+https://github.com/helxplatform/dug@v2.13.10
+git+https://github.com/helxplatform/dug@2.13.11
 orjson==3.9.15
 git+https://github.com/helxplatform/kg_utils.git@v0.0.10
 git+https://github.com/helxplatform/python-stringcase@1.2.1

From 691753886b72d1185628b1b442cb49c168e047db Mon Sep 17 00:00:00 2001
From: "Vladimir G." <117946722+vladimir2217@users.noreply.github.com>
Date: Tue, 9 Sep 2025 11:47:55 -0400
Subject: [PATCH 57/63] DUG-529 Added support to process diffs in
 annotate_and_index (#110)

* added params

* changed to do_nothing, download diff

* pass dag params to setupinputdata

* added more logging

* dag_test

* dag_test

* dag_test

* dag_test

* dag_test

* dag_test

* Added setup_input_data support for diff

* lets try to run it

* fix params

* fix params

* fix params another attempt

* reverted do_nothing placeholder and some debug logging
---
 dags/annotate_and_index.py   | 41 +++++++++++++++++++++++++++++-
 dags/roger/pipelines/base.py |  3 +--
 dags/roger/tasks.py          | 49 ++++++++++++++++++++++++++----------
 3 files changed, 77 insertions(+), 16 deletions(-)

diff --git a/dags/annotate_and_index.py b/dags/annotate_and_index.py
index 884cd149..b82c019d 100644
--- a/dags/annotate_and_index.py
+++ b/dags/annotate_and_index.py
@@ -10,7 +10,8 @@
 
 from airflow.models import DAG
 from airflow.operators.empty import EmptyOperator
-from roger.tasks import default_args, create_pipeline_taskgroup
+from airflow.operators.python import PythonOperator
+from roger.tasks import default_args, create_pipeline_taskgroup, logger, create_python_task
 
 env_enabled_datasets = os.getenv(
     "ROGER_DUG__INPUTS_DATA__SETS", "topmed,anvil").split(",")
@@ -18,11 +19,19 @@
 with DAG(
         dag_id='annotate_and_index',
         default_args=default_args,
+        params=
+            {
+                "repository_id": None,
+                "branch_name": None,
+                "commitid_from": None,
+                "commitid_to": None
+            },
         schedule_interval=None
 ) as dag:
     init = EmptyOperator(task_id="init", dag=dag)
     finish = EmptyOperator(task_id="finish", dag=dag)
 
+
     from roger import pipelines
     from roger.config import config
     envspec = os.getenv("ROGER_DUG__INPUTS_DATA__SETS","topmed:v2.0")
@@ -42,3 +51,33 @@
         # . . .
 
         init >> create_pipeline_taskgroup(dag, pipeline_class, config) >> finish
+
+
+
+
+with DAG(
+        dag_id='dag_test',
+        default_args=default_args,
+        params=
+            {
+                "repository_id": None,
+                "branch_name": None,
+                "commitid_from": None,
+                "commitid_to": None
+            },
+        schedule_interval=None
+) as dag:
+
+    init = EmptyOperator(task_id="init", dag=dag)
+    finish = EmptyOperator(task_id="finish", dag=dag)
+
+    def print_context(ds=None, **kwargs):
+        print(">>>All kwargs")
+        print(kwargs)
+        print(">>>All ds")
+        print(ds)
+
+
+    init >> create_python_task(dag, "get_from_lakefs", print_context) >> finish
+
+    #run_this = PythonOperator(task_id="print_the_context", python_callable=print_context)
\ No newline at end of file
diff --git a/dags/roger/pipelines/base.py b/dags/roger/pipelines/base.py
index e108443e..5c1df4cc 100644
--- a/dags/roger/pipelines/base.py
+++ b/dags/roger/pipelines/base.py
@@ -840,8 +840,7 @@ def annotate(self, to_string=False, files=None, input_data_path=None,
         "Annotate files with the appropriate parsers and crawlers"
         if files is None:
             files = self.get_objects(input_data_path=input_data_path)
-        self.annotate_files(parsable_files=files,
-                            output_data_path=output_data_path)
+        self.annotate_files(parsable_files=files, output_data_path=output_data_path)
         output_log = self.log_stream.getvalue() if to_string else ''
         return output_log
 
diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py
index 5fe5ff90..2749a523 100755
--- a/dags/roger/tasks.py
+++ b/dags/roger/tasks.py
@@ -250,6 +250,9 @@ def setup_input_data(context, exec_conf):
         - put dependency data in input dir
         - if for some reason data was not found raise an exception
           """)
+    logger.info(">>> context")
+    logger.info(context)
+
     # Serves as a location where files the task will work on are placed.
     # computed as ROGER_DATA_DIR + /current task instance name_input_dir
 
@@ -263,6 +266,17 @@ def setup_input_data(context, exec_conf):
     # Download files from lakefs and store them in this new input_path
     client = init_lakefs_client(config=config)
     repos = exec_conf['repos']
+    dag_params = context["params"]
+
+    if dag_params.get("repository_id"):
+        logger.info(">>> repository_id supplied. Overriding repo.")
+        repos=[{
+            'repo': dag_params.get("repository_id"),
+            'branch': dag_params.get("branch_name"),
+            'commitid_from': dag_params.get("commitid_from"),
+            'commitid_to': dag_params.get("commitid_to")
+        }]
+
     # if no external repo is provided we assume to get the upstream task dataset.
     if not repos or len(repos) == 0:
         # merge destination branch
@@ -276,7 +290,9 @@ def setup_input_data(context, exec_conf):
         repos = [{
             'repo': repo,
             'branch': branch,
-            'path': f'{dag_id}/{upstream_id}'
+            'path': f'{dag_id}/{upstream_id}',
+            'commitid_from': None,
+            'commitid_to': None
         } for upstream_id in upstream_ids]
 
     # input_repo = exec_conf['input_repo']
@@ -287,20 +303,27 @@ def setup_input_data(context, exec_conf):
             # get all if path is not specified
             repo['path'] = '*'
     logger.info(f"repos : {repos}")
+
+    logger.info(">>> start of downloading data")
     for r in repos:
-        logger.info("downloading %s from %s@%s to %s",
-                    r['path'], r['repo'], r['branch'], input_dir)
         # create path to download to ...
         if not os.path.exists(input_dir + f'/{r["repo"]}'):
             os.mkdir(input_dir + f'/{r["repo"]}')
-        get_files(
-            local_path=input_dir + f'/{r["repo"]}',
-            remote_path=r['path'],
-            branch=r['branch'],
-            repo=r['repo'],
-            changes_only=False,
-            lake_fs_client=client
-        )
+
+        if not dag_params.get("repository_id"):
+            logger.info("downloading %s from %s@%s to %s", r['path'], r['repo'], r['branch'], input_dir)
+            get_files(
+                local_path=input_dir + f'/{r["repo"]}',
+                remote_path=r['path'],
+                branch=r['branch'],
+                repo=r['repo'],
+                changes_only=r.get("commitid_from") is not None,
+                changes_from=r.get("commitid_from"),
+                changes_to=r.get("commitid_to"),
+                lake_fs_client=client
+            )
+    logger.info(">>> end of downloading data")
+
 
 
 def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos = {}, pass_conf=True, no_output_files=False):
@@ -349,7 +372,7 @@ def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos =
                     'path': r.get('path', '*')
                 } for r in external_repos]
             }
-            
+
         pre_exec = partial(setup_input_data, exec_conf=pre_exec_conf)
         # add pre_exec partial function as an argument to python executor conf 
         python_operator_args['pre_execute'] = pre_exec
@@ -400,7 +423,7 @@ def create_pipeline_taskgroup(
             validate_index_variables_task = create_python_task(
                 dag,
                 f"validate_{name}_index_variables",
-                pipeline.validate_indexed_variables,                
+                pipeline.validate_indexed_variables,
                 pass_conf=False,
                  # declare that this task will not generate files.
                 no_output_files=True

From 23ab7a109f76289898cc140b9badf297a1a37acd Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Fri, 19 Sep 2025 11:56:32 -0400
Subject: [PATCH 58/63] Picsure data ingest test (#128)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* test

* changed the pipeline name

* parser name updated

* update pipeline name

* update the get_object/files

* changed the lakefs dataset name for testing

* update the pipelines for new programs in BDC

* init parser and crawler objects once and retry

* fix logger -> log typo

* remove clear index (#119)

* update with develop (#124)

* Sync develop with main  (#116)

* Update _version.py (#86)

* Update _version.py

* Rti merge (#84)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Update requirements.txt

* Update .gitignore

* Update dug_utils.py

Handle Error when curie not found in validate

* Update __init__.py

* Update config.yaml

* Update dev-config.yaml

* Update docker-compose.yaml

* fixed docker-compose

* adding back postgres volume to docker compose

* env correction , docker compose updates

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>

* adding v5.0

* cde-links branch

* pin linkml

* Update config.yaml

collection_action to action

* pop total items before result

* print extracted elements

* Update requirements.txt

* Keep edge provenance (#94)

* Update kgx.py

* Update kgx.py

* Update kgx.py

can't delete edge keys while looping over them.

* just collect then update

* Update requirements.txt (#93)

* Pipeline parameterize restructure (#95)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* Parameterized annotate tasks with input_data_path and output_data_path

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Parameterized annotate tasks with input_data_path and output_data_path (#85)

* adding lakefs changes to roger-2.0

* point avalon to vg1 branch

* change avalon dep

* update airflow

* fix avalon tag typo

* update jenkins to tag version on main branch only

* update jenkins to tag version

* update jenkins to tag version

* psycopg2 installation

* add cncf k8s req

* use airflow non-slim

* simplified for testing

* simplified for testing

* change dag name

* Erroneous parameter passed, should not be None

* adding pre-exec

* adding pre-exec

* adding pre-exec

* typo preexec

* typo preexec

* fix context

* get files from repo

* get files from repo

* get files from repo

* get files from repo

* First shot at moving pipeline into base class and implementing. Anvil pipeline not complete

* Syntax fix, docker image version bump to airflow 2.7.2-python3.11

* update storage dir

* update remove dir code

* update remove dir code

* remote path to *

* fix input dir for annotators

* fix input dir for annotators

* fix input dir for annotators

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* adding branch info on lakefs config

* callback push to branch

* back to relative import

* reformat temp branch name based on unique task id

* add logging

* add logging

* convert posix path to str for avalon

* add extra / to root path

* New dag created using DugPipeline subclasses

* EmptyOperator imported from wrong place

* import and syntax fixes

* utterly silly syntax error

* Added anvil to default input data sets for testing purposes

* adding / to local path

* commit meta task args empty string

* add merge logic

* add merge logic

* upstream task dir pull for downstream task

* Switched from subdag to taskgroup because latest Airflow depricated subdag

* Added BACPAC pipeline object

* Temporarily ignoring configuration variable for enabled datasets for testing

* Passed dag in to create task group to see if it helps dag errors

* Fixed silly syntax error

* adding input / output dir params for make kgx

* Trying different syntax to make taskgroups work.

* adding input / output dir params for make kgx

* Parsing, syntax, pylint fixes

* adding input / output dir params for make kgx

* Added pipeline name to task group name to ensure uniqueness

* oops, moved something out of scope. Fixed

* Filled out pipeline with methods from dug_utils. Needs data path changes

* Finished implementing input_data_path and output_data_path handling, pylint cleanup

* Update requirements.txt

* adding toggle to avoid sending config obj

* adding toggle to avoid sending config obj

* disable to string for test

* control pipelines for testing

* add self to anvil get files

* add log stream to make it available

* typo fix

* correcting branch id

* adding source repo

* adding source repo

* patch name-resolver response

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* dug pipeline edit

* recurisvely find recursively

* recurisvely find recursively

* setup output path for crawling

* all task functions should have input and output params

* adding annotation as upstream for validate index

* revamp create task , and task wrapper

* add validate concepts index task

* adding concept validation

* add index_variables task as dependecy for validate concepts

* add index_variables task as dependecy for validate concepts

* await client exist

* await client exist

* concepts not getting picked up for indexing

* concepts not getting picked up for indexing

* fix search elements

* converting annotation output to json

* json format annotation outputs

* adding support for json format elements and concepts read

* json back to dug objects

* fixing index valriables with json objects

* indetation and new line for better change detection :?

* indetation and new line for better change detection

* treat dictionary concepts as dictionary

* read concepts json as a dict

* concepts files are actually file paths

* debug message

* make output jsonable

* clear up dir after commit , and delete unmerged branch even if no changes

* don`t clear indexes, parallel dataset processing will be taxed

* memory leak?

* memory leak?

* memory leak?

* dumping pickles to debug locally

* find out why concepts are being added to every other element

* find out why concepts are being added to every other element

* pointless shuffle 🤷‍♂️

* revert back in time

* back to sanitize dug

* output just json for annotation

* adding jsonpickle

* jsonpickle 🥒

* unpickle for index

* unpickle for validate index

* crawling fixes

* crawling fixes

* crawling validation fixes

* fix index concepts

* fix makekgx

* adding other bdc pipelines

* adding pipeline paramters to be able to configure per instance

* fix

* add input dataset for pipelines

* Adding README to document how to create data set-specific pipelines

* catchup on base.py

* Added dbgap and nida pipelines

* fix import errors

* annotator modules added by passing config val (#90)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

---------

Co-authored-by: YaphetKG <kebedey@renci.org>

* Add heal parsers (#96)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Add heal parsers (#97)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Radx pipeline (#99)

* point to large download

* fix schema path

* debug bulk input dir

* fix schema read

* fix schema read

* fix schema read

* commenting steup dir for test

* adding logs

* fix path stuff

* add commented stuff back in

* testing radx parser

* adding parser

* skip indexing vars with no id

* adding indexes as part of bulk loader paramters

* fix id index cli arg

* fix local cli

* dug latest

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* pin avalon

* deleted jenkins and added workflows

* unlinked helx-actions

* testing paths

* testing again

* d

* tests

* commented out pytest

* try again for bandit

* commented out bandit

* changed dag to dags

* Added fixes

* Bagel (#103)

* bump dug version

* adding bdc new pipelines

* adding curesc

* adding bagel config

* add test parser

* add score threshold

* point to dug develop

* Dbgap programs (#104)

* bump dug version

* adding bdc new pipelines

* adding curesc

* fix up merge conflict

* adding bagel config parse to bool

* Dev sync main (#106)

* release sync (#100)

* Update _version.py (#86)

* Update _version.py

* Rti merge (#84)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Update requirements.txt

* Update .gitignore

* Update dug_utils.py

Handle Error when curie not found in validate

* Update __init__.py

* Update config.yaml

* Update dev-config.yaml

* Update docker-compose.yaml

* fixed docker-compose

* adding back postgres volume to docker compose

* env correction , docker compose updates

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>

* adding v5.0

* cde-links branch

* pin linkml

* Update config.yaml

collection_action to action

* pop total items before result

* print extracted elements

* Update requirements.txt

* Keep edge provenance (#94)

* Update kgx.py

* Update kgx.py

* Update kgx.py

can't delete edge keys while looping over them.

* just collect then update

* Update requirements.txt (#93)

* Pipeline parameterize restructure (#95)

* roger cli preped for Merge Deploy

* Update Makefile to work with python env

* Update redisgraph-bulk-loader to fix issue with loading MODULE LIST

* Revert "Update redisgraph-bulk-loader to fix issue with loading MODULE LIST"

This reverts commit 7baf7efa725caac77e5501e948f545a0f4b20e3d.

* Finalized dev deployment of dug inside Catapult Merge, deployment yamls, code changes and configurations

* updated to reflect the Dug-Api updates to FastAPI

* adding multi label redis by removing 'biolink:' on nodes, edges cannot be fixed after update so they need to be solved either by changing TranQl AND Plater or forking bulk-redisgraph to allow for colons to be added in the edges

* Working multi label redis nodes w/ no biolink label

* Latest code changes to deploy working Roger in Merge

* biolink data move to '.' separator

* updates to include new dug fixes, upgraded redis-bulk-loader and made changes to for biolink variables to specify it's domain with a 'biolink.'

* adding test roger code

* removed helm deployments

* change docker owner

* remove core.py

* remove dup dev config

* redis graph is not directly used removing cruft

* remove print statement

* remove logging files

* update requriemtns

* update requriemtns

* add redis graph.py

* fix import error for logger

* adding es scheme and ca_path config

* adding es scheme and ca_path config

* Parameterized annotate tasks with input_data_path and output_data_path

* adding debug code

* removing debug

* adding nodes args

* adding biolink.

* adding biolink.

* Parameterized annotate tasks with input_data_path and output_data_path (#85)

* adding lakefs changes to roger-2.0

* point avalon to vg1 branch

* change avalon dep

* update airflow

* fix avalon tag typo

* update jenkins to tag version on main branch only

* update jenkins to tag version

* update jenkins to tag version

* psycopg2 installation

* add cncf k8s req

* use airflow non-slim

* simplified for testing

* simplified for testing

* change dag name

* Erroneous parameter passed, should not be None

* adding pre-exec

* adding pre-exec

* adding pre-exec

* typo preexec

* typo preexec

* fix context

* get files from repo

* get files from repo

* get files from repo

* get files from repo

* First shot at moving pipeline into base class and implementing. Anvil pipeline not complete

* Syntax fix, docker image version bump to airflow 2.7.2-python3.11

* update storage dir

* update remove dir code

* update remove dir code

* remote path to *

* fix input dir for annotators

* fix input dir for annotators

* fix input dir for annotators

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* kwargs to task

* adding branch info on lakefs config

* callback push to branch

* back to relative import

* reformat temp branch name based on unique task id

* add logging

* add logging

* convert posix path to str for avalon

* add extra / to root path

* New dag created using DugPipeline subclasses

* EmptyOperator imported from wrong place

* import and syntax fixes

* utterly silly syntax error

* Added anvil to default input data sets for testing purposes

* adding / to local path

* commit meta task args empty string

* add merge logic

* add merge logic

* upstream task dir pull for downstream task

* Switched from subdag to taskgroup because latest Airflow depricated subdag

* Added BACPAC pipeline object

* Temporarily ignoring configuration variable for enabled datasets for testing

* Passed dag in to create task group to see if it helps dag errors

* Fixed silly syntax error

* adding input / output dir params for make kgx

* Trying different syntax to make taskgroups work.

* adding input / output dir params for make kgx

* Parsing, syntax, pylint fixes

* adding input / output dir params for make kgx

* Added pipeline name to task group name to ensure uniqueness

* oops, moved something out of scope. Fixed

* Filled out pipeline with methods from dug_utils. Needs data path changes

* Finished implementing input_data_path and output_data_path handling, pylint cleanup

* Update requirements.txt

* adding toggle to avoid sending config obj

* adding toggle to avoid sending config obj

* disable to string for test

* control pipelines for testing

* add self to anvil get files

* add log stream to make it available

* typo fix

* correcting branch id

* adding source repo

* adding source repo

* patch name-resolver response

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* no pass input repo and branch , if not overriden to pre-exec

* dug pipeline edit

* recurisvely find recursively

* recurisvely find recursively

* setup output path for crawling

* all task functions should have input and output params

* adding annotation as upstream for validate index

* revamp create task , and task wrapper

* add validate concepts index task

* adding concept validation

* add index_variables task as dependecy for validate concepts

* add index_variables task as dependecy for validate concepts

* await client exist

* await client exist

* concepts not getting picked up for indexing

* concepts not getting picked up for indexing

* fix search elements

* converting annotation output to json

* json format annotation outputs

* adding support for json format elements and concepts read

* json back to dug objects

* fixing index valriables with json objects

* indetation and new line for better change detection :?

* indetation and new line for better change detection

* treat dictionary concepts as dictionary

* read concepts json as a dict

* concepts files are actually file paths

* debug message

* make output jsonable

* clear up dir after commit , and delete unmerged branch even if no changes

* don`t clear indexes, parallel dataset processing will be taxed

* memory leak?

* memory leak?

* memory leak?

* dumping pickles to debug locally

* find out why concepts are being added to every other element

* find out why concepts are being added to every other element

* pointless shuffle 🤷‍♂️

* revert back in time

* back to sanitize dug

* output just json for annotation

* adding jsonpickle

* jsonpickle 🥒

* unpickle for index

* unpickle for validate index

* crawling fixes

* crawling fixes

* crawling validation fixes

* fix index concepts

* fix makekgx

* adding other bdc pipelines

* adding pipeline paramters to be able to configure per instance

* fix

* add input dataset for pipelines

* Adding README to document how to create data set-specific pipelines

* catchup on base.py

* Added dbgap and nida pipelines

* fix import errors

* annotator modules added by passing config val (#90)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

---------

Co-authored-by: YaphetKG <kebedey@renci.org>

* Add heal parsers (#96)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Add heal parsers (#97)

* annotator modules added by passing config val

* fix merge conflict

* following same pattern as parsers , modify configs

* fix to dug config method

* fix old dug pipeline for backward compatiblity

* correct default annotator type

* reflective changes

* typo extra quotes

* annotator type not being picked up from config

* remove annotate simple , log env value for lakefs enabled

* testing lakefs off

* add more logging

* add more logging

* post init for config to parse to boolean

* put back task calls

* revert some changes

* adding new pipeline

* lakefs io support for merge task

* fix name

* add io params for kg tasks

* wire up i/o paths for merge

* fix variable name

* print files

* few debug logs

* few debug logs

* treat path as path not str

* few debug logs

* some fixes

* logging edge files

* bug fix knowledge has edge

* re-org graph structure

* adding pathing for other tasks

* pagenation logic fix for avalon

* update lakefs client code

* fix glob for get kgx files

* fix up get merged objects

* send down fake commit id for metadata

* working on edges schema

* bulk create nodes I/O

* find schema file

* bulk create edges  I/O

* bulk create edges  I/O

* bulk load io

* no outputs for final tasks

* add recursive glob

* fix globbing

* oops

* delete dags

* pin dug to latest release

* cruft cleanup

* re-org kgx config

* add support for multiple initial repos

* fix comma

* create dir to download to

* swap branch and repo

* clean up dirs

* fix up other pipeline 👌

* add remaining pipelines

* adding ctn parser

* change merge strategy

* merge init fix

* debug dir

* fix topmed file read

* fix topmed file read

* return file names as strings

* topmed kgx builder custom

* topmed kgx builder custom

* add skip

* get files pattern recursive

* version pin avalon

* pin dug

---------

Co-authored-by: braswent <braswent6@gmail.com>

* Radx pipeline (#99)

* point to large download

* fix schema path

* debug bulk input dir

* fix schema read

* fix schema read

* fix schema read

* commenting steup dir for test

* adding logs

* fix path stuff

* add commented stuff back in

* testing radx parser

* adding parser

* skip indexing vars with no id

* adding indexes as part of bulk loader paramters

* fix id index cli arg

* fix local cli

* dug latest

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* pin avalon

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* remove jenkins file

* bump apache version

* revert airflow version

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>

* Updated docker image to 2.10.2

* fix kgx path error

* fix kgx path error

* Trying out the slim apache images to reduce vulnerability footprint

* Building general package update into dockerfile

* Trying more rigorous dist-upgrade to try to get rid of vulerabilities that won't quit

* dist-upgrade also needs -y flag

* Rolling back from slim image

* Fixed apt-get syntax error

* Reverting develop to 2.7.2 for stability's sake, will revisit after upgrades finish

* Update Dockerfile

* Update Dockerfile

* Removed post-install package cleanup

* Update requirements.txt

* test merge issue

* fix merge issue

* modify radx pipeline (#111)

* modify radx pipeline

* pin to dug release

* Heal ingest 2 18 (#112)

* rename repos for heal ingest

* change lib for bulk loader

* change lib for bulk loader

* cde??

* cde??

* cde??

* cde??

* cde??

* final push

* final push

* test node type

* Heal ingest 2 18 (#114)

* rename repos for heal ingest

* change lib for bulk loader

* change lib for bulk loader

* cde??

* cde??

* cde??

* cde??

* cde??

* final push

* final push

* add study url

* Bitnami (#117)

* checkpoint

* pushing docker changes

* Executor tests 2.10 (#115)

* test out bash operator

* task name

* pushing

* revert python executor , with no excutor config

* docker file and requriements

* everything latest

---------

Co-authored-by: waTeim <truthset@gmail.com>

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>
Co-authored-by: Patrick Hachicho <hachichopatrick@gmail.com>
Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com>
Co-authored-by: waTeim <truthset@gmail.com>

* fix study name in graph (#121)

* comment clear index. rolledback during merge

---------

Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>
Co-authored-by: Patrick Hachicho <hachichopatrick@gmail.com>
Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com>
Co-authored-by: waTeim <truthset@gmail.com>

* Update requirements.txt with Dug V2.13.10 which have all the new changes from new_bdc_parser branch

* fix req

---------

Co-authored-by: YSK <yskale@users.noreply.github.com>
Co-authored-by: ykale <14827177+yskale@users.noreply.github.com>
Co-authored-by: Nathan Braswell <nbraswell@rti.org>
Co-authored-by: esurface <esurface@rti.org>
Co-authored-by: braswent <braswent6@gmail.com>
Co-authored-by: Howard Lander <howard@renci.org>
Co-authored-by: Michael T. Bacon <mbacon@renci.org>
Co-authored-by: Michael T Bacon <110547969+mbacon-renci@users.noreply.github.com>
Co-authored-by: Patrick Hachicho <hachichopatrick@gmail.com>
Co-authored-by: Patrick hachicho <105758539+pchachicho@users.noreply.github.com>
Co-authored-by: waTeim <truthset@gmail.com>

From faccf8641e190ee6145e7d4fdd4d1c39da9beb3f Mon Sep 17 00:00:00 2001
From: Jeff Waller <truthset@gmail.com>
Date: Mon, 3 Nov 2025 16:11:02 -0500
Subject: [PATCH 59/63] Dug 547  (#129)

* Add landing zone for 3rd party specializations; add falkordb dockerfile

* pinning to a version

---------

Co-authored-by: YaphetKG <kebedey@renci.org>
---
 ext/containers/FalkorDB/Dockerfile | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 ext/containers/FalkorDB/Dockerfile

diff --git a/ext/containers/FalkorDB/Dockerfile b/ext/containers/FalkorDB/Dockerfile
new file mode 100644
index 00000000..e929a199
--- /dev/null
+++ b/ext/containers/FalkorDB/Dockerfile
@@ -0,0 +1,5 @@
+FROM falkordb/falkordb-server:v4.14.4-alpine
+# Remove Go if present (apk-installed or tarball)
+RUN apk del --no-network go 2>/dev/null || true \
+ && rm -rf /usr/local/go /usr/lib/go /root/go /go 2>/dev/null || true
+RUN  apk add bash

From c6ab5e07d74d531a2530971cf540ddb14ceb24be Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Wed, 12 Nov 2025 08:52:09 -0500
Subject: [PATCH 60/63] fix recover (#131)

Co-authored-by: YSK <yskale@users.noreply.github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8b6ca0b2..63275110 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6
 setuptools>=66
 pytest
 PyYAML
-git+https://github.com/helxplatform/dug@2.13.11
+git+https://github.com/helxplatform/dug@develop
 orjson==3.9.15
 git+https://github.com/helxplatform/kg_utils.git@v0.0.10
 git+https://github.com/helxplatform/python-stringcase@1.2.1

From 086ef6abfa62a741aa927df4c0c64f5188ab2076 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Fri, 14 Nov 2025 15:35:06 -0500
Subject: [PATCH 61/63] Airflow 3.1.1 (#130)

* airflow-3.1.1 changes

* fix

* last docker file changes

* patch libs

* strip newline from node descriptions

* add elastic container and redis containers to make dags load

* fix workflow
---
 .env                                |  31 ++-
 .github/workflows/trivy-pr-scan.yml |   2 +-
 Dockerfile                          | 103 +++++++---
 dags/annotate_and_index.py          |   9 +-
 dags/knowledge_graph_build.py       |   2 +-
 dags/roger/config/__init__.py       |   8 +-
 dags/roger/core/bulkload.py         |   3 +
 dags/roger/tasks.py                 | 209 ++++++++------------
 docker-compose.yaml                 | 286 ++++++++++++----------------
 requirements.txt                    |  12 +-
 10 files changed, 314 insertions(+), 351 deletions(-)

diff --git a/.env b/.env
index 2b42e8d7..28994cc6 100644
--- a/.env
+++ b/.env
@@ -1,23 +1,14 @@
-AIRFLOW_UID=502
-AIRFLOW_GID=0
 
-API_WORKERS=4
-API_PORT=5551
-API_TIMEOUT=10
+KGX_DATA_SETS="bdc-studies-kgx:v1.0"
+INPUT_DATA_SETS="heal-mds-studies:v1.0"
 
-DATA_DIR=./local_storage
+LAKEFS_ACCESS_KEY=""
+LAKEFS_SECRET_KEY=""
+LAKEFS_REPO=""
+LAKEFS_BRANCH=""
+LAKEFS_URL="https://lakefs.apps.renci.org"
 
-DUG_LOG_LEVEL=INFO
-
-ELASTICSEARCH_PASSWORD=12345
-ELASTICSEARCH_HOST=elasticsearch
-ELASTICSEARCH_USERNAME=elastic
-
-NBOOST_API_HOST=nboost
-
-REDIS_PASSWORD=weak
-REDIS_HOST=merge-redis-master
-REDIS_PORT=6379
-TRANQL_ACCESS_LOG=access.log
-TRANQL_ERROR_LOG=error.log
-ROGER_DUG__INPUTS_DATA__SETS=topmed:v1.0
\ No newline at end of file
+BIOMEGATRON_URL="https://med-nemo.apps.renci.org/annotate"
+SAPBERT_URL="https://sap-qdrant.apps.renci.org/annotate"
+NODE_NORM_URL="https://nodenormalization-sri.renci.org/get_normalized_nodes?conflate=false&description=true&curie="
+NAME_RES_URL="https://name-resolution-sri.renci.org/reverse_lookup"
\ No newline at end of file
diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml
index 1e7bc060..b8935164 100644
--- a/.github/workflows/trivy-pr-scan.yml
+++ b/.github/workflows/trivy-pr-scan.yml
@@ -61,7 +61,7 @@ jobs:
     # We still fail the job if results are found, so below will always run
     # unless manually canceled.
     - name: Upload Trivy scan results to GitHub Security tab
-      uses: github/codeql-action/upload-sarif@v2
+      uses: github/codeql-action/upload-sarif@v3
       if: '!cancelled()'
       with:
         sarif_file: 'trivy-results.sarif'
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 47d2c13f..7323534b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,33 +1,74 @@
-FROM bitnami/airflow:2.10.5-debian-12-r7
-
-USER root
-RUN apt-get update &&  apt-get install -y git nano vim gcc rustc cargo
-#RUN useradd -u 1001 -ms /bin/bash airflow && chown -R airflow /home/airflow
-COPY requirements.txt requirements.txt
-RUN source /opt/bitnami/airflow/venv/bin/activate && CARGO_HOME=/tmp/.cargo &&  \
-    pip install setuptools wheel &&  \
-    pip install -r requirements.txt
-
-RUN rm -f requirements.txt
-
-## Vul patches
-## Python lib patches on airflow python env
-RUN source /opt/bitnami/airflow/venv/bin/activate pip install --upgrade \
-    flask-appbuilder==4.5.3 \
-    cryptography==44.0.1 \
-    werkzeug==3.0.6 \
-    urllib3==2.2.2
-RUN source /opt/bitnami/airflow/venv/bin/activate pip uninstall -y  \
-    apache-airflow-providers-mysql==6.2.0
-
-# Uninstall these from non airflow python env
-RUN pip install --upgrade  \
-    flask-appbuilder==4.5.3 \
-    cryptography==44.0.1 \
-    werkzeug==3.0.6 \
-    urllib3==2.2.2
-RUN apt-get autoremove  -y vim
-RUN apt-get autoremove  -y binutils
-RUN apt-get autoremove  -y linux-libc-dev
+# Use a Debian-based image for better compatibility
+FROM python:3.11.14-slim-trixie
 
+# Set Airflow version and home directory
+ARG AIRFLOW_VERSION=3.1.1
+ARG AIRFLOW_HOME=/opt/airflow
+
+# Environment variables
+ENV AIRFLOW_HOME=${AIRFLOW_HOME}
+ENV AIRFLOW__CORE__LOAD_EXAMPLES=False
+ENV AIRFLOW__CORE__EXECUTOR=LocalExecutor
+ENV AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres:5432/airflow
+ENV PYTHONUNBUFFERED=1
+
+# Create airflow user and directories
+RUN useradd --uid 50000 --home-dir ${AIRFLOW_HOME} --create-home airflow && \
+    mkdir -p ${AIRFLOW_HOME}/dags ${AIRFLOW_HOME}/logs ${AIRFLOW_HOME}/plugins ${AIRFLOW_HOME}/config
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    libpq-dev \
+    libffi-dev \
+    libssl-dev \
+    curl \
+    tini \
+    tzdata \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Upgrade pip tools
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel
+
+# Install Airflow (with PostgreSQL, Celery, Redis support)
+RUN pip install --no-cache-dir \
+    "apache-airflow[postgres,celery,redis,fab]==${AIRFLOW_VERSION}" \
+    "apache-airflow-providers-cncf-kubernetes" \
+    --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-3.11.txt"
+
+# Optional: install extra packages
+RUN pip install --no-cache-dir psycopg2-binary redis
+
+COPY ./requirements.txt /tmp/requirements.txt
+
+RUN pip install -r /tmp/requirements.txt
+
+RUN rm /tmp/requirements.txt
+
+
+
+RUN apt-get purge -y --auto-remove \
+    build-essential \
+    libpq-dev \
+    libffi-dev \
+    libssl-dev \
+    curl \
+    git && \
+    apt-get clean
+
+# Set ownership
+RUN chown -R airflow:airflow ${AIRFLOW_HOME}
+
+# Switch to airflow user
 USER airflow
+WORKDIR ${AIRFLOW_HOME}
+
+# Expose Airflow webserver port
+EXPOSE 8080
+
+# Use tini for signal handling
+ENTRYPOINT ["/usr/bin/tini", "--"]
+
+# Default command
+CMD ["airflow", "webserver"]
diff --git a/dags/annotate_and_index.py b/dags/annotate_and_index.py
index b82c019d..e4cdfd98 100644
--- a/dags/annotate_and_index.py
+++ b/dags/annotate_and_index.py
@@ -26,7 +26,7 @@
                 "commitid_from": None,
                 "commitid_to": None
             },
-        schedule_interval=None
+        # schedule_interval=None
 ) as dag:
     init = EmptyOperator(task_id="init", dag=dag)
     finish = EmptyOperator(task_id="finish", dag=dag)
@@ -65,7 +65,7 @@
                 "commitid_from": None,
                 "commitid_to": None
             },
-        schedule_interval=None
+        # schedule_interval=None
 ) as dag:
 
     init = EmptyOperator(task_id="init", dag=dag)
@@ -80,4 +80,7 @@ def print_context(ds=None, **kwargs):
 
     init >> create_python_task(dag, "get_from_lakefs", print_context) >> finish
 
-    #run_this = PythonOperator(task_id="print_the_context", python_callable=print_context)
\ No newline at end of file
+    #run_this = PythonOperator(task_id="print_the_context", python_callable=print_context)
+
+if __name__ == "__main__":
+    dag.test()
\ No newline at end of file
diff --git a/dags/knowledge_graph_build.py b/dags/knowledge_graph_build.py
index de28aa78..d94d2262 100644
--- a/dags/knowledge_graph_build.py
+++ b/dags/knowledge_graph_build.py
@@ -15,7 +15,7 @@
 with DAG(
     dag_id='knowledge_graph_build',
     default_args=default_args,
-    schedule_interval=None
+    # schedule_interval=None
 ) as dag:
 
     """ Build the workflow tasks. """
diff --git a/dags/roger/config/__init__.py b/dags/roger/config/__init__.py
index 71111f39..cc017fca 100644
--- a/dags/roger/config/__init__.py
+++ b/dags/roger/config/__init__.py
@@ -3,7 +3,7 @@
 import warnings
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Dict, Optional, List
+from typing import Dict, Optional, List, Union
 
 import yaml
 from dug.config import Config as DugConfig
@@ -33,7 +33,7 @@ class LakefsConfig(DictLike):
     secret_access_key: str
     branch: str
     repo: str
-    enabled: bool = False
+    enabled: Union[bool, str] = False
 
     def __post_init__(self):
         if isinstance(self.enabled, str):
@@ -135,8 +135,8 @@ class AnnotationConfig(DictLike):
     ])
 
     def __post_init__(self):
-        self.annotator_args["sapbert"]["bagel"]["enabled"] = self.annotator_args["sapbert"]["bagel"][
-                                                                 "enabled"].lower() == "true"
+        self.annotator_args["sapbert"]["bagel"]["enabled"] = str(self.annotator_args["sapbert"]["bagel"][
+                                                                 "enabled"]).lower() == "true"
 
 
 @dataclass
diff --git a/dags/roger/core/bulkload.py b/dags/roger/core/bulkload.py
index a8ddbc15..17bd179f 100644
--- a/dags/roger/core/bulkload.py
+++ b/dags/roger/core/bulkload.py
@@ -53,6 +53,9 @@ def create_nodes_csv_file(self, input_data_path=None, output_data_path=None):
         merged_nodes_file = storage.merged_objects('nodes', input_data_path)
         counter = 1
         for node in storage.json_line_iter(merged_nodes_file):
+            if node.get('description'):
+                node['description'] = node['description'].replace('\n',
+                                                                  ' ')
             if not node.get('category'):
                 category_error_nodes.add(node['id'])
                 node['category'] = [BiolinkModel.root_type]
diff --git a/dags/roger/tasks.py b/dags/roger/tasks.py
index 2749a523..cd828383 100755
--- a/dags/roger/tasks.py
+++ b/dags/roger/tasks.py
@@ -1,20 +1,21 @@
-"Tasks and methods related to Airflow implementations of Roger"
+# Tasks and methods related to Airflow implementations of Roger
 
 import os
-
-from airflow.operators.python import PythonOperator
-from airflow.operators.empty import EmptyOperator
-from airflow.utils.task_group import TaskGroup
-from airflow.utils.dates import days_ago
-from airflow.models import DAG
-from airflow.models.dag import DagContext
-from airflow.models.taskinstance import TaskInstance
-from airflow.operators.bash import BashOperator
+from datetime import datetime
+from functools import partial
 from typing import Union
 from pathlib import Path
 import glob
 import shutil
 
+# Airflow 3.x - prefer provider imports and new public types
+from airflow.providers.standard.operators.python import PythonOperator
+from airflow.operators.empty import EmptyOperator
+from airflow.utils.task_group import TaskGroup
+from airflow.models import DAG
+from airflow.models.taskinstance import TaskInstance
+from airflow.providers.standard.operators.bash import BashOperator
+from airflow.utils.context import Context  # type: ignore
 
 from roger.config import config, RogerConfig
 from roger.logger import get_logger
@@ -22,13 +23,12 @@
 from avalon.mainoperations import put_files, LakeFsWrapper, get_files
 from lakefs_sdk.configuration import Configuration
 from lakefs_sdk.models.merge import Merge
-from functools import partial
 
 logger = get_logger()
 
 default_args = {
     'owner': 'RENCI',
-    'start_date': days_ago(1)
+    'start_date': datetime(2025, 1, 1)
 }
 
 
@@ -44,13 +44,17 @@ def task_wrapper(python_callable, **kwargs):
     pass_conf = kwargs.get('pass_conf', True)
     if config.lakefs_config.enabled:
         # get input path
-        input_data_path = generate_dir_name_from_task_instance(kwargs['ti'],
-                                                            roger_config=config,
-                                                            suffix='input')
+        input_data_path = generate_dir_name_from_task_instance(
+            kwargs['ti'],
+            roger_config=config,
+            suffix='input'
+        )
         # get output path from task id run id dag id combo
-        output_data_path = generate_dir_name_from_task_instance(kwargs['ti'],
-                                                            roger_config=config,
-                                                            suffix='output')
+        output_data_path = generate_dir_name_from_task_instance(
+            kwargs['ti'],
+            roger_config=config,
+            suffix='output'
+        )
     else:
         input_data_path, output_data_path = None, None
     # cast it to a path object
@@ -66,6 +70,7 @@ def task_wrapper(python_callable, **kwargs):
         return python_callable(config=config, **func_args)
     return python_callable(**func_args)
 
+
 def get_executor_config(data_path='/opt/airflow/share/data'):
     """ Get an executor configuration.
     :param annotations: Annotations to attach to the executor.
@@ -73,12 +78,11 @@ def get_executor_config(data_path='/opt/airflow/share/data'):
     """
     env_var_prefix = config.OS_VAR_PREFIX
     # based on environment set on scheduler pod, make secrets for worker pod
-    # this ensures passwords don't leak as pod templates.
     secrets_map = [{
         "secret_name_ref": "ELASTIC_SEARCH_PASSWORD_SECRET",
         "secret_key_ref": "ELASTIC_SEARCH_PASSWORD_SECRET_KEY",
         "env_var_name": f"{env_var_prefix}ELASTIC__SEARCH_PASSWORD"
-        },{
+    }, {
         "secret_name_ref": "REDIS_PASSWORD_SECRET",
         "secret_key_ref": "REDIS_PASSWORD_SECRET_KEY",
         "env_var_name": f"{env_var_prefix}REDISGRAPH_PASSWORD"
@@ -92,8 +96,8 @@ def get_executor_config(data_path='/opt/airflow/share/data'):
                 "name": secret["env_var_name"],
                 "valueFrom": {
                     "secretKeyRef": {
-                       "name": secret_name,
-                       "key": secret_key_name
+                        "name": secret_name,
+                        "key": secret_key_name
                     }
                 }})
 
@@ -104,6 +108,7 @@ def get_executor_config(data_path='/opt/airflow/share/data'):
     }
     return k8s_executor_config
 
+
 def init_lakefs_client(config: RogerConfig) -> LakeFsWrapper:
     configuration = Configuration()
     configuration.username = config.lakefs_config.access_key_id
@@ -123,47 +128,28 @@ def pagination_helper(page_fetcher, **kwargs):
         kwargs['after'] = resp.pagination.next_offset
 
 
-def avalon_commit_callback(context: DagContext, **kwargs):
-    client: LakeFsWrapper  = init_lakefs_client(config=config)
+def avalon_commit_callback(context: Context, **kwargs):
+    client: LakeFsWrapper = init_lakefs_client(config=config)
     # now files have been processed,
     # this part should
     # get the out path of the task
-    local_path = str(generate_dir_name_from_task_instance(context['ti'],
-                                                           roger_config=config,
-                                                           suffix='output')).rstrip('/') + '/'
+    local_path = str(generate_dir_name_from_task_instance(
+        context['ti'],
+        roger_config=config,
+        suffix='output')).rstrip('/') + '/'
     task_id = context['ti'].task_id
     dag_id = context['ti'].dag_id
     run_id = context['ti'].run_id
-    # run id looks like 2023-10-18T17:35:14.890186+00:00
-    # normalized to 2023_10_18T17_35_14_890186_00_00
-    # since lakefs branch id must consist of letters, digits, underscores and dashes, 
-    # and cannot start with a dash
-    run_id_normalized = run_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_')
-    dag_id_normalized = dag_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_')
-    task_id_normalized = task_id.replace('-','_').replace(':','_').replace('+','_').replace('.','_')
+    # normalize run/dag/task ids for branch name
+    run_id_normalized = run_id.replace('-', '_').replace(':', '_').replace('+', '_').replace('.', '_')
+    dag_id_normalized = dag_id.replace('-', '_').replace(':', '_').replace('+', '_').replace('.', '_')
+    task_id_normalized = task_id.replace('-', '_').replace(':', '_').replace('+', '_').replace('.', '_')
     temp_branch_name = f'{dag_id_normalized}_{task_id_normalized}_{run_id_normalized}'
-    # remote path to upload the files to.
     remote_path = f'{dag_id}/{task_id}/'
 
-    # merge destination branch
     branch = config.lakefs_config.branch
     repo = config.lakefs_config.repo
-    # This part pushes to a temp branch on the repo
 
-    # now we have the output path lets do some pushing but where ?
-    # right now lets stick to using one repo ,
-
-    # issue Vladmir pointed out if uploads to a single lakefs branch have not
-    # been finalized with commit,
-    # this would cause dirty commits if parallel tasks target the same branch.
-
-    # solution: Lakefs team suggested we commit to a different temp branch per
-    # task, and merge that branch.
-    # this callback function will do that for now.
-
-    # 1. put files into a temp branch.
-    # 2. make sure a commit happens.
-    # 3. merge that branch to master branch.
     logger.info("Pushing local path %s to %s@%s in %s dir",
                 local_path, repo, temp_branch_name, remote_path)
     put_files(
@@ -182,27 +168,22 @@ def avalon_commit_callback(context: DagContext, **kwargs):
         source_branch_name=branch
     )
 
-    # see what changes are going to be pushed from this branch to main branch
     for diff in pagination_helper(client._client.refs_api.diff_refs,
                                   repository=repo, left_ref=branch,
                                   right_ref=temp_branch_name):
         logger.info("Diff: " + str(diff))
-    
+
     try:
-        # merging temp branch to working branch
-        # the current working branch wins incase of conflicts
         merge = Merge(**{"strategy": "source-wins"})
         client._client.refs_api.merge_into_branch(repository=repo,
-                                                source_ref=temp_branch_name,
-                                                destination_branch=branch,
-                                                merge=merge
-                                                )
+                                                  source_ref=temp_branch_name,
+                                                  destination_branch=branch,
+                                                  merge=merge
+                                                  )
 
         logger.info(f"merged branch {temp_branch_name} into {branch}")
     except Exception as e:
-        # remove temp 
         logger.error(e)
-    # delete temp branch
     finally:
         client._client.branches_api.delete_branch(
             repository=repo,
@@ -211,38 +192,43 @@ def avalon_commit_callback(context: DagContext, **kwargs):
 
         logger.info(f"deleted temp branch {temp_branch_name}")
         logger.info(f"deleting local dir {local_path}")
-        files_to_clean = glob.glob(local_path + '**', recursive=True) + [local_path]
 
+    # cleanup local dirs
     clean_up(context, **kwargs)
 
-def clean_up(context: DagContext, **kwargs):
-    input_dir = str(generate_dir_name_from_task_instance(context['ti'],
-                                                          roger_config=config,
-                                                          suffix='output')).rstrip('/') + '/'
-    output_dir = str(generate_dir_name_from_task_instance(context['ti'],
-                                                          roger_config=config,
-                                                          suffix='input')).rstrip('/') + '/'
+
+def clean_up(context: Context, **kwargs):
+    input_dir = str(generate_dir_name_from_task_instance(
+        context['ti'],
+        roger_config=config,
+        suffix='output')).rstrip('/') + '/'
+    output_dir = str(generate_dir_name_from_task_instance(
+        context['ti'],
+        roger_config=config,
+        suffix='input')).rstrip('/') + '/'
     files_to_clean = glob.glob(input_dir + '**', recursive=True) + [input_dir]
     files_to_clean += glob.glob(output_dir + '**', recursive=True) + [output_dir]
     for f in files_to_clean:
         if os.path.exists(f):
             shutil.rmtree(f)
 
+
 def generate_dir_name_from_task_instance(task_instance: TaskInstance,
-                                         roger_config: RogerConfig, suffix:str):
+                                         roger_config: RogerConfig, suffix: str):
     # if lakefs is not enabled just return none so methods default to using
     # local dir structure.
     if not roger_config.lakefs_config.enabled:
         return None
-    root_data_dir =  os.getenv("ROGER_DATA_DIR").rstrip('/')
+    root_data_dir = os.getenv("ROGER_DATA_DIR").rstrip('/')
     task_id = task_instance.task_id
     dag_id = task_instance.dag_id
     run_id = task_instance.run_id
-    try_number = task_instance._try_number
+    try_number = task_instance.try_number
     return Path(
         f"{root_data_dir}/{dag_id}_{task_id}_{run_id}_{try_number}_{suffix}")
 
-def setup_input_data(context, exec_conf):
+
+def setup_input_data(context: Context, exec_conf):
     logger.info("""
         - Figures out the task name and id,
         - find its data dependencies
@@ -253,40 +239,29 @@ def setup_input_data(context, exec_conf):
     logger.info(">>> context")
     logger.info(context)
 
-    # Serves as a location where files the task will work on are placed.
-    # computed as ROGER_DATA_DIR + /current task instance name_input_dir
-
     input_dir = str(generate_dir_name_from_task_instance(
         context['ti'], roger_config=config, suffix="input"))
-    # Clear up files from previous run etc...
-
-    # create input dir
     os.makedirs(input_dir, exist_ok=True)
 
-    # Download files from lakefs and store them in this new input_path
     client = init_lakefs_client(config=config)
-    repos = exec_conf['repos']
-    dag_params = context["params"]
+    repos = exec_conf.get('repos', [])
+    dag_params = context.get("params", {})
 
     if dag_params.get("repository_id"):
         logger.info(">>> repository_id supplied. Overriding repo.")
-        repos=[{
+        repos = [{
             'repo': dag_params.get("repository_id"),
             'branch': dag_params.get("branch_name"),
             'commitid_from': dag_params.get("commitid_from"),
             'commitid_to': dag_params.get("commitid_to")
         }]
 
-    # if no external repo is provided we assume to get the upstream task dataset.
     if not repos or len(repos) == 0:
-        # merge destination branch
         branch = config.lakefs_config.branch
         repo = config.lakefs_config.repo
         task_instance: TaskInstance = context['ti']
-        # get upstream ids
         upstream_ids = task_instance.task.upstream_task_ids
         dag_id = task_instance.dag_id
-        # calculate remote dirs using dag_id + upstreams
         repos = [{
             'repo': repo,
             'branch': branch,
@@ -295,18 +270,13 @@ def setup_input_data(context, exec_conf):
             'commitid_to': None
         } for upstream_id in upstream_ids]
 
-    # input_repo = exec_conf['input_repo']
-    # input_branch = exec_conf['input_branch']
-    # If input repo is provided use that as source of files
-    for repo in repos:
-        if not repo.get('path'):
-            # get all if path is not specified
-            repo['path'] = '*'
+    for r in repos:
+        if not r.get('path'):
+            r['path'] = '*'
     logger.info(f"repos : {repos}")
 
     logger.info(">>> start of downloading data")
     for r in repos:
-        # create path to download to ...
         if not os.path.exists(input_dir + f'/{r["repo"]}'):
             os.mkdir(input_dir + f'/{r["repo"]}')
 
@@ -325,46 +295,39 @@ def setup_input_data(context, exec_conf):
     logger.info(">>> end of downloading data")
 
 
-
-def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos = {}, pass_conf=True, no_output_files=False):
+def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos=None, pass_conf=True, no_output_files=False):
     """ Create a python task.
     :param func_kwargs: additional arguments for callable.
     :param dag: dag to add task to.
     :param name: The name of the task.
     :param a_callable: The code to run in this task.
-    """    
-    
+    """
+
+    if external_repos is None:
+        external_repos = {}
+
     # these are actual arguments passed down to the task function
     op_kwargs = {
         "python_callable": a_callable,
         "to_string": True,
         "pass_conf": pass_conf
     }
-    # update / override some of the args passed to the task function by default
     if func_kwargs is None:
         func_kwargs = {}
     op_kwargs.update(func_kwargs)
 
-
-    # Python operator arguments , by default for non-lakefs config this is all we need. 
     python_operator_args = {
-            "task_id": name,
-            "python_callable":task_wrapper,            
-            # "executor_config" : get_executor_config(),
-            "dag": dag,
-            "provide_context" : True
+        "task_id": name,
+        "python_callable": task_wrapper,
+        # executor_config example left commented; fill if needed
+        "dag": dag,
     }
 
-    # if we have lakefs...
     if config.lakefs_config.enabled:
-
-        # repo and branch for pre-execution , to download input objects
         pre_exec_conf = {
             'repos': []
         }
         if external_repos:
-            # if the task is a root task , beginning of the dag...
-            # and we want to pull data from a different repo.
             pre_exec_conf = {
                 'repos': [{
                     'repo': r['name'],
@@ -374,18 +337,19 @@ def create_python_task(dag, name, a_callable, func_kwargs=None, external_repos =
             }
 
         pre_exec = partial(setup_input_data, exec_conf=pre_exec_conf)
-        # add pre_exec partial function as an argument to python executor conf 
+        # pre_execute will be called with context -> partial keeps exec_conf fixed
         python_operator_args['pre_execute'] = pre_exec
-        python_operator_args['on_failure_callback'] = partial(clean_up, kwargs=op_kwargs)
-        # if the task has  output files, we will add a commit callback  
+
+        # pass fixed kwargs into partials so resulting callback accepts (context,)
+        python_operator_args['on_failure_callback'] = partial(clean_up, **op_kwargs)
         if not no_output_files:
-            python_operator_args['on_success_callback'] = partial(avalon_commit_callback, kwargs=op_kwargs)
-        
-    # add kwargs
+            python_operator_args['on_success_callback'] = partial(avalon_commit_callback, **op_kwargs)
+
     python_operator_args["op_kwargs"] = op_kwargs
 
     return PythonOperator(**python_operator_args)
 
+
 def create_pipeline_taskgroup(
         dag,
         pipeline_class: type,
@@ -416,7 +380,6 @@ def create_pipeline_taskgroup(
                 f"index_{name}_variables",
                 pipeline.index_variables,
                 pass_conf=False,
-                # declare that this task will not generate files.
                 no_output_files=True)
             index_variables_task.set_upstream(annotate_task)
 
@@ -425,9 +388,8 @@ def create_pipeline_taskgroup(
                 f"validate_{name}_index_variables",
                 pipeline.validate_indexed_variables,
                 pass_conf=False,
-                 # declare that this task will not generate files.
                 no_output_files=True
-                )
+            )
             validate_index_variables_task.set_upstream([annotate_task, index_variables_task])
 
             make_kgx_task = create_python_task(
@@ -441,7 +403,7 @@ def create_pipeline_taskgroup(
                 dag,
                 f"crawl_{name}",
                 pipeline.crawl_tranql,
-                pass_conf=False) 
+                pass_conf=False)
             crawl_task.set_upstream(annotate_task)
 
             index_concepts_task = create_python_task(
@@ -449,7 +411,6 @@ def create_pipeline_taskgroup(
                 f"index_{name}_concepts",
                 pipeline.index_concepts,
                 pass_conf=False,
-                 # declare that this task will not generate files.
                 no_output_files=True)
             index_concepts_task.set_upstream(crawl_task)
 
@@ -458,12 +419,10 @@ def create_pipeline_taskgroup(
                 f"validate_{name}_index_concepts",
                 pipeline.validate_indexed_concepts,
                 pass_conf=False,
-                 # declare that this task will not generate files.
                 no_output_files=True
             )
             validate_index_concepts_task.set_upstream([crawl_task, index_concepts_task, annotate_task])
 
-
             complete_task = EmptyOperator(task_id=f"complete_{name}")
             complete_task.set_upstream(
                 (make_kgx_task,
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 7c698ed6..fd81f5f5 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -1,207 +1,167 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
+version: '3.8'
 
-# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
-#
-# WARNING: This configuration is for local development. Do not use it in a production deployment.
-#
-# This configuration supports basic configuration using environment variables or an .env file
-# The following variables are supported:
-#
-# AIRFLOW_IMAGE_NAME         - Docker image name used to run Airflow.
-#                              Default: apache/airflow:master-python3.8
-# AIRFLOW_UID                - User ID in Airflow containers
-#                              Default: 50000
-# AIRFLOW_GID                - Group ID in Airflow containers
-#                              Default: 50000
-# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account.
-#                              Default: airflow
-# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account.
-#                              Default: airflow
-#
-# Feel free to modify this file to suit your needs.
----
-version: '3'
-x-airflow-common:
-  &airflow-common
-  build:
-    dockerfile: Dockerfile
-    context: .
-  environment:
-    &airflow-common-env
-    AIRFLOW__CORE__EXECUTOR: CeleryExecutor
-    AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
-    AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
-    AIRFLOW__CELERY__BROKER_URL: redis://:$REDIS_PASSWORD@redis:$REDIS_PORT/0
+x-airflow-common: &airflow-common
+  build: .
+  environment: &airflow_common_environment
+    AIRFLOW__CORE__EXECUTOR: LocalExecutor
+    AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres:5432/airflow
     AIRFLOW__CORE__FERNET_KEY: ''
     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
-
-    ROGER_DUG__INPUTS_DATA__SETS: "$ROGER_DUG__INPUTS_DATA__SETS"
-    ROGER_ELASTICSEARCH_HOST: "$ELASTIC_API_HOST"
-    ROGER_ELASTICSEARCH_PASSWORD: "$ELASTIC_PASSWORD"
-    ROGER_ELASTICSEARCH_NBOOST__HOST: "$NBOOST_API_HOST"
-    ROGER_REDISGRAPH_HOST: "$REDIS_HOST"
-    ROGER_REDISGRAPH_PASSWORD: "$REDIS_PASSWORD"
-    ROGER_KGX_DATASET__VERSION: "v3.0"
-    ROGER_DATA_DIR: "/opt/airflow/share/data"
+    AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
+    AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
+    AIRFLOW__CORE__SIMPLE_AUTH_MANAGER_USERS: 'admin:Admin'
+    ROGER_ELASTICSEARCH_HOST: "elasticsearch"
+    ROGER_ELASTICSEARCH_PASSWORD: ""
+    ROGER_ELASTICSEARCH_SCHEME: "http"
+    ROGER_ELASTICSEARCH_USERNAME: "elastic"
+    ROGER_REDISGRAPH_GRAPH: "test"
+    ROGER_REDISGRAPH_HOST: "redis-stack"
+    ROGER_REDISGRAPH_PASSWORD: ""
+    ROGER_REDISGRAPH_PORT: "6379"
+    ROGER_KGX_DATA__SETS: ${KGX_DATA_SETS}
+    ROGER_LAKEFS__CONFIG_ACCESS__KEY__ID: ${LAKEFS_ACCESS_KEY}
+    ROGER_LAKEFS__CONFIG_BRANCH: ${LAKEFS_BRANCH}
+    ROGER_LAKEFS__CONFIG_ENABLED: "true"
+    ROGER_LAKEFS__CONFIG_HOST: ${LAKEFS_URL}
+    ROGER_LAKEFS__CONFIG_REPO: ${LAKEFS_REPO}
+    ROGER_LAKEFS__CONFIG_SECRET__ACCESS__KEY: ${LAKEFS_SECRET_KEY}
+    ROGER_DUG__INPUTS_DATA__SETS: ${INPUT_DATA_SETS}
+    ROGER_ANNOTATION_ANNOTATOR__ARGS_SAPBERT_CLASSIFICATION__URL: ${BIOMEGATRON_URL}
+    ROGER_ANNOTATION_ANNOTATOR__ARGS_SAPBERT_ANNOTATOR__URL : ${SAPBERT_URL}
+    ROGER_ANNOTATION_NORMALIZER : ${NODE_NORM_URL}
+    ROGER_ANNOTATION_SYNONYM__SERVICE : ${NAME_RES_URL}
   volumes:
     - ./dags:/opt/airflow/dags
     - ./logs:/opt/airflow/logs
     - ./plugins:/opt/airflow/plugins
-    - ./data:/opt/airflow/share/data
-  user: root
+    - ./config:/opt/airflow/config
   depends_on:
-    redis:
-      condition: service_healthy
     postgres:
       condition: service_healthy
+  networks:
+    - airflow-network
 
 services:
   postgres:
-    image: postgres:13
+    image: postgres:15-alpine
     environment:
       POSTGRES_USER: airflow
       POSTGRES_PASSWORD: airflow
       POSTGRES_DB: airflow
     volumes:
       - postgres-db-volume:/var/lib/postgresql/data
-      - ${DATA_DIR}/elastic:/elastic
-      - ${DATA_DIR}/redis:/redis
     healthcheck:
       test: ["CMD", "pg_isready", "-U", "airflow"]
-      interval: 5s
+      interval: 10s
       retries: 5
-    restart: always
+      start_period: 5s
+    ports:
+      - "5432:5432"
+    networks:
+      - airflow-network
 
-  airflow-webserver:
-    <<: *airflow-common
-    command: webserver
+  # --- NEW: Elasticsearch Service ---
+  elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:8.11.1
+    environment:
+      - discovery.type=single-node
+      - xpack.security.enabled=false
+      - ES_JAVA_OPTS=-Xms512m -Xmx512m # Limit RAM usage for dev
+    volumes:
+      - elasticsearch-data:/usr/share/elasticsearch/data
     ports:
-      - 8080:8080
+      - "9200:9200" # REST API
+      - "9300:9300" # Internal transport
+    networks:
+      - airflow-network
+
+  # --- NEW: Redis Stack (includes RedisGraph/FalkorDB) ---
+  redis-stack:
+    image: redis/redis-stack:latest
+    volumes:
+      - redis-stack-data:/data
+    ports:
+      - "6379:6379" # Redis port
+      - "8001:8001" # RedisInsight UI
     healthcheck:
-      test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
+      test: [ "CMD", "redis-cli", "ping" ]
       interval: 10s
-      timeout: 10s
       retries: 5
-    restart: always
-
-  airflow-scheduler:
-    <<: *airflow-common
-    command: scheduler
-    restart: always
-
-  airflow-worker:
-    <<: *airflow-common
-    command: celery worker
-    restart: always
+      start_period: 5s
+    networks:
+      - airflow-network
 
   airflow-init:
     <<: *airflow-common
-    command: version
-    environment:
-      <<: *airflow-common-env
-      _AIRFLOW_DB_UPGRADE: 'true'
-      _AIRFLOW_WWW_USER_CREATE: 'true'
-      _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
-      _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
+    entrypoint: /bin/bash
+    command:
+      - -c
+      - |
+        mkdir -p /opt/airflow/logs /opt/airflow/dags /opt/airflow/plugins
+        airflow db migrate
+    depends_on:
+      postgres:
+        condition: service_healthy
 
-  flower:
+  airflow-webserver:
     <<: *airflow-common
-    command: celery flower
+    command: airflow api-server
     ports:
-      - 5555:5555
+      - "8080:8080"
     healthcheck:
-      test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
-      interval: 10s
+      test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
+      interval: 30s
       timeout: 10s
       retries: 5
-    restart: always
+      start_period: 30s
+    depends_on:
+      airflow-init:
+        condition: service_completed_successfully
 
-  redis:
-    # image: redislabs/redisgraph:2.10.9 #Alternative Image
-    user: root
-    image: 'redis/redis-stack:6.2.4-v2'
-    command: "redis-server --requirepass $REDIS_PASSWORD --loadmodule /opt/redis-stack/lib/redisgraph.so"
-    environment:
-      - REDIS_ARGS=--requirepass $REDIS_PASSWORD
-    volumes:
-      - $DATA_DIR/redis:/data # FIX RDB Error on local
-    ports:
-      - $REDIS_PORT:$REDIS_PORT
+  airflow-scheduler:
+    <<: *airflow-common
+    command: airflow scheduler
     healthcheck:
-      test: ["CMD", "redis-cli", "ping"]
-      interval: 5s
-      timeout: 30s
-      retries: 50
-    restart: always
+      test: ["CMD", "airflow", "jobs", "check", "--job-type", "SchedulerJob", "--hostname", "$${HOSTNAME}"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 30s
+    depends_on:
+      airflow-init:
+        condition: service_completed_successfully
 
-  dug:
-    image: containers.renci.org/helxplatform/dug:latest
+  airflow-triggerer:
+    <<: *airflow-common
+    command: airflow triggerer
+    healthcheck:
+      test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 30s
     depends_on:
-      - elasticsearch
-      - redis
-    restart: always
-    environment:
-      ELASTIC_API_HOST: "$ELASTIC_API_HOST"
-      ELASTIC_PASSWORD: "$ELASTIC_PASSWORD"
-      REDIS_HOST: "$REDIS_HOST"
-      REDIS_PASSWORD: "$REDIS_PASSWORD"
-      FLASK_ENV: "development"
-      PYTHONUNBUFFERED: "TRUE"
-    entrypoint: [ "gunicorn",
-                     "--workers=$API_WORKERS", "--name=dug",
-                     "--bind=0.0.0.0:$API_PORT", "--timeout=$API_TIMEOUT",
-                     "--log-level=DEBUG", "-k", "uvicorn.workers.UvicornWorker", "--reload", "dug.server:APP"]
-    ports:
-      - $API_PORT:$API_PORT
+      airflow-init:
+        condition: service_completed_successfully
 
-  elasticsearch:
-    user: root
-    image: docker.elastic.co/elasticsearch/elasticsearch:8.5.2
-    environment:
-      - ELASTIC_PASSWORD=$ELASTIC_PASSWORD
-      - discovery.type=single-node
-      - xpack.security.enabled=true
-      - ingest.geoip.downloader.enabled=false
-    volumes:
-      - $DATA_DIR/elastic:/usr/share/elasticsearch/data
-    ports:
-      - '9200:9200'
-      - '9300:9300'
+  airflow-dag-processor:
+    <<: *airflow-common
+    command: airflow dag-processor
+    healthcheck:
+      test: [ "CMD", "airflow", "jobs", "check", "--job-type", "DagProcessorJob", "--hostname", "$${HOSTNAME}" ]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 30s
+    depends_on:
+      airflow-init:
+        condition: service_completed_successfully
 
-  tranql:
-    image: containers.renci.org/helxplatform/tranql:rti-merge
-    ports:
-      - '8001:8001'
-    entrypoint: [
-        "gunicorn",
-        "--workers=4",
-        "--bind=0.0.0.0:8001",
-        "--timeout=300",
-        "--access-logfile=$TRANQL_ACCESS_LOG",
-        "--error-logfile=$TRANQL_ERROR_LOG",
-        "--log-level=debug",
-        "tranql.api:app",
-    ]
-    environment:
-      - REDIS_PASSWORD=$REDIS_PASSWORD
-    volumes:
-      - ./tranql-schema.yaml:/tranql/tranql/conf/schema.yaml
 volumes:
-  postgres-db-volume:
\ No newline at end of file
+  postgres-db-volume:
+  elasticsearch-data: # <-- New volume for Elasticsearch
+  redis-stack-data:
+networks:
+  airflow-network:
+    driver: bridge
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 63275110..c8c705fa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-elasticsearch==8.5.2
+elasticsearch>=8.5.2
 flatten-dict
 jsonpickle
 git+https://github.com/falkordb/falkordb-bulk-loader.git@v1.0.6
@@ -6,9 +6,15 @@ setuptools>=66
 pytest
 PyYAML
 git+https://github.com/helxplatform/dug@develop
-orjson==3.9.15
+orjson>=3.9.15
 git+https://github.com/helxplatform/kg_utils.git@v0.0.10
 git+https://github.com/helxplatform/python-stringcase@1.2.1
 bmt==1.4.4
-git+https://github.com/helxplatform/avalon.git@v1.1.0
+git+https://github.com/helxplatform/avalon.git@lakefs-1.71.0
 h11>=0.16.0
+starlette>=0.49.1
+datetime
+aiohttp
+#--- patch
+werkzeug==3.0.6
+cryptography>=44.0.1
\ No newline at end of file

From e913111312103badd913d3b2425184b8995c7da3 Mon Sep 17 00:00:00 2001
From: Griffin Roupe <31631417+frostyfan109@users.noreply.github.com>
Date: Wed, 7 Jan 2026 13:00:57 -0500
Subject: [PATCH 62/63] Update urllib version requirement to address
 vulnerability (#134)

* pin min urllib version

* upgrade packages to mitigate vulns

* remove werkzeug constraint from requirements

* add werkzeug constraint back, upgrade to 3.1.4
---
 Dockerfile       | 2 +-
 requirements.txt | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 7323534b..47f32824 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
 FROM python:3.11.14-slim-trixie
 
 # Set Airflow version and home directory
-ARG AIRFLOW_VERSION=3.1.1
+ARG AIRFLOW_VERSION=3.1.5
 ARG AIRFLOW_HOME=/opt/airflow
 
 # Environment variables
diff --git a/requirements.txt b/requirements.txt
index 44e6ffae..def8e43b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,9 @@ git+https://github.com/helxplatform/avalon.git@lakefs-1.71.0
 h11>=0.16.0
 starlette>=0.49.1
 datetime
-aiohttp
 #--- patch
-werkzeug==3.0.6
-cryptography>=44.0.1
\ No newline at end of file
+aiohttp>=3.13.3
+werkzeug==3.1.4
+cryptography>=44.0.1
+urllib3>=2.6.2
+marshmallow==3.26.2 # upgrade from 3.26.1 specified in airflow constraints file
\ No newline at end of file

From 61133245c83e22803d90477b9a6ff1cc3be7e5f0 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Fri, 16 Jan 2026 15:44:59 -0500
Subject: [PATCH 63/63] bump recent

---
 requirements.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 975229df..3e9ac3d6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,8 +16,9 @@ starlette>=0.49.1
 datetime
 #--- patch
 aiohttp>=3.13.3
-werkzeug==3.1.4
+werkzeug==3.1.5
 cryptography>=44.0.1
-urllib3>=2.6.2
+urllib3>=2.6.3
+jaraco.context==6.1.0
 marshmallow==3.26.2 # upgrade from 3.26.1 specified in airflow constraints file