fils · fils · May 21, 2025
diff --git a/README.md b/README.md
@@ -7,6 +7,24 @@
 
 I'm assuming you have set up a working environment with your triplestore and other systems you want.
 Note this repo is using the UV package management system. (see: https://docs.astral.sh/uv/)
+Ensure all dependencies are installed by running `uv pip install -r requirements.txt` or by ensuring your `pyproject.toml` is up to date and then running `uv pip sync`.
+
+#### Dependencies
+The shell scripts `scripts/loadDirToTriplestore.sh` and `scripts/loadSitemapToTriplestore.sh` rely on the `jsonld` command-line tool for processing JSON-LD data. A common implementation is the Node.js `jsonld-cli` package.
+You can install it via npm:
+```bash
+npm install -g jsonld-cli
+```
+For more information, visit: [https://www.npmjs.com/package/jsonld-cli](https://www.npmjs.com/package/jsonld-cli)
+
+### Running Tests
+
+Unit tests are located in the `tests/` directory. To run the tests, navigate to the root of the repository and execute the following command:
+
+```bash
+python -m unittest discover -s tests
+```
+This command will discover and run all tests within the `tests` directory. Make sure all project dependencies, including any test-specific dependencies like `reportlab` (which should be listed in `requirements.txt` or `pyproject.toml`), are installed using `uv`.
 
 ### Notebook prototype
 
@@ -41,7 +59,7 @@ We can run these:
 ./scripts/github_jsonld_sitemap.py --output output/jldnew-sitemap.xml https://github.com/bio-xyz/BioAgents sampleJsonLdsNew 
 ```
 
-To load out JSON-LD now, we can use the sitemap to pull the resources directly from GitHub.
+To load out JSON-LD now, we can use the sitemap to pull the resources directly from GitHub. (Make sure you have `jsonld-cli` installed, see "Dependencies" section under "Setup").
 
 ```bash
 ./scripts/loadSitemapToTriplestore.sh ./output/jld-sitemap.xml http://homelab.lan:7878/store
@@ -96,6 +114,8 @@ Use the code bamlTest.py to use OpenAI (set the key with something like)
 export OPENAI_API_KEY="..."
 ```
 
+`bamlTest.py` is a utility script for sending a markdown file to BAML functions (`ExtractIdea` or `ExtractAssertion`) and saving the JSON output. It can be used for manual testing or exploration of these BAML functions.
+
 > Note: Since this is using [BAML](https://github.com/BoundaryML/baml) it's easy to 
 > modify [clients.baml](baml_src/clients.baml) and add in any client.  Ollama, for local,
 > Xai, Google Gemini, etc.  You will then need to modify the ``` client "openai/gpt-4o" ```
@@ -123,4 +143,3 @@ References:
 
 * BioAgent repo: https://github.com/bio-xyz/plugin-bioagent
 * DKG (origin trail): https://docs.origintrail.io/build-with-dkg/quickstart-test-drive-the-dkg-in-5-mins
-
diff --git a/defs/etl_convert.py b/defs/etl_convert.py
@@ -1,5 +1,6 @@
 import requests
 import os
+import logging
 import tempfile
 from urllib.parse import urlparse
 
@@ -10,10 +11,11 @@
     import html2text
     import PyPDF2
 except ImportError:
-    print("Required libraries not found. Please install them using:")
-    print("pip install html2text PyPDF2")
+    logger.exception("Failed to import html2text or PyPDF2. Please install them using: pip install html2text PyPDF2")
     exit(1)
 
+logger = logging.getLogger(__name__)
+
 def is_url(path):
     """Check if the given path is a URL."""
     try:
@@ -48,7 +50,7 @@ def download_file(url):
         temp_file.close()
         return temp_file_path
     except Exception as e:
-        print(f"Error downloading file: {e}")
+        logger.exception(f"Error downloading file: {e}")
         return None
 
 def html_to_markdown(html_content):
@@ -79,7 +81,7 @@ def pdf_to_markdown(pdf_path):
 
         return markdown_text
     except Exception as e:
-        print(f"Error converting PDF to Markdown: {e}")
+        logger.exception(f"Error converting PDF to Markdown: {e}")
         return None
 
 def convert_to_markdown(source, is_local=False):
@@ -112,7 +114,7 @@ def convert_to_markdown(source, is_local=False):
 
             return html_to_markdown(html_content)
     except Exception as e:
-        print(f"Error converting to Markdown: {e}")
+        logger.exception(f"Error converting to Markdown: {e}")
         return None
 
 def convert_document(url=None, local_file=None, output_file=None):
@@ -128,11 +130,11 @@ def convert_document(url=None, local_file=None, output_file=None):
         str: The Markdown content or None if conversion failed.
     """
     if url and local_file:
-        print("Error: Please provide either a URL or a local file, not both.")
+        logger.error("Error: Please provide either a URL or a local file, not both.")
         return None
 
     if not url and not local_file:
-        print("Error: Please provide either a URL or a local file.")
+        logger.error("Error: Please provide either a URL or a local file.")
         return None
 
     # Convert the document
@@ -149,8 +151,8 @@ def convert_document(url=None, local_file=None, output_file=None):
         try:
             with open(output_file, 'w', encoding='utf-8') as file:
                 file.write(markdown_content)
-            print(f"Markdown saved to {output_file}")
+            logger.info(f"Markdown saved to {output_file}")
         except Exception as e:
-            print(f"Error saving Markdown to file: {e}")
+            logger.exception(f"Error saving Markdown to file: {e}")
 
     return markdown_content
diff --git a/defs/etl_fetch.py b/defs/etl_fetch.py
@@ -7,13 +7,14 @@
 
 import asyncio
 from crawl4ai import *
+import logging
 
+logger = logging.getLogger(__name__)
 
 async def fetch_resources(source):
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(
-            # url="https://www.waterqualitydata.us"
-            url="https://www.hydrosheds.org/hydrosheds-core-downloads",
+            url=source,
         )
 
-    print(result.markdown)
+    logger.info(result.markdown)
diff --git a/defs/etl_query.py b/defs/etl_query.py
@@ -4,11 +4,13 @@
 import lancedb
 import polars as pl
 import requests
+import logging
 
+logger = logging.getLogger(__name__)
 
 def query_mode(source, sink, query, table):
     """Handle query mode operations"""
-    print(f"Query mode: Processing data from {source} to {sink}")
+    logger.info(f"Query mode: Processing data from {source} to {sink}")
     # Add query-specific logic here
 
     # Qlever params, not needed for oxigraph
@@ -30,13 +32,25 @@ def query_mode(source, sink, query, table):
     response = requests.post(source, params=params, headers=headers, data=query)
 
     # Load response into Polars DataFrame
-    # df = pl.read_csv(StringIO(response.text))
-    df = pl.read_csv(StringIO(response.text), truncate_ragged_lines=False)
-
-    # print(df)
+    try:
+        # df = pl.read_csv(StringIO(response.text))
+        df = pl.read_csv(StringIO(response.text), truncate_ragged_lines=False)
+    except pl.exceptions.ShapeError as e:
+        logger.exception(
+            "Failed to parse CSV response from SPARQL query due to ragged lines. "
+            "The number of columns is inconsistent across rows. "
+            f"Polars error: {e}"
+        )
+        # Re-raise the exception as per requirement
+        raise
+    except Exception as e:
+        logger.exception(f"An unexpected error occurred while parsing CSV response: {e}")
+        raise
+
+    # logger.info(df)
 
     # # Create or get LanceDB table and write data
-    print("Saving LanceDB table: ", table, "")
+    logger.info(f"Saving LanceDB table: {table}")
     db = lancedb.connect("./stores/lancedb")
     tbl = db.create_table(table, data=df, mode="overwrite")
-    print(tbl)
+    logger.info(tbl)
diff --git a/scripts/loadDirToTriplestore.sh b/scripts/loadDirToTriplestore.sh
@@ -1,17 +1,67 @@
 #!/bin/bash
+set -e # Exit on error
+
+# Check if jsonld command exists
+if ! command -v jsonld &> /dev/null; then
+    echo "Error: jsonld command not found. Please install it." >&2
+    exit 1
+fi
+
+# Check if curl command exists
+if ! command -v curl &> /dev/null; then
+    echo "Error: curl command not found. Please install it." >&2
+    exit 1
+fi
+
 # A wrapper script for loading RDF from a directory into a triplestore
 # Usage
 # ./jsonldDirLoader.sh ./jsonld/depth_strict http://nas.lan:49153/blazegraph/namespace/kb/sparql
 
 mc_cmd() {
-    find $1 -type f  # kinda basic, might add a filter to it
+    find "$1" -type f  # kinda basic, might add a filter to it
 }
 
 # If you use this for ntriples, be sure to compute and/or add in a graph in the URL target
-for i in $(mc_cmd $1); do
+for i in $(mc_cmd "$1"); do
     echo "-------------start-------------"
-    echo Next: $i
-    cat $i | jsonld format -q | curl -X POST -H 'Content-Type:text/x-nquads' --data-binary  @- $2
+    echo "Processing: $i"
+
+    # Attempt to format the file with jsonld
+    # Capture stdout and stderr separately for jsonld
+    jsonld_output=$(cat "$i" | jsonld format -q 2> /tmp/jsonld_error.log)
+    jsonld_exit_code=$? 
+
+    if [ $jsonld_exit_code -ne 0 ]; then
+        echo "Error formatting $i with jsonld (exit code: $jsonld_exit_code). Skipping." >&2
+        # Print jsonld error log if it exists
+        if [ -s /tmp/jsonld_error.log ]; then
+            cat /tmp/jsonld_error.log >&2
+        fi
+        rm -f /tmp/jsonld_error.log # Clean up error log
+        echo "-------------done (failed formatting)--------------" >&2
+        continue
+    fi
+    rm -f /tmp/jsonld_error.log # Clean up error log if successful
+
+    # Attempt to upload the formatted data
+    # Capture stdout and stderr separately for curl
+    curl_output=$(echo "$jsonld_output" | curl -X POST -H 'Content-Type:text/x-nquads' --data-binary @- "$2" 2> /tmp/curl_error.log)
+    curl_exit_code=$?
+
+    if [ $curl_exit_code -ne 0 ]; then
+        echo "Error uploading $i to $2 (exit code: $curl_exit_code). Skipping." >&2
+        # Print curl error log if it exists
+        if [ -s /tmp/curl_error.log ]; then
+            cat /tmp/curl_error.log >&2
+        fi
+        rm -f /tmp/curl_error.log # Clean up error log
+        echo "-------------done (failed upload)--------------" >&2
+        continue
+    fi
+    rm -f /tmp/curl_error.log # Clean up error log if successful
+
+    echo "Successfully processed and uploaded $i"
+    # Optionally print curl output if needed for success confirmation
+    # echo "Server response: $curl_output"
     echo "-------------done--------------"
 done
-