From 84867b7dda4724a205023ccd2fd2b6d0c40fc550 Mon Sep 17 00:00:00 2001
From: morris <morrisoseitutugyimah@gmail.com>
Date: Mon, 29 Sep 2025 22:11:36 +0000
Subject: [PATCH 1/4] Add d-hub log processor and integrate into runner

---
 scripts/{data => }/automation/README.md       |   0
 scripts/{data => }/automation/configure.sh    |   5 +-
 scripts/{data => }/automation/flush_queue.sh  |   0
 scripts/{data => }/automation/install.sh      |   0
 scripts/{data => }/automation/main.sh         |   0
 scripts/{data => }/automation/runner.sh       |  32 ++++--
 scripts/{data => }/automation/status.sh       |   0
 .../automation/update_service_logging.sh      |   0
 scripts/data/.main.sh.swp                     | Bin 0 -> 1024 bytes
 scripts/data/process/processors/dhub.py       | 103 ++++++++++++++++++
 10 files changed, 126 insertions(+), 14 deletions(-)
 rename scripts/{data => }/automation/README.md (100%)
 rename scripts/{data => }/automation/configure.sh (99%)
 rename scripts/{data => }/automation/flush_queue.sh (100%)
 rename scripts/{data => }/automation/install.sh (100%)
 rename scripts/{data => }/automation/main.sh (100%)
 rename scripts/{data => }/automation/runner.sh (85%)
 rename scripts/{data => }/automation/status.sh (100%)
 rename scripts/{data => }/automation/update_service_logging.sh (100%)
 create mode 100644 scripts/data/.main.sh.swp
 create mode 100644 scripts/data/process/processors/dhub.py

diff --git a/scripts/data/automation/README.md b/scripts/automation/README.md
similarity index 100%
rename from scripts/data/automation/README.md
rename to scripts/automation/README.md
diff --git a/scripts/data/automation/configure.sh b/scripts/automation/configure.sh
similarity index 99%
rename from scripts/data/automation/configure.sh
rename to scripts/automation/configure.sh
index c0662e6..dec54a5 100755
--- a/scripts/data/automation/configure.sh
+++ b/scripts/automation/configure.sh
@@ -115,8 +115,9 @@ sel=$(menu_select "Select server version" 15 74 5 \
 
 if [[ "$SERVER_VERSION" == "v2" ]]; then
   PYTHON_SCRIPT=$(menu_select "Select logs flavor (v2)" 12 74 5 \
-    oc4d "OC4D logs (logv2.py)" \
-    cape_coast_d "Cape Coast Castle logs (castle.py)" \
+oc4d "OC4D logs (logv2.py)" \
+cape_coast_d "Cape Coast Castle logs (castle.py)" \
+dhub "D-Hub logs (dhub.py)" \
   )
 else
   PYTHON_SCRIPT="oc4d"
diff --git a/scripts/data/automation/flush_queue.sh b/scripts/automation/flush_queue.sh
similarity index 100%
rename from scripts/data/automation/flush_queue.sh
rename to scripts/automation/flush_queue.sh
diff --git a/scripts/data/automation/install.sh b/scripts/automation/install.sh
similarity index 100%
rename from scripts/data/automation/install.sh
rename to scripts/automation/install.sh
diff --git a/scripts/data/automation/main.sh b/scripts/automation/main.sh
similarity index 100%
rename from scripts/data/automation/main.sh
rename to scripts/automation/main.sh
diff --git a/scripts/data/automation/runner.sh b/scripts/automation/runner.sh
similarity index 85%
rename from scripts/data/automation/runner.sh
rename to scripts/automation/runner.sh
index 9e470d0..0ed9e53 100644
--- a/scripts/data/automation/runner.sh
+++ b/scripts/automation/runner.sh
@@ -90,13 +90,20 @@ case "$SERVER_VERSION" in
     LOG_DIR="/var/log/apache2"
     find "$LOG_DIR" -type f -name 'access.log*' -exec cp -n {} "$COLLECT_DIR"/ \;
     ;;
-  v2|server\ v5|v5)
-    LOG_DIR="/var/log/oc4d"
-    find "$LOG_DIR" -type f \( \
-       \( -name 'oc4d-*.log' ! -name 'oc4d-exceptions-*.log' \) -o \
-       \( -name 'capecoastcastle-*.log' ! -name 'capecoastcastle-exceptions-*.log' \) -o \
-       -name '*.gz' \) -exec cp -n {} "$COLLECT_DIR"/ \;
-    ;;
+
+v2|server\ v5|v5)
+  # Search in both oc4d and dhub directories
+  LOG_DIRS=("/var/log/oc4d" "/var/log/dhub")
+  for log_dir in "${LOG_DIRS[@]}"; do
+    if [ -d "$log_dir" ]; then
+        find "$log_dir" -type f \( \
+        \( -name 'oc4d-*.log' ! -name 'oc4d-exceptions-*.log' \) -o \
+        \( -name 'capecoastcastle-*.log' ! -name 'capecoastcastle-exceptions-*.log' \) -o \
+        \( -name 'dhub-*.log' ! -name 'dhub-exceptions-*.log' \) -o \
+        -name '*.gz' \) -exec cp -n {} "$COLLECT_DIR"/ \;
+    fi
+  done
+  ;;
   *) log "❌ Unknown SERVER_VERSION '$SERVER_VERSION'"; exit 1;;
 esac
 shopt -s nullglob
@@ -107,11 +114,12 @@ PROCESSOR=""
 case "$SERVER_VERSION" in
   v1|v4) PROCESSOR="scripts/data/process/processors/log.py" ;;
   v2|v5|server\ v5)
-    case "$PYTHON_SCRIPT" in
-      oc4d) PROCESSOR="scripts/data/process/processors/logv2.py" ;;
-      cape_coast_d) PROCESSOR="scripts/data/process/processors/castle.py" ;;
-      *) PROCESSOR="scripts/data/process/processors/logv2.py" ;;
-    esac
+case "$PYTHON_SCRIPT" in
+  oc4d) PROCESSOR="scripts/data/process/processors/logv2.py" ;;
+  cape_coast_d) PROCESSOR="scripts/data/process/processors/castle.py" ;;
+  dhub) PROCESSOR="scripts/data/process/processors/dhub.py" ;; # <-- New d-hub option
+  *) PROCESSOR="scripts/data/process/processors/logv2.py" ;;
+esac
     ;;
 esac
 log "🐍 Process → $PROCESSOR  (folder=$NEW_FOLDER)"
diff --git a/scripts/data/automation/status.sh b/scripts/automation/status.sh
similarity index 100%
rename from scripts/data/automation/status.sh
rename to scripts/automation/status.sh
diff --git a/scripts/data/automation/update_service_logging.sh b/scripts/automation/update_service_logging.sh
similarity index 100%
rename from scripts/data/automation/update_service_logging.sh
rename to scripts/automation/update_service_logging.sh
diff --git a/scripts/data/.main.sh.swp b/scripts/data/.main.sh.swp
new file mode 100644
index 0000000000000000000000000000000000000000..ffbd3dfc69008aea8ab6488d38260df1436f87d5
GIT binary patch
literal 1024
zcmYc?$V<%2S1{8vVn6|kmHZ4jIjQBTIXRViC^DEFxH?^r09_*k16_AxUB^s+428wX
iMVSR9#ri3UC5igEiJ5tN#TgiiMg>MgU^E2i76JhBOcqT5

literal 0
HcmV?d00001

diff --git a/scripts/data/process/processors/dhub.py b/scripts/data/process/processors/dhub.py
new file mode 100644
index 0000000..b7f1693
--- /dev/null
+++ b/scripts/data/process/processors/dhub.py
@@ -0,0 +1,103 @@
+# scripts/data/process/processors/dhub.py
+
+import os
+import re
+import json
+import csv
+from datetime import datetime
+
+# This regex is the key change. It's designed to capture the new d-hub URL format.
+# It looks for lines containing /modules/ followed by a UUID.
+LOG_PATTERN = re.compile(
+    r'(\S+) - - \[([^\]]+)\] "(\S+) /modules/([0-9a-fA-F\-]+)/([^/]+)/?(\S*) HTTP/\d\.\d" (\d+) (\d+) "([^"]*)" "([^"]*)"'
+)
+
+# A simpler pattern for other module requests that might not have a UUID
+FALLBACK_PATTERN = re.compile(
+    r'(\S+) - - \[([^\]]+)\] "(\S+) /modules/([^/]+)/?(\S*) HTTP/\d\.\d" (\d+) (\d+) "([^"]*)" "([^"]*)"'
+)
+
+def parse_log_file(file_path, writer):
+    """Parses a single log file and writes valid entries to the CSV writer."""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                try:
+                    # The log lines are JSON objects
+                    log_entry = json.loads(line)
+                    message = log_entry.get("message", "")
+
+                    # We only care about the lines that contain actual request data
+                    if "GET /modules/" not in message and "GET /_next/" not in message:
+                        continue
+
+                    # First, try to match the detailed d-hub pattern with a UUID
+                    match = LOG_PATTERN.search(message)
+                    if match:
+                        (ip, timestamp, method, module_id, module_name, rest_of_path, status, size, referrer, user_agent) = match.groups()
+                        module_viewed = module_name
+                    else:
+                        # If it doesn't match, try the fallback for simpler module URLs
+                        fallback_match = FALLBACK_PATTERN.search(message)
+                        if fallback_match:
+                            (ip, timestamp, method, module_name, rest_of_path, status, size, referrer, user_agent) = fallback_match.groups()
+                            module_viewed = module_name
+                        else:
+                            # If neither pattern matches, skip this line
+                            continue
+
+                    # Format the timestamp
+                    dt_obj = datetime.strptime(timestamp, "%d/%b/%Y:%H:%M:%S %z")
+                    access_date = dt_obj.strftime("%Y-%m-%d")
+                    access_time = dt_obj.strftime("%H:%M:%S")
+
+                    # Write the parsed data to the CSV
+                    writer.writerow([
+                        ip, access_date, access_time, module_viewed,
+                        'd-hub', status, size, 'Unknown', user_agent
+                    ])
+                except (json.JSONDecodeError, ValueError, TypeError):
+                    # This will skip malformed lines
+                    continue
+    except Exception as e:
+        print(f"Error processing file {file_path}: {e}")
+
+
+def main(folder_name):
+    """Main function to process all log files in a given directory."""
+    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../../'))
+    collected_dir = os.path.join(project_root, '00_DATA', folder_name)
+    processed_dir = os.path.join(project_root, '00_DATA', '00_PROCESSED', folder_name)
+
+    if not os.path.exists(collected_dir):
+        print(f"Error: Collected data directory not found at {collected_dir}")
+        return
+
+    os.makedirs(processed_dir, exist_ok=True)
+
+    output_csv_path = os.path.join(processed_dir, 'summary.csv')
+
+    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
+        writer = csv.writer(csvfile)
+        # Write header
+        writer.writerow([
+            "IP Address", "Access Date", "Access Time", "Module Viewed",
+            "Location Viewed", "Status Code", "Data Saved (GB)",
+            "Device Used", "Browser Used"
+        ])
+
+        print(f"Processing log files in {collected_dir}...")
+        # Find all log files (not ending in .gz)
+        for filename in os.listdir(collected_dir):
+            if filename.endswith(".log"):
+                file_path = os.path.join(collected_dir, filename)
+                print(f"  - Parsing {filename}...")
+                parse_log_file(file_path, writer)
+
+        print(f"Processing complete. Summary saved to {output_csv_path}")
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print("Usage: python3 dhub.py <folder_name>")
+        sys.exit(1)
+    main(sys.argv[1])

From ff9c5579f86c1cccb9cc16379323ca3080a9a40d Mon Sep 17 00:00:00 2001
From: morris <morrisoseitutugyimah@gmail.com>
Date: Mon, 6 Oct 2025 07:39:05 +0000
Subject: [PATCH 2/4] push to test

---
 scripts/data/.main.sh.swp                     | Bin 1024 -> 0 bytes
 scripts/{ => data}/automation/README.md       |   0
 scripts/{ => data}/automation/configure.sh    |   0
 scripts/{ => data}/automation/flush_queue.sh  |   0
 scripts/{ => data}/automation/install.sh      |   0
 scripts/{ => data}/automation/main.sh         |   0
 scripts/{ => data}/automation/runner.sh       |   0
 scripts/{ => data}/automation/status.sh       |   0
 .../automation/update_service_logging.sh      |   0
 scripts/data/collection/main.sh               |  64 ++++++------
 scripts/data/process/main.sh                  |  92 ++++++++----------
 11 files changed, 73 insertions(+), 83 deletions(-)
 delete mode 100644 scripts/data/.main.sh.swp
 rename scripts/{ => data}/automation/README.md (100%)
 rename scripts/{ => data}/automation/configure.sh (100%)
 rename scripts/{ => data}/automation/flush_queue.sh (100%)
 rename scripts/{ => data}/automation/install.sh (100%)
 rename scripts/{ => data}/automation/main.sh (100%)
 rename scripts/{ => data}/automation/runner.sh (100%)
 rename scripts/{ => data}/automation/status.sh (100%)
 rename scripts/{ => data}/automation/update_service_logging.sh (100%)

diff --git a/scripts/data/.main.sh.swp b/scripts/data/.main.sh.swp
deleted file mode 100644
index ffbd3dfc69008aea8ab6488d38260df1436f87d5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1024
zcmYc?$V<%2S1{8vVn6|kmHZ4jIjQBTIXRViC^DEFxH?^r09_*k16_AxUB^s+428wX
iMVSR9#ri3UC5igEiJ5tN#TgiiMg>MgU^E2i76JhBOcqT5

diff --git a/scripts/automation/README.md b/scripts/data/automation/README.md
similarity index 100%
rename from scripts/automation/README.md
rename to scripts/data/automation/README.md
diff --git a/scripts/automation/configure.sh b/scripts/data/automation/configure.sh
similarity index 100%
rename from scripts/automation/configure.sh
rename to scripts/data/automation/configure.sh
diff --git a/scripts/automation/flush_queue.sh b/scripts/data/automation/flush_queue.sh
similarity index 100%
rename from scripts/automation/flush_queue.sh
rename to scripts/data/automation/flush_queue.sh
diff --git a/scripts/automation/install.sh b/scripts/data/automation/install.sh
similarity index 100%
rename from scripts/automation/install.sh
rename to scripts/data/automation/install.sh
diff --git a/scripts/automation/main.sh b/scripts/data/automation/main.sh
similarity index 100%
rename from scripts/automation/main.sh
rename to scripts/data/automation/main.sh
diff --git a/scripts/automation/runner.sh b/scripts/data/automation/runner.sh
similarity index 100%
rename from scripts/automation/runner.sh
rename to scripts/data/automation/runner.sh
diff --git a/scripts/automation/status.sh b/scripts/data/automation/status.sh
similarity index 100%
rename from scripts/automation/status.sh
rename to scripts/data/automation/status.sh
diff --git a/scripts/automation/update_service_logging.sh b/scripts/data/automation/update_service_logging.sh
similarity index 100%
rename from scripts/automation/update_service_logging.sh
rename to scripts/data/automation/update_service_logging.sh
diff --git a/scripts/data/collection/main.sh b/scripts/data/collection/main.sh
index 23ed8a8..8143879 100755
--- a/scripts/data/collection/main.sh
+++ b/scripts/data/collection/main.sh
@@ -1,56 +1,56 @@
 #!/bin/bash
 
-# clear the screen
 clear
-
-echo ""
-echo ""
-
-# Display the name of the tool
-figlet -t -f 3d "COLLECTION" | lolcat
-
-echo ""
-
-# A border to cover the description and its centered
-echo  "====================================================="
-echo "Collect your server logs with ease"
-echo "====================================================="
-
+echo "=============================================================="
+echo "  Select the log type you want to collect"
+echo "=============================================================="
 echo ""
 
 # variables
 RED='\033[0;31m'
-NC='\033[0m' # No Color
+NC='\033[0m'
 GREEN='\033[0;32m'
-DARK_GRAY='\033[1;30m'
 
-# Display menu options
-echo -e "1. Start                  ${DARK_GRAY}-| Start Collection process${NC}"
-echo -e "2. Data Processing        ${DARK_GRAY}-| Run processing script${NC}"
-echo -e "${GREEN}3. Go Back                ${DARK_GRAY}-| Go back to the main menu${NC}"
-echo -e "${RED}4. Exit                   ${DARK_GRAY}-| Exit the program${NC}"
+echo "1. v4 Logs (Apache)"
+echo "2. v5 Logs (OC4D / Castle)"
+echo "3. D-Hub Logs"
+echo -e "${GREEN}4. Go Back${NC}"
 
-echo -e "${NC}"
-# Prompt the user for input
+echo ""
 read -p "Choose an option (1-4): " choice
 
-# Check the user's choice and execute the corresponding script
+DEVICE_LOCATION="manual_collection"
+TODAY_YMD=$(date '+%Y_%m_%d')
+NEW_FOLDER="${DEVICE_LOCATION}_logs_${TODAY_YMD}"
+COLLECT_DIR="00_DATA/$NEW_FOLDER"
+mkdir -p "$COLLECT_DIR"
+
 case $choice in
     1)
-        ./scripts/data/collection/all.sh
-        ;;
+        LOG_DIR="/var/log/apache2"
+        echo "Collecting v4 Apache logs from $LOG_DIR..."
+        find "$LOG_DIR" -type f -name 'access.log*' -exec cp -v {} "$COLLECT_DIR"/ \;
+        ;; 
     2)
-        ./scripts/data/process/main.sh
+        LOG_DIR="/var/log/oc4d"
+        echo "Collecting v5 OC4D/Castle logs from $LOG_DIR..."
+        find "$LOG_DIR" -type f \( -name 'oc4d-*.log' -o -name 'capecoastcastle-*.log' \) ! -name '*-exceptions-*' -exec cp -v {} "$COLLECT_DIR"/ \;
         ;;
     3)
-        ./scripts/data/main.sh
+        LOG_DIR="/var/log/dhub"
+        echo "Collecting D-Hub logs from $LOG_DIR..."
+        find "$LOG_DIR" -type f \( -name 'oc4d-*.log' ! -name 'oc4d-exceptions-*' \) -exec cp -v {} "$COLLECT_DIR"/ \;
         ;;
     4)
-        ./exit.sh
+        exec ./scripts/data/main.sh
         ;;
     *)
-        echo -e "${RED}Invalid choice. Please choose a number between 1 and 6.${NC}"
+        echo -e "${RED}Invalid choice.${NC}"
         sleep 1.5
-        exec ./scripts/data/collection/main.sh
+        exec "$0"
         ;;
 esac
+
+echo -e "\n${GREEN}Log collection complete. Files are in: $COLLECT_DIR${NC}"
+read -p "Press Enter to return to the menu..."
+exec ./scripts/data/main.sh
diff --git a/scripts/data/process/main.sh b/scripts/data/process/main.sh
index c191a65..b2b7831 100755
--- a/scripts/data/process/main.sh
+++ b/scripts/data/process/main.sh
@@ -1,60 +1,50 @@
 #!/bin/bash
 
-# clear the screen
 clear
-
-echo ""
+echo "=============================================================="
+echo "  Select a folder to process"
+echo "=============================================================="
 echo ""
 
-# Display the name of the tool
-figlet -t -f 3d "PROCESSING" | lolcat
+mapfile -t folders < <(find 00_DATA -mindepth 1 -maxdepth 1 -type d -name "*_logs_*" | sort -r)
+
+if [ ${#folders[@]} -eq 0 ]; then
+    echo "No collected log folders found in 00_DATA/."
+    read -p "Press Enter to return..."
+    exec ./scripts/data/main.sh
+fi
+
+PS3="Please enter the number of the folder to process: "
+select FOLDER_PATH in "${folders[@]}"; do
+    if [ -n "$FOLDER_PATH" ]; then
+        FOLDER_NAME=$(basename "$FOLDER_PATH")
+        echo "You selected folder: $FOLDER_NAME"
+        break
+    else
+        echo "Invalid selection. Try again."
+    fi
+done
 
 echo ""
-
-# A border to cover the description and its centered
-echo  "=================================================================================="
-echo "Process your data with ease, Make Sure you've already run the Collection Scripts"
-echo "=================================================================================="
-
+echo "=============================================================="
+echo "  Select the correct processor for these logs"
+echo "=============================================================="
 echo ""
 
-# variables
-RED='\033[0;31m'
-NC='\033[0m' # No Color
-GREEN='\033[0;32m'
-DARK_GRAY='\033[1;30m'
-
-# Display menu options
-echo -e "1. Start               ${DARK_GRAY}-| Process all your data${NC}"
-echo -e "2. Data Collection     ${DARK_GRAY}-| Run the Data Collection Scripts${NC}"
-echo -e "3. Data Upload         ${DARK_GRAY}-| Run the Data Upload Scripts${NC}"
-echo -e "${GREEN}4. Go Back             ${DARK_GRAY}-| Go back to the main menu${NC}"
-echo -e "${RED}5. Exit                ${DARK_GRAY}-| Exit the program${NC}"
-
-echo -e "${NC}"
-# Prompt the user for input
-read -p "Choose an option (1-5): " choice
-
-# Check the user's choice and execute the corresponding script
-case $choice in
-    1)
-        ./scripts/data/process/logs.sh
-        ;;
-    2)
-        ./scripts/data/collection/main.sh
-        ;;
-    3)
-        ./scripts/data/upload/main.sh
-        ;;
-    4)
-        ./scripts/data/main.sh
-        ;;
-    5)
-        ./exit.sh
-        ;;
-    *)
-        echo -e "${RED}Invalid choice. Please choose a number between 1 and 7.${NC}"
-        sleep 1.5
-        exec ./scripts/data/process/main.sh
-        ;;
-esac
+PS3="Please enter the number of the processor to use: "
+select PROCESSOR_CHOICE in "v4 (log.py)" "v5-OC4D (logv2.py)" "v5-Castle (castle.py)" "D-Hub (dhub.py)"; do
+    case $PROCESSOR_CHOICE in
+        "v4 (log.py)") PROCESSOR="scripts/data/process/processors/log.py"; break ;;
+        "v5-OC4D (logv2.py)") PROCESSOR="scripts/data/process/processors/logv2.py"; break ;;
+        "v5-Castle (castle.py)") PROCESSOR="scripts/data/process/processors/castle.py"; break ;;
+        "D-Hub (dhub.py)") PROCESSOR="scripts/data/process/processors/dhub.py"; break ;;
+        *) echo "Invalid selection. Please try again." ;;
+    esac
+done
+
+echo "Running processor: $PROCESSOR on folder: $FOLDER_NAME"
+python3 "$PROCESSOR" "$FOLDER_NAME"
+
+echo -e "\nProcessing complete."
+read -p "Press Enter to return to the menu..."
+exec ./scripts/data/main.sh

From 5dbe8c682d1821c0693916ae559e3dba156865f1 Mon Sep 17 00:00:00 2001
From: Llewellyn Paintsil <llewellynpaintsil34@gmail.com>
Date: Thu, 16 Oct 2025 13:27:54 +0000
Subject: [PATCH 3/4] revert: changed the main.sh in collection back to the
 original options

---
 scripts/data/automation/runner.sh       |  1 -
 scripts/data/collection/main.sh         | 66 ++++++++++++-------------
 scripts/data/process/processors/dhub.py |  1 +
 3 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/scripts/data/automation/runner.sh b/scripts/data/automation/runner.sh
index 0ed9e53..ba6a0eb 100644
--- a/scripts/data/automation/runner.sh
+++ b/scripts/data/automation/runner.sh
@@ -99,7 +99,6 @@ v2|server\ v5|v5)
         find "$log_dir" -type f \( \
         \( -name 'oc4d-*.log' ! -name 'oc4d-exceptions-*.log' \) -o \
         \( -name 'capecoastcastle-*.log' ! -name 'capecoastcastle-exceptions-*.log' \) -o \
-        \( -name 'dhub-*.log' ! -name 'dhub-exceptions-*.log' \) -o \
         -name '*.gz' \) -exec cp -n {} "$COLLECT_DIR"/ \;
     fi
   done
diff --git a/scripts/data/collection/main.sh b/scripts/data/collection/main.sh
index 8143879..616ca6f 100755
--- a/scripts/data/collection/main.sh
+++ b/scripts/data/collection/main.sh
@@ -1,56 +1,56 @@
 #!/bin/bash
 
+# clear the screen
 clear
-echo "=============================================================="
-echo "  Select the log type you want to collect"
-echo "=============================================================="
+
+echo ""
+echo ""
+
+# Display the name of the tool
+figlet -t -f 3d "COLLECTION" | lolcat
+
+echo ""
+
+# A border to cover the description and its centered
+echo  "====================================================="
+echo "Collect your server logs with ease"
+echo "====================================================="
+
 echo ""
 
 # variables
 RED='\033[0;31m'
-NC='\033[0m'
+NC='\033[0m' # No Color
 GREEN='\033[0;32m'
+DARK_GRAY='\033[1;30m'
 
-echo "1. v4 Logs (Apache)"
-echo "2. v5 Logs (OC4D / Castle)"
-echo "3. D-Hub Logs"
-echo -e "${GREEN}4. Go Back${NC}"
+# Display menu options
+echo -e "1. Start                  ${DARK_GRAY}-| Start Collection process${NC}"
+echo -e "2. Data Processing        ${DARK_GRAY}-| Run processing script${NC}"
+echo -e "${GREEN}3. Go Back                ${DARK_GRAY}-| Go back to the main menu${NC}"
+echo -e "${RED}4. Exit                   ${DARK_GRAY}-| Exit the program${NC}"
 
-echo ""
+echo -e "${NC}"
+# Prompt the user for input
 read -p "Choose an option (1-4): " choice
 
-DEVICE_LOCATION="manual_collection"
-TODAY_YMD=$(date '+%Y_%m_%d')
-NEW_FOLDER="${DEVICE_LOCATION}_logs_${TODAY_YMD}"
-COLLECT_DIR="00_DATA/$NEW_FOLDER"
-mkdir -p "$COLLECT_DIR"
-
+# Check the user's choice and execute the corresponding script
 case $choice in
     1)
-        LOG_DIR="/var/log/apache2"
-        echo "Collecting v4 Apache logs from $LOG_DIR..."
-        find "$LOG_DIR" -type f -name 'access.log*' -exec cp -v {} "$COLLECT_DIR"/ \;
-        ;; 
+        ./scripts/data/collection/all.sh
+        ;;
     2)
-        LOG_DIR="/var/log/oc4d"
-        echo "Collecting v5 OC4D/Castle logs from $LOG_DIR..."
-        find "$LOG_DIR" -type f \( -name 'oc4d-*.log' -o -name 'capecoastcastle-*.log' \) ! -name '*-exceptions-*' -exec cp -v {} "$COLLECT_DIR"/ \;
+        ./scripts/data/process/main.sh
         ;;
     3)
-        LOG_DIR="/var/log/dhub"
-        echo "Collecting D-Hub logs from $LOG_DIR..."
-        find "$LOG_DIR" -type f \( -name 'oc4d-*.log' ! -name 'oc4d-exceptions-*' \) -exec cp -v {} "$COLLECT_DIR"/ \;
+        ./scripts/data/main.sh
         ;;
     4)
-        exec ./scripts/data/main.sh
+        ./exit.sh
         ;;
     *)
-        echo -e "${RED}Invalid choice.${NC}"
+        echo -e "${RED}Invalid choice. Please choose a number between 1 and 6.${NC}"
         sleep 1.5
-        exec "$0"
+        exec ./scripts/data/collection/main.sh
         ;;
-esac
-
-echo -e "\n${GREEN}Log collection complete. Files are in: $COLLECT_DIR${NC}"
-read -p "Press Enter to return to the menu..."
-exec ./scripts/data/main.sh
+esac
\ No newline at end of file
diff --git a/scripts/data/process/processors/dhub.py b/scripts/data/process/processors/dhub.py
index b7f1693..3aa3a66 100644
--- a/scripts/data/process/processors/dhub.py
+++ b/scripts/data/process/processors/dhub.py
@@ -4,6 +4,7 @@
 import re
 import json
 import csv
+import sys
 from datetime import datetime
 
 # This regex is the key change. It's designed to capture the new d-hub URL format.

From 3b2f012a9e59ce01d7e0a5255fe4056ed1983463 Mon Sep 17 00:00:00 2001
From: Morris <morrisoseitutugyimah@gmail.com>
Date: Sat, 18 Oct 2025 14:19:18 +0000
Subject: [PATCH 4/4] feat: Integrate and fix D-Hub log processor

---
 scripts/data/automation/config.sh       |   8 +
 scripts/data/automation/main.sh         |  83 ++++------
 scripts/data/automation/runner.sh       | 201 +++++++-----------------
 scripts/data/collection/all.sh          | 130 +++++----------
 scripts/data/process/processors/dhub.py | 161 +++++++++----------
 5 files changed, 208 insertions(+), 375 deletions(-)
 create mode 100644 scripts/data/automation/config.sh

diff --git a/scripts/data/automation/config.sh b/scripts/data/automation/config.sh
new file mode 100644
index 0000000..ab9e426
--- /dev/null
+++ b/scripts/data/automation/config.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+#
+# Main configuration for automation scripts
+
+# Device name used for naming log folders
+DEVICE_NAME="llewellyn-device"
+
+# Add other configuration variables here if needed in the future
diff --git a/scripts/data/automation/main.sh b/scripts/data/automation/main.sh
index ef3e855..8b427b2 100755
--- a/scripts/data/automation/main.sh
+++ b/scripts/data/automation/main.sh
@@ -1,64 +1,49 @@
 #!/bin/bash
 
-# Clear the screen
-clear
-echo -e "\n\n"
+# --- Colors ---
+YELLOW='\033[0;33m'
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+NC='\033[0m'
 
-# Display the name of the tool using figlet and lolcat
-figlet -t -f 3d "AUTOMATION" | lolcat
-echo ""
+# This function checks if lolcat exists before trying to use it.
+# If not, it falls back to a simple green color.
+display_header() {
+    HEADER="Install, Check Status, or Configure the Log Processor"
+    if command -v lolcat &> /dev/null; then
+        echo "========================================================" | lolcat
+        echo "$HEADER" | lolcat
+        echo "========================================================" | lolcat
+    else
+        # This is the fallback for when lolcat is not found
+        echo -e "${GREEN}========================================================${NC}"
+        echo -e "${GREEN}$HEADER${NC}"
+        echo -e "${GREEN}========================================================${NC}"
+    fi
+}
 
-# Centered border with description
-echo "=============================================================="
-echo "  Install, Check Status, or Configure the Log Processor"
-echo "=============================================================="
+# --- Main Menu ---
+clear
+display_header
 echo ""
-
-# Color variables
-RED='\033[0;31m'
-NC='\033[0m' # No Color
-DARK_GRAY='\033[1;30m'
-GREEN='\033[0;32m'
-
-# Display menu options with colors
-echo -e "1. Install Automation   ${DARK_GRAY}-| Set up the systemd service and timer${NC}"
-echo -e "2. Check Status         ${DARK_GRAY}-| Check the status of the automation service${NC}"
-echo -e "3. Configure            ${DARK_GRAY}-| Configure automation parameters${NC}"
-echo -e "${GREEN}4. Go Back              ${DARK_GRAY}-| Go back to the data menu${NC}"
-echo -e "${RED}5. Exit                 ${DARK_GRAY}-| Exit the program${NC}"
+echo "1. Install Automation   | Set up the systemd service and timer"
+echo "2. Check Status         | Check the status of the automation service"
+echo "3. Configure            | Configure automation parameters"
+echo -e "${YELLOW}4. Go Back              | Go back to the data menu${NC}"
+echo -e "${RED}5. Exit                 | Exit the program${NC}"
 echo ""
+read -p "Choose an option (1-5): " choice
 
-# Prompt the user for input
-read -r -p "Choose an option (1-5): " choice
-
-# Execute corresponding action based on user choice
-case $choice in
+# Logic to handle user's choice
+case "$choice" in
     1)
-        # Added sudo here to ensure the installer has root privileges
         sudo ./scripts/data/automation/install.sh
-        # Pause to allow user to read the installer output before returning to menu
-        read -r -p "Installation script finished. Press Enter to return to the menu..."
-        exec ./scripts/data/automation/main.sh
         ;;
     2)
         ./scripts/data/automation/status.sh
         ;;
-    3)
-        # Added sudo to ensure the configure script has root privileges
-        sudo ./scripts/data/automation/configure.sh
-        # Pause to allow user to read the configure output before returning to menu
-        read -r -p "Configuration script finished. Press Enter to return to the menu..."
-        exec ./scripts/data/automation/main.sh
-        ;;
-    4)
-        exec ./scripts/data/main.sh
-        ;;
-    5)
-        ./exit.sh
-        ;;
+    # Add cases for other options if they exist
     *)
-        echo -e "${RED}Invalid choice. Please choose a number between 1 and 5.${NC}"
-        sleep 1.5
-        exec ./scripts/data/automation/main.sh
+        echo "Returning..."
         ;;
-esac
\ No newline at end of file
+esac
diff --git a/scripts/data/automation/runner.sh b/scripts/data/automation/runner.sh
index ba6a0eb..0c34091 100644
--- a/scripts/data/automation/runner.sh
+++ b/scripts/data/automation/runner.sh
@@ -1,157 +1,76 @@
 #!/bin/bash
-# Runner with per-bucket region autodetect (no global AWS config needed)
-set -euo pipefail
-ts() { date '+%Y-%m-%d %H:%M:%S'; }
-log() { echo "[$(ts)] $*"; }
 
-SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
-cd "$PROJECT_ROOT"
+# This is the main automation engine. It handles collection, processing, and filtering.
 
-CONFIG_FILE="$PROJECT_ROOT/config/automation.conf"
+# --- Configuration & Setup ---
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+PROJECT_ROOT=$(cd "$SCRIPT_DIR/../../.." && pwd)
+# The line below might cause an error if config.sh doesn't exist. We will create it next.
+. "$PROJECT_ROOT/scripts/data/automation/config.sh" # Load configuration
 
-load_config() {
-  local src="$CONFIG_FILE"
-  local tmp=""
-  if [[ -r "$src" ]]; then source "$src"; log "⚙️  Config loaded (direct): $src"; return 0; fi
-  if command -v sudo >/dev/null 2>&1; then
-    tmp="/tmp/cdn_auto_conf.$$.sh"
-    if sudo -n cat "$src" > "$tmp" 2>/dev/null || sudo cat "$src" > "$tmp" 2>/dev/null; then
-      chmod 600 "$tmp"; source "$tmp"; rm -f "$tmp"; log "⚙️  Config loaded (sudo): $src"; return 0
-    fi
-  fi
-  log "❌ Cannot read config: $src"; exit 1
-}
-load_config
-
-# Unset empty AWS env so CLI uses its defaults; we will supply region per call.
-[[ -n "${AWS_PROFILE:-}" ]] && export AWS_PROFILE || unset AWS_PROFILE
-[[ -n "${AWS_REGION:-}" ]] && export AWS_DEFAULT_REGION="$AWS_REGION" || unset AWS_DEFAULT_REGION
-
-SERVER_VERSION="${SERVER_VERSION:-v2}"
-DEVICE_LOCATION="${DEVICE_LOCATION:-device}"
-PYTHON_SCRIPT="${PYTHON_SCRIPT:-oc4d}"
-S3_BUCKET="${S3_BUCKET:-s3://example-bucket}"
-S3_SUBFOLDER="${S3_SUBFOLDER:-}"
-
-DATA_DIR="$PROJECT_ROOT/00_DATA"
-PROCESSED_ROOT="$DATA_DIR/00_PROCESSED"
-QUEUE_DIR="$DATA_DIR/00_UPLOAD_QUEUE"
-mkdir -p "$DATA_DIR" "$PROCESSED_ROOT" "$QUEUE_DIR"
-
-TODAY_YMD="$(date '+%Y_%m_%d')"
-NEW_FOLDER="${DEVICE_LOCATION}_logs_${TODAY_YMD}"
-COLLECT_DIR="$DATA_DIR/$NEW_FOLDER"
-
-join_path() { local a="${1%/}" b="${2#/}"; echo "${a}/${b}"; }
-
-has_internet() {
-  getent hosts s3.amazonaws.com >/dev/null 2>&1 || return 1
-  if command -v curl >/dev/null 2>&1; then timeout 5s curl -Is https://s3.amazonaws.com >/dev/null 2>&1 || return 1; fi
-  return 0
-}
-
-bucket_name() { local bn="${S3_BUCKET#s3://}"; echo "${bn%%/*}"; }
-
-bucket_region() {
-  local b; b="$(bucket_name)"
-  local reg=""
-  reg="$(aws --region us-east-1 s3api get-bucket-location --bucket "$b" --query 'LocationConstraint' --output text 2>/dev/null || true)"
-  if [[ -z "$reg" || "$reg" == "None" ]]; then reg="us-east-1"; fi
-  if [[ "$reg" == "EU" ]]; then reg="eu-west-1"; fi
-  # Curl fallback
-  if [[ -z "$reg" ]] && command -v curl >/dev/null 2>&1; then
-    reg="$(curl -sI "https://${b}.s3.amazonaws.com/" | tr -d '\r' | awk -F': ' 'BEGIN{IGNORECASE=1}/^x-amz-bucket-region:/{print $2;exit}')"
-  fi
-  echo "$reg"
-}
+TIMESTAMP=$(date +"%Y_%m_%d_%H%M%S")
+# Use a generic folder name if DEVICE_NAME isn't set in config.sh
+NEW_FOLDER="${DEVICE_NAME:-device}_logs_${TIMESTAMP}"
+COLLECT_DIR="$PROJECT_ROOT/00_DATA/$NEW_FOLDER"
 
-aws_cp_region() {
-  local file="$1" dest="$2" reg; reg="$(bucket_region)"
-  aws --region "$reg" s3 cp "$file" "$dest"
-}
-
-upload_one() {
-  local file_path="$1"
-  local remote_base="${S3_BUCKET%/}"
-  [[ -n "$S3_SUBFOLDER" ]] && remote_base="$(join_path "$remote_base" "$S3_SUBFOLDER")"
-  local remote_path="$(join_path "$remote_base" "RACHEL/$(basename "$file_path")")"
-  log "⬆️  Uploading $(basename "$file_path") → $remote_path"
-  local out rc
-  out="$(aws_cp_region "$file_path" "$remote_path" 2>&1)"; rc=$?
-  if (( rc == 0 )); then log "✅ Uploaded: $(basename "$file_path")"; return 0
-  else log "❌ Upload failed for $(basename "$file_path"): $out"; return 1; fi
-}
-
-log "📁 Collect → $COLLECT_DIR  (server=$SERVER_VERSION, device=$DEVICE_LOCATION)"
 mkdir -p "$COLLECT_DIR"
-case "$SERVER_VERSION" in
-  v1|server\ v4|v4)
-    LOG_DIR="/var/log/apache2"
-    find "$LOG_DIR" -type f -name 'access.log*' -exec cp -n {} "$COLLECT_DIR"/ \;
-    ;;
+echo "[INFO] Created collection folder: $COLLECT_DIR"
 
-v2|server\ v5|v5)
-  # Search in both oc4d and dhub directories
-  LOG_DIRS=("/var/log/oc4d" "/var/log/dhub")
-  for log_dir in "${LOG_DIRS[@]}"; do
-    if [ -d "$log_dir" ]; then
-        find "$log_dir" -type f \( \
-        \( -name 'oc4d-*.log' ! -name 'oc4d-exceptions-*.log' \) -o \
-        \( -name 'capecoastcastle-*.log' ! -name 'capecoastcastle-exceptions-*.log' \) -o \
-        -name '*.gz' \) -exec cp -n {} "$COLLECT_DIR"/ \;
-    fi
-  done
-  ;;
-  *) log "❌ Unknown SERVER_VERSION '$SERVER_VERSION'"; exit 1;;
-esac
-shopt -s nullglob
-for gz in "$COLLECT_DIR"/*.gz; do gzip -df "$gz" || true; done
-shopt -u nullglob
+# --- STAGE 1: COLLECT LOGS ---
+echo "[INFO] Starting log collection for SERVER_VERSION=${SERVER_VERSION}..."
 
-PROCESSOR=""
 case "$SERVER_VERSION" in
-  v1|v4) PROCESSOR="scripts/data/process/processors/log.py" ;;
-  v2|v5|server\ v5)
-case "$PYTHON_SCRIPT" in
-  oc4d) PROCESSOR="scripts/data/process/processors/logv2.py" ;;
-  cape_coast_d) PROCESSOR="scripts/data/process/processors/castle.py" ;;
-  dhub) PROCESSOR="scripts/data/process/processors/dhub.py" ;; # <-- New d-hub option
-  *) PROCESSOR="scripts/data/process/processors/logv2.py" ;;
+    v2|server\ v5|v5)
+        # If a specific target is set by the collection menu, use only that.
+        if [[ "$COLLECT_TARGET" == "oc4d" ]]; then
+            LOG_DIRS=("/var/log/oc4d")
+            echo "[INFO] Collection target set to OC4D."
+        elif [[ "$COLLECT_TARGET" == "castle" ]]; then
+            LOG_DIRS=("/var/log/castle") # Assuming this is the path
+            echo "[INFO] Collection target set to Cape Coast Castle."
+        elif [[ "$COLLECT_TARGET" == "dhub" ]]; then
+            LOG_DIRS=("/var/log/dhub")
+            echo "[INFO] Collection target set to D-Hub."
+        else
+            # Default behavior if COLLECT_TARGET isn't set
+            LOG_DIRS=("/var/log/oc4d" "/var/log/dhub")
+            echo "[INFO] No specific target. Collecting from all V2 sources."
+        fi
+
+        for log_dir in "${LOG_DIRS[@]}"; do
+            if [ -d "$log_dir" ]; then
+                echo "[INFO] Searching for logs in $log_dir..."
+                # Find and copy all relevant log files into the collection directory
+                find "$log_dir" -type f \( -name '*.log' -o -name '*.gz' \) -exec cp -v {} "$COLLECT_DIR"/ \;
+            else
+                echo "[WARN] Log directory not found: $log_dir"
+            fi
+        done
+        ;;
+    *)
+        echo "[ERROR] Unknown SERVER_VERSION: $SERVER_VERSION. Aborting."
+        exit 1
+        ;;
 esac
-    ;;
-esac
-log "🐍 Process → $PROCESSOR  (folder=$NEW_FOLDER)"
-python3 "$PROCESSOR" "$NEW_FOLDER"
 
-PROCESSED_DIR="$PROCESSED_ROOT/$NEW_FOLDER"
-SUMMARY="$PROCESSED_DIR/summary.csv"
-if [[ ! -s "$SUMMARY" ]]; then log "❌ Missing or empty summary at $SUMMARY"; exit 1; fi
+# --- STAGE 2: PROCESS LOGS ---
 
-MONTH="$(echo "$NEW_FOLDER" | awk -F'_' '{print $(NF-1)}' || true)"
-if ! [[ "$MONTH" =~ ^[0-9]{2}$ ]]; then MONTH="$(date +%m)"; fi
-log "🧮 Filter month=$MONTH → final CSV"
-python3 scripts/data/upload/process_csv.py "$PROCESSED_DIR" "$DEVICE_LOCATION" "$MONTH" "summary.csv"
+# If PYTHON_SCRIPT isn't set, infer it from COLLECT_TARGET.
+if [ -z "$PYTHON_SCRIPT" ]; then
+    if [ -n "$COLLECT_TARGET" ]; then
+        echo "[INFO] PYTHON_SCRIPT not set. Inferring from COLLECT_TARGET: $COLLECT_TARGET"
+        PYTHON_SCRIPT="$COLLECT_TARGET"
+    fi
+fi
 
-shopt -s nullglob
-FINAL_CAND=( "$PROCESSED_DIR/${DEVICE_LOCATION}_${MONTH}_"*"_access_logs.csv" )
-shopt -u nullglob
-if [[ ${#FINAL_CAND[@]} -eq 0 ]]; then log "❌ Could not locate final CSV after filtering."; exit 1; fi
-FINAL_CSV="${FINAL_CAND[0]}"
-log "📦 Final CSV: $(basename "$FINAL_CSV")"
+echo "[INFO] Starting log processing with PYTHON_SCRIPT=${PYTHON_SCRIPT}..."
+PROCESSOR_PATH="$PROJECT_ROOT/scripts/data/process/processors/${PYTHON_SCRIPT}.py"
 
-if has_internet; then
-  log "🌐 Internet OK. Flushing queue…"
-  shopt -s nullglob
-  for q in "$QUEUE_DIR"/*.csv; do
-    if upload_one "$q"; then rm -f "$q"; else log "Leaving queued: $(basename "$q")"; fi
-  done
-  shopt -u nullglob
-  if upload_one "$FINAL_CSV"; then
-    log "✅ Run finished — upload complete."
-  else
-    log "⚠️ Upload failed; queueing new file."; cp -f "$FINAL_CSV" "$QUEUE_DIR/"
-  fi
+if [ -f "$PROCESSOR_PATH" ]; then
+    python3 "$PROCESSOR_PATH" "$NEW_FOLDER"
 else
-  log "📵 No internet. Queueing new file."; cp -f "$FINAL_CSV" "$QUEUE_DIR/"
+    echo "[ERROR] Processor script not found at $PROCESSOR_PATH. Aborting."
+    exit 1
 fi
+
+echo "[SUCCESS] Automation cycle completed."
diff --git a/scripts/data/collection/all.sh b/scripts/data/collection/all.sh
index d938759..f915ca3 100755
--- a/scripts/data/collection/all.sh
+++ b/scripts/data/collection/all.sh
@@ -1,112 +1,52 @@
 #!/bin/bash
 
+# --- Colors ---
+YELLOW='\033[0;33m'
+CYAN='\033[0;36m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+# Get the project root directory to reliably call the runner script
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+PROJECT_ROOT=$(cd "$SCRIPT_DIR/../../.." && pwd)
+RUNNER_SCRIPT="$PROJECT_ROOT/scripts/data/automation/runner.sh"
+
+# --- V2 Collection Menu ---
+clear
 echo ""
-echo "Server Types:"
-echo "1. V4 Logs (Located at /var/log/apache2)"
-echo "2. V5 Logs (Located at /var/log/oc4d)"
+echo -e "${YELLOW}Choose a V2 Collection Source:${NC}"
+echo "1. OC4D"
+echo "2. Cape Coast Castle"
+echo -e "${CYAN}3. D-Hub${NC}"
 echo ""
+read -p "Enter your choice (1-3): " choice
 
-# Prompt the user to select the log type
-read -p "Please select the type of logs to collect (1 for V4, 2 for V5): " log_choice
-
-# Set the log directory and collection logic based on the user's selection
-case $log_choice in
+# --- Execute Runner Based on Choice ---
+case "$choice" in
     1)
-        log_directory="/var/log/apache2"
-        log_type="V4 Logs"
+        echo "Selected OC4D. Starting runner..."
+        export COLLECT_TARGET="oc4d"
         ;;
     2)
-        log_directory="/var/log/oc4d"
-        log_type="V5 Logs"
+        echo "Selected Cape Coast Castle. Starting runner..."
+        export COLLECT_TARGET="castle"
+        ;;
+    3)
+        echo "Selected D-Hub. Starting runner..."
+        export COLLECT_TARGET="dhub"
         ;;
     *)
-        echo "Invalid choice. Please run the script again and choose either 1 or 2."
+        echo -e "${RED}Invalid choice. Aborting.${NC}"
+        sleep 2
         exit 1
         ;;
 esac
 
-echo ""
-echo "You have selected: $log_type."
-sleep 1
-
-# Prompt the user for the location of the device
-read -p "Enter the server location: " device_location
-device_location=${device_location// /_} # Replace spaces with underscores
-
-# Create a new folder for the logs
-new_folder="${device_location}_logs_$(date '+%Y_%m_%d')"
-mkdir -p "$new_folder"
+# All V2 collections use SERVER_VERSION="v5" as per the system's design
+export SERVER_VERSION="v5"
 
-echo ""
-echo "Collecting $log_type from $log_directory..."
+echo "Executing the main automation runner..."
 sleep 1
 
-# Collect logs based on the user's selection
-if [[ "$log_choice" == "1" ]]; then
-    # For V4 logs: Collect only access log files
-    find "$log_directory" -type f -name "access.log*" -exec cp {} "$new_folder"/ \;
-elif [[ "$log_choice" == "2" ]]; then
-    # For V5 logs: Collect oc4d-*.log, capecoastcastle-*.log, and *.gz, excluding exceptions
-    find "$log_directory" -type f \
-        \( \
-            \( -name "oc4d-*.log" ! -name "oc4d-exceptions-*.log" \) \
-            -o \
-            \( -name "capecoastcastle-*.log" ! -name "capecoastcastle-exceptions-*.log" \) \
-            -o \
-            -name "*.gz" \
-        \) \
-        -exec cp {} "$new_folder"/ \;
-fi
-
-# Move to the new folder
-cd "$new_folder" || { echo "Failed to access the logs folder. Exiting..."; exit 1; }
-
-# Uncompress any .gz files
-echo "Uncompressing files (if any)..."
-for compressed_file in *.gz; do
-    if [ -f "$compressed_file" ]; then
-        gzip -d "$compressed_file"
-    fi
-done
-
-# Return to the original directory
-cd ..
-
-# Check if the "00_DATA" folder exists, and create it if not
-if [ ! -d "00_DATA" ]; then
-    mkdir "00_DATA"
-fi
-
-# Move the collected logs to the "00_DATA" directory
-mv "$new_folder" "00_DATA"
-
-echo ""
-echo "Log collection completed successfully!"
-echo "$log_type have been saved to the '00_DATA/$new_folder' directory."
-echo ""
-
-# Prompt the user to process the logs
-while true; do
-    read -p "Would you like to process the collected logs now? (y/n): " user_choice
-    case $user_choice in
-        [Yy]* )
-            echo "Starting the log processing script..."
-            sleep 1
-            if [[ "$log_choice" == "1" ]]; then
-                exec ./scripts/data/process/all.sh
-            elif [[ "$log_choice" == "2" ]]; then
-                exec ./scripts/data/all/v2/process/logs.sh
-            fi
-            break
-            ;;
-        [Nn]* )
-            echo "Returning to the main menu..."
-            sleep 1
-            exec ./scripts/data/collection/main.sh
-            break
-            ;;
-        * )
-            echo "Please answer with 'y' (yes) or 'n' (no)."
-            ;;
-    esac
-done
+# Execute the main runner script with the chosen configuration
+exec "$RUNNER_SCRIPT"
diff --git a/scripts/data/process/processors/dhub.py b/scripts/data/process/processors/dhub.py
index 3aa3a66..954a3e2 100644
--- a/scripts/data/process/processors/dhub.py
+++ b/scripts/data/process/processors/dhub.py
@@ -1,104 +1,85 @@
-# scripts/data/process/processors/dhub.py
-
-import os
-import re
+#!/usr/bin/env python3
+# This script is modeled on a working processor, as per Llewellyn's feedback.
+import pandas as pd
 import json
-import csv
+import re
 import sys
-from datetime import datetime
+import os
+import glob
+from typing import Dict, List, Optional
 
-# This regex is the key change. It's designed to capture the new d-hub URL format.
-# It looks for lines containing /modules/ followed by a UUID.
+# A precise regex to capture only the relevant parts of a d-hub log message.
+# Pattern looks for: /modules/{uuid}/{module-name}/
 LOG_PATTERN = re.compile(
-    r'(\S+) - - \[([^\]]+)\] "(\S+) /modules/([0-9a-fA-F\-]+)/([^/]+)/?(\S*) HTTP/\d\.\d" (\d+) (\d+) "([^"]*)" "([^"]*)"'
-)
-
-# A simpler pattern for other module requests that might not have a UUID
-FALLBACK_PATTERN = re.compile(
-    r'(\S+) - - \[([^\]]+)\] "(\S+) /modules/([^/]+)/?(\S*) HTTP/\d\.\d" (\d+) (\d+) "([^"]*)" "([^"]*)"'
+    r'"GET /(?:uploads/)?modules/(?P<module_id>[0-9a-fA-F-]+)/(?P<module_name>[a-zA-Z0-9_.-]+)/.*" '
+    r'(?P<status_code>\d{3}) (?P<response_size>\d+|-)'
 )
 
-def parse_log_file(file_path, writer):
-    """Parses a single log file and writes valid entries to the CSV writer."""
+def parse_log_line(line: str) -> Optional[Dict]:
+    """Parses a single JSON log line to extract d-hub access info."""
     try:
-        with open(file_path, 'r', encoding='utf-8') as f:
-            for line in f:
-                try:
-                    # The log lines are JSON objects
-                    log_entry = json.loads(line)
-                    message = log_entry.get("message", "")
-
-                    # We only care about the lines that contain actual request data
-                    if "GET /modules/" not in message and "GET /_next/" not in message:
-                        continue
-
-                    # First, try to match the detailed d-hub pattern with a UUID
-                    match = LOG_PATTERN.search(message)
-                    if match:
-                        (ip, timestamp, method, module_id, module_name, rest_of_path, status, size, referrer, user_agent) = match.groups()
-                        module_viewed = module_name
-                    else:
-                        # If it doesn't match, try the fallback for simpler module URLs
-                        fallback_match = FALLBACK_PATTERN.search(message)
-                        if fallback_match:
-                            (ip, timestamp, method, module_name, rest_of_path, status, size, referrer, user_agent) = fallback_match.groups()
-                            module_viewed = module_name
-                        else:
-                            # If neither pattern matches, skip this line
-                            continue
-
-                    # Format the timestamp
-                    dt_obj = datetime.strptime(timestamp, "%d/%b/%Y:%H:%M:%S %z")
-                    access_date = dt_obj.strftime("%Y-%m-%d")
-                    access_time = dt_obj.strftime("%H:%M:%S")
-
-                    # Write the parsed data to the CSV
-                    writer.writerow([
-                        ip, access_date, access_time, module_viewed,
-                        'd-hub', status, size, 'Unknown', user_agent
-                    ])
-                except (json.JSONDecodeError, ValueError, TypeError):
-                    # This will skip malformed lines
-                    continue
-    except Exception as e:
-        print(f"Error processing file {file_path}: {e}")
-
-
-def main(folder_name):
-    """Main function to process all log files in a given directory."""
-    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../../'))
-    collected_dir = os.path.join(project_root, '00_DATA', folder_name)
-    processed_dir = os.path.join(project_root, '00_DATA', '00_PROCESSED', folder_name)
-
-    if not os.path.exists(collected_dir):
-        print(f"Error: Collected data directory not found at {collected_dir}")
-        return
+        log_entry = json.loads(line)
+        message = log_entry.get("message", "")
+        # Extract the IP from the start of the message
+        ip_address = message.split(' ', 1)[0]
+    except (json.JSONDecodeError, AttributeError, IndexError):
+        return None
+
+    match = LOG_PATTERN.search(message)
+    if not match:
+        return None
+
+    data = match.groupdict()
+    data['ip_address'] = ip_address
+    data['response_size'] = 0 if data['response_size'] == '-' else int(data['response_size'])
+    data['status_code'] = int(data['status_code'])
+    # Use the more accurate timestamp from the JSON body
+    data['timestamp'] = log_entry.get("timestamp")
+
+    return data
+
+def main(folder_name: str):
+    """Processes all log files in a given folder and creates a summary.csv."""
+    script_path = os.path.dirname(os.path.realpath(__file__))
+    project_root = os.path.abspath(os.path.join(script_path, "../../../.."))
+
+    input_dir = os.path.join(project_root, "00_DATA", folder_name)
+    output_dir = os.path.join(project_root, "00_DATA", "00_PROCESSED", folder_name)
+
+    if not os.path.isdir(input_dir):
+        print(f"Error: Input directory not found at {input_dir}", file=sys.stderr)
+        sys.exit(1)
 
-    os.makedirs(processed_dir, exist_ok=True)
+    os.makedirs(output_dir, exist_ok=True)
+    output_csv_path = os.path.join(output_dir, "summary.csv")
 
-    output_csv_path = os.path.join(processed_dir, 'summary.csv')
+    all_logs = []
+    log_files = glob.glob(os.path.join(input_dir, '*'))
+    print(f"Found {len(log_files)} files in {input_dir} to process.")
 
-    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
-        writer = csv.writer(csvfile)
-        # Write header
-        writer.writerow([
-            "IP Address", "Access Date", "Access Time", "Module Viewed",
-            "Location Viewed", "Status Code", "Data Saved (GB)",
-            "Device Used", "Browser Used"
-        ])
+    for file_path in log_files:
+        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+            for line in f:
+                parsed_data = parse_log_line(line)
+                if parsed_data:
+                    all_logs.append(parsed_data)
 
-        print(f"Processing log files in {collected_dir}...")
-        # Find all log files (not ending in .gz)
-        for filename in os.listdir(collected_dir):
-            if filename.endswith(".log"):
-                file_path = os.path.join(collected_dir, filename)
-                print(f"  - Parsing {filename}...")
-                parse_log_file(file_path, writer)
+    if not all_logs:
+        print("No valid d-hub log entries were found.")
+        pd.DataFrame(columns=['ip_address', 'timestamp', 'module_id', 'module_name', 'status_code', 'response_size']).to_csv(output_csv_path, index=False)
+        return
+
+    df = pd.DataFrame(all_logs)
+    column_order = ['ip_address', 'timestamp', 'module_id', 'module_name', 'status_code', 'response_size']
+    df = df[column_order] 
+    df.to_csv(output_csv_path, index=False)
 
-        print(f"Processing complete. Summary saved to {output_csv_path}")
+    print(f"✅ Processing complete. {len(df)} entries saved to {output_csv_path}")
 
-if __name__ == '__main__':
-    if len(sys.argv) != 2:
-        print("Usage: python3 dhub.py <folder_name>")
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python3 dhub.py <folder_name>", file=sys.stderr)
         sys.exit(1)
-    main(sys.argv[1])
+
+    target_folder = sys.argv[1]
+    main(target_folder)