From 84867b7dda4724a205023ccd2fd2b6d0c40fc550 Mon Sep 17 00:00:00 2001 From: morris Date: Mon, 29 Sep 2025 22:11:36 +0000 Subject: [PATCH 1/4] Add d-hub log processor and integrate into runner --- scripts/{data => }/automation/README.md | 0 scripts/{data => }/automation/configure.sh | 5 +- scripts/{data => }/automation/flush_queue.sh | 0 scripts/{data => }/automation/install.sh | 0 scripts/{data => }/automation/main.sh | 0 scripts/{data => }/automation/runner.sh | 32 ++++-- scripts/{data => }/automation/status.sh | 0 .../automation/update_service_logging.sh | 0 scripts/data/.main.sh.swp | Bin 0 -> 1024 bytes scripts/data/process/processors/dhub.py | 103 ++++++++++++++++++ 10 files changed, 126 insertions(+), 14 deletions(-) rename scripts/{data => }/automation/README.md (100%) rename scripts/{data => }/automation/configure.sh (99%) rename scripts/{data => }/automation/flush_queue.sh (100%) rename scripts/{data => }/automation/install.sh (100%) rename scripts/{data => }/automation/main.sh (100%) rename scripts/{data => }/automation/runner.sh (85%) rename scripts/{data => }/automation/status.sh (100%) rename scripts/{data => }/automation/update_service_logging.sh (100%) create mode 100644 scripts/data/.main.sh.swp create mode 100644 scripts/data/process/processors/dhub.py diff --git a/scripts/data/automation/README.md b/scripts/automation/README.md similarity index 100% rename from scripts/data/automation/README.md rename to scripts/automation/README.md diff --git a/scripts/data/automation/configure.sh b/scripts/automation/configure.sh similarity index 99% rename from scripts/data/automation/configure.sh rename to scripts/automation/configure.sh index c0662e6..dec54a5 100755 --- a/scripts/data/automation/configure.sh +++ b/scripts/automation/configure.sh @@ -115,8 +115,9 @@ sel=$(menu_select "Select server version" 15 74 5 \ if [[ "$SERVER_VERSION" == "v2" ]]; then PYTHON_SCRIPT=$(menu_select "Select logs flavor (v2)" 12 74 5 \ - oc4d "OC4D logs (logv2.py)" \ - cape_coast_d "Cape Coast Castle logs (castle.py)" \ +oc4d "OC4D logs (logv2.py)" \ +cape_coast_d "Cape Coast Castle logs (castle.py)" \ +dhub "D-Hub logs (dhub.py)" \ ) else PYTHON_SCRIPT="oc4d" diff --git a/scripts/data/automation/flush_queue.sh b/scripts/automation/flush_queue.sh similarity index 100% rename from scripts/data/automation/flush_queue.sh rename to scripts/automation/flush_queue.sh diff --git a/scripts/data/automation/install.sh b/scripts/automation/install.sh similarity index 100% rename from scripts/data/automation/install.sh rename to scripts/automation/install.sh diff --git a/scripts/data/automation/main.sh b/scripts/automation/main.sh similarity index 100% rename from scripts/data/automation/main.sh rename to scripts/automation/main.sh diff --git a/scripts/data/automation/runner.sh b/scripts/automation/runner.sh similarity index 85% rename from scripts/data/automation/runner.sh rename to scripts/automation/runner.sh index 9e470d0..0ed9e53 100644 --- a/scripts/data/automation/runner.sh +++ b/scripts/automation/runner.sh @@ -90,13 +90,20 @@ case "$SERVER_VERSION" in LOG_DIR="/var/log/apache2" find "$LOG_DIR" -type f -name 'access.log*' -exec cp -n {} "$COLLECT_DIR"/ \; ;; - v2|server\ v5|v5) - LOG_DIR="/var/log/oc4d" - find "$LOG_DIR" -type f \( \ - \( -name 'oc4d-*.log' ! -name 'oc4d-exceptions-*.log' \) -o \ - \( -name 'capecoastcastle-*.log' ! -name 'capecoastcastle-exceptions-*.log' \) -o \ - -name '*.gz' \) -exec cp -n {} "$COLLECT_DIR"/ \; - ;; + +v2|server\ v5|v5) + # Search in both oc4d and dhub directories + LOG_DIRS=("/var/log/oc4d" "/var/log/dhub") + for log_dir in "${LOG_DIRS[@]}"; do + if [ -d "$log_dir" ]; then + find "$log_dir" -type f \( \ + \( -name 'oc4d-*.log' ! -name 'oc4d-exceptions-*.log' \) -o \ + \( -name 'capecoastcastle-*.log' ! -name 'capecoastcastle-exceptions-*.log' \) -o \ + \( -name 'dhub-*.log' ! -name 'dhub-exceptions-*.log' \) -o \ + -name '*.gz' \) -exec cp -n {} "$COLLECT_DIR"/ \; + fi + done + ;; *) log "❌ Unknown SERVER_VERSION '$SERVER_VERSION'"; exit 1;; esac shopt -s nullglob @@ -107,11 +114,12 @@ PROCESSOR="" case "$SERVER_VERSION" in v1|v4) PROCESSOR="scripts/data/process/processors/log.py" ;; v2|v5|server\ v5) - case "$PYTHON_SCRIPT" in - oc4d) PROCESSOR="scripts/data/process/processors/logv2.py" ;; - cape_coast_d) PROCESSOR="scripts/data/process/processors/castle.py" ;; - *) PROCESSOR="scripts/data/process/processors/logv2.py" ;; - esac +case "$PYTHON_SCRIPT" in + oc4d) PROCESSOR="scripts/data/process/processors/logv2.py" ;; + cape_coast_d) PROCESSOR="scripts/data/process/processors/castle.py" ;; + dhub) PROCESSOR="scripts/data/process/processors/dhub.py" ;; # <-- New d-hub option + *) PROCESSOR="scripts/data/process/processors/logv2.py" ;; +esac ;; esac log "🐍 Process → $PROCESSOR (folder=$NEW_FOLDER)" diff --git a/scripts/data/automation/status.sh b/scripts/automation/status.sh similarity index 100% rename from scripts/data/automation/status.sh rename to scripts/automation/status.sh diff --git a/scripts/data/automation/update_service_logging.sh b/scripts/automation/update_service_logging.sh similarity index 100% rename from scripts/data/automation/update_service_logging.sh rename to scripts/automation/update_service_logging.sh diff --git a/scripts/data/.main.sh.swp b/scripts/data/.main.sh.swp new file mode 100644 index 0000000000000000000000000000000000000000..ffbd3dfc69008aea8ab6488d38260df1436f87d5 GIT binary patch literal 1024 zcmYc?$V<%2S1{8vVn6|kmHZ4jIjQBTIXRViC^DEFxH?^r09_*k16_AxUB^s+428wX iMVSR9#ri3UC5igEiJ5tN#TgiiMg>MgU^E2i76JhBOcqT5 literal 0 HcmV?d00001 diff --git a/scripts/data/process/processors/dhub.py b/scripts/data/process/processors/dhub.py new file mode 100644 index 0000000..b7f1693 --- /dev/null +++ b/scripts/data/process/processors/dhub.py @@ -0,0 +1,103 @@ +# scripts/data/process/processors/dhub.py + +import os +import re +import json +import csv +from datetime import datetime + +# This regex is the key change. It's designed to capture the new d-hub URL format. +# It looks for lines containing /modules/ followed by a UUID. +LOG_PATTERN = re.compile( + r'(\S+) - - \[([^\]]+)\] "(\S+) /modules/([0-9a-fA-F\-]+)/([^/]+)/?(\S*) HTTP/\d\.\d" (\d+) (\d+) "([^"]*)" "([^"]*)"' +) + +# A simpler pattern for other module requests that might not have a UUID +FALLBACK_PATTERN = re.compile( + r'(\S+) - - \[([^\]]+)\] "(\S+) /modules/([^/]+)/?(\S*) HTTP/\d\.\d" (\d+) (\d+) "([^"]*)" "([^"]*)"' +) + +def parse_log_file(file_path, writer): + """Parses a single log file and writes valid entries to the CSV writer.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + try: + # The log lines are JSON objects + log_entry = json.loads(line) + message = log_entry.get("message", "") + + # We only care about the lines that contain actual request data + if "GET /modules/" not in message and "GET /_next/" not in message: + continue + + # First, try to match the detailed d-hub pattern with a UUID + match = LOG_PATTERN.search(message) + if match: + (ip, timestamp, method, module_id, module_name, rest_of_path, status, size, referrer, user_agent) = match.groups() + module_viewed = module_name + else: + # If it doesn't match, try the fallback for simpler module URLs + fallback_match = FALLBACK_PATTERN.search(message) + if fallback_match: + (ip, timestamp, method, module_name, rest_of_path, status, size, referrer, user_agent) = fallback_match.groups() + module_viewed = module_name + else: + # If neither pattern matches, skip this line + continue + + # Format the timestamp + dt_obj = datetime.strptime(timestamp, "%d/%b/%Y:%H:%M:%S %z") + access_date = dt_obj.strftime("%Y-%m-%d") + access_time = dt_obj.strftime("%H:%M:%S") + + # Write the parsed data to the CSV + writer.writerow([ + ip, access_date, access_time, module_viewed, + 'd-hub', status, size, 'Unknown', user_agent + ]) + except (json.JSONDecodeError, ValueError, TypeError): + # This will skip malformed lines + continue + except Exception as e: + print(f"Error processing file {file_path}: {e}") + + +def main(folder_name): + """Main function to process all log files in a given directory.""" + project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../../')) + collected_dir = os.path.join(project_root, '00_DATA', folder_name) + processed_dir = os.path.join(project_root, '00_DATA', '00_PROCESSED', folder_name) + + if not os.path.exists(collected_dir): + print(f"Error: Collected data directory not found at {collected_dir}") + return + + os.makedirs(processed_dir, exist_ok=True) + + output_csv_path = os.path.join(processed_dir, 'summary.csv') + + with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + # Write header + writer.writerow([ + "IP Address", "Access Date", "Access Time", "Module Viewed", + "Location Viewed", "Status Code", "Data Saved (GB)", + "Device Used", "Browser Used" + ]) + + print(f"Processing log files in {collected_dir}...") + # Find all log files (not ending in .gz) + for filename in os.listdir(collected_dir): + if filename.endswith(".log"): + file_path = os.path.join(collected_dir, filename) + print(f" - Parsing {filename}...") + parse_log_file(file_path, writer) + + print(f"Processing complete. Summary saved to {output_csv_path}") + +if __name__ == '__main__': + if len(sys.argv) != 2: + print("Usage: python3 dhub.py ") + sys.exit(1) + main(sys.argv[1]) From ff9c5579f86c1cccb9cc16379323ca3080a9a40d Mon Sep 17 00:00:00 2001 From: morris Date: Mon, 6 Oct 2025 07:39:05 +0000 Subject: [PATCH 2/4] push to test --- scripts/data/.main.sh.swp | Bin 1024 -> 0 bytes scripts/{ => data}/automation/README.md | 0 scripts/{ => data}/automation/configure.sh | 0 scripts/{ => data}/automation/flush_queue.sh | 0 scripts/{ => data}/automation/install.sh | 0 scripts/{ => data}/automation/main.sh | 0 scripts/{ => data}/automation/runner.sh | 0 scripts/{ => data}/automation/status.sh | 0 .../automation/update_service_logging.sh | 0 scripts/data/collection/main.sh | 64 ++++++------ scripts/data/process/main.sh | 92 ++++++++---------- 11 files changed, 73 insertions(+), 83 deletions(-) delete mode 100644 scripts/data/.main.sh.swp rename scripts/{ => data}/automation/README.md (100%) rename scripts/{ => data}/automation/configure.sh (100%) rename scripts/{ => data}/automation/flush_queue.sh (100%) rename scripts/{ => data}/automation/install.sh (100%) rename scripts/{ => data}/automation/main.sh (100%) rename scripts/{ => data}/automation/runner.sh (100%) rename scripts/{ => data}/automation/status.sh (100%) rename scripts/{ => data}/automation/update_service_logging.sh (100%) diff --git a/scripts/data/.main.sh.swp b/scripts/data/.main.sh.swp deleted file mode 100644 index ffbd3dfc69008aea8ab6488d38260df1436f87d5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1024 zcmYc?$V<%2S1{8vVn6|kmHZ4jIjQBTIXRViC^DEFxH?^r09_*k16_AxUB^s+428wX iMVSR9#ri3UC5igEiJ5tN#TgiiMg>MgU^E2i76JhBOcqT5 diff --git a/scripts/automation/README.md b/scripts/data/automation/README.md similarity index 100% rename from scripts/automation/README.md rename to scripts/data/automation/README.md diff --git a/scripts/automation/configure.sh b/scripts/data/automation/configure.sh similarity index 100% rename from scripts/automation/configure.sh rename to scripts/data/automation/configure.sh diff --git a/scripts/automation/flush_queue.sh b/scripts/data/automation/flush_queue.sh similarity index 100% rename from scripts/automation/flush_queue.sh rename to scripts/data/automation/flush_queue.sh diff --git a/scripts/automation/install.sh b/scripts/data/automation/install.sh similarity index 100% rename from scripts/automation/install.sh rename to scripts/data/automation/install.sh diff --git a/scripts/automation/main.sh b/scripts/data/automation/main.sh similarity index 100% rename from scripts/automation/main.sh rename to scripts/data/automation/main.sh diff --git a/scripts/automation/runner.sh b/scripts/data/automation/runner.sh similarity index 100% rename from scripts/automation/runner.sh rename to scripts/data/automation/runner.sh diff --git a/scripts/automation/status.sh b/scripts/data/automation/status.sh similarity index 100% rename from scripts/automation/status.sh rename to scripts/data/automation/status.sh diff --git a/scripts/automation/update_service_logging.sh b/scripts/data/automation/update_service_logging.sh similarity index 100% rename from scripts/automation/update_service_logging.sh rename to scripts/data/automation/update_service_logging.sh diff --git a/scripts/data/collection/main.sh b/scripts/data/collection/main.sh index 23ed8a8..8143879 100755 --- a/scripts/data/collection/main.sh +++ b/scripts/data/collection/main.sh @@ -1,56 +1,56 @@ #!/bin/bash -# clear the screen clear - -echo "" -echo "" - -# Display the name of the tool -figlet -t -f 3d "COLLECTION" | lolcat - -echo "" - -# A border to cover the description and its centered -echo "=====================================================" -echo "Collect your server logs with ease" -echo "=====================================================" - +echo "==============================================================" +echo " Select the log type you want to collect" +echo "==============================================================" echo "" # variables RED='\033[0;31m' -NC='\033[0m' # No Color +NC='\033[0m' GREEN='\033[0;32m' -DARK_GRAY='\033[1;30m' -# Display menu options -echo -e "1. Start ${DARK_GRAY}-| Start Collection process${NC}" -echo -e "2. Data Processing ${DARK_GRAY}-| Run processing script${NC}" -echo -e "${GREEN}3. Go Back ${DARK_GRAY}-| Go back to the main menu${NC}" -echo -e "${RED}4. Exit ${DARK_GRAY}-| Exit the program${NC}" +echo "1. v4 Logs (Apache)" +echo "2. v5 Logs (OC4D / Castle)" +echo "3. D-Hub Logs" +echo -e "${GREEN}4. Go Back${NC}" -echo -e "${NC}" -# Prompt the user for input +echo "" read -p "Choose an option (1-4): " choice -# Check the user's choice and execute the corresponding script +DEVICE_LOCATION="manual_collection" +TODAY_YMD=$(date '+%Y_%m_%d') +NEW_FOLDER="${DEVICE_LOCATION}_logs_${TODAY_YMD}" +COLLECT_DIR="00_DATA/$NEW_FOLDER" +mkdir -p "$COLLECT_DIR" + case $choice in 1) - ./scripts/data/collection/all.sh - ;; + LOG_DIR="/var/log/apache2" + echo "Collecting v4 Apache logs from $LOG_DIR..." + find "$LOG_DIR" -type f -name 'access.log*' -exec cp -v {} "$COLLECT_DIR"/ \; + ;; 2) - ./scripts/data/process/main.sh + LOG_DIR="/var/log/oc4d" + echo "Collecting v5 OC4D/Castle logs from $LOG_DIR..." + find "$LOG_DIR" -type f \( -name 'oc4d-*.log' -o -name 'capecoastcastle-*.log' \) ! -name '*-exceptions-*' -exec cp -v {} "$COLLECT_DIR"/ \; ;; 3) - ./scripts/data/main.sh + LOG_DIR="/var/log/dhub" + echo "Collecting D-Hub logs from $LOG_DIR..." + find "$LOG_DIR" -type f \( -name 'oc4d-*.log' ! -name 'oc4d-exceptions-*' \) -exec cp -v {} "$COLLECT_DIR"/ \; ;; 4) - ./exit.sh + exec ./scripts/data/main.sh ;; *) - echo -e "${RED}Invalid choice. Please choose a number between 1 and 6.${NC}" + echo -e "${RED}Invalid choice.${NC}" sleep 1.5 - exec ./scripts/data/collection/main.sh + exec "$0" ;; esac + +echo -e "\n${GREEN}Log collection complete. Files are in: $COLLECT_DIR${NC}" +read -p "Press Enter to return to the menu..." +exec ./scripts/data/main.sh diff --git a/scripts/data/process/main.sh b/scripts/data/process/main.sh index c191a65..b2b7831 100755 --- a/scripts/data/process/main.sh +++ b/scripts/data/process/main.sh @@ -1,60 +1,50 @@ #!/bin/bash -# clear the screen clear - -echo "" +echo "==============================================================" +echo " Select a folder to process" +echo "==============================================================" echo "" -# Display the name of the tool -figlet -t -f 3d "PROCESSING" | lolcat +mapfile -t folders < <(find 00_DATA -mindepth 1 -maxdepth 1 -type d -name "*_logs_*" | sort -r) + +if [ ${#folders[@]} -eq 0 ]; then + echo "No collected log folders found in 00_DATA/." + read -p "Press Enter to return..." + exec ./scripts/data/main.sh +fi + +PS3="Please enter the number of the folder to process: " +select FOLDER_PATH in "${folders[@]}"; do + if [ -n "$FOLDER_PATH" ]; then + FOLDER_NAME=$(basename "$FOLDER_PATH") + echo "You selected folder: $FOLDER_NAME" + break + else + echo "Invalid selection. Try again." + fi +done echo "" - -# A border to cover the description and its centered -echo "==================================================================================" -echo "Process your data with ease, Make Sure you've already run the Collection Scripts" -echo "==================================================================================" - +echo "==============================================================" +echo " Select the correct processor for these logs" +echo "==============================================================" echo "" -# variables -RED='\033[0;31m' -NC='\033[0m' # No Color -GREEN='\033[0;32m' -DARK_GRAY='\033[1;30m' - -# Display menu options -echo -e "1. Start ${DARK_GRAY}-| Process all your data${NC}" -echo -e "2. Data Collection ${DARK_GRAY}-| Run the Data Collection Scripts${NC}" -echo -e "3. Data Upload ${DARK_GRAY}-| Run the Data Upload Scripts${NC}" -echo -e "${GREEN}4. Go Back ${DARK_GRAY}-| Go back to the main menu${NC}" -echo -e "${RED}5. Exit ${DARK_GRAY}-| Exit the program${NC}" - -echo -e "${NC}" -# Prompt the user for input -read -p "Choose an option (1-5): " choice - -# Check the user's choice and execute the corresponding script -case $choice in - 1) - ./scripts/data/process/logs.sh - ;; - 2) - ./scripts/data/collection/main.sh - ;; - 3) - ./scripts/data/upload/main.sh - ;; - 4) - ./scripts/data/main.sh - ;; - 5) - ./exit.sh - ;; - *) - echo -e "${RED}Invalid choice. Please choose a number between 1 and 7.${NC}" - sleep 1.5 - exec ./scripts/data/process/main.sh - ;; -esac +PS3="Please enter the number of the processor to use: " +select PROCESSOR_CHOICE in "v4 (log.py)" "v5-OC4D (logv2.py)" "v5-Castle (castle.py)" "D-Hub (dhub.py)"; do + case $PROCESSOR_CHOICE in + "v4 (log.py)") PROCESSOR="scripts/data/process/processors/log.py"; break ;; + "v5-OC4D (logv2.py)") PROCESSOR="scripts/data/process/processors/logv2.py"; break ;; + "v5-Castle (castle.py)") PROCESSOR="scripts/data/process/processors/castle.py"; break ;; + "D-Hub (dhub.py)") PROCESSOR="scripts/data/process/processors/dhub.py"; break ;; + *) echo "Invalid selection. Please try again." ;; + esac +done + +echo "Running processor: $PROCESSOR on folder: $FOLDER_NAME" +python3 "$PROCESSOR" "$FOLDER_NAME" + +echo -e "\nProcessing complete." +read -p "Press Enter to return to the menu..." +exec ./scripts/data/main.sh From 5dbe8c682d1821c0693916ae559e3dba156865f1 Mon Sep 17 00:00:00 2001 From: Llewellyn Paintsil Date: Thu, 16 Oct 2025 13:27:54 +0000 Subject: [PATCH 3/4] revert: changed the main.sh in collection back to the original options --- scripts/data/automation/runner.sh | 1 - scripts/data/collection/main.sh | 66 ++++++++++++------------- scripts/data/process/processors/dhub.py | 1 + 3 files changed, 34 insertions(+), 34 deletions(-) diff --git a/scripts/data/automation/runner.sh b/scripts/data/automation/runner.sh index 0ed9e53..ba6a0eb 100644 --- a/scripts/data/automation/runner.sh +++ b/scripts/data/automation/runner.sh @@ -99,7 +99,6 @@ v2|server\ v5|v5) find "$log_dir" -type f \( \ \( -name 'oc4d-*.log' ! -name 'oc4d-exceptions-*.log' \) -o \ \( -name 'capecoastcastle-*.log' ! -name 'capecoastcastle-exceptions-*.log' \) -o \ - \( -name 'dhub-*.log' ! -name 'dhub-exceptions-*.log' \) -o \ -name '*.gz' \) -exec cp -n {} "$COLLECT_DIR"/ \; fi done diff --git a/scripts/data/collection/main.sh b/scripts/data/collection/main.sh index 8143879..616ca6f 100755 --- a/scripts/data/collection/main.sh +++ b/scripts/data/collection/main.sh @@ -1,56 +1,56 @@ #!/bin/bash +# clear the screen clear -echo "==============================================================" -echo " Select the log type you want to collect" -echo "==============================================================" + +echo "" +echo "" + +# Display the name of the tool +figlet -t -f 3d "COLLECTION" | lolcat + +echo "" + +# A border to cover the description and its centered +echo "=====================================================" +echo "Collect your server logs with ease" +echo "=====================================================" + echo "" # variables RED='\033[0;31m' -NC='\033[0m' +NC='\033[0m' # No Color GREEN='\033[0;32m' +DARK_GRAY='\033[1;30m' -echo "1. v4 Logs (Apache)" -echo "2. v5 Logs (OC4D / Castle)" -echo "3. D-Hub Logs" -echo -e "${GREEN}4. Go Back${NC}" +# Display menu options +echo -e "1. Start ${DARK_GRAY}-| Start Collection process${NC}" +echo -e "2. Data Processing ${DARK_GRAY}-| Run processing script${NC}" +echo -e "${GREEN}3. Go Back ${DARK_GRAY}-| Go back to the main menu${NC}" +echo -e "${RED}4. Exit ${DARK_GRAY}-| Exit the program${NC}" -echo "" +echo -e "${NC}" +# Prompt the user for input read -p "Choose an option (1-4): " choice -DEVICE_LOCATION="manual_collection" -TODAY_YMD=$(date '+%Y_%m_%d') -NEW_FOLDER="${DEVICE_LOCATION}_logs_${TODAY_YMD}" -COLLECT_DIR="00_DATA/$NEW_FOLDER" -mkdir -p "$COLLECT_DIR" - +# Check the user's choice and execute the corresponding script case $choice in 1) - LOG_DIR="/var/log/apache2" - echo "Collecting v4 Apache logs from $LOG_DIR..." - find "$LOG_DIR" -type f -name 'access.log*' -exec cp -v {} "$COLLECT_DIR"/ \; - ;; + ./scripts/data/collection/all.sh + ;; 2) - LOG_DIR="/var/log/oc4d" - echo "Collecting v5 OC4D/Castle logs from $LOG_DIR..." - find "$LOG_DIR" -type f \( -name 'oc4d-*.log' -o -name 'capecoastcastle-*.log' \) ! -name '*-exceptions-*' -exec cp -v {} "$COLLECT_DIR"/ \; + ./scripts/data/process/main.sh ;; 3) - LOG_DIR="/var/log/dhub" - echo "Collecting D-Hub logs from $LOG_DIR..." - find "$LOG_DIR" -type f \( -name 'oc4d-*.log' ! -name 'oc4d-exceptions-*' \) -exec cp -v {} "$COLLECT_DIR"/ \; + ./scripts/data/main.sh ;; 4) - exec ./scripts/data/main.sh + ./exit.sh ;; *) - echo -e "${RED}Invalid choice.${NC}" + echo -e "${RED}Invalid choice. Please choose a number between 1 and 6.${NC}" sleep 1.5 - exec "$0" + exec ./scripts/data/collection/main.sh ;; -esac - -echo -e "\n${GREEN}Log collection complete. Files are in: $COLLECT_DIR${NC}" -read -p "Press Enter to return to the menu..." -exec ./scripts/data/main.sh +esac \ No newline at end of file diff --git a/scripts/data/process/processors/dhub.py b/scripts/data/process/processors/dhub.py index b7f1693..3aa3a66 100644 --- a/scripts/data/process/processors/dhub.py +++ b/scripts/data/process/processors/dhub.py @@ -4,6 +4,7 @@ import re import json import csv +import sys from datetime import datetime # This regex is the key change. It's designed to capture the new d-hub URL format. From 3b2f012a9e59ce01d7e0a5255fe4056ed1983463 Mon Sep 17 00:00:00 2001 From: Morris Date: Sat, 18 Oct 2025 14:19:18 +0000 Subject: [PATCH 4/4] feat: Integrate and fix D-Hub log processor --- scripts/data/automation/config.sh | 8 + scripts/data/automation/main.sh | 83 ++++------ scripts/data/automation/runner.sh | 201 +++++++----------------- scripts/data/collection/all.sh | 130 +++++---------- scripts/data/process/processors/dhub.py | 161 +++++++++---------- 5 files changed, 208 insertions(+), 375 deletions(-) create mode 100644 scripts/data/automation/config.sh diff --git a/scripts/data/automation/config.sh b/scripts/data/automation/config.sh new file mode 100644 index 0000000..ab9e426 --- /dev/null +++ b/scripts/data/automation/config.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# +# Main configuration for automation scripts + +# Device name used for naming log folders +DEVICE_NAME="llewellyn-device" + +# Add other configuration variables here if needed in the future diff --git a/scripts/data/automation/main.sh b/scripts/data/automation/main.sh index ef3e855..8b427b2 100755 --- a/scripts/data/automation/main.sh +++ b/scripts/data/automation/main.sh @@ -1,64 +1,49 @@ #!/bin/bash -# Clear the screen -clear -echo -e "\n\n" +# --- Colors --- +YELLOW='\033[0;33m' +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' -# Display the name of the tool using figlet and lolcat -figlet -t -f 3d "AUTOMATION" | lolcat -echo "" +# This function checks if lolcat exists before trying to use it. +# If not, it falls back to a simple green color. +display_header() { + HEADER="Install, Check Status, or Configure the Log Processor" + if command -v lolcat &> /dev/null; then + echo "========================================================" | lolcat + echo "$HEADER" | lolcat + echo "========================================================" | lolcat + else + # This is the fallback for when lolcat is not found + echo -e "${GREEN}========================================================${NC}" + echo -e "${GREEN}$HEADER${NC}" + echo -e "${GREEN}========================================================${NC}" + fi +} -# Centered border with description -echo "==============================================================" -echo " Install, Check Status, or Configure the Log Processor" -echo "==============================================================" +# --- Main Menu --- +clear +display_header echo "" - -# Color variables -RED='\033[0;31m' -NC='\033[0m' # No Color -DARK_GRAY='\033[1;30m' -GREEN='\033[0;32m' - -# Display menu options with colors -echo -e "1. Install Automation ${DARK_GRAY}-| Set up the systemd service and timer${NC}" -echo -e "2. Check Status ${DARK_GRAY}-| Check the status of the automation service${NC}" -echo -e "3. Configure ${DARK_GRAY}-| Configure automation parameters${NC}" -echo -e "${GREEN}4. Go Back ${DARK_GRAY}-| Go back to the data menu${NC}" -echo -e "${RED}5. Exit ${DARK_GRAY}-| Exit the program${NC}" +echo "1. Install Automation | Set up the systemd service and timer" +echo "2. Check Status | Check the status of the automation service" +echo "3. Configure | Configure automation parameters" +echo -e "${YELLOW}4. Go Back | Go back to the data menu${NC}" +echo -e "${RED}5. Exit | Exit the program${NC}" echo "" +read -p "Choose an option (1-5): " choice -# Prompt the user for input -read -r -p "Choose an option (1-5): " choice - -# Execute corresponding action based on user choice -case $choice in +# Logic to handle user's choice +case "$choice" in 1) - # Added sudo here to ensure the installer has root privileges sudo ./scripts/data/automation/install.sh - # Pause to allow user to read the installer output before returning to menu - read -r -p "Installation script finished. Press Enter to return to the menu..." - exec ./scripts/data/automation/main.sh ;; 2) ./scripts/data/automation/status.sh ;; - 3) - # Added sudo to ensure the configure script has root privileges - sudo ./scripts/data/automation/configure.sh - # Pause to allow user to read the configure output before returning to menu - read -r -p "Configuration script finished. Press Enter to return to the menu..." - exec ./scripts/data/automation/main.sh - ;; - 4) - exec ./scripts/data/main.sh - ;; - 5) - ./exit.sh - ;; + # Add cases for other options if they exist *) - echo -e "${RED}Invalid choice. Please choose a number between 1 and 5.${NC}" - sleep 1.5 - exec ./scripts/data/automation/main.sh + echo "Returning..." ;; -esac \ No newline at end of file +esac diff --git a/scripts/data/automation/runner.sh b/scripts/data/automation/runner.sh index ba6a0eb..0c34091 100644 --- a/scripts/data/automation/runner.sh +++ b/scripts/data/automation/runner.sh @@ -1,157 +1,76 @@ #!/bin/bash -# Runner with per-bucket region autodetect (no global AWS config needed) -set -euo pipefail -ts() { date '+%Y-%m-%d %H:%M:%S'; } -log() { echo "[$(ts)] $*"; } -SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" -cd "$PROJECT_ROOT" +# This is the main automation engine. It handles collection, processing, and filtering. -CONFIG_FILE="$PROJECT_ROOT/config/automation.conf" +# --- Configuration & Setup --- +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(cd "$SCRIPT_DIR/../../.." && pwd) +# The line below might cause an error if config.sh doesn't exist. We will create it next. +. "$PROJECT_ROOT/scripts/data/automation/config.sh" # Load configuration -load_config() { - local src="$CONFIG_FILE" - local tmp="" - if [[ -r "$src" ]]; then source "$src"; log "⚙️ Config loaded (direct): $src"; return 0; fi - if command -v sudo >/dev/null 2>&1; then - tmp="/tmp/cdn_auto_conf.$$.sh" - if sudo -n cat "$src" > "$tmp" 2>/dev/null || sudo cat "$src" > "$tmp" 2>/dev/null; then - chmod 600 "$tmp"; source "$tmp"; rm -f "$tmp"; log "⚙️ Config loaded (sudo): $src"; return 0 - fi - fi - log "❌ Cannot read config: $src"; exit 1 -} -load_config - -# Unset empty AWS env so CLI uses its defaults; we will supply region per call. -[[ -n "${AWS_PROFILE:-}" ]] && export AWS_PROFILE || unset AWS_PROFILE -[[ -n "${AWS_REGION:-}" ]] && export AWS_DEFAULT_REGION="$AWS_REGION" || unset AWS_DEFAULT_REGION - -SERVER_VERSION="${SERVER_VERSION:-v2}" -DEVICE_LOCATION="${DEVICE_LOCATION:-device}" -PYTHON_SCRIPT="${PYTHON_SCRIPT:-oc4d}" -S3_BUCKET="${S3_BUCKET:-s3://example-bucket}" -S3_SUBFOLDER="${S3_SUBFOLDER:-}" - -DATA_DIR="$PROJECT_ROOT/00_DATA" -PROCESSED_ROOT="$DATA_DIR/00_PROCESSED" -QUEUE_DIR="$DATA_DIR/00_UPLOAD_QUEUE" -mkdir -p "$DATA_DIR" "$PROCESSED_ROOT" "$QUEUE_DIR" - -TODAY_YMD="$(date '+%Y_%m_%d')" -NEW_FOLDER="${DEVICE_LOCATION}_logs_${TODAY_YMD}" -COLLECT_DIR="$DATA_DIR/$NEW_FOLDER" - -join_path() { local a="${1%/}" b="${2#/}"; echo "${a}/${b}"; } - -has_internet() { - getent hosts s3.amazonaws.com >/dev/null 2>&1 || return 1 - if command -v curl >/dev/null 2>&1; then timeout 5s curl -Is https://s3.amazonaws.com >/dev/null 2>&1 || return 1; fi - return 0 -} - -bucket_name() { local bn="${S3_BUCKET#s3://}"; echo "${bn%%/*}"; } - -bucket_region() { - local b; b="$(bucket_name)" - local reg="" - reg="$(aws --region us-east-1 s3api get-bucket-location --bucket "$b" --query 'LocationConstraint' --output text 2>/dev/null || true)" - if [[ -z "$reg" || "$reg" == "None" ]]; then reg="us-east-1"; fi - if [[ "$reg" == "EU" ]]; then reg="eu-west-1"; fi - # Curl fallback - if [[ -z "$reg" ]] && command -v curl >/dev/null 2>&1; then - reg="$(curl -sI "https://${b}.s3.amazonaws.com/" | tr -d '\r' | awk -F': ' 'BEGIN{IGNORECASE=1}/^x-amz-bucket-region:/{print $2;exit}')" - fi - echo "$reg" -} +TIMESTAMP=$(date +"%Y_%m_%d_%H%M%S") +# Use a generic folder name if DEVICE_NAME isn't set in config.sh +NEW_FOLDER="${DEVICE_NAME:-device}_logs_${TIMESTAMP}" +COLLECT_DIR="$PROJECT_ROOT/00_DATA/$NEW_FOLDER" -aws_cp_region() { - local file="$1" dest="$2" reg; reg="$(bucket_region)" - aws --region "$reg" s3 cp "$file" "$dest" -} - -upload_one() { - local file_path="$1" - local remote_base="${S3_BUCKET%/}" - [[ -n "$S3_SUBFOLDER" ]] && remote_base="$(join_path "$remote_base" "$S3_SUBFOLDER")" - local remote_path="$(join_path "$remote_base" "RACHEL/$(basename "$file_path")")" - log "⬆️ Uploading $(basename "$file_path") → $remote_path" - local out rc - out="$(aws_cp_region "$file_path" "$remote_path" 2>&1)"; rc=$? - if (( rc == 0 )); then log "✅ Uploaded: $(basename "$file_path")"; return 0 - else log "❌ Upload failed for $(basename "$file_path"): $out"; return 1; fi -} - -log "📁 Collect → $COLLECT_DIR (server=$SERVER_VERSION, device=$DEVICE_LOCATION)" mkdir -p "$COLLECT_DIR" -case "$SERVER_VERSION" in - v1|server\ v4|v4) - LOG_DIR="/var/log/apache2" - find "$LOG_DIR" -type f -name 'access.log*' -exec cp -n {} "$COLLECT_DIR"/ \; - ;; +echo "[INFO] Created collection folder: $COLLECT_DIR" -v2|server\ v5|v5) - # Search in both oc4d and dhub directories - LOG_DIRS=("/var/log/oc4d" "/var/log/dhub") - for log_dir in "${LOG_DIRS[@]}"; do - if [ -d "$log_dir" ]; then - find "$log_dir" -type f \( \ - \( -name 'oc4d-*.log' ! -name 'oc4d-exceptions-*.log' \) -o \ - \( -name 'capecoastcastle-*.log' ! -name 'capecoastcastle-exceptions-*.log' \) -o \ - -name '*.gz' \) -exec cp -n {} "$COLLECT_DIR"/ \; - fi - done - ;; - *) log "❌ Unknown SERVER_VERSION '$SERVER_VERSION'"; exit 1;; -esac -shopt -s nullglob -for gz in "$COLLECT_DIR"/*.gz; do gzip -df "$gz" || true; done -shopt -u nullglob +# --- STAGE 1: COLLECT LOGS --- +echo "[INFO] Starting log collection for SERVER_VERSION=${SERVER_VERSION}..." -PROCESSOR="" case "$SERVER_VERSION" in - v1|v4) PROCESSOR="scripts/data/process/processors/log.py" ;; - v2|v5|server\ v5) -case "$PYTHON_SCRIPT" in - oc4d) PROCESSOR="scripts/data/process/processors/logv2.py" ;; - cape_coast_d) PROCESSOR="scripts/data/process/processors/castle.py" ;; - dhub) PROCESSOR="scripts/data/process/processors/dhub.py" ;; # <-- New d-hub option - *) PROCESSOR="scripts/data/process/processors/logv2.py" ;; + v2|server\ v5|v5) + # If a specific target is set by the collection menu, use only that. + if [[ "$COLLECT_TARGET" == "oc4d" ]]; then + LOG_DIRS=("/var/log/oc4d") + echo "[INFO] Collection target set to OC4D." + elif [[ "$COLLECT_TARGET" == "castle" ]]; then + LOG_DIRS=("/var/log/castle") # Assuming this is the path + echo "[INFO] Collection target set to Cape Coast Castle." + elif [[ "$COLLECT_TARGET" == "dhub" ]]; then + LOG_DIRS=("/var/log/dhub") + echo "[INFO] Collection target set to D-Hub." + else + # Default behavior if COLLECT_TARGET isn't set + LOG_DIRS=("/var/log/oc4d" "/var/log/dhub") + echo "[INFO] No specific target. Collecting from all V2 sources." + fi + + for log_dir in "${LOG_DIRS[@]}"; do + if [ -d "$log_dir" ]; then + echo "[INFO] Searching for logs in $log_dir..." + # Find and copy all relevant log files into the collection directory + find "$log_dir" -type f \( -name '*.log' -o -name '*.gz' \) -exec cp -v {} "$COLLECT_DIR"/ \; + else + echo "[WARN] Log directory not found: $log_dir" + fi + done + ;; + *) + echo "[ERROR] Unknown SERVER_VERSION: $SERVER_VERSION. Aborting." + exit 1 + ;; esac - ;; -esac -log "🐍 Process → $PROCESSOR (folder=$NEW_FOLDER)" -python3 "$PROCESSOR" "$NEW_FOLDER" -PROCESSED_DIR="$PROCESSED_ROOT/$NEW_FOLDER" -SUMMARY="$PROCESSED_DIR/summary.csv" -if [[ ! -s "$SUMMARY" ]]; then log "❌ Missing or empty summary at $SUMMARY"; exit 1; fi +# --- STAGE 2: PROCESS LOGS --- -MONTH="$(echo "$NEW_FOLDER" | awk -F'_' '{print $(NF-1)}' || true)" -if ! [[ "$MONTH" =~ ^[0-9]{2}$ ]]; then MONTH="$(date +%m)"; fi -log "🧮 Filter month=$MONTH → final CSV" -python3 scripts/data/upload/process_csv.py "$PROCESSED_DIR" "$DEVICE_LOCATION" "$MONTH" "summary.csv" +# If PYTHON_SCRIPT isn't set, infer it from COLLECT_TARGET. +if [ -z "$PYTHON_SCRIPT" ]; then + if [ -n "$COLLECT_TARGET" ]; then + echo "[INFO] PYTHON_SCRIPT not set. Inferring from COLLECT_TARGET: $COLLECT_TARGET" + PYTHON_SCRIPT="$COLLECT_TARGET" + fi +fi -shopt -s nullglob -FINAL_CAND=( "$PROCESSED_DIR/${DEVICE_LOCATION}_${MONTH}_"*"_access_logs.csv" ) -shopt -u nullglob -if [[ ${#FINAL_CAND[@]} -eq 0 ]]; then log "❌ Could not locate final CSV after filtering."; exit 1; fi -FINAL_CSV="${FINAL_CAND[0]}" -log "📦 Final CSV: $(basename "$FINAL_CSV")" +echo "[INFO] Starting log processing with PYTHON_SCRIPT=${PYTHON_SCRIPT}..." +PROCESSOR_PATH="$PROJECT_ROOT/scripts/data/process/processors/${PYTHON_SCRIPT}.py" -if has_internet; then - log "🌐 Internet OK. Flushing queue…" - shopt -s nullglob - for q in "$QUEUE_DIR"/*.csv; do - if upload_one "$q"; then rm -f "$q"; else log "Leaving queued: $(basename "$q")"; fi - done - shopt -u nullglob - if upload_one "$FINAL_CSV"; then - log "✅ Run finished — upload complete." - else - log "⚠️ Upload failed; queueing new file."; cp -f "$FINAL_CSV" "$QUEUE_DIR/" - fi +if [ -f "$PROCESSOR_PATH" ]; then + python3 "$PROCESSOR_PATH" "$NEW_FOLDER" else - log "📵 No internet. Queueing new file."; cp -f "$FINAL_CSV" "$QUEUE_DIR/" + echo "[ERROR] Processor script not found at $PROCESSOR_PATH. Aborting." + exit 1 fi + +echo "[SUCCESS] Automation cycle completed." diff --git a/scripts/data/collection/all.sh b/scripts/data/collection/all.sh index d938759..f915ca3 100755 --- a/scripts/data/collection/all.sh +++ b/scripts/data/collection/all.sh @@ -1,112 +1,52 @@ #!/bin/bash +# --- Colors --- +YELLOW='\033[0;33m' +CYAN='\033[0;36m' +RED='\033[0;31m' +NC='\033[0m' + +# Get the project root directory to reliably call the runner script +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(cd "$SCRIPT_DIR/../../.." && pwd) +RUNNER_SCRIPT="$PROJECT_ROOT/scripts/data/automation/runner.sh" + +# --- V2 Collection Menu --- +clear echo "" -echo "Server Types:" -echo "1. V4 Logs (Located at /var/log/apache2)" -echo "2. V5 Logs (Located at /var/log/oc4d)" +echo -e "${YELLOW}Choose a V2 Collection Source:${NC}" +echo "1. OC4D" +echo "2. Cape Coast Castle" +echo -e "${CYAN}3. D-Hub${NC}" echo "" +read -p "Enter your choice (1-3): " choice -# Prompt the user to select the log type -read -p "Please select the type of logs to collect (1 for V4, 2 for V5): " log_choice - -# Set the log directory and collection logic based on the user's selection -case $log_choice in +# --- Execute Runner Based on Choice --- +case "$choice" in 1) - log_directory="/var/log/apache2" - log_type="V4 Logs" + echo "Selected OC4D. Starting runner..." + export COLLECT_TARGET="oc4d" ;; 2) - log_directory="/var/log/oc4d" - log_type="V5 Logs" + echo "Selected Cape Coast Castle. Starting runner..." + export COLLECT_TARGET="castle" + ;; + 3) + echo "Selected D-Hub. Starting runner..." + export COLLECT_TARGET="dhub" ;; *) - echo "Invalid choice. Please run the script again and choose either 1 or 2." + echo -e "${RED}Invalid choice. Aborting.${NC}" + sleep 2 exit 1 ;; esac -echo "" -echo "You have selected: $log_type." -sleep 1 - -# Prompt the user for the location of the device -read -p "Enter the server location: " device_location -device_location=${device_location// /_} # Replace spaces with underscores - -# Create a new folder for the logs -new_folder="${device_location}_logs_$(date '+%Y_%m_%d')" -mkdir -p "$new_folder" +# All V2 collections use SERVER_VERSION="v5" as per the system's design +export SERVER_VERSION="v5" -echo "" -echo "Collecting $log_type from $log_directory..." +echo "Executing the main automation runner..." sleep 1 -# Collect logs based on the user's selection -if [[ "$log_choice" == "1" ]]; then - # For V4 logs: Collect only access log files - find "$log_directory" -type f -name "access.log*" -exec cp {} "$new_folder"/ \; -elif [[ "$log_choice" == "2" ]]; then - # For V5 logs: Collect oc4d-*.log, capecoastcastle-*.log, and *.gz, excluding exceptions - find "$log_directory" -type f \ - \( \ - \( -name "oc4d-*.log" ! -name "oc4d-exceptions-*.log" \) \ - -o \ - \( -name "capecoastcastle-*.log" ! -name "capecoastcastle-exceptions-*.log" \) \ - -o \ - -name "*.gz" \ - \) \ - -exec cp {} "$new_folder"/ \; -fi - -# Move to the new folder -cd "$new_folder" || { echo "Failed to access the logs folder. Exiting..."; exit 1; } - -# Uncompress any .gz files -echo "Uncompressing files (if any)..." -for compressed_file in *.gz; do - if [ -f "$compressed_file" ]; then - gzip -d "$compressed_file" - fi -done - -# Return to the original directory -cd .. - -# Check if the "00_DATA" folder exists, and create it if not -if [ ! -d "00_DATA" ]; then - mkdir "00_DATA" -fi - -# Move the collected logs to the "00_DATA" directory -mv "$new_folder" "00_DATA" - -echo "" -echo "Log collection completed successfully!" -echo "$log_type have been saved to the '00_DATA/$new_folder' directory." -echo "" - -# Prompt the user to process the logs -while true; do - read -p "Would you like to process the collected logs now? (y/n): " user_choice - case $user_choice in - [Yy]* ) - echo "Starting the log processing script..." - sleep 1 - if [[ "$log_choice" == "1" ]]; then - exec ./scripts/data/process/all.sh - elif [[ "$log_choice" == "2" ]]; then - exec ./scripts/data/all/v2/process/logs.sh - fi - break - ;; - [Nn]* ) - echo "Returning to the main menu..." - sleep 1 - exec ./scripts/data/collection/main.sh - break - ;; - * ) - echo "Please answer with 'y' (yes) or 'n' (no)." - ;; - esac -done +# Execute the main runner script with the chosen configuration +exec "$RUNNER_SCRIPT" diff --git a/scripts/data/process/processors/dhub.py b/scripts/data/process/processors/dhub.py index 3aa3a66..954a3e2 100644 --- a/scripts/data/process/processors/dhub.py +++ b/scripts/data/process/processors/dhub.py @@ -1,104 +1,85 @@ -# scripts/data/process/processors/dhub.py - -import os -import re +#!/usr/bin/env python3 +# This script is modeled on a working processor, as per Llewellyn's feedback. +import pandas as pd import json -import csv +import re import sys -from datetime import datetime +import os +import glob +from typing import Dict, List, Optional -# This regex is the key change. It's designed to capture the new d-hub URL format. -# It looks for lines containing /modules/ followed by a UUID. +# A precise regex to capture only the relevant parts of a d-hub log message. +# Pattern looks for: /modules/{uuid}/{module-name}/ LOG_PATTERN = re.compile( - r'(\S+) - - \[([^\]]+)\] "(\S+) /modules/([0-9a-fA-F\-]+)/([^/]+)/?(\S*) HTTP/\d\.\d" (\d+) (\d+) "([^"]*)" "([^"]*)"' -) - -# A simpler pattern for other module requests that might not have a UUID -FALLBACK_PATTERN = re.compile( - r'(\S+) - - \[([^\]]+)\] "(\S+) /modules/([^/]+)/?(\S*) HTTP/\d\.\d" (\d+) (\d+) "([^"]*)" "([^"]*)"' + r'"GET /(?:uploads/)?modules/(?P[0-9a-fA-F-]+)/(?P[a-zA-Z0-9_.-]+)/.*" ' + r'(?P\d{3}) (?P\d+|-)' ) -def parse_log_file(file_path, writer): - """Parses a single log file and writes valid entries to the CSV writer.""" +def parse_log_line(line: str) -> Optional[Dict]: + """Parses a single JSON log line to extract d-hub access info.""" try: - with open(file_path, 'r', encoding='utf-8') as f: - for line in f: - try: - # The log lines are JSON objects - log_entry = json.loads(line) - message = log_entry.get("message", "") - - # We only care about the lines that contain actual request data - if "GET /modules/" not in message and "GET /_next/" not in message: - continue - - # First, try to match the detailed d-hub pattern with a UUID - match = LOG_PATTERN.search(message) - if match: - (ip, timestamp, method, module_id, module_name, rest_of_path, status, size, referrer, user_agent) = match.groups() - module_viewed = module_name - else: - # If it doesn't match, try the fallback for simpler module URLs - fallback_match = FALLBACK_PATTERN.search(message) - if fallback_match: - (ip, timestamp, method, module_name, rest_of_path, status, size, referrer, user_agent) = fallback_match.groups() - module_viewed = module_name - else: - # If neither pattern matches, skip this line - continue - - # Format the timestamp - dt_obj = datetime.strptime(timestamp, "%d/%b/%Y:%H:%M:%S %z") - access_date = dt_obj.strftime("%Y-%m-%d") - access_time = dt_obj.strftime("%H:%M:%S") - - # Write the parsed data to the CSV - writer.writerow([ - ip, access_date, access_time, module_viewed, - 'd-hub', status, size, 'Unknown', user_agent - ]) - except (json.JSONDecodeError, ValueError, TypeError): - # This will skip malformed lines - continue - except Exception as e: - print(f"Error processing file {file_path}: {e}") - - -def main(folder_name): - """Main function to process all log files in a given directory.""" - project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../../')) - collected_dir = os.path.join(project_root, '00_DATA', folder_name) - processed_dir = os.path.join(project_root, '00_DATA', '00_PROCESSED', folder_name) - - if not os.path.exists(collected_dir): - print(f"Error: Collected data directory not found at {collected_dir}") - return + log_entry = json.loads(line) + message = log_entry.get("message", "") + # Extract the IP from the start of the message + ip_address = message.split(' ', 1)[0] + except (json.JSONDecodeError, AttributeError, IndexError): + return None + + match = LOG_PATTERN.search(message) + if not match: + return None + + data = match.groupdict() + data['ip_address'] = ip_address + data['response_size'] = 0 if data['response_size'] == '-' else int(data['response_size']) + data['status_code'] = int(data['status_code']) + # Use the more accurate timestamp from the JSON body + data['timestamp'] = log_entry.get("timestamp") + + return data + +def main(folder_name: str): + """Processes all log files in a given folder and creates a summary.csv.""" + script_path = os.path.dirname(os.path.realpath(__file__)) + project_root = os.path.abspath(os.path.join(script_path, "../../../..")) + + input_dir = os.path.join(project_root, "00_DATA", folder_name) + output_dir = os.path.join(project_root, "00_DATA", "00_PROCESSED", folder_name) + + if not os.path.isdir(input_dir): + print(f"Error: Input directory not found at {input_dir}", file=sys.stderr) + sys.exit(1) - os.makedirs(processed_dir, exist_ok=True) + os.makedirs(output_dir, exist_ok=True) + output_csv_path = os.path.join(output_dir, "summary.csv") - output_csv_path = os.path.join(processed_dir, 'summary.csv') + all_logs = [] + log_files = glob.glob(os.path.join(input_dir, '*')) + print(f"Found {len(log_files)} files in {input_dir} to process.") - with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - # Write header - writer.writerow([ - "IP Address", "Access Date", "Access Time", "Module Viewed", - "Location Viewed", "Status Code", "Data Saved (GB)", - "Device Used", "Browser Used" - ]) + for file_path in log_files: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + for line in f: + parsed_data = parse_log_line(line) + if parsed_data: + all_logs.append(parsed_data) - print(f"Processing log files in {collected_dir}...") - # Find all log files (not ending in .gz) - for filename in os.listdir(collected_dir): - if filename.endswith(".log"): - file_path = os.path.join(collected_dir, filename) - print(f" - Parsing {filename}...") - parse_log_file(file_path, writer) + if not all_logs: + print("No valid d-hub log entries were found.") + pd.DataFrame(columns=['ip_address', 'timestamp', 'module_id', 'module_name', 'status_code', 'response_size']).to_csv(output_csv_path, index=False) + return + + df = pd.DataFrame(all_logs) + column_order = ['ip_address', 'timestamp', 'module_id', 'module_name', 'status_code', 'response_size'] + df = df[column_order] + df.to_csv(output_csv_path, index=False) - print(f"Processing complete. Summary saved to {output_csv_path}") + print(f"✅ Processing complete. {len(df)} entries saved to {output_csv_path}") -if __name__ == '__main__': - if len(sys.argv) != 2: - print("Usage: python3 dhub.py ") +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python3 dhub.py ", file=sys.stderr) sys.exit(1) - main(sys.argv[1]) + + target_folder = sys.argv[1] + main(target_folder)