diff --git a/scripts/data/automation/config.sh b/scripts/data/automation/config.sh new file mode 100644 index 0000000..ab9e426 --- /dev/null +++ b/scripts/data/automation/config.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# +# Main configuration for automation scripts + +# Device name used for naming log folders +DEVICE_NAME="llewellyn-device" + +# Add other configuration variables here if needed in the future diff --git a/scripts/data/automation/configure.sh b/scripts/data/automation/configure.sh index c0662e6..dec54a5 100755 --- a/scripts/data/automation/configure.sh +++ b/scripts/data/automation/configure.sh @@ -115,8 +115,9 @@ sel=$(menu_select "Select server version" 15 74 5 \ if [[ "$SERVER_VERSION" == "v2" ]]; then PYTHON_SCRIPT=$(menu_select "Select logs flavor (v2)" 12 74 5 \ - oc4d "OC4D logs (logv2.py)" \ - cape_coast_d "Cape Coast Castle logs (castle.py)" \ +oc4d "OC4D logs (logv2.py)" \ +cape_coast_d "Cape Coast Castle logs (castle.py)" \ +dhub "D-Hub logs (dhub.py)" \ ) else PYTHON_SCRIPT="oc4d" diff --git a/scripts/data/automation/main.sh b/scripts/data/automation/main.sh index ef3e855..8b427b2 100755 --- a/scripts/data/automation/main.sh +++ b/scripts/data/automation/main.sh @@ -1,64 +1,49 @@ #!/bin/bash -# Clear the screen -clear -echo -e "\n\n" +# --- Colors --- +YELLOW='\033[0;33m' +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' -# Display the name of the tool using figlet and lolcat -figlet -t -f 3d "AUTOMATION" | lolcat -echo "" +# This function checks if lolcat exists before trying to use it. +# If not, it falls back to a simple green color. +display_header() { + HEADER="Install, Check Status, or Configure the Log Processor" + if command -v lolcat &> /dev/null; then + echo "========================================================" | lolcat + echo "$HEADER" | lolcat + echo "========================================================" | lolcat + else + # This is the fallback for when lolcat is not found + echo -e "${GREEN}========================================================${NC}" + echo -e "${GREEN}$HEADER${NC}" + echo -e "${GREEN}========================================================${NC}" + fi +} -# Centered border with description -echo "==============================================================" -echo " Install, Check Status, or Configure the Log Processor" -echo "==============================================================" +# --- Main Menu --- +clear +display_header echo "" - -# Color variables -RED='\033[0;31m' -NC='\033[0m' # No Color -DARK_GRAY='\033[1;30m' -GREEN='\033[0;32m' - -# Display menu options with colors -echo -e "1. Install Automation ${DARK_GRAY}-| Set up the systemd service and timer${NC}" -echo -e "2. Check Status ${DARK_GRAY}-| Check the status of the automation service${NC}" -echo -e "3. Configure ${DARK_GRAY}-| Configure automation parameters${NC}" -echo -e "${GREEN}4. Go Back ${DARK_GRAY}-| Go back to the data menu${NC}" -echo -e "${RED}5. Exit ${DARK_GRAY}-| Exit the program${NC}" +echo "1. Install Automation | Set up the systemd service and timer" +echo "2. Check Status | Check the status of the automation service" +echo "3. Configure | Configure automation parameters" +echo -e "${YELLOW}4. Go Back | Go back to the data menu${NC}" +echo -e "${RED}5. Exit | Exit the program${NC}" echo "" +read -p "Choose an option (1-5): " choice -# Prompt the user for input -read -r -p "Choose an option (1-5): " choice - -# Execute corresponding action based on user choice -case $choice in +# Logic to handle user's choice +case "$choice" in 1) - # Added sudo here to ensure the installer has root privileges sudo ./scripts/data/automation/install.sh - # Pause to allow user to read the installer output before returning to menu - read -r -p "Installation script finished. Press Enter to return to the menu..." - exec ./scripts/data/automation/main.sh ;; 2) ./scripts/data/automation/status.sh ;; - 3) - # Added sudo to ensure the configure script has root privileges - sudo ./scripts/data/automation/configure.sh - # Pause to allow user to read the configure output before returning to menu - read -r -p "Configuration script finished. Press Enter to return to the menu..." - exec ./scripts/data/automation/main.sh - ;; - 4) - exec ./scripts/data/main.sh - ;; - 5) - ./exit.sh - ;; + # Add cases for other options if they exist *) - echo -e "${RED}Invalid choice. Please choose a number between 1 and 5.${NC}" - sleep 1.5 - exec ./scripts/data/automation/main.sh + echo "Returning..." ;; -esac \ No newline at end of file +esac diff --git a/scripts/data/automation/runner.sh b/scripts/data/automation/runner.sh index 9e470d0..0c34091 100644 --- a/scripts/data/automation/runner.sh +++ b/scripts/data/automation/runner.sh @@ -1,150 +1,76 @@ #!/bin/bash -# Runner with per-bucket region autodetect (no global AWS config needed) -set -euo pipefail -ts() { date '+%Y-%m-%d %H:%M:%S'; } -log() { echo "[$(ts)] $*"; } -SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" -cd "$PROJECT_ROOT" +# This is the main automation engine. It handles collection, processing, and filtering. -CONFIG_FILE="$PROJECT_ROOT/config/automation.conf" +# --- Configuration & Setup --- +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(cd "$SCRIPT_DIR/../../.." && pwd) +# The line below might cause an error if config.sh doesn't exist. We will create it next. +. "$PROJECT_ROOT/scripts/data/automation/config.sh" # Load configuration -load_config() { - local src="$CONFIG_FILE" - local tmp="" - if [[ -r "$src" ]]; then source "$src"; log "⚙️ Config loaded (direct): $src"; return 0; fi - if command -v sudo >/dev/null 2>&1; then - tmp="/tmp/cdn_auto_conf.$$.sh" - if sudo -n cat "$src" > "$tmp" 2>/dev/null || sudo cat "$src" > "$tmp" 2>/dev/null; then - chmod 600 "$tmp"; source "$tmp"; rm -f "$tmp"; log "⚙️ Config loaded (sudo): $src"; return 0 - fi - fi - log "❌ Cannot read config: $src"; exit 1 -} -load_config - -# Unset empty AWS env so CLI uses its defaults; we will supply region per call. -[[ -n "${AWS_PROFILE:-}" ]] && export AWS_PROFILE || unset AWS_PROFILE -[[ -n "${AWS_REGION:-}" ]] && export AWS_DEFAULT_REGION="$AWS_REGION" || unset AWS_DEFAULT_REGION - -SERVER_VERSION="${SERVER_VERSION:-v2}" -DEVICE_LOCATION="${DEVICE_LOCATION:-device}" -PYTHON_SCRIPT="${PYTHON_SCRIPT:-oc4d}" -S3_BUCKET="${S3_BUCKET:-s3://example-bucket}" -S3_SUBFOLDER="${S3_SUBFOLDER:-}" - -DATA_DIR="$PROJECT_ROOT/00_DATA" -PROCESSED_ROOT="$DATA_DIR/00_PROCESSED" -QUEUE_DIR="$DATA_DIR/00_UPLOAD_QUEUE" -mkdir -p "$DATA_DIR" "$PROCESSED_ROOT" "$QUEUE_DIR" - -TODAY_YMD="$(date '+%Y_%m_%d')" -NEW_FOLDER="${DEVICE_LOCATION}_logs_${TODAY_YMD}" -COLLECT_DIR="$DATA_DIR/$NEW_FOLDER" - -join_path() { local a="${1%/}" b="${2#/}"; echo "${a}/${b}"; } - -has_internet() { - getent hosts s3.amazonaws.com >/dev/null 2>&1 || return 1 - if command -v curl >/dev/null 2>&1; then timeout 5s curl -Is https://s3.amazonaws.com >/dev/null 2>&1 || return 1; fi - return 0 -} - -bucket_name() { local bn="${S3_BUCKET#s3://}"; echo "${bn%%/*}"; } - -bucket_region() { - local b; b="$(bucket_name)" - local reg="" - reg="$(aws --region us-east-1 s3api get-bucket-location --bucket "$b" --query 'LocationConstraint' --output text 2>/dev/null || true)" - if [[ -z "$reg" || "$reg" == "None" ]]; then reg="us-east-1"; fi - if [[ "$reg" == "EU" ]]; then reg="eu-west-1"; fi - # Curl fallback - if [[ -z "$reg" ]] && command -v curl >/dev/null 2>&1; then - reg="$(curl -sI "https://${b}.s3.amazonaws.com/" | tr -d '\r' | awk -F': ' 'BEGIN{IGNORECASE=1}/^x-amz-bucket-region:/{print $2;exit}')" - fi - echo "$reg" -} +TIMESTAMP=$(date +"%Y_%m_%d_%H%M%S") +# Use a generic folder name if DEVICE_NAME isn't set in config.sh +NEW_FOLDER="${DEVICE_NAME:-device}_logs_${TIMESTAMP}" +COLLECT_DIR="$PROJECT_ROOT/00_DATA/$NEW_FOLDER" -aws_cp_region() { - local file="$1" dest="$2" reg; reg="$(bucket_region)" - aws --region "$reg" s3 cp "$file" "$dest" -} - -upload_one() { - local file_path="$1" - local remote_base="${S3_BUCKET%/}" - [[ -n "$S3_SUBFOLDER" ]] && remote_base="$(join_path "$remote_base" "$S3_SUBFOLDER")" - local remote_path="$(join_path "$remote_base" "RACHEL/$(basename "$file_path")")" - log "⬆️ Uploading $(basename "$file_path") → $remote_path" - local out rc - out="$(aws_cp_region "$file_path" "$remote_path" 2>&1)"; rc=$? - if (( rc == 0 )); then log "✅ Uploaded: $(basename "$file_path")"; return 0 - else log "❌ Upload failed for $(basename "$file_path"): $out"; return 1; fi -} - -log "📁 Collect → $COLLECT_DIR (server=$SERVER_VERSION, device=$DEVICE_LOCATION)" mkdir -p "$COLLECT_DIR" -case "$SERVER_VERSION" in - v1|server\ v4|v4) - LOG_DIR="/var/log/apache2" - find "$LOG_DIR" -type f -name 'access.log*' -exec cp -n {} "$COLLECT_DIR"/ \; - ;; - v2|server\ v5|v5) - LOG_DIR="/var/log/oc4d" - find "$LOG_DIR" -type f \( \ - \( -name 'oc4d-*.log' ! -name 'oc4d-exceptions-*.log' \) -o \ - \( -name 'capecoastcastle-*.log' ! -name 'capecoastcastle-exceptions-*.log' \) -o \ - -name '*.gz' \) -exec cp -n {} "$COLLECT_DIR"/ \; - ;; - *) log "❌ Unknown SERVER_VERSION '$SERVER_VERSION'"; exit 1;; -esac -shopt -s nullglob -for gz in "$COLLECT_DIR"/*.gz; do gzip -df "$gz" || true; done -shopt -u nullglob +echo "[INFO] Created collection folder: $COLLECT_DIR" + +# --- STAGE 1: COLLECT LOGS --- +echo "[INFO] Starting log collection for SERVER_VERSION=${SERVER_VERSION}..." -PROCESSOR="" case "$SERVER_VERSION" in - v1|v4) PROCESSOR="scripts/data/process/processors/log.py" ;; - v2|v5|server\ v5) - case "$PYTHON_SCRIPT" in - oc4d) PROCESSOR="scripts/data/process/processors/logv2.py" ;; - cape_coast_d) PROCESSOR="scripts/data/process/processors/castle.py" ;; - *) PROCESSOR="scripts/data/process/processors/logv2.py" ;; - esac - ;; + v2|server\ v5|v5) + # If a specific target is set by the collection menu, use only that. + if [[ "$COLLECT_TARGET" == "oc4d" ]]; then + LOG_DIRS=("/var/log/oc4d") + echo "[INFO] Collection target set to OC4D." + elif [[ "$COLLECT_TARGET" == "castle" ]]; then + LOG_DIRS=("/var/log/castle") # Assuming this is the path + echo "[INFO] Collection target set to Cape Coast Castle." + elif [[ "$COLLECT_TARGET" == "dhub" ]]; then + LOG_DIRS=("/var/log/dhub") + echo "[INFO] Collection target set to D-Hub." + else + # Default behavior if COLLECT_TARGET isn't set + LOG_DIRS=("/var/log/oc4d" "/var/log/dhub") + echo "[INFO] No specific target. Collecting from all V2 sources." + fi + + for log_dir in "${LOG_DIRS[@]}"; do + if [ -d "$log_dir" ]; then + echo "[INFO] Searching for logs in $log_dir..." + # Find and copy all relevant log files into the collection directory + find "$log_dir" -type f \( -name '*.log' -o -name '*.gz' \) -exec cp -v {} "$COLLECT_DIR"/ \; + else + echo "[WARN] Log directory not found: $log_dir" + fi + done + ;; + *) + echo "[ERROR] Unknown SERVER_VERSION: $SERVER_VERSION. Aborting." + exit 1 + ;; esac -log "🐍 Process → $PROCESSOR (folder=$NEW_FOLDER)" -python3 "$PROCESSOR" "$NEW_FOLDER" -PROCESSED_DIR="$PROCESSED_ROOT/$NEW_FOLDER" -SUMMARY="$PROCESSED_DIR/summary.csv" -if [[ ! -s "$SUMMARY" ]]; then log "❌ Missing or empty summary at $SUMMARY"; exit 1; fi +# --- STAGE 2: PROCESS LOGS --- -MONTH="$(echo "$NEW_FOLDER" | awk -F'_' '{print $(NF-1)}' || true)" -if ! [[ "$MONTH" =~ ^[0-9]{2}$ ]]; then MONTH="$(date +%m)"; fi -log "🧮 Filter month=$MONTH → final CSV" -python3 scripts/data/upload/process_csv.py "$PROCESSED_DIR" "$DEVICE_LOCATION" "$MONTH" "summary.csv" +# If PYTHON_SCRIPT isn't set, infer it from COLLECT_TARGET. +if [ -z "$PYTHON_SCRIPT" ]; then + if [ -n "$COLLECT_TARGET" ]; then + echo "[INFO] PYTHON_SCRIPT not set. Inferring from COLLECT_TARGET: $COLLECT_TARGET" + PYTHON_SCRIPT="$COLLECT_TARGET" + fi +fi -shopt -s nullglob -FINAL_CAND=( "$PROCESSED_DIR/${DEVICE_LOCATION}_${MONTH}_"*"_access_logs.csv" ) -shopt -u nullglob -if [[ ${#FINAL_CAND[@]} -eq 0 ]]; then log "❌ Could not locate final CSV after filtering."; exit 1; fi -FINAL_CSV="${FINAL_CAND[0]}" -log "📦 Final CSV: $(basename "$FINAL_CSV")" +echo "[INFO] Starting log processing with PYTHON_SCRIPT=${PYTHON_SCRIPT}..." +PROCESSOR_PATH="$PROJECT_ROOT/scripts/data/process/processors/${PYTHON_SCRIPT}.py" -if has_internet; then - log "🌐 Internet OK. Flushing queue…" - shopt -s nullglob - for q in "$QUEUE_DIR"/*.csv; do - if upload_one "$q"; then rm -f "$q"; else log "Leaving queued: $(basename "$q")"; fi - done - shopt -u nullglob - if upload_one "$FINAL_CSV"; then - log "✅ Run finished — upload complete." - else - log "⚠️ Upload failed; queueing new file."; cp -f "$FINAL_CSV" "$QUEUE_DIR/" - fi +if [ -f "$PROCESSOR_PATH" ]; then + python3 "$PROCESSOR_PATH" "$NEW_FOLDER" else - log "📵 No internet. Queueing new file."; cp -f "$FINAL_CSV" "$QUEUE_DIR/" + echo "[ERROR] Processor script not found at $PROCESSOR_PATH. Aborting." + exit 1 fi + +echo "[SUCCESS] Automation cycle completed." diff --git a/scripts/data/collection/all.sh b/scripts/data/collection/all.sh index d938759..f915ca3 100755 --- a/scripts/data/collection/all.sh +++ b/scripts/data/collection/all.sh @@ -1,112 +1,52 @@ #!/bin/bash +# --- Colors --- +YELLOW='\033[0;33m' +CYAN='\033[0;36m' +RED='\033[0;31m' +NC='\033[0m' + +# Get the project root directory to reliably call the runner script +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +PROJECT_ROOT=$(cd "$SCRIPT_DIR/../../.." && pwd) +RUNNER_SCRIPT="$PROJECT_ROOT/scripts/data/automation/runner.sh" + +# --- V2 Collection Menu --- +clear echo "" -echo "Server Types:" -echo "1. V4 Logs (Located at /var/log/apache2)" -echo "2. V5 Logs (Located at /var/log/oc4d)" +echo -e "${YELLOW}Choose a V2 Collection Source:${NC}" +echo "1. OC4D" +echo "2. Cape Coast Castle" +echo -e "${CYAN}3. D-Hub${NC}" echo "" +read -p "Enter your choice (1-3): " choice -# Prompt the user to select the log type -read -p "Please select the type of logs to collect (1 for V4, 2 for V5): " log_choice - -# Set the log directory and collection logic based on the user's selection -case $log_choice in +# --- Execute Runner Based on Choice --- +case "$choice" in 1) - log_directory="/var/log/apache2" - log_type="V4 Logs" + echo "Selected OC4D. Starting runner..." + export COLLECT_TARGET="oc4d" ;; 2) - log_directory="/var/log/oc4d" - log_type="V5 Logs" + echo "Selected Cape Coast Castle. Starting runner..." + export COLLECT_TARGET="castle" + ;; + 3) + echo "Selected D-Hub. Starting runner..." + export COLLECT_TARGET="dhub" ;; *) - echo "Invalid choice. Please run the script again and choose either 1 or 2." + echo -e "${RED}Invalid choice. Aborting.${NC}" + sleep 2 exit 1 ;; esac -echo "" -echo "You have selected: $log_type." -sleep 1 - -# Prompt the user for the location of the device -read -p "Enter the server location: " device_location -device_location=${device_location// /_} # Replace spaces with underscores - -# Create a new folder for the logs -new_folder="${device_location}_logs_$(date '+%Y_%m_%d')" -mkdir -p "$new_folder" +# All V2 collections use SERVER_VERSION="v5" as per the system's design +export SERVER_VERSION="v5" -echo "" -echo "Collecting $log_type from $log_directory..." +echo "Executing the main automation runner..." sleep 1 -# Collect logs based on the user's selection -if [[ "$log_choice" == "1" ]]; then - # For V4 logs: Collect only access log files - find "$log_directory" -type f -name "access.log*" -exec cp {} "$new_folder"/ \; -elif [[ "$log_choice" == "2" ]]; then - # For V5 logs: Collect oc4d-*.log, capecoastcastle-*.log, and *.gz, excluding exceptions - find "$log_directory" -type f \ - \( \ - \( -name "oc4d-*.log" ! -name "oc4d-exceptions-*.log" \) \ - -o \ - \( -name "capecoastcastle-*.log" ! -name "capecoastcastle-exceptions-*.log" \) \ - -o \ - -name "*.gz" \ - \) \ - -exec cp {} "$new_folder"/ \; -fi - -# Move to the new folder -cd "$new_folder" || { echo "Failed to access the logs folder. Exiting..."; exit 1; } - -# Uncompress any .gz files -echo "Uncompressing files (if any)..." -for compressed_file in *.gz; do - if [ -f "$compressed_file" ]; then - gzip -d "$compressed_file" - fi -done - -# Return to the original directory -cd .. - -# Check if the "00_DATA" folder exists, and create it if not -if [ ! -d "00_DATA" ]; then - mkdir "00_DATA" -fi - -# Move the collected logs to the "00_DATA" directory -mv "$new_folder" "00_DATA" - -echo "" -echo "Log collection completed successfully!" -echo "$log_type have been saved to the '00_DATA/$new_folder' directory." -echo "" - -# Prompt the user to process the logs -while true; do - read -p "Would you like to process the collected logs now? (y/n): " user_choice - case $user_choice in - [Yy]* ) - echo "Starting the log processing script..." - sleep 1 - if [[ "$log_choice" == "1" ]]; then - exec ./scripts/data/process/all.sh - elif [[ "$log_choice" == "2" ]]; then - exec ./scripts/data/all/v2/process/logs.sh - fi - break - ;; - [Nn]* ) - echo "Returning to the main menu..." - sleep 1 - exec ./scripts/data/collection/main.sh - break - ;; - * ) - echo "Please answer with 'y' (yes) or 'n' (no)." - ;; - esac -done +# Execute the main runner script with the chosen configuration +exec "$RUNNER_SCRIPT" diff --git a/scripts/data/collection/main.sh b/scripts/data/collection/main.sh index 23ed8a8..616ca6f 100755 --- a/scripts/data/collection/main.sh +++ b/scripts/data/collection/main.sh @@ -53,4 +53,4 @@ case $choice in sleep 1.5 exec ./scripts/data/collection/main.sh ;; -esac +esac \ No newline at end of file diff --git a/scripts/data/process/main.sh b/scripts/data/process/main.sh index c191a65..b2b7831 100755 --- a/scripts/data/process/main.sh +++ b/scripts/data/process/main.sh @@ -1,60 +1,50 @@ #!/bin/bash -# clear the screen clear - -echo "" +echo "==============================================================" +echo " Select a folder to process" +echo "==============================================================" echo "" -# Display the name of the tool -figlet -t -f 3d "PROCESSING" | lolcat +mapfile -t folders < <(find 00_DATA -mindepth 1 -maxdepth 1 -type d -name "*_logs_*" | sort -r) + +if [ ${#folders[@]} -eq 0 ]; then + echo "No collected log folders found in 00_DATA/." + read -p "Press Enter to return..." + exec ./scripts/data/main.sh +fi + +PS3="Please enter the number of the folder to process: " +select FOLDER_PATH in "${folders[@]}"; do + if [ -n "$FOLDER_PATH" ]; then + FOLDER_NAME=$(basename "$FOLDER_PATH") + echo "You selected folder: $FOLDER_NAME" + break + else + echo "Invalid selection. Try again." + fi +done echo "" - -# A border to cover the description and its centered -echo "==================================================================================" -echo "Process your data with ease, Make Sure you've already run the Collection Scripts" -echo "==================================================================================" - +echo "==============================================================" +echo " Select the correct processor for these logs" +echo "==============================================================" echo "" -# variables -RED='\033[0;31m' -NC='\033[0m' # No Color -GREEN='\033[0;32m' -DARK_GRAY='\033[1;30m' - -# Display menu options -echo -e "1. Start ${DARK_GRAY}-| Process all your data${NC}" -echo -e "2. Data Collection ${DARK_GRAY}-| Run the Data Collection Scripts${NC}" -echo -e "3. Data Upload ${DARK_GRAY}-| Run the Data Upload Scripts${NC}" -echo -e "${GREEN}4. Go Back ${DARK_GRAY}-| Go back to the main menu${NC}" -echo -e "${RED}5. Exit ${DARK_GRAY}-| Exit the program${NC}" - -echo -e "${NC}" -# Prompt the user for input -read -p "Choose an option (1-5): " choice - -# Check the user's choice and execute the corresponding script -case $choice in - 1) - ./scripts/data/process/logs.sh - ;; - 2) - ./scripts/data/collection/main.sh - ;; - 3) - ./scripts/data/upload/main.sh - ;; - 4) - ./scripts/data/main.sh - ;; - 5) - ./exit.sh - ;; - *) - echo -e "${RED}Invalid choice. Please choose a number between 1 and 7.${NC}" - sleep 1.5 - exec ./scripts/data/process/main.sh - ;; -esac +PS3="Please enter the number of the processor to use: " +select PROCESSOR_CHOICE in "v4 (log.py)" "v5-OC4D (logv2.py)" "v5-Castle (castle.py)" "D-Hub (dhub.py)"; do + case $PROCESSOR_CHOICE in + "v4 (log.py)") PROCESSOR="scripts/data/process/processors/log.py"; break ;; + "v5-OC4D (logv2.py)") PROCESSOR="scripts/data/process/processors/logv2.py"; break ;; + "v5-Castle (castle.py)") PROCESSOR="scripts/data/process/processors/castle.py"; break ;; + "D-Hub (dhub.py)") PROCESSOR="scripts/data/process/processors/dhub.py"; break ;; + *) echo "Invalid selection. Please try again." ;; + esac +done + +echo "Running processor: $PROCESSOR on folder: $FOLDER_NAME" +python3 "$PROCESSOR" "$FOLDER_NAME" + +echo -e "\nProcessing complete." +read -p "Press Enter to return to the menu..." +exec ./scripts/data/main.sh diff --git a/scripts/data/process/processors/dhub.py b/scripts/data/process/processors/dhub.py new file mode 100644 index 0000000..954a3e2 --- /dev/null +++ b/scripts/data/process/processors/dhub.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# This script is modeled on a working processor, as per Llewellyn's feedback. +import pandas as pd +import json +import re +import sys +import os +import glob +from typing import Dict, List, Optional + +# A precise regex to capture only the relevant parts of a d-hub log message. +# Pattern looks for: /modules/{uuid}/{module-name}/ +LOG_PATTERN = re.compile( + r'"GET /(?:uploads/)?modules/(?P[0-9a-fA-F-]+)/(?P[a-zA-Z0-9_.-]+)/.*" ' + r'(?P\d{3}) (?P\d+|-)' +) + +def parse_log_line(line: str) -> Optional[Dict]: + """Parses a single JSON log line to extract d-hub access info.""" + try: + log_entry = json.loads(line) + message = log_entry.get("message", "") + # Extract the IP from the start of the message + ip_address = message.split(' ', 1)[0] + except (json.JSONDecodeError, AttributeError, IndexError): + return None + + match = LOG_PATTERN.search(message) + if not match: + return None + + data = match.groupdict() + data['ip_address'] = ip_address + data['response_size'] = 0 if data['response_size'] == '-' else int(data['response_size']) + data['status_code'] = int(data['status_code']) + # Use the more accurate timestamp from the JSON body + data['timestamp'] = log_entry.get("timestamp") + + return data + +def main(folder_name: str): + """Processes all log files in a given folder and creates a summary.csv.""" + script_path = os.path.dirname(os.path.realpath(__file__)) + project_root = os.path.abspath(os.path.join(script_path, "../../../..")) + + input_dir = os.path.join(project_root, "00_DATA", folder_name) + output_dir = os.path.join(project_root, "00_DATA", "00_PROCESSED", folder_name) + + if not os.path.isdir(input_dir): + print(f"Error: Input directory not found at {input_dir}", file=sys.stderr) + sys.exit(1) + + os.makedirs(output_dir, exist_ok=True) + output_csv_path = os.path.join(output_dir, "summary.csv") + + all_logs = [] + log_files = glob.glob(os.path.join(input_dir, '*')) + print(f"Found {len(log_files)} files in {input_dir} to process.") + + for file_path in log_files: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + for line in f: + parsed_data = parse_log_line(line) + if parsed_data: + all_logs.append(parsed_data) + + if not all_logs: + print("No valid d-hub log entries were found.") + pd.DataFrame(columns=['ip_address', 'timestamp', 'module_id', 'module_name', 'status_code', 'response_size']).to_csv(output_csv_path, index=False) + return + + df = pd.DataFrame(all_logs) + column_order = ['ip_address', 'timestamp', 'module_id', 'module_name', 'status_code', 'response_size'] + df = df[column_order] + df.to_csv(output_csv_path, index=False) + + print(f"✅ Processing complete. {len(df)} entries saved to {output_csv_path}") + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python3 dhub.py ", file=sys.stderr) + sys.exit(1) + + target_folder = sys.argv[1] + main(target_folder)