diff --git a/scanner/k8s-image-scanner.sh b/scanner/k8s-image-scanner.sh index 6eef411..86fa3f9 100755 --- a/scanner/k8s-image-scanner.sh +++ b/scanner/k8s-image-scanner.sh @@ -26,7 +26,7 @@ Usage: Options: --namespace NAMESPACE Kubernetes namespace to scan (default: pharia-ai) --ignore-file FILE File containing images to ignore (one per line) - --output-dir DIR Output directory for reports (default: ./scan-results) + --output-dir DIR Output directory for reports (default: ./scan-results/k8s-images) --kubeconfig FILE Path to kubeconfig file (optional) --context CONTEXT Kubernetes context to use (optional) --trivy-config FILE Custom Trivy configuration file (optional) @@ -78,7 +78,7 @@ EOF # Default configuration NAMESPACE="pharia-ai" IGNORE_FILE="" -OUTPUT_DIR="./scan-results" +OUTPUT_DIR="./scan-results/k8s-images" KUBECONFIG="" CONTEXT="" TRIVY_CONFIG="" @@ -1999,7 +1999,8 @@ if [[ -d "$OUTPUT_DIR" ]]; then printf ",\n" fi printf " " - cat "$file" | jq -c . + # Use jq with error handling to ensure valid JSON output + jq -c . "$file" 2>/dev/null || echo "{}" fi done < <(find "$OUTPUT_DIR" -name "cve_details.json" -print0 2>/dev/null) fi) diff --git a/scanner/postgres-dependency-scanner.sh b/scanner/postgres-dependency-scanner.sh new file mode 100755 index 0000000..0f816a2 --- /dev/null +++ b/scanner/postgres-dependency-scanner.sh @@ -0,0 +1,1015 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Postgres Dependency Scanner +# This script analyzes SBOMs from k8s-image-scanner.sh results to identify images with Postgres dependencies +# It also orchestrates the complete analysis workflow including repository discovery and dependency analysis + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +show_help() { + cat <&2 + show_help + exit 1 + ;; + esac +done + +log_verbose() { + if $VERBOSE; then + echo "[VERBOSE] $*" >&2 + fi +} + +log_info() { + echo "[INFO] $*" >&2 +} + +log_error() { + echo "[ERROR] $*" >&2 +} + +# Print step header with visual separator +print_step_header() { + local step_num="$1" + local step_title="$2" + local step_desc="$3" + + echo "" >&2 + echo "╔══════════════════════════════════════════════════════════════════════════════╗" >&2 + printf "║ STEP %s: %-65s ║\n" "$step_num" "$step_title" >&2 + echo "╠══════════════════════════════════════════════════════════════════════════════╣" >&2 + if [[ -n "$step_desc" ]]; then + printf "║ %-76s ║\n" "$step_desc" >&2 + echo "╚══════════════════════════════════════════════════════════════════════════════╝" >&2 + else + echo "╚══════════════════════════════════════════════════════════════════════════════╝" >&2 + fi + echo "" >&2 +} + +# Print step summary +print_step_summary() { + local summary_title="$1" + shift + local summary_items=("$@") + + log_info "📋 $summary_title:" + for item in "${summary_items[@]}"; do + log_info " • $item" + done + echo "" >&2 +} + +# Postgres-related package patterns to search for +# These cover common Postgres packages across different ecosystems +declare -a POSTGRES_PATTERNS=( + "postgresql" + "postgres" + "psycopg" + "psycopg2" + "psycopg3" + "pg" + "libpq" + "postgresql-client" + "postgres-client" + "pg_dump" + "pg_restore" + "postgresql-server" + "postgresql-contrib" + "postgresql-dev" + "postgresql-devel" + "libpq-dev" + "libpq5" + "node-postgres" + "pg-promise" + "sequelize" + "typeorm" + "prisma" + "diesel" + "sqlx" + "gorm" + "activerecord" + "sqlalchemy" + "asyncpg" + "tortoise-orm" +) + +# Check if a package name matches Postgres patterns +is_postgres_package() { + local package_name="$1" + local package_lower="${package_name,,}" # Convert to lowercase + + for pattern in "${POSTGRES_PATTERNS[@]}"; do + if [[ "$package_lower" == *"${pattern,,}"* ]]; then + return 0 + fi + done + + return 1 +} + +# Analyze CycloneDX SBOM +analyze_cyclonedx_sbom() { + local sbom_file="$1" + local image="$2" + local postgres_packages=() + + if [[ ! -f "$sbom_file" ]]; then + return 1 + fi + + # Check if valid JSON + if ! jq empty "$sbom_file" 2>/dev/null; then + log_verbose "Invalid JSON in SBOM: $sbom_file" + return 1 + fi + + # Extract components from CycloneDX format + # CycloneDX has components array with name, purl, etc. + local components + if ! components=$(jq -r '.components[]? | "\(.name)|\(.purl // "")|\(.type // "")"' "$sbom_file" 2>/dev/null); then + log_verbose "No components found in CycloneDX SBOM: $sbom_file" + return 1 + fi + + while IFS='|' read -r name purl type; do + if [[ -z "$name" ]]; then + continue + fi + + # Check package name + if is_postgres_package "$name"; then + postgres_packages+=("$name") + log_verbose "Found Postgres package in $image: $name" + fi + + # Check PURL (Package URL) for Postgres references + if [[ -n "$purl" ]] && is_postgres_package "$purl"; then + postgres_packages+=("$purl") + log_verbose "Found Postgres package in $image (PURL): $purl" + fi + done <<< "$components" + + # Also check dependencies + local dependencies + if dependencies=$(jq -r '.dependencies[]?.ref // empty' "$sbom_file" 2>/dev/null); then + while read -r dep_ref; do + if [[ -n "$dep_ref" ]] && is_postgres_package "$dep_ref"; then + postgres_packages+=("$dep_ref") + log_verbose "Found Postgres dependency in $image: $dep_ref" + fi + done <<< "$dependencies" + fi + + # Return unique packages + if [[ ${#postgres_packages[@]} -gt 0 ]]; then + printf '%s\n' "${postgres_packages[@]}" | sort -u | tr '\n' '|' + return 0 + fi + + return 1 +} + +# Analyze SPDX SBOM +analyze_spdx_sbom() { + local sbom_file="$1" + local image="$2" + local postgres_packages=() + + if [[ ! -f "$sbom_file" ]]; then + return 1 + fi + + # Check if valid JSON + if ! jq empty "$sbom_file" 2>/dev/null; then + log_verbose "Invalid JSON in SBOM: $sbom_file" + return 1 + fi + + # Extract packages from SPDX format + # SPDX has packages array with name, externalRefs, etc. + local packages + if ! packages=$(jq -r '.packages[]? | "\(.name)|\(.externalRefs[]?.referenceLocator // "")"' "$sbom_file" 2>/dev/null); then + log_verbose "No packages found in SPDX SBOM: $sbom_file" + return 1 + fi + + while IFS='|' read -r name ref; do + if [[ -z "$name" ]]; then + continue + fi + + # Check package name + if is_postgres_package "$name"; then + postgres_packages+=("$name") + log_verbose "Found Postgres package in $image: $name" + fi + + # Check external reference + if [[ -n "$ref" ]] && is_postgres_package "$ref"; then + postgres_packages+=("$ref") + log_verbose "Found Postgres package in $image (ref): $ref" + fi + done <<< "$packages" + + # Return unique packages + if [[ ${#postgres_packages[@]} -gt 0 ]]; then + printf '%s\n' "${postgres_packages[@]}" | sort -u | tr '\n' '|' + return 0 + fi + + return 1 +} + +# Analyze SBOM file (auto-detect format) +analyze_sbom() { + local sbom_file="$1" + local image="$2" + local packages="" + + if [[ ! -f "$sbom_file" ]]; then + return 1 + fi + + # Detect SBOM format + if jq -e '.bomFormat // .spdxVersion' "$sbom_file" >/dev/null 2>&1; then + # Check for CycloneDX + if jq -e '.bomFormat' "$sbom_file" >/dev/null 2>&1; then + log_verbose "Detected CycloneDX format: $sbom_file" + packages=$(analyze_cyclonedx_sbom "$sbom_file" "$image") + # Check for SPDX + elif jq -e '.spdxVersion' "$sbom_file" >/dev/null 2>&1; then + log_verbose "Detected SPDX format: $sbom_file" + packages=$(analyze_spdx_sbom "$sbom_file" "$image") + else + log_verbose "Unknown SBOM format: $sbom_file" + return 1 + fi + else + log_verbose "Invalid SBOM format: $sbom_file" + return 1 + fi + + if [[ -n "$packages" ]]; then + echo "$packages" + return 0 + fi + + return 1 +} + +# Show spinner while command runs in background +show_spinner() { + local pid=$1 + local message="$2" + local spin='-\|/' + local i=0 + + while kill -0 "$pid" 2>/dev/null; do + i=$(((i + 1) % 4)) + printf "\r%s %s" "$message" "${spin:$i:1}" >&2 + sleep 0.1 + done + printf "\r%*s\r" 100 "" >&2 # Clear the spinner line +} + +# Use Trivy to scan image for Postgres (fallback for images without SBOMs) +scan_image_with_trivy() { + local image="$1" + local postgres_packages=() + + if ! command -v trivy &> /dev/null; then + log_error "Trivy not found. Install Trivy to use --use-trivy option" + return 1 + fi + + log_verbose "Scanning image with Trivy: $image" + + # Use Trivy to get installed packages + # Run in background and show spinner + local trivy_output + local temp_file + temp_file=$(mktemp) + + # Start Trivy in background + (timeout "$TRIVY_TIMEOUT" trivy image --format json --timeout "${TRIVY_TIMEOUT}s" "$image" 2>/dev/null > "$temp_file") & + local trivy_pid=$! + + # Show spinner while Trivy runs + show_spinner "$trivy_pid" "[INFO] Scanning with Trivy..." + + # Wait for process and get exit code + wait "$trivy_pid" + local trivy_exit=$? + + if [[ $trivy_exit -eq 0 ]] && [[ -s "$temp_file" ]]; then + trivy_output=$(cat "$temp_file") + rm -f "$temp_file" + else + log_verbose "Trivy scan failed for: $image" + rm -f "$temp_file" + return 1 + fi + + # Extract package names from Trivy JSON output + local packages + if packages=$(echo "$trivy_output" | jq -r '.Results[]?.Packages[]?.Name // empty' 2>/dev/null); then + while read -r pkg_name; do + if [[ -n "$pkg_name" ]] && is_postgres_package "$pkg_name"; then + postgres_packages+=("$pkg_name") + log_verbose "Found Postgres package via Trivy in $image: $pkg_name" + fi + done <<< "$packages" + fi + + # Return unique packages + if [[ ${#postgres_packages[@]} -gt 0 ]]; then + printf '%s\n' "${postgres_packages[@]}" | sort -u | tr '\n' '|' + return 0 + fi + + return 1 +} + +# Write incremental results to temporary file +write_incremental_results() { + local temp_file="$1" + local current="$2" + local total="$3" + local postgres_count="$4" + local without_sbom_count="$5" + shift 5 + # Handle empty array case - if no arguments left, array is empty + local postgres_entries=() + if [[ $# -gt 0 ]]; then + postgres_entries=("$@") + fi + + # Calculate percentage + local percent=0 + if [[ $total -gt 0 ]]; then + percent=$(awk -v c="$current" -v t="$total" 'BEGIN {printf "%.1f", (c / t) * 100}') + fi + + { + echo "{" + echo " \"status\": \"in_progress\"," + echo " \"progress\": {" + echo " \"current\": $current," + echo " \"total\": $total," + echo " \"percent\": $percent" + echo " }," + echo " \"timestamp\": \"$(date -u +"%Y-%m-%dT%H:%M:%SZ")\"," + echo " \"images_with_postgres\": $postgres_count," + echo " \"images_without_sbom\": $without_sbom_count," + echo " \"images\": [" + local first=true + if [[ ${#postgres_entries[@]} -gt 0 ]]; then + for entry in "${postgres_entries[@]}"; do + if [[ -n "$entry" ]]; then + IFS='|' read -r image pkg_list <<< "$entry" + if [[ "$first" == "true" ]]; then + first=false + else + echo "," + fi + echo -n " {" + echo -n "\"image\": \"$image\"," + echo -n "\"packages\": [" + IFS=' ' read -ra pkgs <<< "$pkg_list" + local pkg_first=true + for pkg in "${pkgs[@]}"; do + if [[ -n "$pkg" ]]; then + if [[ "$pkg_first" == "true" ]]; then + pkg_first=false + else + echo -n ", " + fi + echo -n "\"$pkg\"" + fi + done + echo -n "]" + echo -n "}" + fi + done + fi + echo "" + echo " ]" + echo "}" + } > "$temp_file" +} + +# Main analysis function +main() { + local scan_dir + + # Resolve scan results directory (relative to script directory if not absolute) + if [[ -z "$SCAN_RESULTS_DIR" ]]; then + # Default: scan-results in parent directory of script + scan_dir="$(cd "$SCRIPT_DIR/../scan-results" && pwd)" + elif [[ "$SCAN_RESULTS_DIR" == /* ]]; then + # Absolute path + scan_dir="$SCAN_RESULTS_DIR" + else + # Relative path - resolve from script directory + scan_dir="$(cd "$SCRIPT_DIR/$SCAN_RESULTS_DIR" && pwd)" + fi + + if [[ ! -d "$scan_dir" ]]; then + log_error "Scan results directory not found: $scan_dir" + log_error "Expected location: $SCRIPT_DIR/../scan-results" + exit 1 + fi + + log_info "Analyzing scan results in: $scan_dir" + echo "" >&2 + + # Resolve output file path + local output_path + if [[ "$OUTPUT_FILE" == /* ]]; then + # Absolute path + output_path="$OUTPUT_FILE" + elif [[ "$OUTPUT_FILE" == ../* ]]; then + # Relative path starting with ../ - resolve relative to script directory + local rel_dir + rel_dir="$(dirname "$OUTPUT_FILE")" + local filename + filename="$(basename "$OUTPUT_FILE")" + # Resolve directory part (may not exist yet) + if [[ -d "$SCRIPT_DIR/$rel_dir" ]]; then + output_path="$(cd "$SCRIPT_DIR/$rel_dir" && pwd)/$filename" + else + # Directory doesn't exist yet, construct absolute path + output_path="$(cd "$SCRIPT_DIR" && pwd)/$rel_dir/$filename" + fi + else + # Relative path - resolve relative to scan results directory + local rel_dir + rel_dir="$(dirname "$OUTPUT_FILE")" + local filename + filename="$(basename "$OUTPUT_FILE")" + # Resolve directory part (may not exist yet) + if [[ -d "$scan_dir/$rel_dir" ]]; then + output_path="$(cd "$scan_dir/$rel_dir" && pwd)/$filename" + else + # Directory doesn't exist yet, construct absolute path + output_path="$(cd "$scan_dir" && pwd)/$rel_dir/$filename" + fi + fi + + # STEP 1: SBOM Analysis + print_step_header "1" "SBOM Analysis" "Analyzing SBOM files to identify images with PostgreSQL dependencies" + + # Check if output file already exists + local skip_sbom_analysis=false + if [[ -f "$output_path" ]]; then + log_info "✓ Output file already exists: $output_path" + log_info " Skipping SBOM analysis step" + skip_sbom_analysis=true + + # Show summary from existing file if possible + if command -v jq &>/dev/null; then + local total_images_in_file + total_images_in_file=$(jq '.images | length' "$output_path" 2>/dev/null || echo "0") + if [[ "$total_images_in_file" != "0" ]]; then + print_step_summary "Summary from existing file" \ + "Images with Postgres: $total_images_in_file" + fi + fi + echo "" >&2 + else + # Print step summary before starting + print_step_summary "SBOM Analysis Summary" \ + "Input: Scan results directory ($scan_dir)" \ + "Output: $output_path" \ + "Process: Analyze SBOM files for PostgreSQL packages" \ + "Fallback: Use Trivy for images without SBOMs (if enabled)" + + # Skip SBOM analysis if output already exists + if [[ "$skip_sbom_analysis" == "true" ]]; then + # Continue to complete analysis workflow if enabled + # (the workflow check happens later in the function) + : + else + # Ensure output directory exists + local output_dir + output_dir="$(dirname "$output_path")" + if [[ ! -d "$output_dir" ]]; then + log_info "Creating output directory: $output_dir" + mkdir -p "$output_dir" + fi + + # Create temporary file for incremental results + local temp_output_file="${output_path}.tmp" + local start_time + start_time=$(date +%s) + + # Results storage + declare -a images_with_postgres=() + declare -a images_without_sbom=() + local total_images=0 + local images_analyzed=0 + + # Count total images first for progress display + while IFS= read -r -d '' image_dir; do + total_images=$((total_images + 1)) + done < <(find "$scan_dir" -mindepth 1 -maxdepth 1 -type d -print0) + + if [[ $total_images -eq 0 ]]; then + log_error "No image directories found in: $scan_dir" + exit 1 + fi + + log_info "Found $total_images images to analyze (from scan result directories)" + log_info "" + log_info "Note: The SBOM analysis above only processes images with scan directories." + log_info " The repository discovery step will process ALL images from scan-summary.json." + log_info "" + + # Process each image directory + local current_image=0 + while IFS= read -r -d '' image_dir; do + current_image=$((current_image + 1)) + + # Extract image name from directory + # Directory format: registry_domain_path_image_tag -> registry/domain/path/image:tag + local image_name + local dir_name + dir_name=$(basename "$image_dir") + # The last underscore-separated segment is typically the tag + # Convert all underscores to slashes, then convert the last slash to a colon + image_name=$(echo "$dir_name" | sed 's|_|/|g' | sed 's|\(.*\)/\([^/]*\)$|\1:\2|') + + # Show progress + printf "[INFO] Processing: [%d/%d] %s\n" "$current_image" "$total_images" "$image_name" >&2 + log_verbose "Processing: $image_name (from dir: $dir_name)" + + # Check for SBOM file + local sbom_file="$image_dir/sbom.json" + local packages="" + + if [[ -f "$sbom_file" ]]; then + log_verbose "Found SBOM: $sbom_file" + if packages=$(analyze_sbom "$sbom_file" "$image_name"); then + images_analyzed=$((images_analyzed + 1)) + # Convert packages string back to array + IFS='|' read -ra pkg_array <<< "$packages" + images_with_postgres+=("$image_name|${pkg_array[*]}") + printf " ✓ Found Postgres packages\n" >&2 + # Write incremental result + if [[ ${#images_with_postgres[@]} -gt 0 ]]; then + write_incremental_results "$temp_output_file" "$current_image" "$total_images" "${#images_with_postgres[@]}" "${#images_without_sbom[@]}" "${images_with_postgres[@]}" + else + write_incremental_results "$temp_output_file" "$current_image" "$total_images" "${#images_with_postgres[@]}" "${#images_without_sbom[@]}" + fi + else + log_verbose "No Postgres packages found in: $image_name" + fi + else + log_verbose "No SBOM found for: $image_name" + images_without_sbom+=("$image_name") + + # Try Trivy if enabled + if $USE_TRIVY; then + if packages=$(scan_image_with_trivy "$image_name"); then + images_analyzed=$((images_analyzed + 1)) + IFS='|' read -ra pkg_array <<< "$packages" + images_with_postgres+=("$image_name|${pkg_array[*]}") + printf " ✓ Found Postgres packages (via Trivy)\n" >&2 + # Write incremental result + if [[ ${#images_with_postgres[@]} -gt 0 ]]; then + write_incremental_results "$temp_output_file" "$current_image" "$total_images" "${#images_with_postgres[@]}" "${#images_without_sbom[@]}" "${images_with_postgres[@]}" + else + write_incremental_results "$temp_output_file" "$current_image" "$total_images" "${#images_with_postgres[@]}" "${#images_without_sbom[@]}" + fi + else + # No Postgres found via Trivy + log_verbose "No Postgres packages found via Trivy in: $image_name" + fi + fi + # Update temp file even if no Postgres found (to show progress) + if [[ ${#images_with_postgres[@]} -gt 0 ]]; then + write_incremental_results "$temp_output_file" "$current_image" "$total_images" "${#images_with_postgres[@]}" "${#images_without_sbom[@]}" "${images_with_postgres[@]}" + else + write_incremental_results "$temp_output_file" "$current_image" "$total_images" "${#images_with_postgres[@]}" "${#images_without_sbom[@]}" + fi + fi + done < <(find "$scan_dir" -mindepth 1 -maxdepth 1 -type d -print0) + + # Generate final output + echo "" >&2 + log_info "Analysis complete:" + log_info " Total images: $total_images" + log_info " Images with Postgres: ${#images_with_postgres[@]}" + log_info " Images without SBOM: ${#images_without_sbom[@]}" + log_info " Temporary results: $temp_output_file" + + case "$FORMAT" in + json) + { + echo "{" + echo " \"analysis_timestamp\": \"$(date -u +"%Y-%m-%dT%H:%M:%SZ")\"," + echo " \"total_images\": $total_images," + echo " \"images_with_postgres\": ${#images_with_postgres[@]}," + echo " \"images_without_sbom\": ${#images_without_sbom[@]}," + echo " \"images\": [" + local first=true + if [[ ${#images_with_postgres[@]} -gt 0 ]]; then + for entry in "${images_with_postgres[@]}"; do + IFS='|' read -r image pkg_list <<< "$entry" + if [[ "$first" == "true" ]]; then + first=false + else + echo "," + fi + echo -n " {" + echo -n "\"image\": \"$image\"," + echo -n "\"packages\": [" + IFS=' ' read -ra pkgs <<< "$pkg_list" + local pkg_first=true + for pkg in "${pkgs[@]}"; do + if [[ -n "$pkg" ]]; then + if [[ "$pkg_first" == "true" ]]; then + pkg_first=false + else + echo -n ", " + fi + echo -n "\"$pkg\"" + fi + done + echo -n "]" + echo -n "}" + done + fi + echo "" + echo " ]," + echo " \"images_without_sbom\": [" + first=true + if [[ ${#images_without_sbom[@]} -gt 0 ]]; then + for image in "${images_without_sbom[@]}"; do + if [[ "$first" == "true" ]]; then + first=false + else + echo "," + fi + echo -n " \"$image\"" + done + fi + echo "" + echo " ]" + echo "}" + } > "$output_path" + # Remove temporary file now that final output is written + rm -f "$temp_output_file" + log_info "Results written to: $output_path" + ;; + table) + { + echo "Images Using Postgres:" + echo "======================" + echo "" + printf "%-60s %s\n" "IMAGE" "POSTGRES PACKAGES" + printf "%-60s %s\n" "-----" "-----------------" + if [[ ${#images_with_postgres[@]} -gt 0 ]]; then + for entry in "${images_with_postgres[@]}"; do + IFS='|' read -r image pkg_list <<< "$entry" + printf "%-60s %s\n" "$image" "$pkg_list" + done + else + echo "(none)" + fi + echo "" + echo "Images Without SBOM (${#images_without_sbom[@]}):" + echo "==========================================" + if [[ ${#images_without_sbom[@]} -gt 0 ]]; then + for image in "${images_without_sbom[@]}"; do + echo " - $image" + done + else + echo " (none)" + fi + } > "$output_path" + # Remove temporary file now that final output is written + rm -f "$temp_output_file" + log_info "Results written to: $output_path" + ;; + list) + { + if [[ ${#images_with_postgres[@]} -gt 0 ]]; then + for entry in "${images_with_postgres[@]}"; do + IFS='|' read -r image pkg_list <<< "$entry" + echo "$image" + done + fi + } > "$output_path" + # Remove temporary file now that final output is written + rm -f "$temp_output_file" + log_info "Results written to: $output_path" + ;; + esac + fi # End of skip_sbom_analysis else block + fi # End of if [[ -f "$output_path" ]] else block + + # Run full analysis workflow if enabled + if $FULL_ANALYSIS; then + # Setup paths for complete analysis workflow + # Note: scan_dir already points to k8s-images directory + local scan_summary_file="$scan_dir/scan-summary.json" + # postgres-analysis is in the parent directory (scan-results), not inside k8s-images + local postgres_analysis_dir="$(dirname "$scan_dir")/postgres-analysis" + local image_source_analysis_file="$postgres_analysis_dir/image-source-analysis.json" + local generate_script="$SCRIPT_DIR/postgres-dependency-scanner/generate-image-source-analysis.py" + local analyze_script="$SCRIPT_DIR/postgres-dependency-scanner/analyze-postgres-dependencies.py" + + # Create postgres-analysis directory if it doesn't exist + mkdir -p "$postgres_analysis_dir" + + # Ensure generate script is executable + if [[ -f "$generate_script" ]]; then + chmod +x "$generate_script" 2>/dev/null || true + fi + + # Ensure analyze script is executable + if [[ -f "$analyze_script" ]]; then + chmod +x "$analyze_script" 2>/dev/null || true + fi + + # Check if scan-summary.json exists + if [[ ! -f "$scan_summary_file" ]]; then + log_error "scan-summary.json not found: $scan_summary_file" + log_error "Skipping repository discovery and dependency analysis" + log_info "Run k8s-image-scanner.sh first to generate scan-summary.json" + return 0 + fi + + # STEP 2: Repository Discovery + print_step_header "2" "Repository Discovery" "Discovering source repositories for images with PostgreSQL dependencies" + + print_step_summary "Repository Discovery Summary" \ + "Input: scan-summary.json ($scan_summary_file)" \ + "Output: image-source-analysis.json ($image_source_analysis_file)" \ + "Process: Match images to their source repositories" \ + "Note: Processes ALL images from scan-summary.json (not just those with scan directories)" + + # Check if image-source-analysis.json already exists + if [[ -f "$image_source_analysis_file" ]]; then + log_info "✓ image-source-analysis.json already exists: $image_source_analysis_file" + log_info " Skipping repository discovery step" + echo "" >&2 + + # Show summary from existing file + if command -v jq &>/dev/null; then + local total_processed + local found_count + local not_found_count + + total_processed=$(jq '.results | length' "$image_source_analysis_file" 2>/dev/null || echo "0") + found_count=$(jq '[.results[] | select(.status == "found")] | length' "$image_source_analysis_file" 2>/dev/null || echo "0") + not_found_count=$(jq '[.results[] | select(.status == "not_found")] | length' "$image_source_analysis_file" 2>/dev/null || echo "0") + + if [[ "$total_processed" != "0" ]]; then + print_step_summary "Summary from existing file" \ + "Total images processed: $total_processed" \ + "Repositories found: $found_count" \ + "Repositories not found: $not_found_count" + fi + fi + elif [[ ! -f "$generate_script" ]]; then + log_error "generate-image-source-analysis.py not found: $generate_script" + log_error "Skipping repository discovery" + else + # Count images in scan-summary.json for reporting + if [[ -f "$scan_summary_file" ]] && command -v jq &>/dev/null; then + local total_in_summary + local successful_count + local failed_count + local skipped_count + + total_in_summary=$(jq '[.successful_scans[], .failed_scans[], .skipped_scans[]] | length' "$scan_summary_file" 2>/dev/null || echo "0") + successful_count=$(jq '.successful_scans | length' "$scan_summary_file" 2>/dev/null || echo "0") + failed_count=$(jq '.failed_scans | length' "$scan_summary_file" 2>/dev/null || echo "0") + skipped_count=$(jq '.skipped_scans | length' "$scan_summary_file" 2>/dev/null || echo "0") + + if [[ "$total_in_summary" != "0" ]]; then + log_info " Images in scan-summary.json: $total_in_summary total" + log_info " - Successful: $successful_count" + log_info " - Failed: $failed_count" + log_info " - Skipped: $skipped_count" + fi + fi + local verbose_flag="" + if $VERBOSE; then + verbose_flag="--verbose" + fi + + log_info " Running: python3 $generate_script --scan-summary $scan_summary_file --output $image_source_analysis_file" + log_info "" + + # Run the script with live output (unbuffered for real-time progress) + python3 -u "$generate_script" \ + --scan-summary "$scan_summary_file" \ + --output "$image_source_analysis_file" \ + $verbose_flag + local script_exit_code=$? + + log_info "" + + if [[ $script_exit_code -eq 0 ]]; then + log_info "" + log_info "✓ Successfully generated: $image_source_analysis_file" + + # Show summary from the generated file if it exists + if [[ -f "$image_source_analysis_file" ]] && command -v jq &>/dev/null; then + local total_processed + local found_count + local not_found_count + + total_processed=$(jq '.results | length' "$image_source_analysis_file" 2>/dev/null || echo "0") + found_count=$(jq '[.results[] | select(.status == "found")] | length' "$image_source_analysis_file" 2>/dev/null || echo "0") + not_found_count=$(jq '[.results[] | select(.status == "not_found")] | length' "$image_source_analysis_file" 2>/dev/null || echo "0") + + if [[ "$total_processed" != "0" ]]; then + log_info " Summary:" + log_info " - Total images processed: $total_processed" + log_info " - Repositories found: $found_count" + log_info " - Repositories not found: $not_found_count" + fi + fi + else + local exit_code=$? + log_error "" + log_error "Failed to generate image-source-analysis.json (exit code: $exit_code)" + log_error "Skipping dependency analysis" + return 1 + fi + fi + + # STEP 3: Dependency Analysis + print_step_header "3" "Dependency Analysis" "Analyzing PostgreSQL dependencies in source repositories" + + local postgres_dependency_analysis_file="$postgres_analysis_dir/postgres-dependency-analysis.json" + + print_step_summary "Dependency Analysis Summary" \ + "Input: image-source-analysis.json ($image_source_analysis_file)" \ + "Output: postgres-dependency-analysis.json ($postgres_dependency_analysis_file)" \ + "Process: Scan repositories for PostgreSQL dependencies and migration effort" + + # Check if postgres-dependency-analysis.json already exists + if [[ -f "$postgres_dependency_analysis_file" ]]; then + log_info "✓ postgres-dependency-analysis.json already exists: $postgres_dependency_analysis_file" + log_info " Skipping dependency analysis step" + echo "" >&2 + + # Show summary from existing file if possible + if command -v jq &>/dev/null; then + local total_projects + total_projects=$(jq '.projects | length' "$postgres_dependency_analysis_file" 2>/dev/null || echo "0") + if [[ "$total_projects" != "0" ]]; then + print_step_summary "Summary from existing file" \ + "Total projects analyzed: $total_projects" + fi + fi + elif [[ ! -f "$analyze_script" ]]; then + log_error "analyze-postgres-dependencies.py not found: $analyze_script" + log_error "Skipping dependency analysis" + elif [[ ! -f "$image_source_analysis_file" ]]; then + log_error "image-source-analysis.json not found: $image_source_analysis_file" + log_error "Skipping dependency analysis" + else + # Run the script from its directory + local original_dir + original_dir="$(pwd)" + cd "$(dirname "$analyze_script")" || exit 1 + + log_info " Running: python3 $(basename "$analyze_script")" + log_info " Working directory: $(pwd)" + log_info " Input file: $image_source_analysis_file" + log_info " Output file: $postgres_dependency_analysis_file" + log_info "" + + # Run the script with live output (unbuffered for real-time progress) + python3 -u "$analyze_script" + local script_exit_code=$? + + log_info "" + + if [[ $script_exit_code -eq 0 ]]; then + log_info "" + log_info "✓ Successfully generated postgres-dependency-analysis.json" + log_info " Location: $postgres_dependency_analysis_file" + else + log_error "" + log_error "Failed to analyze PostgreSQL dependencies (exit code: $script_exit_code)" + if [[ -n "$script_output" ]]; then + log_error "Script output:" + echo "$script_output" | while IFS= read -r line; do + log_error " $line" + done + fi + cd "$original_dir" || exit 1 + return 1 + fi + + cd "$original_dir" || exit 1 + fi + + echo "" >&2 + echo "╔══════════════════════════════════════════════════════════════════════════════╗" >&2 + echo "║ ✓ COMPLETE ANALYSIS WORKFLOW FINISHED ║" >&2 + echo "╚══════════════════════════════════════════════════════════════════════════════╝" >&2 + echo "" >&2 + print_step_summary "Generated Files" \ + "SBOM Analysis: $output_path" \ + "Repository Discovery: $image_source_analysis_file" \ + "Dependency Analysis: $postgres_dependency_analysis_file" + else + log_info "" + log_info "Skipping repository discovery and dependency analysis (--no-repo-analysis)" + fi +} + +main "$@" + diff --git a/scanner/postgres-dependency-scanner/analyze-postgres-dependencies.py b/scanner/postgres-dependency-scanner/analyze-postgres-dependencies.py new file mode 100755 index 0000000..17e9d56 --- /dev/null +++ b/scanner/postgres-dependency-scanner/analyze-postgres-dependencies.py @@ -0,0 +1,469 @@ +#!/usr/bin/env python3 +""" +Analyze PostgreSQL dependencies across projects from image-source-analysis.json +to assess migration effort to MSDB (Microsoft SQL Server Database). +""" + +import json +import re +from pathlib import Path +from collections import defaultdict +from typing import Dict, List, Set, Optional +import sys + +# PostgreSQL-related patterns to search for +POSTGRES_PATTERNS = [ + # Python packages + r'psycopg[23]?', + r'asyncpg', + r'pg8000', + r'py-postgresql', + r'postgresql', + r'postgres', + r'sqlalchemy.*postgres', + r'tortoise-orm', + r'django.*postgres', + r'peewee.*postgres', + # Go packages + r'github\.com/lib/pq', + r'github\.com/jackc/pgx', + r'gorm\.io/driver/postgres', + r'postgres', + # Rust crates + r'postgres', + r'tokio-postgres', + r'sqlx.*postgres', + r'diesel.*postgres', + # Node.js packages + r'pg[^a-z]', + r'node-postgres', + r'pg-promise', + r'sequelize', + r'typeorm', + r'prisma', + r'knex', + # Generic + r'libpq', + r'postgresql-client', + r'postgres-client', +] + +def is_postgres_dependency(text: str) -> bool: + """Check if text contains PostgreSQL-related dependencies.""" + text_lower = text.lower() + for pattern in POSTGRES_PATTERNS: + if re.search(pattern, text_lower, re.IGNORECASE): + return True + return False + +def analyze_python_dependencies(file_path: Path) -> List[str]: + """Analyze Python dependency files for PostgreSQL packages.""" + postgres_deps = [] + + try: + content = file_path.read_text(encoding='utf-8', errors='ignore') + + # Check pyproject.toml + if file_path.name == 'pyproject.toml': + # Look in dependencies and optional-dependencies + for line in content.split('\n'): + if is_postgres_dependency(line): + # Extract package name + match = re.search(r'["\']([^"\']*postgres[^"\']*)["\']', line, re.IGNORECASE) + if match: + postgres_deps.append(match.group(1)) + elif 'postgres' in line.lower() or 'psycopg' in line.lower(): + # Try to extract from the line + parts = line.split('=') + if len(parts) > 0: + dep = parts[0].strip().strip('"').strip("'") + if dep: + postgres_deps.append(dep) + + # Check requirements.txt + elif file_path.name == 'requirements.txt': + for line in content.split('\n'): + line = line.strip() + if line and not line.startswith('#'): + if is_postgres_dependency(line): + # Extract package name (before == or other version specifiers) + dep = re.split(r'[=<>!]', line)[0].strip() + postgres_deps.append(dep) + + # Check poetry.lock and uv.lock (JSON format) + elif file_path.name in ['poetry.lock', 'uv.lock']: + try: + data = json.loads(content) + # Search through the JSON structure + json_str = json.dumps(data) + if is_postgres_dependency(json_str): + # Try to find specific package names + if isinstance(data, dict): + # Check packages or dependencies + for key in ['package', 'dependencies', 'packages']: + if key in data: + items = data[key] if isinstance(data[key], list) else [data[key]] + for item in items: + if isinstance(item, dict): + name = item.get('name', '') + if is_postgres_dependency(name): + postgres_deps.append(name) + except json.JSONDecodeError: + # If not valid JSON, search as text + if is_postgres_dependency(content): + postgres_deps.append('postgres-related') + + except Exception as e: + print(f"Error reading {file_path}: {e}", file=sys.stderr) + + return list(set(postgres_deps)) + +def analyze_go_dependencies(file_path: Path) -> List[str]: + """Analyze Go dependency files for PostgreSQL packages.""" + postgres_deps = [] + + try: + content = file_path.read_text(encoding='utf-8', errors='ignore') + + for line in content.split('\n'): + if is_postgres_dependency(line): + # Extract module path + match = re.search(r'([^\s]+postgres[^\s]*)', line, re.IGNORECASE) + if match: + postgres_deps.append(match.group(1)) + else: + # Try to extract from require or replace statements + parts = line.split() + for part in parts: + if is_postgres_dependency(part): + postgres_deps.append(part) + + except Exception as e: + print(f"Error reading {file_path}: {e}", file=sys.stderr) + + return list(set(postgres_deps)) + +def analyze_rust_dependencies(file_path: Path) -> List[str]: + """Analyze Rust dependency files for PostgreSQL packages.""" + postgres_deps = [] + + try: + content = file_path.read_text(encoding='utf-8', errors='ignore') + + # Check Cargo.toml + if file_path.name == 'Cargo.toml': + in_deps = False + for line in content.split('\n'): + line_lower = line.lower() + if '[dependencies]' in line_lower or '[dev-dependencies]' in line_lower: + in_deps = True + continue + if line.strip().startswith('[') and in_deps: + in_deps = False + + if in_deps and is_postgres_dependency(line): + # Extract crate name + match = re.search(r'([a-z0-9_-]*postgres[a-z0-9_-]*)', line, re.IGNORECASE) + if match: + postgres_deps.append(match.group(1)) + else: + # Try to extract from = "version" format + parts = line.split('=') + if len(parts) > 0: + dep = parts[0].strip().strip('"').strip("'") + if dep and is_postgres_dependency(dep): + postgres_deps.append(dep) + + # Check Cargo.lock (TOML format) + elif file_path.name == 'Cargo.lock': + if is_postgres_dependency(content): + # Try to extract package names + matches = re.findall(r'name\s*=\s*"([^"]*postgres[^"]*)"', content, re.IGNORECASE) + postgres_deps.extend(matches) + + except Exception as e: + print(f"Error reading {file_path}: {e}", file=sys.stderr) + + return list(set(postgres_deps)) + +def analyze_nodejs_dependencies(file_path: Path) -> List[str]: + """Analyze Node.js dependency files for PostgreSQL packages.""" + postgres_deps = [] + + try: + if file_path.name == 'package.json': + data = json.loads(file_path.read_text(encoding='utf-8', errors='ignore')) + + # Check dependencies, devDependencies, peerDependencies + for dep_type in ['dependencies', 'devDependencies', 'peerDependencies', 'optionalDependencies']: + if dep_type in data: + for dep_name, version in data[dep_type].items(): + if is_postgres_dependency(dep_name): + postgres_deps.append(dep_name) + + elif file_path.name in ['package-lock.json', 'pnpm-lock.yaml', 'yarn.lock']: + content = file_path.read_text(encoding='utf-8', errors='ignore') + if is_postgres_dependency(content): + # Try to extract package names + if file_path.name == 'package-lock.json': + try: + data = json.loads(content) + json_str = json.dumps(data) + # Search for postgres packages + matches = re.findall(r'"([^"]*postgres[^"]*)"', json_str, re.IGNORECASE) + postgres_deps.extend(matches) + except json.JSONDecodeError: + pass + else: + # For pnpm-lock.yaml and yarn.lock, search as text + matches = re.findall(r'([a-z0-9_-]*postgres[a-z0-9_-]*)', content, re.IGNORECASE) + postgres_deps.extend(matches) + + except Exception as e: + print(f"Error reading {file_path}: {e}", file=sys.stderr) + + return list(set(postgres_deps)) + +def analyze_dockerfile(file_path: Path) -> List[str]: + """Analyze Dockerfile for PostgreSQL installations.""" + postgres_refs = [] + + try: + content = file_path.read_text(encoding='utf-8', errors='ignore') + + if is_postgres_dependency(content): + # Look for specific PostgreSQL references + patterns = [ + r'postgresql[^\s]*', + r'psycopg[23]?', + r'libpq', + r'postgres-client', + ] + for pattern in patterns: + matches = re.findall(pattern, content, re.IGNORECASE) + postgres_refs.extend(matches) + + except Exception as e: + print(f"Error reading {file_path}: {e}", file=sys.stderr) + + return list(set(postgres_refs)) + +def analyze_project(result: Dict) -> Dict: + """Analyze a single project for PostgreSQL dependencies.""" + repo_path = result.get('repo_path') + if not repo_path or not Path(repo_path).exists(): + return { + 'has_postgres': False, + 'dependencies': {}, + 'dockerfiles': [], + 'note': 'Repository not found locally' + } + + repo_path_obj = Path(repo_path) + analysis = { + 'has_postgres': False, + 'dependencies': defaultdict(list), + 'dockerfiles': [], + 'languages': [] + } + + # Analyze dependency files + deps = result.get('dependencies', {}) + + for lang, files in deps.items(): + if lang not in analysis['languages']: + analysis['languages'].append(lang) + + for file_rel_path in files: + # Handle relative paths from repo root + file_path = repo_path_obj / file_rel_path.replace(f"{result.get('repo', '')}/", "") + + if not file_path.exists(): + # Try alternative path + file_path = Path(file_path) + if not file_path.exists(): + continue + + postgres_deps = [] + + if lang == 'python': + postgres_deps = analyze_python_dependencies(file_path) + elif lang == 'go': + postgres_deps = analyze_go_dependencies(file_path) + elif lang == 'rust': + postgres_deps = analyze_rust_dependencies(file_path) + elif lang == 'nodejs': + postgres_deps = analyze_nodejs_dependencies(file_path) + + if postgres_deps: + analysis['has_postgres'] = True + analysis['dependencies'][lang].extend(postgres_deps) + analysis['dependencies'][lang] = list(set(analysis['dependencies'][lang])) + + # Analyze Dockerfiles + dockerfiles = result.get('dockerfiles', []) + for dockerfile_rel_path in dockerfiles: + dockerfile_path = repo_path_obj / dockerfile_rel_path.replace(f"{result.get('repo', '')}/", "") + + if not dockerfile_path.exists(): + # Try alternative path + dockerfile_path = Path(dockerfile_path) + if not dockerfile_path.exists(): + continue + + postgres_refs = analyze_dockerfile(dockerfile_path) + if postgres_refs: + analysis['has_postgres'] = True + analysis['dockerfiles'].append({ + 'file': dockerfile_rel_path, + 'references': postgres_refs + }) + + return analysis + +def main(): + """Main analysis function.""" + # Try multiple possible locations for image-source-analysis.json + script_dir = Path(__file__).parent + support_dir = script_dir.parent.parent # ../.. from postgres-dependency-scanner + possible_locations = [ + support_dir / 'scan-results' / 'postgres-analysis' / 'image-source-analysis.json', # Default location + script_dir / 'image-source-analysis.json', # Same directory as script (fallback) + Path('scan-results/postgres-analysis/image-source-analysis.json'), # Relative to current directory (fallback) + ] + + analysis_file = None + for location in possible_locations: + if location.exists(): + analysis_file = location + break + + if not analysis_file: + print(f"Error: image-source-analysis.json not found in any of these locations:", file=sys.stderr) + for loc in possible_locations: + print(f" - {loc}", file=sys.stderr) + sys.exit(1) + + print(f"Using image-source-analysis.json from: {analysis_file}", file=sys.stderr) + + with open(analysis_file, 'r') as f: + data = json.load(f) + + results = data.get('results', []) + + # Filter to only non-public images with repositories + projects_to_analyze = [ + r for r in results + if not r.get('is_public', False) and r.get('repo_path') and r.get('status') == 'found' + ] + + print(f"Analyzing {len(projects_to_analyze)} projects for PostgreSQL dependencies...\n") + + postgres_projects = [] + no_postgres_projects = [] + + for result in projects_to_analyze: + image = result.get('image', 'unknown') + repo = result.get('repo', 'unknown') + + print(f"Analyzing {repo} ({image})...", end=' ', flush=True) + + analysis = analyze_project(result) + + if analysis.get('has_postgres'): + print("✓ PostgreSQL dependencies found") + postgres_projects.append({ + 'image': image, + 'repo': repo, + 'repo_path': result.get('repo_path'), + 'git_remote_url': result.get('git_remote_url'), + 'analysis': analysis, + 'evidence': result.get('evidence', {}) + }) + else: + print("✗ No PostgreSQL dependencies") + no_postgres_projects.append({ + 'image': image, + 'repo': repo, + 'git_remote_url': result.get('git_remote_url') + }) + + # Generate report + report = { + 'summary': { + 'total_projects': len(projects_to_analyze), + 'with_postgres': len(postgres_projects), + 'without_postgres': len(no_postgres_projects), + 'migration_effort': { + 'high': 0, + 'medium': 0, + 'low': 0 + } + }, + 'projects_with_postgres': postgres_projects, + 'projects_without_postgres': [p['repo'] for p in no_postgres_projects] + } + + # Categorize migration effort + for project in postgres_projects: + analysis = project['analysis'] + lang_count = len(analysis.get('languages', [])) + dep_count = sum(len(deps) for deps in analysis.get('dependencies', {}).values()) + dockerfile_count = len(analysis.get('dockerfiles', [])) + + # Simple heuristic: more languages/dependencies = higher effort + if lang_count > 1 or dep_count > 3 or dockerfile_count > 2: + report['summary']['migration_effort']['high'] += 1 + elif dep_count > 1 or dockerfile_count > 0: + report['summary']['migration_effort']['medium'] += 1 + else: + report['summary']['migration_effort']['low'] += 1 + + # Output report (use scan-results/postgres-analysis directory) + output_dir = support_dir / 'scan-results' / 'postgres-analysis' + output_dir.mkdir(parents=True, exist_ok=True) + output_file = output_dir / 'postgres-dependency-analysis.json' + with open(output_file, 'w') as f: + json.dump(report, f, indent=2) + + print(f"\n{'='*80}") + print("POSTGRESQL DEPENDENCY ANALYSIS REPORT") + print(f"{'='*80}\n") + print(f"Total projects analyzed: {report['summary']['total_projects']}") + print(f"Projects with PostgreSQL dependencies: {report['summary']['with_postgres']}") + print(f"Projects without PostgreSQL dependencies: {report['summary']['without_postgres']}\n") + + print("Migration Effort Breakdown:") + print(f" High effort: {report['summary']['migration_effort']['high']} projects") + print(f" Medium effort: {report['summary']['migration_effort']['medium']} projects") + print(f" Low effort: {report['summary']['migration_effort']['low']} projects\n") + + if postgres_projects: + print(f"\n{'='*80}") + print("PROJECTS WITH POSTGRESQL DEPENDENCIES:") + print(f"{'='*80}\n") + + for project in postgres_projects: + print(f"Repository: {project['repo']}") + print(f"Image: {project['image']}") + print(f"Languages: {', '.join(project['analysis'].get('languages', []))}") + + deps = project['analysis'].get('dependencies', {}) + if deps: + print("Dependencies:") + for lang, packages in deps.items(): + print(f" {lang}: {', '.join(packages)}") + + dockerfiles = project['analysis'].get('dockerfiles', []) + if dockerfiles: + print("Dockerfile references:") + for df in dockerfiles: + print(f" {df['file']}: {', '.join(df['references'])}") + + print() + + print(f"\nDetailed report saved to: {output_file}") + +if __name__ == '__main__': + main() + diff --git a/scanner/postgres-dependency-scanner/generate-image-source-analysis.py b/scanner/postgres-dependency-scanner/generate-image-source-analysis.py new file mode 100755 index 0000000..c598c16 --- /dev/null +++ b/scanner/postgres-dependency-scanner/generate-image-source-analysis.py @@ -0,0 +1,1455 @@ +#!/usr/bin/env python3 +""" +Generate image-source-analysis.json from scan-summary.json by finding repositories +and analyzing their dependencies and Dockerfiles. + +This script searches for repositories matching images from scan-summary.json in: +1. Recursively in ../ (excluding support folder) +2. Recursively in ../../ (excluding aleph-alpha folder) + +Usage: + python3 generate-image-source-analysis.py [--verbose] [--scan-summary FILE] [--output FILE] +""" + +import json +import argparse +import re +import yaml +import time +from pathlib import Path +from typing import Dict, List, Optional, Set, Tuple, Any +import sys +import subprocess +from datetime import datetime + +# Try to import toml (may not be available) +try: + import toml + HAS_TOML = True +except ImportError: + HAS_TOML = False + # Fallback: basic TOML parsing for simple cases + def toml_load(file_path): + """Basic TOML parser for simple cases.""" + content = file_path.read_text(encoding='utf-8') + result = {} + current_section = result + for line in content.split('\n'): + line = line.strip() + if not line or line.startswith('#'): + continue + if line.startswith('[') and line.endswith(']'): + section = line[1:-1].strip() + if section not in result: + result[section] = {} + current_section = result[section] + elif '=' in line: + key, value = line.split('=', 1) + key = key.strip() + value = value.strip().strip('"').strip("'") + current_section[key] = value + return result + +# Dependency file patterns by language +DEPENDENCY_FILES = { + 'python': [ + 'requirements.txt', + 'pyproject.toml', + 'poetry.lock', + 'Pipfile', + 'Pipfile.lock', + 'setup.py', + 'setup.cfg', + 'uv.lock', + ], + 'go': [ + 'go.mod', + 'go.sum', + 'Gopkg.toml', + 'Gopkg.lock', + 'glide.yaml', + 'glide.lock', + ], + 'rust': [ + 'Cargo.toml', + 'Cargo.lock', + ], + 'nodejs': [ + 'package.json', + 'package-lock.json', + 'pnpm-lock.yaml', + 'yarn.lock', + 'npm-shrinkwrap.json', + ], +} + +# Dockerfile patterns +DOCKERFILE_PATTERNS = [ + 'Dockerfile', + 'Dockerfile.*', + '*.dockerfile', + 'docker/Dockerfile', + 'docker/Dockerfile.*', + '.dockerfile', +] + + +def extract_image_name(image: str) -> str: + """ + Extract the base image name from a full image string. + + Examples: + harbor.management-prod01.stackit.run/pharia-data/data:v0.52.2 -> data + harbor.management-prod01.stackit.run/pharia-os-images/pharia-os-app:1.24.1 -> pharia-os-app + quay.io/prometheuscommunity/postgres-exporter:v0.17.1 -> postgres-exporter + """ + # Remove registry prefix + parts = image.split('/') + if len(parts) > 1: + # Get the last part (image:tag) + image_part = parts[-1] + else: + image_part = parts[0] + + # Remove tag + if ':' in image_part: + image_part = image_part.split(':')[0] + + return image_part + + +def extract_image_tag(image: str) -> Optional[str]: + """Extract the tag/version from a full image string.""" + parts = image.split('/') + if len(parts) > 0: + image_part = parts[-1] + if ':' in image_part: + return image_part.split(':', 1)[1] + return None + + +def extract_artifact_from_helm_chart(chart_dir: Path, image: str) -> Optional[Dict[str, str]]: + """ + Extract artifact name and version from Helm chart values.yaml and templates. + + Returns dict with 'name' and 'version' keys, or None if not found. + """ + image_name = extract_image_name(image) + image_tag = extract_image_tag(image) + + # Try to parse values.yaml + for values_file in [chart_dir / "values.yaml", chart_dir / "values.yml"]: + if not values_file.exists(): + continue + + try: + content = values_file.read_text(encoding='utf-8') + # Check if image is referenced + if image_name not in content and image not in content: + continue + + # Try to parse as YAML + try: + values = yaml.safe_load(content) + if not isinstance(values, dict): + continue + + # Search for image references in values structure + def find_image_ref(obj, path=""): + """Recursively search for image references.""" + if isinstance(obj, dict): + for key, value in obj.items(): + new_path = f"{path}.{key}" if path else key + if key in ['image', 'repository', 'imageName']: + if isinstance(value, str): + if image_name in value or image in value: + return {'name': image_name, 'version': image_tag} + elif isinstance(value, (dict, list)): + result = find_image_ref(value, new_path) + if result: + return result + elif isinstance(obj, list): + for item in obj: + result = find_image_ref(item, path) + if result: + return result + return None + + result = find_image_ref(values) + if result: + return result + except yaml.YAMLError: + # If YAML parsing fails, try regex matching + pass + + # Fallback: regex search for image patterns + # Look for patterns like: image: "repo/image:tag" or repository: "repo/image" + patterns = [ + rf'image:\s*["\']?([^"\'\s]+{re.escape(image_name)}[^"\'\s]*)["\']?', + rf'repository:\s*["\']?([^"\'\s]+{re.escape(image_name)}[^"\'\s]*)["\']?', + rf'["\']?([^"\'\s]*{re.escape(image)}[^"\'\s]*)["\']?', + ] + + for pattern in patterns: + match = re.search(pattern, content, re.IGNORECASE) + if match: + return { + 'name': image_name, + 'version': image_tag or '' + } + except (IOError, UnicodeDecodeError): + continue + + # Try templates directory + templates_dir = chart_dir / "templates" + if templates_dir.exists(): + for template_file in templates_dir.rglob("*.yaml"): + try: + content = template_file.read_text(encoding='utf-8') + if image_name in content or image in content: + # Look for image references in templates + patterns = [ + rf'image:\s*["\']?([^"\'\s]+{re.escape(image_name)}[^"\'\s]*)["\']?', + rf'["\']?([^"\'\s]*{re.escape(image)}[^"\'\s]*)["\']?', + ] + for pattern in patterns: + match = re.search(pattern, content, re.IGNORECASE) + if match: + return { + 'name': image_name, + 'version': image_tag or '' + } + except (IOError, UnicodeDecodeError): + continue + + return None + + +def extract_repo_name_from_image(image: str) -> Optional[str]: + """ + Try to extract repository name from image path. + + Examples: + harbor.management-prod01.stackit.run/pharia-data/data:v0.52.2 -> pharia-data + harbor.management-prod01.stackit.run/pharia-os-images/pharia-os-app:1.24.1 -> pharia-os + """ + parts = image.split('/') + if len(parts) >= 2: + # Usually the second-to-last part is the repo/org name + namespace = parts[-2] + + # Remove common suffixes to get base name + # e.g., "pharia-os-images" -> "pharia-os" + for suffix in ['-images', '-container-images', 'container-images']: + if namespace.endswith(suffix): + namespace = namespace[:-len(suffix)] + break + + return namespace + return None + + +def normalize_repo_name(name: str) -> str: + """Normalize repository name for matching (lowercase, remove special chars).""" + return name.lower().replace('_', '-').replace(' ', '-') + + +def normalize_for_comparison(name: str) -> str: + """ + Normalize name for comparison by removing hyphens. + This helps match 'pharia-os' with 'phariaos'. + """ + return normalize_repo_name(name).replace('-', '').replace('_', '') + + +def is_repository_directory(path: Path) -> bool: + """Check if a directory looks like a repository.""" + # Must have .git or dependency files + if (path / '.git').exists(): + return True + + # Or have dependency files + if has_dependency_files(path): + return True + + return False + + +def calculate_name_similarity(repo_name: str, image_name: str) -> float: + """ + Calculate similarity score between repo name and image name. + Returns a score from 0.0 to 1.0, where 1.0 is exact match. + """ + repo_norm = normalize_repo_name(repo_name) + image_norm = normalize_repo_name(image_name) + + # Exact match + if repo_norm == image_norm: + return 1.0 + + # Normalize without hyphens for comparison + repo_no_hyphen = normalize_for_comparison(repo_name) + image_no_hyphen = normalize_for_comparison(image_name) + + # Exact match without hyphens (e.g., pharia-os vs phariaos) + if repo_no_hyphen == image_no_hyphen: + return 0.95 + + # Prefix match: image name is a prefix of repo name (e.g., inference -> inference-worker) + if repo_norm.startswith(image_norm + '-') or repo_norm.startswith(image_norm + '_'): + return 0.9 + + # Prefix match: repo name is a prefix of image name (e.g., catch -> catch-backend) + # This handles cases where image is "catch-api" and repo is "catch-backend" + if image_norm.startswith(repo_norm + '-') or image_norm.startswith(repo_norm + '_'): + return 0.85 + + # One contains the other (substring match) + if image_norm in repo_norm: + # Longer the match relative to image name, higher the score + return 0.7 + (len(image_norm) / len(repo_norm)) * 0.15 + if repo_norm in image_norm: + return 0.7 + (len(repo_norm) / len(image_norm)) * 0.15 + + # Handle compound names (e.g., "pharia-os-app" matches "pharia-os") + image_parts = image_norm.split('-') + repo_parts = repo_norm.split('-') + + # If image name parts are all in repo name + if len(image_parts) > 1 and all(part in repo_norm for part in image_parts): + matched_parts = sum(1 for part in image_parts if part in repo_norm) + return 0.6 + (matched_parts / len(image_parts)) * 0.2 + + # If repo name parts are all in image name + if len(repo_parts) > 1 and all(part in image_norm for part in repo_parts): + matched_parts = sum(1 for part in repo_parts if part in image_norm) + return 0.6 + (matched_parts / len(repo_parts)) * 0.2 + + # Check without hyphens for partial matches + if image_no_hyphen in repo_no_hyphen or repo_no_hyphen in image_no_hyphen: + return 0.5 + + return 0.0 + + +def matches_image_name(repo_name: str, image_name: str, repo_hint: Optional[str] = None, min_similarity: float = 0.5) -> bool: + """ + Check if repository name matches image name with various heuristics. + + Args: + repo_name: Repository name to check + image_name: Image name to match against + repo_hint: Optional hint from image path (e.g., 'pharia-os' from 'pharia-os-images') + min_similarity: Minimum similarity score to consider a match (default: 0.5) + + Returns: + True if similarity score >= min_similarity + """ + # Check direct match + similarity = calculate_name_similarity(repo_name, image_name) + if similarity >= min_similarity: + return True + + # Check with repo hint if provided + if repo_hint: + hint_norm = normalize_repo_name(repo_hint) + repo_norm = normalize_repo_name(repo_name) + + # Hint matches repo name + if hint_norm in repo_norm or repo_norm in hint_norm: + # Then check if image name matches + similarity = calculate_name_similarity(repo_name, image_name) + if similarity >= 0.3: # Lower threshold when hint matches + return True + + # Check hint without hyphens + hint_no_hyphen = normalize_for_comparison(repo_hint) + repo_no_hyphen = normalize_for_comparison(repo_name) + if hint_no_hyphen in repo_no_hyphen or repo_no_hyphen in hint_no_hyphen: + similarity = calculate_name_similarity(repo_name, image_name) + if similarity >= 0.3: + return True + + return False + + +def extract_artifact_from_repo(repo_path: Path) -> List[Dict[str, str]]: + """ + Extract artifact names and versions from repository dependency files. + + Returns list of dicts with 'name', 'version', and 'file' keys. + """ + artifacts = [] + + # Check Cargo.toml (Rust) + for cargo_file in repo_path.rglob("Cargo.toml"): + try: + if HAS_TOML: + cargo_data = toml.load(cargo_file) + else: + cargo_data = toml_load(cargo_file) + if 'package' in cargo_data: + pkg = cargo_data['package'] + if 'name' in pkg: + artifact = { + 'name': str(pkg['name']), + 'version': str(pkg.get('version', '')), + 'file': str(cargo_file.relative_to(repo_path)) + } + artifacts.append(artifact) + except (Exception, KeyError, AttributeError): + pass + + # Check package.json (Node.js) + for pkg_file in repo_path.rglob("package.json"): + try: + with open(pkg_file, 'r', encoding='utf-8') as f: + pkg_data = json.load(f) + if 'name' in pkg_data: + artifact = { + 'name': pkg_data['name'], + 'version': pkg_data.get('version', ''), + 'file': str(pkg_file.relative_to(repo_path)) + } + artifacts.append(artifact) + except Exception: + pass + + # Check pyproject.toml (Python) + for pyproject_file in repo_path.rglob("pyproject.toml"): + try: + if HAS_TOML: + pyproject_data = toml.load(pyproject_file) + else: + pyproject_data = toml_load(pyproject_file) + # Check [project] section + if 'project' in pyproject_data: + project = pyproject_data['project'] + if isinstance(project, dict) and 'name' in project: + artifact = { + 'name': str(project['name']), + 'version': str(project.get('version', '')), + 'file': str(pyproject_file.relative_to(repo_path)) + } + artifacts.append(artifact) + # Check [tool.poetry] section + elif ('tool' in pyproject_data and + isinstance(pyproject_data['tool'], dict) and + 'poetry' in pyproject_data['tool']): + poetry = pyproject_data['tool']['poetry'] + if isinstance(poetry, dict) and 'name' in poetry: + artifact = { + 'name': str(poetry['name']), + 'version': str(poetry.get('version', '')), + 'file': str(pyproject_file.relative_to(repo_path)) + } + artifacts.append(artifact) + except (Exception, KeyError, AttributeError): + pass + + # Check setup.py (Python - basic regex extraction) + for setup_file in repo_path.rglob("setup.py"): + try: + content = setup_file.read_text(encoding='utf-8') + # Look for name= and version= patterns + name_match = re.search(r'name\s*=\s*["\']([^"\']+)["\']', content) + version_match = re.search(r'version\s*=\s*["\']([^"\']+)["\']', content) + if name_match: + artifact = { + 'name': name_match.group(1), + 'version': version_match.group(1) if version_match else '', + 'file': str(setup_file.relative_to(repo_path)) + } + artifacts.append(artifact) + except Exception: + pass + + return artifacts + + +def matches_artifact(repo_artifact: Dict[str, str], helm_artifact: Dict[str, str]) -> bool: + """ + Check if a repository artifact matches a Helm chart artifact. + + Matches based on: + 1. Name similarity (normalized, handles hyphens) + 2. Version compatibility (exact match or semantic version compatibility) + """ + repo_name = normalize_for_comparison(repo_artifact['name']) + helm_name = normalize_for_comparison(helm_artifact['name']) + + # Name must match (with hyphen normalization) + if repo_name != helm_name: + # Try with image name extraction + image_name_norm = normalize_for_comparison(extract_image_name(helm_artifact['name'])) + if repo_name != image_name_norm: + return False + + # Version matching (if both have versions) + repo_version = repo_artifact.get('version', '') + helm_version = helm_artifact.get('version', '') + + if repo_version and helm_version: + # Exact match + if repo_version == helm_version: + return True + + # Try semantic version comparison (basic) + # Remove 'v' prefix if present + repo_v = repo_version.lstrip('v') + helm_v = helm_version.lstrip('v') + + # Check if major.minor matches (allows patch differences) + repo_parts = repo_v.split('.') + helm_parts = helm_v.split('.') + if len(repo_parts) >= 2 and len(helm_parts) >= 2: + if repo_parts[0] == helm_parts[0] and repo_parts[1] == helm_parts[1]: + return True + + # If name matches but no version info, still consider it a match + return True + + +def find_helm_charts_for_image(image: str, search_paths: List[Path], helm_chart_repos: List[Path]) -> List[Path]: + """ + Find Helm charts that reference the given image. + + Strategy: + 1. First search in dedicated Helm chart repositories (faster, more reliable) + 2. Then search in wider paths if needed + + This confirms the image is actually used and helps identify the source repository. + """ + image_name = extract_image_name(image) + + # Also try without registry prefix + image_without_registry = image.split('/')[-1] if '/' in image else image + if ':' in image_without_registry: + image_without_registry = image_without_registry.split(':')[0] + + found_charts = [] + + def search_in_path(search_path: Path): + """Helper to search for charts in a specific path.""" + if not search_path.exists(): + return + + # Search for Chart.yaml files + try: + for chart_file in search_path.rglob("Chart.yaml"): + chart_dir = chart_file.parent + + # Skip if already found + if chart_dir in found_charts: + continue + + # Check values.yaml files + for values_file in [chart_dir / "values.yaml", chart_dir / "values.yml"]: + if not values_file.exists(): + continue + + try: + content = values_file.read_text() + # Check if image is referenced + if image in content or image_without_registry in content or image_name in content: + found_charts.append(chart_dir) + return # Found in this path, can stop + except (IOError, UnicodeDecodeError): + pass + + # Check template files + templates_dir = chart_dir / "templates" + if templates_dir.exists(): + for template_file in templates_dir.rglob("*.yaml"): + try: + content = template_file.read_text() + if image in content or image_without_registry in content or image_name in content: + found_charts.append(chart_dir) + return # Found in this path, can stop + except (IOError, UnicodeDecodeError): + pass + except (PermissionError, OSError): + pass + + # Step 1: Search in Helm chart repositories first (prioritized) + for helm_repo in helm_chart_repos: + search_in_path(helm_repo) + if found_charts: # If found, we can stop early + return found_charts + + # Step 2: Search in wider paths if not found in Helm chart repos + for search_path in search_paths: + search_in_path(search_path) + + return found_charts + + +def get_git_commit_date(repo_path: Path) -> Optional[datetime]: + """ + Get the date of the most recent commit in a git repository. + Returns None if not a git repo or if git command fails. + """ + if not (repo_path / '.git').exists(): + return None + + try: + result = subprocess.run( + ['git', 'log', '-1', '--format=%ct', 'HEAD'], + cwd=repo_path, + capture_output=True, + text=True, + timeout=5 + ) + if result.returncode == 0 and result.stdout.strip(): + timestamp = int(result.stdout.strip()) + return datetime.fromtimestamp(timestamp) + except (subprocess.TimeoutExpired, ValueError, OSError): + pass + + return None + + +def rank_candidates(candidates: List[Tuple[Path, int]], image_name: str) -> List[Tuple[Path, float]]: + """ + Rank candidates by multiple factors: + 1. Name similarity score + 2. Git commit recency (newest commits preferred) + 3. Depth (shallower preferred) + + Returns list of (path, score) tuples sorted by score (highest first). + """ + ranked = [] + + for repo_path, depth in candidates: + # Calculate name similarity + similarity = calculate_name_similarity(repo_path.name, image_name) + + # Get git commit date + commit_date = get_git_commit_date(repo_path) + + # Calculate composite score + # Base score: similarity (0.0-1.0) weighted at 60% + score = similarity * 0.6 + + # Depth penalty: shallower is better (0-5 depth, normalized to 0-0.2) + depth_score = max(0, (6 - depth) / 6) * 0.2 + score += depth_score + + # Git recency bonus: newer commits get higher score (0-0.2) + if commit_date: + # Calculate days since commit (more recent = higher score) + days_ago = (datetime.now() - commit_date).days + # Normalize: 0 days = 0.2, 365 days = 0.0 + recency_score = max(0, (365 - days_ago) / 365) * 0.2 + score += recency_score + else: + # No git info: small penalty + score += 0.05 + + ranked.append((repo_path, score)) + + # Sort by score (highest first) + ranked.sort(key=lambda x: x[1], reverse=True) + return ranked + + +def find_repos_by_artifact_name(artifact_name: str, search_paths: List[Path], + exclude_dirs: Set[str]) -> List[Path]: + """ + Fast search for repositories containing artifact name in dependency files. + + Uses grep-like search across TOML, package.json, and go.mod files. + Returns list of repository paths that might contain the artifact. + """ + candidates = [] + artifact_norm = normalize_for_comparison(artifact_name) + artifact_lower = artifact_name.lower() + + # File patterns to search + search_patterns = [ + '**/Cargo.toml', + '**/pyproject.toml', + '**/package.json', + '**/go.mod', + '**/setup.py', + ] + + for search_path in search_paths: + if not search_path.exists(): + continue + + for pattern in search_patterns: + try: + for file_path in search_path.rglob(pattern): + # Skip excluded directories + if any(part in exclude_dirs for part in file_path.parts): + continue + + # Skip if in .git or node_modules + if '.git' in file_path.parts or 'node_modules' in file_path.parts: + continue + + try: + content = file_path.read_text(encoding='utf-8', errors='ignore') + + # Quick check: does the artifact name appear? + if artifact_lower not in content.lower(): + continue + + # More specific checks based on file type + found = False + + if file_path.name == 'Cargo.toml': + # Look for [package] name = "artifact_name" + if (f'name = "{artifact_name}"' in content or + f"name = '{artifact_name}'" in content or + f'name = "{artifact_lower}"' in content): + found = True + # Also check normalized version + elif artifact_norm in normalize_for_comparison(content): + # Verify it's in a name field + if re.search(r'name\s*=\s*["\']([^"\']*)["\']', + content, re.IGNORECASE): + found = True + + elif file_path.name == 'package.json': + # Look for "name": "artifact_name" + if (f'"name": "{artifact_name}"' in content or + f'"name": "{artifact_lower}"' in content): + found = True + + elif file_path.name == 'pyproject.toml': + # Look for name = "artifact_name" in [project] or [tool.poetry] + if (f'name = "{artifact_name}"' in content or + f"name = '{artifact_name}'" in content or + f'name = "{artifact_lower}"' in content): + found = True + + elif file_path.name == 'go.mod': + # Look for module artifact_name + if (f'module {artifact_name}' in content or + f'module {artifact_lower}' in content): + found = True + + elif file_path.name == 'setup.py': + # Look for name="artifact_name" + if (f'name="{artifact_name}"' in content or + f"name='{artifact_name}'" in content or + f'name="{artifact_lower}"' in content): + found = True + + if found: + # Find the repository root (directory with .git or parent) + repo_path = file_path.parent + # Walk up to find .git or dependency files + for _ in range(5): # Max 5 levels up + if (repo_path / '.git').exists(): + break + if has_dependency_files(repo_path): + break + if repo_path.parent == repo_path: + break + repo_path = repo_path.parent + + if repo_path not in candidates: + candidates.append(repo_path) + except (IOError, UnicodeDecodeError, PermissionError): + continue + except (PermissionError, OSError): + continue + + return candidates + + +def find_repository_for_image(image: str, search_paths: List[Path], exclude_dirs: Set[str], helm_chart_repos: List[Path], verbose: bool = False) -> Tuple[Optional[Path], Dict]: + """ + Find repository directory that matches the given image. + + Strategy: + 1. Find Helm charts that reference the image and extract artifact metadata + 2. Search repositories for matching artifact names/versions in dependency files + 3. Fall back to name-based matching if artifact matching fails + 4. Rank candidates by artifact match quality, git recency, and depth + + Searches in the provided paths, excluding specified directories. + + Returns: + Tuple of (repo_path, evidence_dict) where evidence contains: + - match_method: How the repo was found (artifact_match, grep_name_match, etc.) + - match_score: Confidence score (0.0-1.0) + - matched_artifact: Artifact info if found via artifact matching + - artifact_files: List of files where artifact was found + - helm_charts_found: Number of Helm charts found + - helm_artifacts: List of artifact metadata from Helm charts + """ + import time + + image_name = extract_image_name(image) + repo_name_hint = extract_repo_name_from_image(image) + + if verbose: + print(f"\n → Searching for: {image_name}", flush=True) + + # Step 1: Find Helm charts and extract artifact metadata + step_start = time.time() + if verbose: + print(f" → Step 1: Searching Helm charts in {len(helm_chart_repos)} repos...", end='', flush=True) + + helm_charts = find_helm_charts_for_image(image, search_paths, helm_chart_repos) + helm_artifacts = [] + + for chart_dir in helm_charts: + artifact = extract_artifact_from_helm_chart(chart_dir, image) + if artifact: + helm_artifacts.append(artifact) + + if verbose: + print(f" found {len(helm_charts)} charts, {len(helm_artifacts)} artifacts ({time.time()-step_start:.1f}s)", flush=True) + + # Step 2: Fast grep-based search for artifact names in dependency files + candidates = [] + + step_start = time.time() + if verbose: + print(f" → Step 2: Grep search in dependency files...", end='', flush=True) + + # First, try fast grep search for the image name + grep_candidates = find_repos_by_artifact_name(image_name, search_paths, + exclude_dirs) + + if verbose: + print(f" found {len(grep_candidates)} candidates ({time.time()-step_start:.1f}s)", flush=True) + + # If we have artifact info from Helm charts, also search for those names + if helm_artifacts: + if verbose: + print(f" → Step 2b: Grep search for Helm artifact names...", end='', flush=True) + step_start = time.time() + for helm_artifact in helm_artifacts: + artifact_name = helm_artifact.get('name', image_name) + additional_candidates = find_repos_by_artifact_name( + artifact_name, search_paths, exclude_dirs) + grep_candidates.extend(additional_candidates) + if verbose: + print(f" found {len(grep_candidates)} total ({time.time()-step_start:.1f}s)", flush=True) + + # Remove duplicates while preserving order + seen = set() + unique_grep_candidates = [] + for candidate in grep_candidates: + if candidate not in seen: + seen.add(candidate) + unique_grep_candidates.append(candidate) + + # Step 3: Verify grep candidates by extracting and matching artifacts + if unique_grep_candidates or helm_artifacts: + step_start = time.time() + if verbose: + print(f" → Step 3: Verifying {len(unique_grep_candidates)} grep candidates...", end='', flush=True) + + # First, check the grep candidates (fast path) + for repo_path in unique_grep_candidates: + if not is_repository_directory(repo_path): + continue + + # Extract artifacts from this repo + repo_artifacts = extract_artifact_from_repo(repo_path) + + # Check if any repo artifact matches + artifact_match_found = False + if helm_artifacts and repo_artifacts: + # Match against Helm artifacts + for repo_artifact in repo_artifacts: + for helm_artifact in helm_artifacts: + if matches_artifact(repo_artifact, helm_artifact): + # Strong match based on artifact metadata + depth = len(repo_path.parts) - len(search_paths[0].parts) if search_paths else 0 + candidates.append(( + repo_path, depth, 1.0, repo_artifact + )) + artifact_match_found = True + break + if artifact_match_found: + break + + # If found via grep but no artifact match, still add it + if not artifact_match_found: + # Check if name matches + if matches_image_name(repo_path.name, image_name, repo_name_hint): + depth = len(repo_path.parts) - len(search_paths[0].parts) if search_paths else 0 + # Grep-based name match (good confidence) + score = 0.7 if repo_artifacts else 0.6 + candidates.append(( + repo_path, depth, score, repo_artifacts[0] if repo_artifacts else None + )) + + if verbose: + print(f" {len(candidates)} matches ({time.time()-step_start:.1f}s)", flush=True) + + # Also do recursive search for completeness (but prioritize grep results) + step_start = time.time() + if verbose: + print(f" → Step 3b: Recursive search (depth 5) in {len(search_paths)} paths...", end='', flush=True) + + for search_path in search_paths: + if not search_path.exists(): + continue + + # Search recursively for repos with matching artifacts + try: + def search_recursive(path: Path, current_depth: int, max_depth: int): + """Recursively search directories up to max_depth.""" + if current_depth > max_depth: + return + + try: + for item in path.iterdir(): + if not item.is_dir(): + continue + + # Skip excluded directories + if item.name in exclude_dirs: + continue + + # Check if any parent is excluded + if any(part in exclude_dirs for part in item.parts): + continue + + # Skip if already found via grep + if item in unique_grep_candidates: + continue + + # Check if it's a repository + if not is_repository_directory(item): + # Recurse deeper + if current_depth < max_depth: + search_recursive(item, current_depth + 1, max_depth) + continue + + # Check if name matches (always check this) + name_matches = matches_image_name(item.name, image_name, + repo_name_hint) + + if name_matches: + # Extract artifacts to check for bonus score + repo_artifacts = extract_artifact_from_repo(item) + + # Check if any repo artifact matches Helm artifacts + artifact_match_found = False + if helm_artifacts and repo_artifacts: + for repo_artifact in repo_artifacts: + for helm_artifact in helm_artifacts: + if matches_artifact(repo_artifact, + helm_artifact): + # Strong match: name + artifact + candidates.append(( + item, current_depth, 1.0, + repo_artifact + )) + artifact_match_found = True + break + if artifact_match_found: + break + + # If no artifact match, still add based on name + if not artifact_match_found: + # Name match only (lower score) + candidates.append(( + item, current_depth, 0.5, None + )) + + # Recurse deeper + if current_depth < max_depth: + search_recursive(item, current_depth + 1, max_depth) + except (PermissionError, OSError): + pass + + # Search up to 5 levels deep (handles nested monorepos) + search_recursive(search_path, 0, 5) + except (PermissionError, OSError): + continue + + if verbose: + print(f" {len(candidates)} total candidates ({time.time()-step_start:.1f}s)", flush=True) + + # Step 4: Additional name-based search if still no candidates + if not candidates: + step_start = time.time() + if verbose: + print(f" → Step 4: Fallback name-based search...", end='', flush=True) + + for search_path in search_paths: + if not search_path.exists(): + continue + + # First, try direct children (most common case) + try: + for item in search_path.iterdir(): + if not item.is_dir(): + continue + + # Skip excluded directories + if item.name in exclude_dirs: + continue + + # Check if it matches and is a repository + if matches_image_name(item.name, image_name, repo_name_hint): + if is_repository_directory(item): + candidates.append(( + item, 0, 0.0, None + )) # depth, score, artifact + except (PermissionError, OSError): + continue + + # Then try recursive search (limited depth for performance) + try: + def search_recursive_fallback(path: Path, current_depth: int, + max_depth: int): + """Recursively search directories up to max_depth.""" + if current_depth > max_depth: + return + + try: + for item in path.iterdir(): + if not item.is_dir(): + continue + + # Skip excluded directories + if item.name in exclude_dirs: + continue + + # Check if any parent is excluded + if any(part in exclude_dirs for part in item.parts): + continue + + # Check if it matches and is a repository + if matches_image_name(item.name, image_name, + repo_name_hint): + if is_repository_directory(item): + candidates.append(( + item, current_depth, 0.0, None + )) + + # Recurse deeper + if current_depth < max_depth: + search_recursive_fallback(item, + current_depth + 1, + max_depth) + except (PermissionError, OSError): + pass + + # Search up to 5 levels deep + search_recursive_fallback(search_path, 1, 5) + except (PermissionError, OSError): + continue + + if verbose: + print(f" {len(candidates)} candidates ({time.time()-step_start:.1f}s)", flush=True) + + # Rank and return best match + if candidates: + # Sort by score (high to low), then by depth (shallow first) + candidates.sort(key=lambda x: (-x[2], x[1], len(x[0].parts))) + + best_repo_path, best_depth, best_score, best_artifact = candidates[0] + + # Build evidence dict + evidence = { + 'match_score': best_score, + 'search_depth': best_depth, + 'helm_charts_found': len(helm_charts), + 'helm_artifacts': helm_artifacts, + } + + # Add artifact evidence if available + if best_artifact: + evidence['matched_artifact'] = best_artifact + evidence['artifact_files'] = [] + # Find which files contained the artifact + for lang, dep_files in DEPENDENCY_FILES.items(): + for dep_file in dep_files: + file_path = best_repo_path / dep_file + if file_path.exists(): + try: + content = file_path.read_text(encoding='utf-8', errors='ignore') + artifact_name = best_artifact.get('name', '') + if artifact_name and artifact_name.lower() in content.lower(): + evidence['artifact_files'].append(str(file_path.relative_to(best_repo_path))) + except: + pass + + # Determine match method + if best_score >= 1.0: + evidence['match_method'] = 'artifact_match' + elif best_score >= 0.6: + evidence['match_method'] = 'grep_name_match' + elif best_score >= 0.3: + evidence['match_method'] = 'recursive_name_match' + else: + evidence['match_method'] = 'fallback_name_match' + + # Additional ranking for name-based candidates with same score + if candidates[0][2] < 1.0: # No perfect artifact match + # Use name similarity and git recency for final ranking + name_based_candidates = [(c[0], c[1]) for c in candidates] + ranked = rank_candidates(name_based_candidates, image_name) + if ranked and ranked[0][1] >= 0.3: + evidence['git_ranking_applied'] = True + evidence['name_similarity'] = ranked[0][1] + return (ranked[0][0], evidence) + + # Return the best candidate + if candidates[0][2] >= 0.3: # Minimum score threshold + return (best_repo_path, evidence) + + return (None, {}) + + +def has_dependency_files(repo_path: Path) -> bool: + """Check if a directory looks like a repository with dependency files.""" + for lang, files in DEPENDENCY_FILES.items(): + for dep_file in files: + if (repo_path / dep_file).exists(): + return True + return False + + +def find_dependency_files(repo_path: Path) -> Dict[str, List[str]]: + """Find all dependency files in a repository.""" + dependencies = {} + + for lang, files in DEPENDENCY_FILES.items(): + found_files = [] + for dep_file in files: + # Check exact match + if (repo_path / dep_file).exists(): + found_files.append(dep_file) + else: + # Check for glob patterns + if '*' in dep_file: + pattern = dep_file.replace('*', '**') + for match in repo_path.rglob(pattern): + if match.is_file(): + rel_path = str(match.relative_to(repo_path)) + if rel_path not in found_files: + found_files.append(rel_path) + + if found_files: + dependencies[lang] = found_files + + return dependencies + + +def find_dockerfiles(repo_path: Path) -> List[str]: + """Find all Dockerfiles in a repository.""" + dockerfiles = [] + + for pattern in DOCKERFILE_PATTERNS: + if '*' in pattern: + # Glob pattern + for match in repo_path.rglob(pattern): + if match.is_file(): + rel_path = str(match.relative_to(repo_path)) + if rel_path not in dockerfiles: + dockerfiles.append(rel_path) + else: + # Exact match + if (repo_path / pattern).exists(): + dockerfiles.append(pattern) + + # Also check common locations + common_locations = [ + 'Dockerfile', + 'docker/Dockerfile', + '.docker/Dockerfile', + 'build/Dockerfile', + ] + + for location in common_locations: + if (repo_path / location).exists() and location not in dockerfiles: + dockerfiles.append(location) + + return sorted(dockerfiles) + + +def get_git_remote_url(repo_path: Path) -> Optional[str]: + """ + Get the git remote URL for a repository. + + Returns the URL of the 'origin' remote, or None if not a git repo. + """ + try: + result = subprocess.run( + ['git', 'config', '--get', 'remote.origin.url'], + cwd=repo_path, + capture_output=True, + text=True, + timeout=5 + ) + if result.returncode == 0: + return result.stdout.strip() + except (subprocess.TimeoutExpired, subprocess.SubprocessError, FileNotFoundError): + pass + return None + + +def is_public_image(image: str) -> bool: + """Determine if an image is public (from public registries).""" + public_registries = [ + 'docker.io', + 'quay.io', + 'gcr.io', + 'k8s.gcr.io', + 'ghcr.io', + ] + + for registry in public_registries: + if image.startswith(registry): + return True + + return False + + +def get_repo_name_from_path(repo_path: Path) -> str: + """Extract repository name from path.""" + return repo_path.name + + +def main(): + """Main function to generate image-source-analysis.json.""" + parser = argparse.ArgumentParser( + description='Generate image-source-analysis.json from scan-summary.json', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +This script searches for repositories matching images from scan-summary.json in: +1. Recursively in ../ (excluding support folder) +2. Recursively in ../../ (excluding aleph-alpha folder) + """ + ) + parser.add_argument( + '--scan-summary', + type=str, + help='Path to scan-summary.json (default: ../k8s-images/scan-summary.json)' + ) + parser.add_argument( + '--output', + type=str, + help='Path to output file (default: image-source-analysis.json in script directory)' + ) + parser.add_argument( + '--verbose', '-v', + action='store_true', + help='Enable verbose output' + ) + + args = parser.parse_args() + + script_dir = Path(__file__).parent + support_dir = script_dir.parent.parent # ../.. from postgres-dependency-scanner + + # Input file + if args.scan_summary: + scan_summary_file = Path(args.scan_summary) + else: + # Default: scan-results/k8s-images/scan-summary.json + scan_summary_file = support_dir / 'scan-results' / 'k8s-images' / 'scan-summary.json' + + if not scan_summary_file.exists(): + print(f"Error: {scan_summary_file} not found", file=sys.stderr) + sys.exit(1) + + # Output file + if args.output: + output_file = Path(args.output) + else: + # Default: scan-results/postgres-analysis/image-source-analysis.json + output_file = support_dir / 'scan-results' / 'postgres-analysis' / 'image-source-analysis.json' + + verbose = args.verbose + + # Load scan summary + if verbose: + print(f"Loading scan summary from: {scan_summary_file}") + with open(scan_summary_file, 'r') as f: + scan_summary = json.load(f) + + # Collect all images + all_images = set() + all_images.update(scan_summary.get('successful_scans', [])) + all_images.update(scan_summary.get('failed_scans', [])) + all_images.update(scan_summary.get('skipped_scans', [])) + + print(f"Found {len(all_images)} unique images to analyze") + if verbose: + print(f" - Successful scans: {len(scan_summary.get('successful_scans', []))}") + print(f" - Failed scans: {len(scan_summary.get('failed_scans', []))}") + print(f" - Skipped scans: {len(scan_summary.get('skipped_scans', []))}") + print() + + # Define search paths + # 1. ../ (aleph-alpha directory, excluding support folder) + search_path_1 = support_dir.parent + # 2. ../../ (all repositories) + search_path_2 = support_dir.parent.parent + + # Only exclude support directory (where this script lives) + exclude_dirs = {'support'} + + # Define Helm chart repositories (searched first, prioritized) + helm_chart_repos = [] + helm_chart_base = search_path_2 # Start from repositories root + + # Common Helm chart repository patterns + helm_chart_patterns = [ + '**/pharia-ai-helm-chart', + '**/pharia-ai-prod-deployments', + '**/inference-helm-charts', + '**/catch-deployment', + '**/creance-helm-chart', + '**/pharia-studio-deployment', + '**/phariaos-deployment', + '**/*-deployment', + '**/*-helm-chart', + '**/*-helm-charts', + ] + + # Find Helm chart repositories + if verbose: + print(f"Searching for Helm chart repositories in {helm_chart_base}...") + sys.stdout.flush() + + helm_search_start = time.time() + if helm_chart_base.exists(): + for pattern_idx, pattern in enumerate(helm_chart_patterns, 1): + if verbose: + print(f" [{pattern_idx}/{len(helm_chart_patterns)}] Searching pattern: {pattern}", end=' ', flush=True) + pattern_start = time.time() + try: + for helm_repo in helm_chart_base.rglob(pattern): + if (helm_repo.is_dir() and + ((helm_repo / 'Chart.yaml').exists() or + any((helm_repo / f).exists() + for f in ['values.yaml', 'templates']))): + if helm_repo not in helm_chart_repos: + helm_chart_repos.append(helm_repo) + except (PermissionError, OSError): + pass + if verbose: + pattern_time = time.time() - pattern_start + print(f"({pattern_time:.2f}s, found {len(helm_chart_repos)} total)") + + helm_search_time = time.time() - helm_search_start + if verbose: + print(f"✓ Helm chart search completed in {helm_search_time:.2f}s\n") + + if verbose: + print(f"Search paths:") + print(f" 1. {search_path_1} (excluding: support)") + print(f" 2. {search_path_2} (excluding: support)") + print(f"\nHelm chart repositories (searched first): {len(helm_chart_repos)}") + if verbose and helm_chart_repos: + for helm_repo in helm_chart_repos[:10]: # Show first 10 + print(f" - {helm_repo}") + if len(helm_chart_repos) > 10: + print(f" ... and {len(helm_chart_repos) - 10} more") + print() + + # Process each image + results = [] + found_count = 0 + not_found_count = 0 + + start_time = time.time() + + for idx, image in enumerate(sorted(all_images), 1): + image_start = time.time() + + if verbose: + print(f"[{idx}/{len(all_images)}] Processing: {image}...", end=' ', flush=True) + else: + # Show progress for non-verbose mode + elapsed = time.time() - start_time + avg_time = elapsed / idx if idx > 0 else 0 + remaining = (len(all_images) - idx) * avg_time + print(f"[{idx}/{len(all_images)} | {elapsed:.1f}s | ~{remaining:.0f}s left] {image[:50]}...", end=' ', flush=True) + + # Check if public + is_public = is_public_image(image) + + # Try to find repository + repo_path, evidence = find_repository_for_image( + image, + [search_path_1, search_path_2], + exclude_dirs, + helm_chart_repos, + verbose=verbose + ) + + if repo_path: + image_time = time.time() - image_start + if verbose: + print(f"✓ Found: {repo_path.name} ({image_time:.2f}s)") + else: + print(f"✓ {repo_path.name} ({image_time:.1f}s)") + found_count += 1 + + # Analyze repository + dependencies = find_dependency_files(repo_path) + dockerfiles = find_dockerfiles(repo_path) + repo_name = get_repo_name_from_path(repo_path) + git_remote_url = get_git_remote_url(repo_path) + + results.append({ + 'image': image, + 'repo': repo_name, + 'repo_path': str(repo_path.resolve()), + 'git_remote_url': git_remote_url, + 'status': 'found', + 'is_public': is_public, + 'dependencies': dependencies, + 'dockerfiles': dockerfiles, + 'evidence': evidence + }) + else: + image_time = time.time() - image_start + if verbose: + print(f"✗ Not found ({image_time:.2f}s)") + else: + print(f"✗ ({image_time:.1f}s)") + not_found_count += 1 + + # Still add to results with status 'not_found' + results.append({ + 'image': image, + 'repo': extract_image_name(image), + 'repo_path': None, + 'status': 'not_found', + 'is_public': is_public, + 'dependencies': {}, + 'dockerfiles': [], + }) + + # Create output structure + output_data = { + 'results': results + } + + # Write output + total_time = time.time() - start_time + print(f"\n{'='*80}") + print("SUMMARY") + print(f"{'='*80}") + print(f"Total images: {len(all_images)}") + print(f"Repositories found: {found_count}") + print(f"Repositories not found: {not_found_count}") + print(f"Total time: {total_time:.1f}s ({total_time/len(all_images):.2f}s per image)") + print(f"\nWriting results to: {output_file}") + + with open(output_file, 'w') as f: + json.dump(output_data, f, indent=2) + + print(f"✓ Successfully generated: {output_file}") + + +if __name__ == '__main__': + main()