Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/paperbot/utils/log_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Add new log files here, then import them in Python:
# from paperbot.utils.logging_config import Logger, LogFiles
# Logger.info("message", file=LogFiles.HARVEST)

# Base directory for all logs (relative to project root)
base_dir: "logs"

# Log file paths (relative to base_dir)
files:
HARVEST: "harvest/harvest.log"
api: "api/api.log"
error: "errors/error.log"
database: "db/database.log"
Comment on lines +9 to +13

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The keys under files use mixed casing (HARVEST vs api, error, database). The corresponding Python code in logging_config.py suggests that these are meant to be accessed like constants (e.g., LogFiles.HARVEST), which by convention (PEP 8) should be uppercase. For consistency and improved readability, it's recommended to use uppercase for all keys.

files:
  HARVEST: "harvest/harvest.log"
  API: "api/api.log"
  ERROR: "errors/error.log"
  DATABASE: "db/database.log"

370 changes: 370 additions & 0 deletions src/paperbot/utils/logging_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,370 @@
# src/paperbot/utils/logging_config.py
"""
Centralized logging configuration for PaperBot.

Usage:
from paperbot.utils.logging_config import Logger

# Log to specific file with automatic line number
Logger.info("Processing started", file="harvest/harvest.log")
Logger.error("Failed to connect", file="errors/error.log")

# Log to default file (logs/paperbot.log)
Logger.info("General message")

Configuration via environment variables:
PAPERBOT_LOG_LEVEL: DEBUG, INFO, WARNING, ERROR, CRITICAL (default: INFO)
PAPERBOT_LOG_DIR: Base directory for log files (default: logs/)
PAPERBOT_LOG_MAX_BYTES: Max size per log file in bytes (default: 10MB)
PAPERBOT_LOG_BACKUP_COUNT: Number of backup files to keep (default: 5)
"""

from __future__ import annotations

import inspect
import os
import uuid
from contextvars import ContextVar
from datetime import datetime
from logging.handlers import RotatingFileHandler
from pathlib import Path
from typing import Optional

# Context variable for trace_id (thread-safe, async-safe)
_trace_id_var: ContextVar[Optional[str]] = ContextVar("trace_id", default=None)

# Default configuration
DEFAULT_LOG_LEVEL = "INFO"
DEFAULT_LOG_DIR = "logs"
DEFAULT_LOG_FILE = "paperbot.log"
DEFAULT_MAX_BYTES = 10 * 1024 * 1024 # 10 MB
DEFAULT_BACKUP_COUNT = 5
DEFAULT_FORMAT = "{timestamp} [{level}] [{trace_id}] {filename}:{lineno} - {message}"
DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
# Config file in the same directory as this module
LOG_CONFIG_FILE = Path(__file__).parent / "log_config.yaml"


class _LogFilesMeta(type):
"""Metaclass to allow attribute access like LogFiles.HARVEST."""

def __getattr__(cls, name: str) -> str:
cls._load()
# Try exact match first, then case-insensitive
if name in cls._files:
return cls._files[name]
key = name.lower()
if key in cls._files:
return cls._files[key]
raise AttributeError(f"Log file '{name}' not found in config")


class LogFiles(metaclass=_LogFilesMeta):
"""
Log file paths loaded from src/paperbot/utils/log_config.yaml.

Usage:
from paperbot.utils.logging_config import Logger, LogFiles

Logger.info("Message", file=LogFiles.HARVEST)
Logger.error("Error", file=LogFiles.ERROR)

To add a new log file:
1. Edit src/paperbot/utils/log_config.yaml
2. Add entry under 'files' section
3. Access via LogFiles.YOUR_NAME (uppercase)
"""

_loaded = False
_files: dict = {}

@classmethod
def _load(cls) -> None:
if cls._loaded:
return

# Default values
cls._files = {
"harvest": "harvest/harvest.log",
"api": "api/api.log",
"error": "errors/error.log",
}
Comment on lines +87 to +91

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The hardcoded default values for log files are incomplete. The log_config.yaml file includes a database log, but it's missing from these defaults. If the YAML file fails to load, any attempt to use LogFiles.DATABASE will result in an AttributeError. To make the fallback behavior more robust and consistent, the default values should include all expected log files.

Suggested change
cls._files = {
"harvest": "harvest/harvest.log",
"api": "api/api.log",
"error": "errors/error.log",
}
cls._files = {
"harvest": "harvest/harvest.log",
"api": "api/api.log",
"error": "errors/error.log",
"database": "db/database.log",
}


# Try to load from config file
try:
import yaml
config_path = LOG_CONFIG_FILE # Already a Path object
if config_path.exists():
with open(config_path, "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
if config and "files" in config:
cls._files.update(config["files"])
except Exception:
pass
Comment on lines +102 to +103

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This broad except Exception: pass will silently ignore all errors during the loading of log_config.yaml, including a missing pyyaml dependency or a malformed YAML file. This can make debugging configuration issues very difficult. It's better to at least log a warning to standard error when the configuration can't be loaded.

Suggested change
except Exception:
pass
except Exception as e:
import sys
print(f"WARNING: Failed to load log config from {LOG_CONFIG_FILE}, using defaults. Error: {e}", file=sys.stderr)


cls._loaded = True

@classmethod
def get(cls, name: str) -> str:
"""Get log file path by name."""
cls._load()
# Try exact match first, then case-insensitive
if name in cls._files:
return cls._files[name]
key = name.lower()
if key in cls._files:
return cls._files[key]
return f"{name}/{name}.log"

# Log levels
LOG_LEVELS = {
"DEBUG": 10,
"INFO": 20,
"WARNING": 30,
"ERROR": 40,
"CRITICAL": 50,
}

# Module-level state
_initialized = False
_config: dict = {}
_file_handlers: dict[str, RotatingFileHandler] = {}


def _get_config() -> dict:
"""Get logging configuration from environment variables."""
return {
"level": os.environ.get("PAPERBOT_LOG_LEVEL", DEFAULT_LOG_LEVEL).upper(),
"base_dir": os.environ.get("PAPERBOT_LOG_DIR", DEFAULT_LOG_DIR),
"max_bytes": int(os.environ.get("PAPERBOT_LOG_MAX_BYTES", DEFAULT_MAX_BYTES)),
"backup_count": int(os.environ.get("PAPERBOT_LOG_BACKUP_COUNT", DEFAULT_BACKUP_COUNT)),
}


def _ensure_dir(path: Path) -> None:
"""Ensure directory exists."""
path.mkdir(parents=True, exist_ok=True)


def _get_file_handler(file_path: str) -> RotatingFileHandler:
"""Get or create a file handler for the given path."""
if file_path not in _file_handlers:
path = Path(file_path)
_ensure_dir(path.parent)

handler = RotatingFileHandler(
filename=str(path),
maxBytes=_config.get("max_bytes", DEFAULT_MAX_BYTES),
backupCount=_config.get("backup_count", DEFAULT_BACKUP_COUNT),
encoding="utf-8",
)
_file_handlers[file_path] = handler

return _file_handlers[file_path]
Comment on lines +149 to +163

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-medium medium

The _file_handlers dictionary acts as an unbounded cache for file handlers. Since handlers are never removed or closed (except on global shutdown), providing dynamic file names to the logger can lead to a resource exhaustion attack (Denial of Service). An attacker could trigger the creation of thousands of file handlers, exhausting file descriptors and memory.



def _format_message(
level: str,
message: str,
filename: str,
lineno: int,
trace_id: Optional[str] = None,
) -> str:
"""Format a log message."""
timestamp = datetime.now().strftime(DEFAULT_DATE_FORMAT)
tid = trace_id or _trace_id_var.get() or "-"
return DEFAULT_FORMAT.format(
timestamp=timestamp,
level=level,
trace_id=tid,
filename=filename,
lineno=lineno,
message=message,
)


def _resolve_file_path(file: Optional[str]) -> str:
"""Resolve the full file path for logging."""
base_dir = _config.get("base_dir", DEFAULT_LOG_DIR)

if file is None:
return str(Path(base_dir) / DEFAULT_LOG_FILE)

# If file contains directory separator, use as relative path under base_dir
if "/" in file or "\\" in file:
return str(Path(base_dir) / file)

# Otherwise, just a filename in base_dir
return str(Path(base_dir) / file)
Comment on lines +186 to +198

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-high high

The _resolve_file_path function is vulnerable to path traversal. It joins the base_dir with the file argument without validating that the resulting path remains within the base_dir. If the file argument is an absolute path or contains traversal sequences like .., it can point to arbitrary files on the system. This could allow an attacker to overwrite sensitive files if they can influence the log file name.

Suggested change
def _resolve_file_path(file: Optional[str]) -> str:
"""Resolve the full file path for logging."""
base_dir = _config.get("base_dir", DEFAULT_LOG_DIR)
if file is None:
return str(Path(base_dir) / DEFAULT_LOG_FILE)
# If file contains directory separator, use as relative path under base_dir
if "/" in file or "\\" in file:
return str(Path(base_dir) / file)
# Otherwise, just a filename in base_dir
return str(Path(base_dir) / file)
def _resolve_file_path(file: Optional[str]) -> str:
"""Resolve the full file path for logging and prevent path traversal."""
base_dir = Path(_config.get("base_dir", DEFAULT_LOG_DIR)).resolve()
if file is None:
return str(base_dir / DEFAULT_LOG_FILE)
# Resolve the path and ensure it's under base_dir
target_path = (base_dir / file).resolve()
try:
target_path.relative_to(base_dir)
except ValueError:
# Fallback or raise error if path is outside base_dir
return str(base_dir / DEFAULT_LOG_FILE)
return str(target_path)



def _should_log(level: str) -> bool:
"""Check if message should be logged based on current level."""
current_level = _config.get("level", DEFAULT_LOG_LEVEL)
return LOG_LEVELS.get(level, 0) >= LOG_LEVELS.get(current_level, 0)


def _write_log(level: str, message: str, file: Optional[str] = None) -> None:
"""Write a log message to the specified file."""
if not _should_log(level):
return

# Get caller info (skip _write_log and the public method)
frame = inspect.currentframe()
caller_frame = frame.f_back.f_back if frame and frame.f_back else None
Comment on lines +213 to +214

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using inspect.currentframe() to get caller information can have a significant performance impact, especially if logging is done frequently in tight loops. The standard logging module is highly optimized for this (often in C). While this approach works, it's important to be aware of the potential overhead. If performance becomes a concern, consider refactoring to leverage the standard logging module's more efficient mechanisms for capturing caller info (e.g., using %(filename)s and %(lineno)d in the format string).


if caller_frame:
filename = os.path.basename(caller_frame.f_code.co_filename)
lineno = caller_frame.f_lineno
else:
filename = "unknown"
lineno = 0

# Format message (trace_id is automatically obtained from context)
formatted = _format_message(level, message, filename, lineno)

# Get file path and handler
file_path = _resolve_file_path(file)
handler = _get_file_handler(file_path)

# Write to file
handler.stream.write(formatted + "\n")
handler.stream.flush()
Comment on lines +231 to +232

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

Writing directly to handler.stream bypasses the thread-safe mechanisms (i.e., locks) provided by Python's logging.Handler. In a multi-threaded application, this can lead to race conditions and result in garbled or interleaved log messages. To ensure thread safety, you should acquire the handler's lock before writing to the stream.

    handler.acquire()
    try:
        handler.stream.write(formatted + "\n")
        handler.stream.flush()
    finally:
        handler.release()



class Logger:
"""
Static logger class for logging to specific files.

Usage:
from paperbot.utils.logging_config import Logger

# Initialize once at application startup (optional, auto-initializes on first use)
Logger.init()

# Log to specific file
Logger.info("Processing started", file="harvest/harvest.log")
Logger.error("Failed", file="errors/error.log")

# Log to default file (logs/paperbot.log)
Logger.info("General message")
"""

@staticmethod
def init(
level: Optional[str] = None,
base_dir: Optional[str] = None,
max_bytes: Optional[int] = None,
backup_count: Optional[int] = None,
) -> None:
"""
Initialize the logging system.

Args:
level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
base_dir: Base directory for all log files
max_bytes: Maximum size of each log file before rotation
backup_count: Number of backup files to keep
"""
global _initialized, _config

if _initialized:
return

_config = _get_config()

if level:
_config["level"] = level.upper()
if base_dir:
_config["base_dir"] = base_dir
if max_bytes:
_config["max_bytes"] = max_bytes
if backup_count:
_config["backup_count"] = backup_count

_initialized = True

@staticmethod
def _ensure_init() -> None:
"""Ensure logger is initialized."""
if not _initialized:
Logger.init()

@staticmethod
def debug(message: str, file: Optional[str] = None) -> None:
"""Log a debug message."""
Logger._ensure_init()
_write_log("DEBUG", message, file)

@staticmethod
def info(message: str, file: Optional[str] = None) -> None:
"""Log an info message."""
Logger._ensure_init()
_write_log("INFO", message, file)

@staticmethod
def warning(message: str, file: Optional[str] = None) -> None:
"""Log a warning message."""
Logger._ensure_init()
_write_log("WARNING", message, file)

@staticmethod
def error(message: str, file: Optional[str] = None) -> None:
"""Log an error message."""
Logger._ensure_init()
_write_log("ERROR", message, file)

@staticmethod
def critical(message: str, file: Optional[str] = None) -> None:
"""Log a critical message."""
Logger._ensure_init()
_write_log("CRITICAL", message, file)

@staticmethod
def set_level(level: str) -> None:
"""Change the log level at runtime."""
Logger._ensure_init()
_config["level"] = level.upper()

@staticmethod
def close() -> None:
"""Close all file handlers."""
for handler in _file_handlers.values():
handler.close()
_file_handlers.clear()


# ============================================================================
# Trace ID Management
# ============================================================================

def generate_trace_id() -> str:
"""Generate a new trace ID."""
return f"req-{uuid.uuid4().hex[:12]}"


def set_trace_id(trace_id: Optional[str] = None) -> str:
"""
Set the trace ID for the current context.

If no trace_id is provided, generates a new one.
Returns the trace_id that was set.

Usage:
# At the start of a request handler
trace_id = set_trace_id()
Logger.info("Request started", file=LogFiles.HARVEST) # auto-includes trace_id
"""
tid = trace_id or generate_trace_id()
_trace_id_var.set(tid)
return tid


def get_trace_id() -> Optional[str]:
"""Get the current trace ID."""
return _trace_id_var.get()


def clear_trace_id() -> None:
"""Clear the current trace ID."""
_trace_id_var.set(None)
Loading