diff --git a/.dockerignore b/.dockerignore index ebeb9a09..7b2e3a90 100644 --- a/.dockerignore +++ b/.dockerignore @@ -38,3 +38,7 @@ .venv/lib/python3.12/site-packages/debugpy/_vendored/pydevd/pydevd_attach_to_process/linux_and_mac/attach_linux_x86.o .venv/lib/python3.12/site-packages/debugpy/_vendored/pydevd/pydevd_attach_to_process/linux_and_mac/attach_linux_x86_64.o fly.toml + +node_modules +**/node_modules +morphik.toml \ No newline at end of file diff --git a/Dockerfile.gpu b/Dockerfile.gpu new file mode 100644 index 00000000..6326c29a --- /dev/null +++ b/Dockerfile.gpu @@ -0,0 +1,217 @@ +# syntax=docker/dockerfile:1 + +# Purpose: Production Docker image for Morphik Core +# This file builds the official Morphik image that gets published to ghcr.io +# It includes all dependencies and uses start_server.py which reads config from morphik.toml +# Used by: GitHub Actions (docker-publish.yml) and developers building locally + +# Build stage (CUDA devel image so we have nvcc & headers for flash-attn) +FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS builder + +# Install uv +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +# Set working directory +WORKDIR /app + +# Install build dependencies and Python 3.11 (to match main image / lockfile) +RUN DEBIAN_FRONTEND=noninteractive apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + software-properties-common \ + curl \ + git \ + gcc \ + g++ \ + cmake && \ + DEBIAN_FRONTEND=noninteractive add-apt-repository ppa:deadsnakes/ppa && \ + DEBIAN_FRONTEND=noninteractive apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + python3.11 \ + python3.11-dev \ + python3.11-venv \ + python3.11-distutils \ + python3-pip && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ + rm -rf /var/lib/apt/lists/* + +# Install Rust using the simpler method +RUN curl https://sh.rustup.rs -sSf | bash -s -- -y +# Activating cargo env for this RUN instruction and subsequent ones in this stage. +ENV PATH="/root/.cargo/bin:${PATH}" + +# Set uv environment variables +ENV UV_LINK_MODE=copy +ENV UV_CACHE_DIR=/root/.cache/uv +ENV VIRTUAL_ENV=/app/.venv +ENV PATH="/app/.venv/bin:${PATH}" + +# Copy project definition and lock file +COPY pyproject.toml uv.lock ./ +COPY fde ./fde +COPY morphik_rust ./morphik_rust + +# Create venv and install dependencies from lockfile (excluding the project itself initially for better caching) +# This also creates the /app/.venv directory +# morphik-rust is a required dependency - Rust toolchain installed above +RUN --mount=type=cache,target=${UV_CACHE_DIR} \ + uv sync --verbose --locked --no-install-project + +# Copy the rest of the application code +# Assuming start_server.py is at the root or handled by pyproject.toml structure. +COPY . . + +# Copy the UI component (including it in the image for optional use) +# This ensures the UI is available when users want to enable it +COPY ee/ui-component /app/ee/ui-component + +# Install the project itself into the venv in non-editable mode +RUN --mount=type=cache,target=${UV_CACHE_DIR} \ + uv sync --verbose --locked --no-editable + +# Install flash-attn AFTER the final sync to ensure it's not removed +# This requires access to CUDA toolchain (nvcc) available in this builder image +RUN --mount=type=cache,target=${UV_CACHE_DIR} \ + uv pip install "flash-attn==2.8.3" --no-build-isolation + +# Install GCC 11 and set it as default (Ubuntu 22.04 base image already has proper repos) +RUN DEBIAN_FRONTEND=noninteractive apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-11 g++-11 && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 + +# Production stage +FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 + +# Set working directory +WORKDIR /app + +# Install runtime dependencies +# Note: tesseract-ocr removed - docling uses rapidocr (pure Python) instead +# LibreOffice needed for ColPali processing of Office docs (docx/xlsx/pptx -> PDF -> images) +RUN DEBIAN_FRONTEND=noninteractive apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + software-properties-common \ + ffmpeg \ + libsm6 \ + libxext6 \ + libmagic1 \ + postgresql-client \ + poppler-utils \ + gcc \ + g++ \ + cmake \ + git \ + libreoffice-writer \ + libreoffice-calc \ + libreoffice-impress && \ + DEBIAN_FRONTEND=noninteractive add-apt-repository ppa:deadsnakes/ppa && \ + DEBIAN_FRONTEND=noninteractive apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + python3.11 \ + python3.11-venv \ + python3.11-distutils \ + python3-pip && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ + rm -rf /var/lib/apt/lists/* + +# Copy the virtual environment from the builder stage +COPY --from=builder /app/.venv /app/.venv +# Copy uv binaries from the builder stage +COPY --from=builder /bin/uv /bin/uv +COPY --from=builder /bin/uvx /bin/uvx + +## copy fde package to avoid error at server startup +COPY --from=builder /app/fde ./fde + +## copy morphik_rust package (path dependency in pyproject.toml) +COPY --from=builder /app/morphik_rust ./morphik_rust + +# Create necessary directories +RUN mkdir -p storage logs + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV VIRTUAL_ENV=/app/.venv +ENV PATH="/app/.venv/bin:/usr/local/bin:${PATH}" + +# Ensure the virtualenv Python interpreter path is valid in this runtime image +# This prevents uv from deleting the prebuilt .venv and recreating it at container start. +RUN ln -sf /usr/bin/python3.11 /app/.venv/bin/python + +# Create default configuration +COPY morphik.docker.toml /app/morphik.toml.default + +# Create startup script +RUN echo '#!/bin/bash\n\ +set -e\n\ +\n\ +# Copy default config if none exists\n\ +if [ ! -f /app/morphik.toml ]; then\n\ + cp /app/morphik.toml.default /app/morphik.toml\n\ +fi\n\ +\n\ +# Function to check PostgreSQL\n\ +check_postgres() {\n\ + if [ -n "$POSTGRES_URI" ]; then\n\ + echo "Waiting for PostgreSQL..."\n\ + max_retries=30\n\ + retries=0\n\ + until PGPASSWORD=$PGPASSWORD pg_isready -h postgres -U morphik -d morphik; do\n\ + retries=$((retries + 1))\n\ + if [ $retries -eq $max_retries ]; then\n\ + echo "Error: PostgreSQL did not become ready in time"\n\ + exit 1\n\ + fi\n\ + echo "Waiting for PostgreSQL... (Attempt $retries/$max_retries)"\n\ + sleep 2\n\ + done\n\ + echo "PostgreSQL is ready!"\n\ + \n\ + # Verify database connection\n\ + if ! PGPASSWORD=$PGPASSWORD psql -h postgres -U morphik -d morphik -c "SELECT 1" > /dev/null 2>&1; then\n\ + echo "Error: Could not connect to PostgreSQL database"\n\ + exit 1\n\ + fi\n\ + echo "PostgreSQL connection verified!"\n\ + fi\n\ +}\n\ +\n\ +# Check PostgreSQL\n\ +check_postgres\n\ +\n\ +# Check if command arguments were passed ($# is the number of arguments)\n\ +if [ $# -gt 0 ]; then\n\ + # If arguments exist, run them via uv so Python environment is consistent (e.g., "arq core.workers...")\n\ + exec "$@"\n\ +else\n\ + # Otherwise, execute the default command (uv run start_server.py)\n\ + exec uv run start_server.py --skip-redis-check\n\ +fi\n\ +' > /app/docker-entrypoint.sh && chmod +x /app/docker-entrypoint.sh + +# Copy application code +# pyproject.toml is needed for uv to identify the project context for `uv run` +COPY pyproject.toml uv.lock ./ + +## copy the fde package also to fix distribution not found error +COPY fde ./fde + +COPY core ./core +COPY ee ./ee +COPY utils ./utils +COPY README.md LICENSE ./ +# Assuming start_server.py is at the root of your project +COPY start_server.py ./ + +# Labels for the image +LABEL org.opencontainers.image.title="Morphik Core" +LABEL org.opencontainers.image.description="Morphik Core - A powerful document processing and retrieval system" +LABEL org.opencontainers.image.source="https://github.com/morphik-org/morphik-core" +LABEL org.opencontainers.image.version="1.0.0" +LABEL org.opencontainers.image.licenses="MIT" + +# Expose port +EXPOSE 8000 + +# Set the entrypoint +ENTRYPOINT ["/app/docker-entrypoint.sh"] diff --git a/core/config.py b/core/config.py index c5635181..294c56aa 100644 --- a/core/config.py +++ b/core/config.py @@ -110,6 +110,7 @@ class Settings(BaseSettings): STORAGE_PATH: Optional[str] = None AWS_REGION: Optional[str] = None S3_BUCKET: Optional[str] = None + S3_ENDPOINT_URL: Optional[str] = None S3_UPLOAD_CONCURRENCY: int = 16 CACHE_ENABLED: bool = False CACHE_MAX_BYTES: int = 10 * 1024 * 1024 * 1024 @@ -307,6 +308,7 @@ def get_settings() -> Settings: { "STORAGE_PROVIDER": config["storage"]["provider"], "STORAGE_PATH": config["storage"]["storage_path"], + "S3_ENDPOINT_URL": config["storage"]["endpoint_url"], } ) upload_conc = config["storage"].get("s3_upload_concurrency", 16) @@ -325,6 +327,7 @@ def get_settings() -> Settings: "S3_BUCKET": config["storage"]["bucket_name"], "AWS_ACCESS_KEY": os.environ["AWS_ACCESS_KEY"], "AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"], + "S3_ENDPOINT_URL": config["storage"]["endpoint_url"], } ) case "aws-s3": diff --git a/core/database/postgres_database.py b/core/database/postgres_database.py index 64bf3959..d3300fb7 100644 --- a/core/database/postgres_database.py +++ b/core/database/postgres_database.py @@ -2023,7 +2023,7 @@ async def upsert_chat_history( ) -> bool: """Store or update chat history.""" try: - now = datetime.now(UTC).isoformat() + now = datetime.now(UTC) # Auto-generate title from first user message if not provided if title is None and history: @@ -2038,39 +2038,34 @@ async def upsert_chat_history( break async with self.async_session() as session: - # Check if conversation exists to determine if we need to preserve existing title + # Check if conversation exists result = await session.execute( - text("SELECT title FROM chat_conversations WHERE conversation_id = :cid"), {"cid": conversation_id} + select(ChatConversationModel).where(ChatConversationModel.conversation_id == conversation_id) ) - existing = result.fetchone() - - # If conversation exists and has a title, preserve it unless a new title is provided - if existing and existing[0] and title is None: - title = existing[0] + existing_convo = result.scalar_one_or_none() + + if existing_convo: + # Update existing conversation + existing_convo.user_id = user_id + existing_convo.app_id = app_id + existing_convo.history = history + # Preserve existing title if no new title is provided + if title is not None: + existing_convo.title = title + existing_convo.updated_at = now + else: + # Create new conversation + new_convo = ChatConversationModel( + conversation_id=conversation_id, + user_id=user_id, + app_id=app_id, + history=history, + title=title, + created_at=now, + updated_at=now, + ) + session.add(new_convo) - await session.execute( - text( - """ - INSERT INTO chat_conversations (conversation_id, user_id, app_id, history, title, created_at, updated_at) - VALUES (:cid, :uid, :aid, :hist, :title, CAST(:now AS TEXT), CAST(:now AS TEXT)) - ON CONFLICT (conversation_id) - DO UPDATE SET - user_id = EXCLUDED.user_id, - app_id = EXCLUDED.app_id, - history = EXCLUDED.history, - title = COALESCE(EXCLUDED.title, chat_conversations.title), - updated_at = CAST(:now AS TEXT) - """ - ), - { - "cid": conversation_id, - "uid": user_id, - "aid": app_id, - "hist": json.dumps(history), - "title": title, - "now": now, - }, - ) await session.commit() return True except Exception as e: diff --git a/core/services_init.py b/core/services_init.py index ff1e3aa7..4041d02f 100644 --- a/core/services_init.py +++ b/core/services_init.py @@ -68,8 +68,13 @@ aws_secret_key=settings.AWS_SECRET_ACCESS_KEY, region_name=settings.AWS_REGION, default_bucket=settings.S3_BUCKET, + endpoint_url=settings.S3_ENDPOINT_URL, upload_concurrency=settings.S3_UPLOAD_CONCURRENCY, ) + if settings.S3_ENDPOINT_URL: + logger.info("Initialized S3Storage with endpoint URL: %s", settings.S3_ENDPOINT_URL) + else: + logger.info("Initialized S3Storage with default endpoint URL") case _: raise ValueError(f"Unsupported storage provider: {settings.STORAGE_PROVIDER}") logger.debug("Initialised Storage layer: %s", settings.STORAGE_PROVIDER) diff --git a/core/storage/s3_storage.py b/core/storage/s3_storage.py index dd4c9e19..2928a50e 100644 --- a/core/storage/s3_storage.py +++ b/core/storage/s3_storage.py @@ -40,18 +40,25 @@ def __init__( aws_secret_key: str, region_name: str = "us-east-2", default_bucket: str = "morphik-storage", + endpoint_url: Optional[str] = None, upload_concurrency: int = 16, ): self.default_bucket = default_bucket # Increase the underlying urllib3 connection-pool size to better support high concurrency boto_cfg = Config(max_pool_connections=64, retries={"max_attempts": 3, "mode": "standard"}) - self.s3_client = boto3.client( - "s3", - aws_access_key_id=aws_access_key, - aws_secret_access_key=aws_secret_key, - region_name=region_name, - config=boto_cfg, - ) + + client_args = { + "service_name": "s3", + "aws_access_key_id": aws_access_key, + "aws_secret_access_key": aws_secret_key, + "region_name": region_name, + "config": boto_cfg, + } + if endpoint_url: + client_args["endpoint_url"] = endpoint_url + + self.s3_client = boto3.client(**client_args) + # Cap concurrent uploads to avoid overwhelming the pool/S3 while still allowing parallelism. self._upload_semaphore = asyncio.Semaphore(max(1, upload_concurrency)) diff --git a/core/vector_store/fast_multivector_store.py b/core/vector_store/fast_multivector_store.py index d5af3975..0e9f5266 100644 --- a/core/vector_store/fast_multivector_store.py +++ b/core/vector_store/fast_multivector_store.py @@ -288,9 +288,9 @@ def _init_chunk_storage(self) -> Tuple[BaseStorage, Optional[str]]: provider = settings.STORAGE_PROVIDER storage_path = settings.STORAGE_PATH or "./storage" bucket = (settings.S3_BUCKET or MULTIVECTOR_CHUNKS_BUCKET) if provider == "aws-s3" else "" - + endpoint_url = settings.S3_ENDPOINT_URL logger.info("Initializing %s storage for chunk payloads", provider) - storage = self._create_storage(provider, storage_path=storage_path, default_bucket=bucket) + storage = self._create_storage(provider, storage_path=storage_path, default_bucket=bucket, endpoint_url=endpoint_url) # Track meta for later reuse decisions self.chunk_storage_provider = provider @@ -304,7 +304,7 @@ def _init_vector_storage(self) -> Tuple[BaseStorage, Optional[str]]: provider = settings.STORAGE_PROVIDER storage_path = settings.STORAGE_PATH or "./storage" bucket = (settings.S3_BUCKET or MULTIVECTOR_CHUNKS_BUCKET) if provider == "aws-s3" else "" - + endpoint_url = settings.S3_ENDPOINT_URL # Reuse chunk storage instance when configuration matches if provider == getattr(self, "chunk_storage_provider", None) and ( (provider == "local" and storage_path == getattr(self, "chunk_storage_path", None)) @@ -316,12 +316,12 @@ def _init_vector_storage(self) -> Tuple[BaseStorage, Optional[str]]: return self.chunk_storage, getattr(self, "chunk_bucket", None) logger.info("Initializing %s storage for vector tensors", provider) - storage = self._create_storage(provider, storage_path=storage_path, default_bucket=bucket) + storage = self._create_storage(provider, storage_path=storage_path, default_bucket=bucket, endpoint_url=endpoint_url) resolved_bucket = bucket if provider == "aws-s3" else "" return storage, resolved_bucket def _create_storage( - self, provider: str, *, storage_path: Optional[str], default_bucket: Optional[str] + self, provider: str, *, storage_path: Optional[str], default_bucket: Optional[str], endpoint_url: Optional[str] ) -> BaseStorage: """Factory helper to instantiate storage implementations.""" settings = get_settings() @@ -334,6 +334,7 @@ def _create_storage( aws_secret_key=settings.AWS_SECRET_ACCESS_KEY, region_name=settings.AWS_REGION, default_bucket=default_bucket or MULTIVECTOR_CHUNKS_BUCKET, + endpoint_url=endpoint_url, upload_concurrency=settings.S3_UPLOAD_CONCURRENCY, ) case "local": diff --git a/core/vector_store/multi_vector_store.py b/core/vector_store/multi_vector_store.py index 05e2f8df..de5bcbac 100644 --- a/core/vector_store/multi_vector_store.py +++ b/core/vector_store/multi_vector_store.py @@ -116,6 +116,7 @@ def _init_storage(self) -> BaseStorage: aws_secret_key=settings.AWS_SECRET_ACCESS_KEY, region_name=settings.AWS_REGION, default_bucket=MULTIVECTOR_CHUNKS_BUCKET, + endpoint_url=settings.S3_ENDPOINT_URL, upload_concurrency=settings.S3_UPLOAD_CONCURRENCY, ) else: diff --git a/core/workers/ingestion_worker.py b/core/workers/ingestion_worker.py index fe8c6a3d..7fa1ed23 100644 --- a/core/workers/ingestion_worker.py +++ b/core/workers/ingestion_worker.py @@ -1039,8 +1039,13 @@ async def startup(ctx): aws_secret_key=settings.AWS_SECRET_ACCESS_KEY, region_name=settings.AWS_REGION, default_bucket=settings.S3_BUCKET, + endpoint_url=settings.S3_ENDPOINT_URL, upload_concurrency=settings.S3_UPLOAD_CONCURRENCY, ) + if settings.S3_ENDPOINT_URL: + logger.info("Initialized S3Storage with endpoint URL: %s", settings.S3_ENDPOINT_URL) + else: + logger.info("Initialized S3Storage with default endpoint URL") else: raise ValueError(f"Unsupported storage provider: {settings.STORAGE_PROVIDER}") ctx["storage"] = storage diff --git a/scripts/migrate_multivector_to_external_storage.py b/scripts/migrate_multivector_to_external_storage.py index 74f77447..f9bb5c2b 100644 --- a/scripts/migrate_multivector_to_external_storage.py +++ b/scripts/migrate_multivector_to_external_storage.py @@ -62,6 +62,7 @@ def _init_storage(self) -> BaseStorage: aws_secret_key=settings.AWS_SECRET_ACCESS_KEY, region_name=settings.AWS_REGION, default_bucket=MULTIVECTOR_CHUNKS_BUCKET, + endpoint_url=settings.S3_ENDPOINT_URL, ) else: # logger.info("Initializing local storage for multi-vector chunks") diff --git a/start_server.py b/start_server.py index dad551a7..7578772c 100644 --- a/start_server.py +++ b/start_server.py @@ -242,13 +242,17 @@ def get_ollama_usage_info(): def main(): + # Get default log level from environment variable if available + env_log_level = os.getenv("LOG_LEVEL", "").lower() + default_log_level = env_log_level if env_log_level in ["debug", "info", "warning", "error"] else "info" + # Parse command line arguments parser = argparse.ArgumentParser(description="Start the Morphik server") parser.add_argument( "--log", choices=["debug", "info", "warning", "error"], - default="info", - help="Set the logging level", + default=default_log_level, + help=f"Set the logging level (default: {default_log_level}, can be set via LOG_LEVEL env var)", ) parser.add_argument( "--skip-ollama-check",