Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions alembic/versions/0003_paper_harvest_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""paper harvest tables

Revision ID: 0003_paper_harvest_tables
Revises: 0002_research_eval_runs
Create Date: 2026-02-06

Adds:
- papers: harvested paper metadata with multi-source deduplication
- harvest_runs: harvest execution tracking and audit
"""

from __future__ import annotations

import sqlalchemy as sa
from alembic import context, op

revision = "0003_paper_harvest_tables"
down_revision = "0002_research_eval_runs"
branch_labels = None
depends_on = None


def _is_offline() -> bool:
try:
return bool(context.is_offline_mode())
except Exception:
return False


def _insp():
return sa.inspect(op.get_bind())


def _has_table(name: str) -> bool:
return _insp().has_table(name)


def _get_indexes(table: str) -> set[str]:
idx = set()
for i in _insp().get_indexes(table):
idx.add(str(i.get("name") or ""))
return idx


def _create_index(name: str, table: str, cols: list[str]) -> None:
if _is_offline():
op.create_index(name, table, cols)
return
if name in _get_indexes(table):
return
op.create_index(name, table, cols)


def upgrade() -> None:
if _is_offline():
_upgrade_create_tables()
return
_upgrade_create_tables()
_upgrade_create_indexes()
Comment on lines +54 to +59
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Indexes are skipped in offline mode — generated SQL will be incomplete.

When _is_offline() is True, upgrade() returns early after creating tables, never calling _upgrade_create_indexes(). Since _create_index already handles offline mode (Line 46-48, it calls op.create_index unconditionally), the indexes should be emitted in offline-generated SQL too.

Proposed fix
 def upgrade() -> None:
-    if _is_offline():
-        _upgrade_create_tables()
-        return
     _upgrade_create_tables()
     _upgrade_create_indexes()
🤖 Prompt for AI Agents
In `@alembic/versions/0003_paper_harvest_tables.py` around lines 54 - 59, The
upgrade() function returns early when _is_offline() is true, so
_upgrade_create_indexes() is never called and generated offline SQL omits index
creation; remove the early return or restructure upgrade() so it always calls
_upgrade_create_indexes() after _upgrade_create_tables() (rely on
_create_index/op.create_index which already handles offline mode) — modify
upgrade() to invoke _upgrade_create_tables() then _upgrade_create_indexes()
regardless of _is_offline() so indexes are emitted in offline-generated SQL.



def _upgrade_create_tables() -> None:
# Papers table - harvested paper metadata
if _is_offline() or not _has_table("papers"):
op.create_table(
"papers",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
# Canonical identifiers (for deduplication)
sa.Column("doi", sa.String(length=256), nullable=True),
sa.Column("arxiv_id", sa.String(length=64), nullable=True),
sa.Column("semantic_scholar_id", sa.String(length=64), nullable=True),
sa.Column("openalex_id", sa.String(length=64), nullable=True),
Comment on lines +69 to +72

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The lengths for doi, arxiv_id, semantic_scholar_id, and openalex_id columns in the papers table seem arbitrary. It's best practice to define these lengths based on the maximum possible length of these identifiers from their respective sources to prevent truncation or unnecessary over-allocation. For example, DOIs can be up to 256 characters, arXiv IDs are typically shorter (e.g., 10-15 chars plus optional version), Semantic Scholar IDs are UUID-like (32-36 chars), and OpenAlex IDs are also UUID-like. Consider reviewing the actual maximum lengths from the sources to set more precise limits.

sa.Column("title_hash", sa.String(length=64), nullable=False),
# Core metadata
sa.Column("title", sa.Text(), nullable=False),
sa.Column("abstract", sa.Text(), server_default="", nullable=False),
sa.Column("authors_json", sa.Text(), server_default="[]", nullable=False),
sa.Column("year", sa.Integer(), nullable=True),
sa.Column("venue", sa.String(length=256), nullable=True),
sa.Column("publication_date", sa.String(length=32), nullable=True),
sa.Column("citation_count", sa.Integer(), server_default="0", nullable=False),
# URLs
sa.Column("url", sa.String(length=1024), nullable=True),
sa.Column("pdf_url", sa.String(length=1024), nullable=True),
# Classification
sa.Column("keywords_json", sa.Text(), server_default="[]", nullable=False),
sa.Column("fields_of_study_json", sa.Text(), server_default="[]", nullable=False),
# Source tracking
sa.Column("primary_source", sa.String(length=32), nullable=False),
sa.Column("sources_json", sa.Text(), server_default="[]", nullable=False),
Comment on lines +67 to +90
Copy link

Copilot AI Feb 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Alembic table definitions don’t match the SQLAlchemy models for PaperModel (e.g. doi length 256 here vs 128 in the model; url/pdf_url 1024 here vs 512 in the model; arxiv_id 64 vs 32). This can cause migration drift and runtime issues across environments. Align the migration column sizes/types with models.py (and ensure unique constraints in the migration match unique=True in the model).

Copilot uses AI. Check for mistakes.
# Timestamps
sa.Column("created_at", sa.DateTime(timezone=True), nullable=True),
sa.Column("updated_at", sa.DateTime(timezone=True), nullable=True),
sa.Column("deleted_at", sa.DateTime(timezone=True), nullable=True),
)

# Harvest runs table - execution tracking
if _is_offline() or not _has_table("harvest_runs"):
op.create_table(
"harvest_runs",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("run_id", sa.String(length=64), unique=True, nullable=False),
# Input parameters
sa.Column("keywords_json", sa.Text(), server_default="[]", nullable=False),
sa.Column("venues_json", sa.Text(), server_default="[]", nullable=False),
sa.Column("sources_json", sa.Text(), server_default="[]", nullable=False),
sa.Column("max_results_per_source", sa.Integer(), server_default="100", nullable=False),
# Results
sa.Column("status", sa.String(length=32), server_default="running", nullable=False),
sa.Column("papers_found", sa.Integer(), server_default="0", nullable=False),
sa.Column("papers_new", sa.Integer(), server_default="0", nullable=False),
sa.Column("papers_deduplicated", sa.Integer(), server_default="0", nullable=False),
sa.Column("error_json", sa.Text(), server_default="{}", nullable=False),
# Timestamps
sa.Column("started_at", sa.DateTime(timezone=True), nullable=True),
sa.Column("ended_at", sa.DateTime(timezone=True), nullable=True),
)


def _upgrade_create_indexes() -> None:
# Papers indexes
_create_index("ix_papers_doi", "papers", ["doi"])
_create_index("ix_papers_arxiv_id", "papers", ["arxiv_id"])
_create_index("ix_papers_semantic_scholar_id", "papers", ["semantic_scholar_id"])
_create_index("ix_papers_openalex_id", "papers", ["openalex_id"])
_create_index("ix_papers_title_hash", "papers", ["title_hash"])
_create_index("ix_papers_year", "papers", ["year"])
_create_index("ix_papers_venue", "papers", ["venue"])
_create_index("ix_papers_citation_count", "papers", ["citation_count"])
_create_index("ix_papers_primary_source", "papers", ["primary_source"])
_create_index("ix_papers_created_at", "papers", ["created_at"])

# Harvest runs indexes
_create_index("ix_harvest_runs_run_id", "harvest_runs", ["run_id"])
_create_index("ix_harvest_runs_status", "harvest_runs", ["status"])
_create_index("ix_harvest_runs_started_at", "harvest_runs", ["started_at"])


def downgrade() -> None:
op.drop_table("harvest_runs")
op.drop_table("papers")
Loading
Loading