From 04e13269810e239345619c8a19f43a77dd88724a Mon Sep 17 00:00:00 2001 From: boyu Date: Tue, 10 Feb 2026 19:51:03 +0100 Subject: [PATCH 1/3] feat(Harvest): add -- Paper Search and Storage Closes #26 Signed-off-by: LIU BOYU --- alembic/versions/0003_paper_harvest_tables.py | 141 ++ docs/paper_harvest_v1_design.md | 2211 +++++++++++++++++ src/paperbot/api/main.py | 8 + src/paperbot/api/routes/harvest.py | 429 ++++ src/paperbot/api/routes/research.py | 72 +- .../application/ports/harvester_port.py | 50 + src/paperbot/application/services/__init__.py | 11 +- .../services/paper_deduplicator.py | 190 ++ .../application/services/query_rewriter.py | 151 ++ .../application/services/venue_recommender.py | 157 ++ .../application/workflows/harvest_pipeline.py | 376 +++ src/paperbot/context_engine/engine.py | 10 +- src/paperbot/domain/harvest.py | 160 ++ .../infrastructure/harvesters/__init__.py | 17 + .../harvesters/arxiv_harvester.py | 168 ++ .../harvesters/openalex_harvester.py | 212 ++ .../harvesters/semantic_scholar_harvester.py | 133 + src/paperbot/infrastructure/stores/models.py | 127 + .../infrastructure/stores/paper_store.py | 524 ++++ .../infrastructure/stores/research_store.py | 11 + tests/integration/test_harvest_pipeline.py | 537 ++++ tests/integration/test_harvesters.py | 478 ++++ tests/integration/test_paper_store.py | 580 +++++ tests/unit/test_harvested_paper.py | 328 +++ tests/unit/test_paper_deduplicator.py | 292 +++ tests/unit/test_query_rewriter.py | 136 + tests/unit/test_venue_recommender.py | 175 ++ web/package-lock.json | 8 - .../app/api/papers/[paperId]/save/route.ts | 20 + web/src/app/api/papers/library/route.ts | 7 + .../components/research/ResearchDashboard.tsx | 36 +- .../components/research/SavedPapersList.tsx | 108 +- web/src/lib/api.ts | 48 +- 33 files changed, 7817 insertions(+), 94 deletions(-) create mode 100644 alembic/versions/0003_paper_harvest_tables.py create mode 100644 docs/paper_harvest_v1_design.md create mode 100644 src/paperbot/api/routes/harvest.py create mode 100644 src/paperbot/application/ports/harvester_port.py create mode 100644 src/paperbot/application/services/paper_deduplicator.py create mode 100644 src/paperbot/application/services/query_rewriter.py create mode 100644 src/paperbot/application/services/venue_recommender.py create mode 100644 src/paperbot/application/workflows/harvest_pipeline.py create mode 100644 src/paperbot/domain/harvest.py create mode 100644 src/paperbot/infrastructure/harvesters/__init__.py create mode 100644 src/paperbot/infrastructure/harvesters/arxiv_harvester.py create mode 100644 src/paperbot/infrastructure/harvesters/openalex_harvester.py create mode 100644 src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py create mode 100644 tests/integration/test_harvest_pipeline.py create mode 100644 tests/integration/test_harvesters.py create mode 100644 tests/integration/test_paper_store.py create mode 100644 tests/unit/test_harvested_paper.py create mode 100644 tests/unit/test_paper_deduplicator.py create mode 100644 tests/unit/test_query_rewriter.py create mode 100644 tests/unit/test_venue_recommender.py create mode 100644 web/src/app/api/papers/[paperId]/save/route.ts create mode 100644 web/src/app/api/papers/library/route.ts diff --git a/alembic/versions/0003_paper_harvest_tables.py b/alembic/versions/0003_paper_harvest_tables.py new file mode 100644 index 0000000..ecf3803 --- /dev/null +++ b/alembic/versions/0003_paper_harvest_tables.py @@ -0,0 +1,141 @@ +"""paper harvest tables + +Revision ID: 0003_paper_harvest_tables +Revises: 0002_research_eval_runs +Create Date: 2026-02-06 + +Adds: +- papers: harvested paper metadata with multi-source deduplication +- harvest_runs: harvest execution tracking and audit +""" + +from __future__ import annotations + +import sqlalchemy as sa +from alembic import context, op + +revision = "0003_paper_harvest_tables" +down_revision = "0002_research_eval_runs" +branch_labels = None +depends_on = None + + +def _is_offline() -> bool: + try: + return bool(context.is_offline_mode()) + except Exception: + return False + + +def _insp(): + return sa.inspect(op.get_bind()) + + +def _has_table(name: str) -> bool: + return _insp().has_table(name) + + +def _get_indexes(table: str) -> set[str]: + idx = set() + for i in _insp().get_indexes(table): + idx.add(str(i.get("name") or "")) + return idx + + +def _create_index(name: str, table: str, cols: list[str]) -> None: + if _is_offline(): + op.create_index(name, table, cols) + return + if name in _get_indexes(table): + return + op.create_index(name, table, cols) + + +def upgrade() -> None: + if _is_offline(): + _upgrade_create_tables() + return + _upgrade_create_tables() + _upgrade_create_indexes() + + +def _upgrade_create_tables() -> None: + # Papers table - harvested paper metadata + if _is_offline() or not _has_table("papers"): + op.create_table( + "papers", + sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), + # Canonical identifiers (for deduplication) + sa.Column("doi", sa.String(length=256), nullable=True), + sa.Column("arxiv_id", sa.String(length=64), nullable=True), + sa.Column("semantic_scholar_id", sa.String(length=64), nullable=True), + sa.Column("openalex_id", sa.String(length=64), nullable=True), + sa.Column("title_hash", sa.String(length=64), nullable=False), + # Core metadata + sa.Column("title", sa.Text(), nullable=False), + sa.Column("abstract", sa.Text(), server_default="", nullable=False), + sa.Column("authors_json", sa.Text(), server_default="[]", nullable=False), + sa.Column("year", sa.Integer(), nullable=True), + sa.Column("venue", sa.String(length=256), nullable=True), + sa.Column("publication_date", sa.String(length=32), nullable=True), + sa.Column("citation_count", sa.Integer(), server_default="0", nullable=False), + # URLs + sa.Column("url", sa.String(length=1024), nullable=True), + sa.Column("pdf_url", sa.String(length=1024), nullable=True), + # Classification + sa.Column("keywords_json", sa.Text(), server_default="[]", nullable=False), + sa.Column("fields_of_study_json", sa.Text(), server_default="[]", nullable=False), + # Source tracking + sa.Column("primary_source", sa.String(length=32), nullable=False), + sa.Column("sources_json", sa.Text(), server_default="[]", nullable=False), + # Timestamps + sa.Column("created_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("updated_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("deleted_at", sa.DateTime(timezone=True), nullable=True), + ) + + # Harvest runs table - execution tracking + if _is_offline() or not _has_table("harvest_runs"): + op.create_table( + "harvest_runs", + sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("run_id", sa.String(length=64), unique=True, nullable=False), + # Input parameters + sa.Column("keywords_json", sa.Text(), server_default="[]", nullable=False), + sa.Column("venues_json", sa.Text(), server_default="[]", nullable=False), + sa.Column("sources_json", sa.Text(), server_default="[]", nullable=False), + sa.Column("max_results_per_source", sa.Integer(), server_default="100", nullable=False), + # Results + sa.Column("status", sa.String(length=32), server_default="running", nullable=False), + sa.Column("papers_found", sa.Integer(), server_default="0", nullable=False), + sa.Column("papers_new", sa.Integer(), server_default="0", nullable=False), + sa.Column("papers_deduplicated", sa.Integer(), server_default="0", nullable=False), + sa.Column("error_json", sa.Text(), server_default="{}", nullable=False), + # Timestamps + sa.Column("started_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("ended_at", sa.DateTime(timezone=True), nullable=True), + ) + + +def _upgrade_create_indexes() -> None: + # Papers indexes + _create_index("ix_papers_doi", "papers", ["doi"]) + _create_index("ix_papers_arxiv_id", "papers", ["arxiv_id"]) + _create_index("ix_papers_semantic_scholar_id", "papers", ["semantic_scholar_id"]) + _create_index("ix_papers_openalex_id", "papers", ["openalex_id"]) + _create_index("ix_papers_title_hash", "papers", ["title_hash"]) + _create_index("ix_papers_year", "papers", ["year"]) + _create_index("ix_papers_venue", "papers", ["venue"]) + _create_index("ix_papers_citation_count", "papers", ["citation_count"]) + _create_index("ix_papers_primary_source", "papers", ["primary_source"]) + _create_index("ix_papers_created_at", "papers", ["created_at"]) + + # Harvest runs indexes + _create_index("ix_harvest_runs_run_id", "harvest_runs", ["run_id"]) + _create_index("ix_harvest_runs_status", "harvest_runs", ["status"]) + _create_index("ix_harvest_runs_started_at", "harvest_runs", ["started_at"]) + + +def downgrade() -> None: + op.drop_table("harvest_runs") + op.drop_table("papers") diff --git a/docs/paper_harvest_v1_design.md b/docs/paper_harvest_v1_design.md new file mode 100644 index 0000000..2c54602 --- /dev/null +++ b/docs/paper_harvest_v1_design.md @@ -0,0 +1,2211 @@ +# Paper Collection & Resource Pool v1 - Technical Design Document + +> **Status**: Draft +> **Author**: Claude Code +> **Date**: 2026-02-03 +> **Estimated Effort**: 5-7 days (~40h) + +--- + +## 0. Architecture Context: Where v1 Fits + +v1 spans **three layers** of the PaperBot architecture, focusing on **Paper Harvesting, Storage, and Search capabilities**. + +### 0.1 PaperBot Architecture with Harvest Layer + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ PaperBot Standard Architecture │ +│ (Offline Ingestion → Storage → Online Retrieval → Generation → Feedback)│ +└─────────────────────────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Layer 1 · Ingestion (Async) - HARVEST LAYER │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ ╔════════════════════════════════════════════════════════════════════╗ │ │ +│ │ ║ v1: Paper Harvesters ║ │ │ +│ │ ║ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ║ │ │ +│ │ ║ │ arXiv │ │ Semantic │ │ OpenAlex │ ║ │ │ +│ │ ║ │ Harvester │ │ Scholar │ │ Harvester │ ║ │ │ +│ │ ║ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ ║ │ │ +│ │ ║ └─────────────────┼─────────────────┘ ║ │ │ +│ │ ║ ▼ ║ │ │ +│ │ ║ ┌───────────────────────┐ ║ │ │ +│ │ ║ │ PaperDeduplicator │ ║ │ │ +│ │ ║ │ (DOI/Title/ID match) │ ║ │ │ +│ │ ║ └───────────┬───────────┘ ║ │ │ +│ │ ╚═══════════════════════════╪════════════════════════════════════════╝ │ │ +│ └─────────────────────────────┼────────────────────────────────────────────┘ │ +│ ▼ │ +└────────────────────────────────┼─────────────────────────────────────────────────┘ + │ +┌────────────────────────────────┼─────────────────────────────────────────────────┐ +│ Layer 2 · Storage │ +│ ┌─────────────────────────────▼────────────────────────────────────────────┐ │ +│ │ SQL 主库 (SQLite/Postgres) │ │ +│ │ ╔═══════════════════════════════════════════════════════════════════╗ │ │ +│ │ ║ v1: papers table (NEW) ║ │ │ +│ │ ║ - doi, arxiv_id, semantic_scholar_id, openalex_id ║ │ │ +│ │ ║ - title, abstract, authors, year, venue, citations ║ │ │ +│ │ ║ - title_hash (dedup), primary_source, sources_json ║ │ │ +│ │ ╠═══════════════════════════════════════════════════════════════════╣ │ │ +│ │ ║ v1: harvest_runs table (NEW) ║ │ │ +│ │ ║ - run_id, keywords, venues, status, papers_found/new/deduped ║ │ │ +│ │ ╚═══════════════════════════════════════════════════════════════════╝ │ │ +│ │ research_tracks / tasks / paper_feedback (existing) │ │ +│ └──────────────────────────────────────────────────────────────────────────┘ │ +└────────────────────────────────┼─────────────────────────────────────────────────┘ + │ +┌────────────────────────────────┼─────────────────────────────────────────────────┐ +│ Layer 3 · Retrieval (Online) │ +│ ┌─────────────────────────────▼────────────────────────────────────────────┐ │ +│ │ ╔═══════════════════════════════════════════════════════════════════╗ │ │ +│ │ ║ v1: PaperStore.search_papers() (NEW) ║ │ │ +│ │ ║ - Full-text search in title/abstract ║ │ │ +│ │ ║ - Filter by: keywords, venues, year range, citations, sources ║ │ │ +│ │ ║ - Sort by: citation_count, year, created_at ║ │ │ +│ │ ║ - Pagination with limit/offset (TopN) ║ │ │ +│ │ ╚═══════════════════════════════════════════════════════════════════╝ │ │ +│ │ ContextEngine / TrackRouter / Paper Searcher (existing) │ │ +│ └──────────────────────────────────────────────────────────────────────────┘ │ +└────────────────────────────────┼─────────────────────────────────────────────────┘ + │ +┌────────────────────────────────┼─────────────────────────────────────────────────┐ +│ Layer 4-5 · Generation & Feedback (Existing - No Changes) │ +│ ┌─────────────────────────────▼────────────────────────────────────────────┐ │ +│ │ PromptComposer → LLM → Output Parser → Paper Feedback │ │ +│ └──────────────────────────────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────────────────────┘ + +Legend: + ╔═══╗ v1 Focus Area (Paper Harvest & Storage) + ╚═══╝ + ───▶ Data Flow +``` + +### 0.2 v1 Components Mapped to Architecture Layers + +| Layer | Component | v1 Deliverable | +|-------|-----------|----------------| +| **Layer 1: Ingestion** | Harvesters | ArxivHarvester, SemanticScholarHarvester, OpenAlexHarvester | +| | Query Services | VenueRecommender, QueryRewriter | +| | Deduplication | PaperDeduplicator (multi-strategy) | +| **Layer 2: Storage** | `papers` table | Paper metadata with multi-source IDs | +| | `harvest_runs` table | Harvest execution tracking | +| | PaperStore | SQLAlchemy repository implementation | +| **Layer 3: Retrieval** | Search API | Filter-based TopN retrieval | + +### 0.3 v1 Focus: Harvest Pipeline + +``` + ┌─────────────────────────────────────────┐ + │ v1: Harvest Pipeline │ + └─────────────────────────────────────────┘ + │ + ┌───────────────────────────┼───────────────────────────┐ + │ │ │ + ▼ ▼ ▼ +┌─────────────────────┐ ┌─────────────────────┐ ┌─────────────────────┐ +│ Query Services │ │ Harvesters │ │ Storage & Search │ +│ │ │ │ │ │ +│ - VenueRecommender │ │ - ArxivHarvester │ │ - PaperStore │ +│ keyword→venues │ │ - S2Harvester │ │ upsert/search │ +│ - QueryRewriter │ │ - OpenAlexHarvester│ │ - Deduplication │ +│ expand/synonyms │ │ │ │ DOI/title/ID │ +│ │ │ │ │ │ +└─────────────────────┘ └─────────────────────┘ └─────────────────────┘ + │ │ │ + └───────────────────────────┼───────────────────────────┘ + │ + ▼ + ┌─────────────────────────────────────────┐ + │ Implementation Artifacts │ + │ │ + │ src/paperbot/domain/harvest.py │ + │ src/paperbot/infrastructure/harvesters/│ + │ src/paperbot/infrastructure/stores/ │ + │ src/paperbot/application/services/ │ + │ src/paperbot/api/routes/harvest.py │ + └─────────────────────────────────────────┘ +``` + +### 0.4 Data Flow with v1 Touch Points + +``` + USER INPUT + │ + ┌─────────────────────────────┼─────────────────────────────┐ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────┐ │ + │ │ POST /api/harvest │ │ + │ │ keywords: ["ransomware", "machine learning"] │ │ + │ │ venues: ["USENIX Security", "CCS"] (optional) │ │ + │ │ year_from: 2020, year_to: 2024 (optional) │ │ + │ └─────────────────────────────────────────────────────┘ │ + │ │ │ + └─────────────────────────────┼─────────────────────────────┘ + │ + QUERY SERVICES + │ + ┌─────────────────────────────┼─────────────────────────────┐ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────┐ │ + │ │ VenueRecommender.recommend() │ │ + │ │ ╔═══════════════════════════════════════════════╗ │ │ + │ │ ║ v1: keyword→venue mapping from config ║ │ │ + │ │ ║ "ransomware" → security: [CCS, S&P, USENIX] ║ │ │ + │ │ ╚═══════════════════════════════════════════════╝ │ │ + │ └─────────────────────────────────────────────────────┘ │ + │ │ │ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────┐ │ + │ │ QueryRewriter.rewrite() │ │ + │ │ ╔═══════════════════════════════════════════════╗ │ │ + │ │ ║ v1: abbreviation expansion + synonyms ║ │ │ + │ │ ║ "ML" → "machine learning" ║ │ │ + │ │ ║ "LLM" → "large language model" ║ │ │ + │ │ ╚═══════════════════════════════════════════════╝ │ │ + │ └─────────────────────────────────────────────────────┘ │ + │ │ + └─────────────────────────────┼─────────────────────────────┘ + │ + PARALLEL HARVEST + │ + ┌─────────────────────────────┼─────────────────────────────┐ + │ ┌──────────────┼──────────────┐ │ + │ ▼ ▼ ▼ │ + │ ┌────────────────┐ ┌────────────────┐ ┌────────────────┐ │ + │ │ ArxivHarvester │ │ S2Harvester │ │OpenAlexHarvest │ │ + │ │ ╔════════════╗ │ │ ╔════════════╗ │ │ ╔════════════╗ │ │ + │ │ ║ v1: Atom ║ │ │ ║ v1: REST ║ │ │ ║ v1: REST ║ │ │ + │ │ ║ XML API ║ │ │ ║ API wrap ║ │ │ ║ API (new) ║ │ │ + │ │ ╚════════════╝ │ │ ╚════════════╝ │ │ ╚════════════╝ │ │ + │ └───────┬────────┘ └───────┬────────┘ └───────┬────────┘ │ + │ └──────────────────┼──────────────────┘ │ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────┐ │ + │ │ List[HarvestedPaper] (unified format) │ │ + │ └─────────────────────────────────────────────────────┘ │ + │ │ + └─────────────────────────────┼─────────────────────────────┘ + │ + DEDUPLICATION + │ + ┌─────────────────────────────┼─────────────────────────────┐ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────┐ │ + │ │ PaperDeduplicator.deduplicate() │ │ + │ │ ╔═══════════════════════════════════════════════╗ │ │ + │ │ ║ v1: Multi-strategy matching (priority order): ║ │ │ + │ │ ║ 1. DOI (canonical, most reliable) ║ │ │ + │ │ ║ 2. arXiv ID ║ │ │ + │ │ ║ 3. Semantic Scholar ID ║ │ │ + │ │ ║ 4. OpenAlex ID ║ │ │ + │ │ ║ 5. Normalized title hash (fallback) ║ │ │ + │ │ ╚═══════════════════════════════════════════════╝ │ │ + │ └─────────────────────────────────────────────────────┘ │ + │ │ + └─────────────────────────────┼─────────────────────────────┘ + │ + STORAGE + │ + ┌─────────────────────────────┼─────────────────────────────┐ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────┐ │ + │ │ PaperStore.upsert_papers_batch() │ │ + │ │ ╔═══════════════════════════════════════════════╗ │ │ + │ │ ║ v1: Atomic upsert with dedup at DB level ║ │ │ + │ │ ║ - Unique constraints on DOI, arxiv_id, etc. ║ │ │ + │ │ ║ - Merge metadata from duplicates ║ │ │ + │ │ ║ - Track sources that returned each paper ║ │ │ + │ │ ╚═══════════════════════════════════════════════╝ │ │ + │ └─────────────────────────────────────────────────────┘ │ + │ │ + └─────────────────────────────┼─────────────────────────────┘ + │ + RETRIEVAL + │ + ┌─────────────────────────────┼─────────────────────────────┐ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────┐ │ + │ │ POST /api/papers/search │ │ + │ │ ╔═══════════════════════════════════════════════╗ │ │ + │ │ ║ v1: Filter-based search with TopN ║ │ │ + │ │ ║ - Full-text: title LIKE '%query%' ║ │ │ + │ │ ║ - Filters: year, venue, citations, source ║ │ │ + │ │ ║ - Sort: citation_count DESC (default) ║ │ │ + │ │ ║ - Pagination: limit=50, offset=0 ║ │ │ + │ │ ╚═══════════════════════════════════════════════╝ │ │ + │ └─────────────────────────────────────────────────────┘ │ + │ │ + └───────────────────────────────────────────────────────────┘ +``` + +--- + +## 1. Executive Summary + +**Objective**: Build a stable pipeline for "keywords → recommend venues → pull papers → store → search", enabling paper collection from 3 open sources with deduplication and filter-based retrieval. + +**Current State**: +- ArxivConnector exists (XML parsing only, no search) +- SemanticScholarClient exists (async API wrapper) +- No unified harvester interface +- No persistent paper storage +- No deduplication across sources + +**Scope**: This document covers the v1 deliverables: +1. Unified harvester interface and 3 implementations +2. Paper storage with multi-strategy deduplication +3. Query services (VenueRecommender, QueryRewriter) +4. API endpoints for harvest and search + +**Non-Goals (deferred to v2)**: +- PDF downloading and parsing +- Full-text search (FTS5/Elasticsearch) +- Embedding-based semantic search +- Authenticated sources (IEEE, ACM) + +--- + +## 2. Technical Solution Design + +### 2.1 Domain Models + +#### 2.1.1 HarvestedPaper (Unified Format) + +**File**: `src/paperbot/domain/harvest.py` + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `title` | str | Yes | Paper title | +| `abstract` | str | No | Paper abstract | +| `authors` | List[str] | No | Author names | +| `doi` | str | No | Digital Object Identifier | +| `arxiv_id` | str | No | arXiv identifier (e.g., 2301.12345) | +| `semantic_scholar_id` | str | No | S2 paper ID | +| `openalex_id` | str | No | OpenAlex work ID | +| `year` | int | No | Publication year | +| `venue` | str | No | Conference/journal name | +| `publication_date` | str | No | ISO date string | +| `citation_count` | int | No | Number of citations | +| `url` | str | No | Paper URL | +| `pdf_url` | str | No | PDF URL (metadata only, no download) | +| `keywords` | List[str] | No | Author keywords | +| `fields_of_study` | List[str] | No | Research fields | +| `source` | HarvestSource | Yes | Which harvester found this | +| `source_rank` | int | No | Position in source results | + +```python +@dataclass +class HarvestedPaper: + title: str + source: HarvestSource + abstract: str = "" + authors: List[str] = field(default_factory=list) + doi: Optional[str] = None + arxiv_id: Optional[str] = None + semantic_scholar_id: Optional[str] = None + openalex_id: Optional[str] = None + year: Optional[int] = None + venue: Optional[str] = None + publication_date: Optional[str] = None + citation_count: int = 0 + url: Optional[str] = None + pdf_url: Optional[str] = None + keywords: List[str] = field(default_factory=list) + fields_of_study: List[str] = field(default_factory=list) + source_rank: Optional[int] = None +``` + +#### 2.1.2 HarvestSource Enum + +```python +class HarvestSource(str, Enum): + ARXIV = "arxiv" + SEMANTIC_SCHOLAR = "semantic_scholar" + OPENALEX = "openalex" +``` + +#### 2.1.3 HarvestResult + +```python +@dataclass +class HarvestResult: + """Result from a single harvester.""" + source: HarvestSource + papers: List[HarvestedPaper] + total_found: int + error: Optional[str] = None + +@dataclass +class HarvestRunResult: + """Aggregated result from all harvesters.""" + run_id: str + status: str # running/success/partial/failed + papers_found: int + papers_new: int + papers_deduplicated: int + source_results: Dict[HarvestSource, HarvestResult] + started_at: datetime + ended_at: Optional[datetime] = None +``` + +### 2.2 Database Schema + +#### 2.2.1 papers Table (NEW) + +**File**: `alembic/versions/0003_paper_harvest_tables.py` + +```sql +CREATE TABLE papers ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + + -- Canonical identifiers (for deduplication) + doi TEXT UNIQUE, + arxiv_id TEXT UNIQUE, + semantic_scholar_id TEXT UNIQUE, + openalex_id TEXT UNIQUE, + title_hash TEXT NOT NULL, -- SHA256 of normalized title + + -- Core metadata + title TEXT NOT NULL, + abstract TEXT DEFAULT '', + authors_json TEXT DEFAULT '[]', + year INTEGER, + venue TEXT, + publication_date TEXT, + citation_count INTEGER DEFAULT 0, + + -- URLs (no PDF download, just references) + url TEXT, + pdf_url TEXT, + + -- Classification + keywords_json TEXT DEFAULT '[]', + fields_of_study_json TEXT DEFAULT '[]', + + -- Source tracking + primary_source TEXT NOT NULL, -- First source that found this paper + sources_json TEXT DEFAULT '[]', -- All sources that returned this paper + + -- Timestamps + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + deleted_at TIMESTAMP -- Soft delete +); + +-- Indexes +CREATE INDEX idx_papers_doi ON papers(doi); +CREATE INDEX idx_papers_arxiv_id ON papers(arxiv_id); +CREATE INDEX idx_papers_title_hash ON papers(title_hash); +CREATE INDEX idx_papers_year ON papers(year); +CREATE INDEX idx_papers_venue ON papers(venue); +CREATE INDEX idx_papers_citation_count ON papers(citation_count); +CREATE INDEX idx_papers_created_at ON papers(created_at); +``` + +#### 2.2.2 harvest_runs Table (NEW) + +```sql +CREATE TABLE harvest_runs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id TEXT UNIQUE NOT NULL, + + -- Input + keywords_json TEXT DEFAULT '[]', + venues_json TEXT DEFAULT '[]', + sources_json TEXT DEFAULT '[]', + max_results_per_source INTEGER DEFAULT 100, + + -- Results + status TEXT DEFAULT 'running', -- running/success/partial/failed + papers_found INTEGER DEFAULT 0, + papers_new INTEGER DEFAULT 0, + papers_deduplicated INTEGER DEFAULT 0, + error_json TEXT DEFAULT '{}', + + -- Timestamps + started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + ended_at TIMESTAMP +); + +CREATE INDEX idx_harvest_runs_run_id ON harvest_runs(run_id); +CREATE INDEX idx_harvest_runs_status ON harvest_runs(status); +CREATE INDEX idx_harvest_runs_started_at ON harvest_runs(started_at); +``` + +#### 2.2.3 SQLAlchemy Models + +**File**: `src/paperbot/infrastructure/stores/models.py` (additions) + +```python +class PaperModel(Base): + __tablename__ = "papers" + + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + + # Canonical identifiers + doi: Mapped[Optional[str]] = mapped_column(String(128), unique=True, nullable=True, index=True) + arxiv_id: Mapped[Optional[str]] = mapped_column(String(64), unique=True, nullable=True, index=True) + semantic_scholar_id: Mapped[Optional[str]] = mapped_column(String(64), unique=True, nullable=True, index=True) + openalex_id: Mapped[Optional[str]] = mapped_column(String(64), unique=True, nullable=True, index=True) + title_hash: Mapped[str] = mapped_column(String(64), nullable=False, index=True) + + # Core metadata + title: Mapped[str] = mapped_column(Text, nullable=False) + abstract: Mapped[str] = mapped_column(Text, default="") + authors_json: Mapped[str] = mapped_column(Text, default="[]") + year: Mapped[Optional[int]] = mapped_column(Integer, nullable=True, index=True) + venue: Mapped[Optional[str]] = mapped_column(String(256), nullable=True, index=True) + publication_date: Mapped[Optional[str]] = mapped_column(String(32), nullable=True) + citation_count: Mapped[int] = mapped_column(Integer, default=0, index=True) + + # URLs + url: Mapped[Optional[str]] = mapped_column(String(512), nullable=True) + pdf_url: Mapped[Optional[str]] = mapped_column(String(512), nullable=True) + + # Classification + keywords_json: Mapped[str] = mapped_column(Text, default="[]") + fields_of_study_json: Mapped[str] = mapped_column(Text, default="[]") + + # Source tracking + primary_source: Mapped[str] = mapped_column(String(32), nullable=False) + sources_json: Mapped[str] = mapped_column(Text, default="[]") + + # Timestamps + created_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True, index=True) + updated_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + deleted_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + + +class HarvestRunModel(Base): + __tablename__ = "harvest_runs" + + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + run_id: Mapped[str] = mapped_column(String(64), unique=True, nullable=False, index=True) + + # Input + keywords_json: Mapped[str] = mapped_column(Text, default="[]") + venues_json: Mapped[str] = mapped_column(Text, default="[]") + sources_json: Mapped[str] = mapped_column(Text, default="[]") + max_results_per_source: Mapped[int] = mapped_column(Integer, default=100) + + # Results + status: Mapped[str] = mapped_column(String(32), default="running", index=True) + papers_found: Mapped[int] = mapped_column(Integer, default=0) + papers_new: Mapped[int] = mapped_column(Integer, default=0) + papers_deduplicated: Mapped[int] = mapped_column(Integer, default=0) + error_json: Mapped[str] = mapped_column(Text, default="{}") + + # Timestamps + started_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True, index=True) + ended_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) +``` + +### 2.3 Harvester Interface + +**File**: `src/paperbot/application/ports/harvester_port.py` + +```python +from typing import Protocol, runtime_checkable, Optional, List +from paperbot.domain.harvest import HarvestSource, HarvestResult + +@runtime_checkable +class HarvesterPort(Protocol): + """Abstract interface for all paper harvesters.""" + + @property + def source(self) -> HarvestSource: + """Return the harvest source identifier.""" + ... + + async def search( + self, + query: str, + *, + max_results: int = 100, + year_from: Optional[int] = None, + year_to: Optional[int] = None, + venues: Optional[List[str]] = None, + ) -> HarvestResult: + """ + Search for papers matching the query. + + Args: + query: Search query string + max_results: Maximum number of results to return + year_from: Filter papers published on or after this year + year_to: Filter papers published on or before this year + venues: Filter papers from these venues (if supported) + + Returns: + HarvestResult with papers and metadata + """ + ... + + async def close(self) -> None: + """Release resources (HTTP sessions, etc.).""" + ... +``` + +### 2.4 Harvester Implementations + +#### 2.4.1 ArxivHarvester + +**File**: `src/paperbot/infrastructure/harvesters/arxiv_harvester.py` + +```python +class ArxivHarvester: + """ + arXiv paper harvester using the Atom API. + + API: https://export.arxiv.org/api/query + Rate limit: 1 request per 3 seconds (be conservative) + """ + + ARXIV_API_URL = "https://export.arxiv.org/api/query" + REQUEST_INTERVAL = 3.0 # seconds between requests + + def __init__(self, connector: ArxivConnector): + self.connector = connector + self._session: Optional[aiohttp.ClientSession] = None + + @property + def source(self) -> HarvestSource: + return HarvestSource.ARXIV + + async def search( + self, + query: str, + *, + max_results: int = 100, + year_from: Optional[int] = None, + year_to: Optional[int] = None, + venues: Optional[List[str]] = None, # Not supported by arXiv + ) -> HarvestResult: + """ + Search arXiv using the Atom API. + + Query syntax: https://arxiv.org/help/api/user-manual#query_details + """ + # Build query with year filters if provided + search_query = self._build_query(query, year_from, year_to) + + params = { + "search_query": search_query, + "start": 0, + "max_results": max_results, + "sortBy": "relevance", + "sortOrder": "descending", + } + + try: + async with self._get_session().get(self.ARXIV_API_URL, params=params) as resp: + xml_text = await resp.text() + + records = self.connector.parse_atom(xml_text) + papers = [self._record_to_paper(r, rank=i) for i, r in enumerate(records)] + + return HarvestResult( + source=self.source, + papers=papers, + total_found=len(papers), + ) + except Exception as e: + return HarvestResult( + source=self.source, + papers=[], + total_found=0, + error=str(e), + ) + + def _record_to_paper(self, record: ArxivRecord, rank: int) -> HarvestedPaper: + """Convert ArxivRecord to HarvestedPaper.""" + # Extract arxiv_id from full URL (e.g., "http://arxiv.org/abs/2301.12345v1") + arxiv_id = record.arxiv_id.split("/")[-1].split("v")[0] + + # Extract year from published date + year = None + if record.published: + try: + year = int(record.published[:4]) + except ValueError: + pass + + return HarvestedPaper( + title=record.title, + source=HarvestSource.ARXIV, + abstract=record.summary, + authors=record.authors, + arxiv_id=arxiv_id, + year=year, + publication_date=record.published, + url=record.abs_url, + pdf_url=record.pdf_url, + source_rank=rank, + ) +``` + +#### 2.4.2 SemanticScholarHarvester + +**File**: `src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py` + +```python +class SemanticScholarHarvester: + """ + Semantic Scholar paper harvester. + + API: https://api.semanticscholar.org/graph/v1/paper/search + Rate limit: 100 req/min (with API key), 5000/day without key + """ + + FIELDS = [ + "paperId", "title", "abstract", "year", "venue", + "citationCount", "authors", "publicationDate", + "externalIds", "fieldsOfStudy", "url", "openAccessPdf" + ] + + def __init__(self, client: SemanticScholarClient): + self.client = client + + @property + def source(self) -> HarvestSource: + return HarvestSource.SEMANTIC_SCHOLAR + + async def search( + self, + query: str, + *, + max_results: int = 100, + year_from: Optional[int] = None, + year_to: Optional[int] = None, + venues: Optional[List[str]] = None, + ) -> HarvestResult: + """Search Semantic Scholar API.""" + try: + # S2 API supports year filter in query + year_filter = "" + if year_from and year_to: + year_filter = f" year:{year_from}-{year_to}" + elif year_from: + year_filter = f" year:{year_from}-" + elif year_to: + year_filter = f" year:-{year_to}" + + results = await self.client.search_papers( + query=query + year_filter, + limit=max_results, + fields=self.FIELDS, + ) + + papers = [self._to_paper(r, rank=i) for i, r in enumerate(results)] + + # Filter by venue if specified + if venues: + venue_set = {v.lower() for v in venues} + papers = [p for p in papers if p.venue and p.venue.lower() in venue_set] + + return HarvestResult( + source=self.source, + papers=papers, + total_found=len(papers), + ) + except Exception as e: + return HarvestResult( + source=self.source, + papers=[], + total_found=0, + error=str(e), + ) + + def _to_paper(self, data: Dict[str, Any], rank: int) -> HarvestedPaper: + """Convert S2 API response to HarvestedPaper.""" + authors = [a.get("name", "") for a in data.get("authors", [])] + external_ids = data.get("externalIds", {}) or {} + + pdf_url = None + if data.get("openAccessPdf"): + pdf_url = data["openAccessPdf"].get("url") + + return HarvestedPaper( + title=data.get("title", ""), + source=HarvestSource.SEMANTIC_SCHOLAR, + abstract=data.get("abstract") or "", + authors=authors, + doi=external_ids.get("DOI"), + arxiv_id=external_ids.get("ArXiv"), + semantic_scholar_id=data.get("paperId"), + year=data.get("year"), + venue=data.get("venue"), + publication_date=data.get("publicationDate"), + citation_count=data.get("citationCount", 0), + url=data.get("url"), + pdf_url=pdf_url, + fields_of_study=data.get("fieldsOfStudy") or [], + source_rank=rank, + ) +``` + +#### 2.4.3 OpenAlexHarvester + +**File**: `src/paperbot/infrastructure/harvesters/openalex_harvester.py` + +```python +class OpenAlexHarvester: + """ + OpenAlex paper harvester. + + API: https://docs.openalex.org/api-entities/works + Rate limit: 10 req/s (polite pool with email), 100K/day + """ + + OPENALEX_API_URL = "https://api.openalex.org/works" + REQUEST_INTERVAL = 0.1 # 10 req/s + + def __init__(self, email: Optional[str] = None): + self.email = email # For polite pool + self._session: Optional[aiohttp.ClientSession] = None + + @property + def source(self) -> HarvestSource: + return HarvestSource.OPENALEX + + async def search( + self, + query: str, + *, + max_results: int = 100, + year_from: Optional[int] = None, + year_to: Optional[int] = None, + venues: Optional[List[str]] = None, + ) -> HarvestResult: + """Search OpenAlex API.""" + params = { + "search": query, + "per_page": min(max_results, 200), # API max is 200 + "sort": "cited_by_count:desc", + } + + # Add email for polite pool + if self.email: + params["mailto"] = self.email + + # Build filter string + filters = [] + if year_from: + filters.append(f"publication_year:>={year_from}") + if year_to: + filters.append(f"publication_year:<={year_to}") + if filters: + params["filter"] = ",".join(filters) + + try: + async with self._get_session().get(self.OPENALEX_API_URL, params=params) as resp: + data = await resp.json() + + results = data.get("results", []) + papers = [self._to_paper(r, rank=i) for i, r in enumerate(results)] + + # Filter by venue if specified + if venues: + venue_set = {v.lower() for v in venues} + papers = [p for p in papers if p.venue and p.venue.lower() in venue_set] + + return HarvestResult( + source=self.source, + papers=papers, + total_found=data.get("meta", {}).get("count", len(papers)), + ) + except Exception as e: + return HarvestResult( + source=self.source, + papers=[], + total_found=0, + error=str(e), + ) + + def _to_paper(self, data: Dict[str, Any], rank: int) -> HarvestedPaper: + """Convert OpenAlex API response to HarvestedPaper.""" + # Extract authors + authors = [] + for authorship in data.get("authorships", []): + author = authorship.get("author", {}) + if author.get("display_name"): + authors.append(author["display_name"]) + + # Extract identifiers + ids = data.get("ids", {}) + doi = ids.get("doi", "").replace("https://doi.org/", "") if ids.get("doi") else None + openalex_id = ids.get("openalex", "").replace("https://openalex.org/", "") + + # Extract venue + venue = None + if data.get("primary_location"): + source = data["primary_location"].get("source") or {} + venue = source.get("display_name") + + # Extract PDF URL + pdf_url = None + if data.get("open_access", {}).get("oa_url"): + pdf_url = data["open_access"]["oa_url"] + + return HarvestedPaper( + title=data.get("title", ""), + source=HarvestSource.OPENALEX, + abstract=self._get_abstract(data), + authors=authors, + doi=doi, + openalex_id=openalex_id, + year=data.get("publication_year"), + venue=venue, + publication_date=data.get("publication_date"), + citation_count=data.get("cited_by_count", 0), + url=data.get("doi") or ids.get("openalex"), + pdf_url=pdf_url, + keywords=self._extract_keywords(data), + fields_of_study=[c.get("display_name", "") for c in data.get("concepts", [])[:5]], + source_rank=rank, + ) + + def _get_abstract(self, data: Dict[str, Any]) -> str: + """Reconstruct abstract from inverted index.""" + abstract_index = data.get("abstract_inverted_index") + if not abstract_index: + return "" + + # OpenAlex stores abstract as inverted index: {"word": [positions]} + words = [] + for word, positions in abstract_index.items(): + for pos in positions: + words.append((pos, word)) + words.sort(key=lambda x: x[0]) + return " ".join(w[1] for w in words) +``` + +### 2.5 Query Services + +#### 2.5.1 VenueRecommender + +**File**: `src/paperbot/application/services/venue_recommender.py` + +```python +class VenueRecommender: + """ + Recommend relevant venues based on keywords. + + Uses a static mapping from keywords/domains to top venues. + Configuration loaded from config/venue_mappings.yaml. + """ + + # Default keyword→venue mappings (can be overridden by config) + DEFAULT_MAPPINGS = { + # Security + "security": ["CCS", "S&P", "USENIX Security", "NDSS"], + "ransomware": ["CCS", "S&P", "USENIX Security", "NDSS"], + "malware": ["CCS", "S&P", "USENIX Security", "NDSS"], + "cryptography": ["CRYPTO", "EUROCRYPT", "CCS"], + "privacy": ["S&P", "PETS", "CCS", "USENIX Security"], + + # ML/AI + "machine learning": ["NeurIPS", "ICML", "ICLR"], + "deep learning": ["NeurIPS", "ICML", "ICLR", "CVPR"], + "llm": ["NeurIPS", "ICML", "ACL", "EMNLP"], + "large language model": ["NeurIPS", "ICML", "ACL", "EMNLP"], + "transformer": ["NeurIPS", "ICML", "ACL", "EMNLP"], + "nlp": ["ACL", "EMNLP", "NAACL", "NeurIPS"], + "computer vision": ["CVPR", "ICCV", "ECCV", "NeurIPS"], + + # Systems + "database": ["SIGMOD", "VLDB", "ICDE"], + "systems": ["OSDI", "SOSP", "EuroSys", "ATC"], + "networking": ["SIGCOMM", "NSDI", "MobiCom"], + + # Software Engineering + "software engineering": ["ICSE", "FSE", "ASE"], + "testing": ["ICSE", "ISSTA", "FSE"], + "program analysis": ["PLDI", "POPL", "OOPSLA"], + } + + def __init__(self, config_path: Optional[str] = None): + self.mappings = self.DEFAULT_MAPPINGS.copy() + if config_path: + self._load_config(config_path) + + def recommend( + self, + keywords: List[str], + *, + max_venues: int = 5, + ) -> List[str]: + """ + Recommend venues based on keywords. + + Args: + keywords: List of search keywords + max_venues: Maximum number of venues to recommend + + Returns: + List of recommended venue names, ordered by relevance + """ + venue_scores: Dict[str, int] = {} + + for keyword in keywords: + keyword_lower = keyword.lower() + + # Exact match + if keyword_lower in self.mappings: + for venue in self.mappings[keyword_lower]: + venue_scores[venue] = venue_scores.get(venue, 0) + 2 + + # Partial match + for mapped_kw, venues in self.mappings.items(): + if keyword_lower in mapped_kw or mapped_kw in keyword_lower: + for venue in venues: + venue_scores[venue] = venue_scores.get(venue, 0) + 1 + + # Sort by score descending + sorted_venues = sorted(venue_scores.items(), key=lambda x: -x[1]) + return [v[0] for v in sorted_venues[:max_venues]] +``` + +#### 2.5.2 QueryRewriter + +**File**: `src/paperbot/application/services/query_rewriter.py` + +```python +class QueryRewriter: + """ + Expand and rewrite queries for better search coverage. + + Handles: + - Abbreviation expansion (LLM → large language model) + - Synonym addition (ML → machine learning) + - Query normalization + """ + + # Abbreviation → full form mappings + ABBREVIATIONS = { + "llm": "large language model", + "llms": "large language models", + "ml": "machine learning", + "dl": "deep learning", + "nlp": "natural language processing", + "cv": "computer vision", + "rl": "reinforcement learning", + "gan": "generative adversarial network", + "gans": "generative adversarial networks", + "cnn": "convolutional neural network", + "cnns": "convolutional neural networks", + "rnn": "recurrent neural network", + "rnns": "recurrent neural networks", + "lstm": "long short-term memory", + "bert": "bidirectional encoder representations from transformers", + "gpt": "generative pre-trained transformer", + "rag": "retrieval augmented generation", + "vae": "variational autoencoder", + "asr": "automatic speech recognition", + "tts": "text to speech", + "ocr": "optical character recognition", + "sql": "structured query language", + "api": "application programming interface", + } + + def __init__(self, abbreviations: Optional[Dict[str, str]] = None): + self.abbreviations = {**self.ABBREVIATIONS} + if abbreviations: + self.abbreviations.update(abbreviations) + + def rewrite(self, query: str) -> List[str]: + """ + Rewrite query to produce expanded variations. + + Args: + query: Original search query + + Returns: + List of query variations (original + expanded) + """ + queries = [query] + + # Tokenize and expand abbreviations + words = query.lower().split() + expanded_words = [] + has_expansion = False + + for word in words: + # Remove punctuation for matching + clean_word = word.strip(".,;:!?()[]{}\"'") + + if clean_word in self.abbreviations: + expanded_words.append(self.abbreviations[clean_word]) + has_expansion = True + else: + expanded_words.append(word) + + if has_expansion: + expanded_query = " ".join(expanded_words) + if expanded_query != query.lower(): + queries.append(expanded_query) + + return queries + + def normalize(self, query: str) -> str: + """ + Normalize query for consistent matching. + + - Lowercase + - Remove extra whitespace + - Remove special characters (except alphanumeric and space) + """ + import re + normalized = query.lower() + normalized = re.sub(r"[^\w\s]", " ", normalized) + normalized = re.sub(r"\s+", " ", normalized).strip() + return normalized +``` + +### 2.6 Deduplication Service + +**File**: `src/paperbot/application/services/paper_deduplicator.py` + +```python +class PaperDeduplicator: + """ + Multi-strategy paper deduplication. + + Priority order: + 1. DOI (most reliable) + 2. arXiv ID + 3. Semantic Scholar ID + 4. OpenAlex ID + 5. Normalized title hash (fallback) + """ + + def __init__(self): + self._doi_index: Dict[str, int] = {} + self._arxiv_index: Dict[str, int] = {} + self._s2_index: Dict[str, int] = {} + self._openalex_index: Dict[str, int] = {} + self._title_hash_index: Dict[str, int] = {} + + def deduplicate( + self, + papers: List[HarvestedPaper], + ) -> Tuple[List[HarvestedPaper], int]: + """ + Deduplicate papers in-memory. + + Args: + papers: List of papers from all sources + + Returns: + Tuple of (deduplicated papers, count of duplicates removed) + """ + unique_papers: List[HarvestedPaper] = [] + duplicates_count = 0 + + for paper in papers: + existing_idx = self._find_duplicate(paper) + + if existing_idx is not None: + # Merge metadata into existing paper + self._merge_paper(unique_papers[existing_idx], paper) + duplicates_count += 1 + else: + # Add new paper + idx = len(unique_papers) + self._index_paper(paper, idx) + unique_papers.append(paper) + + return unique_papers, duplicates_count + + def _find_duplicate(self, paper: HarvestedPaper) -> Optional[int]: + """Find existing paper index if duplicate exists.""" + # 1. DOI match + if paper.doi: + doi_lower = paper.doi.lower() + if doi_lower in self._doi_index: + return self._doi_index[doi_lower] + + # 2. arXiv ID match + if paper.arxiv_id: + arxiv_lower = paper.arxiv_id.lower() + if arxiv_lower in self._arxiv_index: + return self._arxiv_index[arxiv_lower] + + # 3. Semantic Scholar ID match + if paper.semantic_scholar_id: + s2_lower = paper.semantic_scholar_id.lower() + if s2_lower in self._s2_index: + return self._s2_index[s2_lower] + + # 4. OpenAlex ID match + if paper.openalex_id: + openalex_lower = paper.openalex_id.lower() + if openalex_lower in self._openalex_index: + return self._openalex_index[openalex_lower] + + # 5. Title hash match (fallback) + title_hash = self._compute_title_hash(paper.title) + if title_hash in self._title_hash_index: + return self._title_hash_index[title_hash] + + return None + + def _index_paper(self, paper: HarvestedPaper, idx: int) -> None: + """Add paper to all relevant indexes.""" + if paper.doi: + self._doi_index[paper.doi.lower()] = idx + if paper.arxiv_id: + self._arxiv_index[paper.arxiv_id.lower()] = idx + if paper.semantic_scholar_id: + self._s2_index[paper.semantic_scholar_id.lower()] = idx + if paper.openalex_id: + self._openalex_index[paper.openalex_id.lower()] = idx + + title_hash = self._compute_title_hash(paper.title) + self._title_hash_index[title_hash] = idx + + def _merge_paper(self, existing: HarvestedPaper, new: HarvestedPaper) -> None: + """Merge metadata from new paper into existing.""" + # Fill in missing identifiers + if not existing.doi and new.doi: + existing.doi = new.doi + if not existing.arxiv_id and new.arxiv_id: + existing.arxiv_id = new.arxiv_id + if not existing.semantic_scholar_id and new.semantic_scholar_id: + existing.semantic_scholar_id = new.semantic_scholar_id + if not existing.openalex_id and new.openalex_id: + existing.openalex_id = new.openalex_id + + # Prefer longer abstract + if len(new.abstract) > len(existing.abstract): + existing.abstract = new.abstract + + # Prefer higher citation count + if new.citation_count > existing.citation_count: + existing.citation_count = new.citation_count + + # Merge keywords and fields + existing.keywords = list(set(existing.keywords + new.keywords)) + existing.fields_of_study = list(set(existing.fields_of_study + new.fields_of_study)) + + @staticmethod + def _compute_title_hash(title: str) -> str: + """Compute normalized title hash for deduplication.""" + import hashlib + import re + + # Normalize: lowercase, remove punctuation, collapse whitespace + normalized = title.lower() + normalized = re.sub(r"[^\w\s]", "", normalized) + normalized = re.sub(r"\s+", " ", normalized).strip() + + return hashlib.sha256(normalized.encode()).hexdigest() +``` + +### 2.7 PaperStore Repository + +**File**: `src/paperbot/infrastructure/stores/paper_store.py` + +```python +class PaperStore: + """ + Paper storage repository. + + Handles: + - Batch upsert with DB-level deduplication + - Filter-based search with pagination + - Source tracking + """ + + def __init__(self, session_provider: SessionProvider): + self.session_provider = session_provider + + async def upsert_papers_batch( + self, + papers: List[HarvestedPaper], + ) -> Tuple[int, int]: + """ + Upsert papers with deduplication. + + Returns: + Tuple of (new_count, updated_count) + """ + new_count = 0 + updated_count = 0 + + with self.session_provider() as session: + for paper in papers: + existing = self._find_existing(session, paper) + + if existing: + self._update_paper(existing, paper) + updated_count += 1 + else: + model = self._create_model(paper) + session.add(model) + new_count += 1 + + session.commit() + + return new_count, updated_count + + async def search_papers( + self, + *, + query: Optional[str] = None, + keywords: Optional[List[str]] = None, + venues: Optional[List[str]] = None, + year_from: Optional[int] = None, + year_to: Optional[int] = None, + min_citations: Optional[int] = None, + sources: Optional[List[str]] = None, + sort_by: str = "citation_count", + sort_order: str = "desc", + limit: int = 50, + offset: int = 0, + ) -> Tuple[List[PaperModel], int]: + """ + Search papers with filters and pagination. + + Returns: + Tuple of (papers, total_count) + """ + with self.session_provider() as session: + stmt = select(PaperModel).where(PaperModel.deleted_at.is_(None)) + + # Full-text search (LIKE for v1) + if query: + pattern = f"%{query}%" + stmt = stmt.where( + or_( + PaperModel.title.ilike(pattern), + PaperModel.abstract.ilike(pattern), + ) + ) + + # Filters + if year_from: + stmt = stmt.where(PaperModel.year >= year_from) + if year_to: + stmt = stmt.where(PaperModel.year <= year_to) + if min_citations: + stmt = stmt.where(PaperModel.citation_count >= min_citations) + if venues: + stmt = stmt.where(PaperModel.venue.in_(venues)) + if sources: + stmt = stmt.where(PaperModel.primary_source.in_(sources)) + + # Count total + count_stmt = select(func.count()).select_from(stmt.subquery()) + total_count = session.execute(count_stmt).scalar() or 0 + + # Sort + sort_col = getattr(PaperModel, sort_by, PaperModel.citation_count) + if sort_order == "desc": + stmt = stmt.order_by(sort_col.desc()) + else: + stmt = stmt.order_by(sort_col.asc()) + + # Pagination + stmt = stmt.offset(offset).limit(limit) + + papers = session.execute(stmt).scalars().all() + + return list(papers), total_count +``` + +### 2.8 Harvest Pipeline Orchestrator + +**File**: `src/paperbot/application/workflows/harvest_pipeline.py` + +```python +class HarvestPipeline: + """ + Orchestrates the paper harvest pipeline. + + Stages: + 1. Query expansion (QueryRewriter) + 2. Venue recommendation (VenueRecommender) + 3. Parallel harvesting (all harvesters) + 4. Deduplication (PaperDeduplicator) + 5. Storage (PaperStore) + """ + + def __init__( + self, + harvesters: List[HarvesterPort], + paper_store: PaperStore, + query_rewriter: QueryRewriter, + venue_recommender: VenueRecommender, + deduplicator: PaperDeduplicator, + ): + self.harvesters = harvesters + self.paper_store = paper_store + self.query_rewriter = query_rewriter + self.venue_recommender = venue_recommender + self.deduplicator = deduplicator + + async def run( + self, + keywords: List[str], + *, + venues: Optional[List[str]] = None, + year_from: Optional[int] = None, + year_to: Optional[int] = None, + max_results_per_source: int = 50, + sources: Optional[List[str]] = None, + progress_callback: Optional[Callable[[str, str], None]] = None, + ) -> HarvestRunResult: + """ + Run the full harvest pipeline. + + Args: + keywords: Search keywords + venues: Venue filter (optional, will recommend if not provided) + year_from: Publication year lower bound + year_to: Publication year upper bound + max_results_per_source: Max papers per source + sources: Which sources to use (default: all) + progress_callback: Optional callback for progress updates + + Returns: + HarvestRunResult with all papers and statistics + """ + run_id = f"harvest-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{uuid4().hex[:6]}" + started_at = datetime.now(timezone.utc) + + def emit(phase: str, message: str): + if progress_callback: + progress_callback(phase, message) + + # Stage 1: Query expansion + emit("Expanding", "Expanding keywords...") + expanded_queries = [] + for kw in keywords: + expanded_queries.extend(self.query_rewriter.rewrite(kw)) + combined_query = " ".join(expanded_queries) + + # Stage 2: Venue recommendation + if not venues: + emit("Recommending", "Recommending venues...") + venues = self.venue_recommender.recommend(keywords) + + # Stage 3: Parallel harvesting + emit("Harvesting", "Fetching from sources...") + + selected_harvesters = self.harvesters + if sources: + source_set = {HarvestSource(s) for s in sources} + selected_harvesters = [h for h in self.harvesters if h.source in source_set] + + # Run all harvesters in parallel + tasks = [ + h.search( + combined_query, + max_results=max_results_per_source, + year_from=year_from, + year_to=year_to, + venues=venues, + ) + for h in selected_harvesters + ] + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Collect results + source_results: Dict[HarvestSource, HarvestResult] = {} + all_papers: List[HarvestedPaper] = [] + + for harvester, result in zip(selected_harvesters, results): + if isinstance(result, Exception): + source_results[harvester.source] = HarvestResult( + source=harvester.source, + papers=[], + total_found=0, + error=str(result), + ) + else: + source_results[harvester.source] = result + all_papers.extend(result.papers) + emit("Harvesting", f"Found {result.total_found} from {harvester.source.value}") + + papers_found = len(all_papers) + + # Stage 4: Deduplication + emit("Deduplicating", "Removing duplicates...") + unique_papers, duplicates_count = self.deduplicator.deduplicate(all_papers) + + # Stage 5: Storage + emit("Storing", "Saving to database...") + new_count, updated_count = await self.paper_store.upsert_papers_batch(unique_papers) + + # Determine final status + has_errors = any(r.error for r in source_results.values()) + has_results = any(r.papers for r in source_results.values()) + + if has_errors and not has_results: + status = "failed" + elif has_errors: + status = "partial" + else: + status = "success" + + return HarvestRunResult( + run_id=run_id, + status=status, + papers_found=papers_found, + papers_new=new_count, + papers_deduplicated=duplicates_count, + source_results=source_results, + started_at=started_at, + ended_at=datetime.now(timezone.utc), + ) +``` + +### 2.9 API Endpoints + +**File**: `src/paperbot/api/routes/harvest.py` + +```python +router = APIRouter(prefix="/api", tags=["harvest"]) + + +class HarvestRequest(BaseModel): + keywords: List[str] + venues: Optional[List[str]] = None + year_from: Optional[int] = None + year_to: Optional[int] = None + max_results_per_source: int = Field(default=50, ge=1, le=200) + sources: Optional[List[str]] = None + + +class PaperSearchRequest(BaseModel): + query: Optional[str] = None + keywords: Optional[List[str]] = None + venues: Optional[List[str]] = None + year_from: Optional[int] = None + year_to: Optional[int] = None + min_citations: Optional[int] = None + sources: Optional[List[str]] = None + sort_by: str = Field(default="citation_count") + sort_order: str = Field(default="desc") + limit: int = Field(default=50, ge=1, le=500) + offset: int = Field(default=0, ge=0) + + +@router.post("/harvest") +async def harvest_papers(request: HarvestRequest): + """ + Start paper harvesting pipeline. + + Returns SSE stream with progress updates and final result. + """ + async def generate(): + pipeline = get_harvest_pipeline() # From DI container + + async def on_progress(phase: str, message: str): + yield sse_event("progress", {"phase": phase, "message": message}) + + result = await pipeline.run( + keywords=request.keywords, + venues=request.venues, + year_from=request.year_from, + year_to=request.year_to, + max_results_per_source=request.max_results_per_source, + sources=request.sources, + progress_callback=on_progress, + ) + + yield sse_event("result", { + "run_id": result.run_id, + "status": result.status, + "papers_found": result.papers_found, + "papers_new": result.papers_new, + "papers_deduplicated": result.papers_deduplicated, + "sources": { + source.value: { + "papers": len(r.papers), + "error": r.error, + } + for source, r in result.source_results.items() + }, + }) + yield sse_event("done", {}) + + return StreamingResponse( + generate(), + media_type="text/event-stream", + ) + + +@router.post("/papers/search") +async def search_papers(request: PaperSearchRequest): + """ + Search harvested papers with filters. + """ + store = get_paper_store() # From DI container + + papers, total = await store.search_papers( + query=request.query, + venues=request.venues, + year_from=request.year_from, + year_to=request.year_to, + min_citations=request.min_citations, + sources=request.sources, + sort_by=request.sort_by, + sort_order=request.sort_order, + limit=request.limit, + offset=request.offset, + ) + + return { + "papers": [paper_to_dict(p) for p in papers], + "total": total, + "limit": request.limit, + "offset": request.offset, + } +``` + +### 2.10 Papers Library Integration + +The **Papers Library** (web UI at `/papers`) displays the user's personal paper collection. When a user clicks "Save" on a paper from search results or recommendations, that paper should appear in their Papers Library. + +#### 2.10.1 Data Flow: Save → Papers Library + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Save Action → Papers Library Flow │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ Research Page / Recommendations │ │ +│ │ ┌─────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ Paper: "Attention Is All You Need" │ │ │ +│ │ │ [Like] [Save] [Dislike] │ │ │ +│ │ └─────────────────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────┬──────────────────────────────────────┘ │ +│ │ User clicks "Save" │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ POST /api/research/feedback │ │ +│ │ { │ │ +│ │ "user_id": "user123", │ │ +│ │ "track_id": 1, │ │ +│ │ "paper_id": 42, ← papers.id from paper_store │ │ +│ │ "action": "save" │ │ +│ │ } │ │ +│ └───────────────────────────────────┬──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ paper_feedback table (existing in research_store) │ │ +│ │ ┌─────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ id: 1 │ │ │ +│ │ │ user_id: "user123" │ │ │ +│ │ │ track_id: 1 │ │ │ +│ │ │ paper_id: "42" ← Reference to papers.id │ │ │ +│ │ │ action: "save" │ │ │ +│ │ │ ts: 2026-02-06T10:30:00Z │ │ │ +│ │ └─────────────────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────┬──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ GET /api/papers/library │ │ +│ │ ┌─────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ SELECT p.*, pf.action, pf.ts AS saved_at │ │ │ +│ │ │ FROM papers p │ │ │ +│ │ │ JOIN paper_feedback pf ON p.id = CAST(pf.paper_id AS INTEGER) │ │ │ +│ │ │ WHERE pf.user_id = ? AND pf.action = 'save' │ │ │ +│ │ │ ORDER BY pf.ts DESC │ │ │ +│ │ └─────────────────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────┬──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ Papers Library Page (/papers) │ │ +│ │ ┌─────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ 📄 Attention Is All You Need [Transformer] [NLP] │ │ │ +│ │ │ NeurIPS 2017 · Vaswani et al. · 100k+ citations │ │ │ +│ │ │ Saved: Feb 6, 2026 [Analyze] [Remove]│ │ │ +│ │ └─────────────────────────────────────────────────────────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +#### 2.10.2 Key Design Decisions + +| Decision | Rationale | +|----------|-----------| +| **Use `paper_feedback.paper_id` to reference `papers.id`** | Links user actions to the local paper pool | +| **Papers Library = papers WHERE action='save'** | Simple query, no new table needed | +| **Store `papers.id` (integer) not external IDs** | Consistent internal reference, supports papers from any source | +| **Keep track_id in feedback** | Papers can be saved in context of a research track | + +#### 2.10.3 New API Endpoint: GET /api/papers/library + +**File**: `src/paperbot/api/routes/harvest.py` (addition) + +```python +class PaperLibraryRequest(BaseModel): + user_id: str + track_id: Optional[int] = None # Filter by track, or all if None + include_actions: List[str] = Field(default=["save"]) # "save", "like", "cite" + sort_by: str = Field(default="saved_at") # saved_at, title, citation_count + sort_order: str = Field(default="desc") + limit: int = Field(default=50, ge=1, le=500) + offset: int = Field(default=0, ge=0) + + +@router.get("/papers/library") +async def get_user_library( + user_id: str, + track_id: Optional[int] = None, + sort_by: str = "saved_at", + limit: int = 50, + offset: int = 0, +): + """ + Get user's saved papers (Papers Library). + + Joins paper_feedback (action='save') with papers table to return + full paper metadata for the user's personal collection. + """ + store = get_paper_store() + + papers, total = await store.get_user_library( + user_id=user_id, + track_id=track_id, + actions=["save"], + sort_by=sort_by, + limit=limit, + offset=offset, + ) + + return { + "papers": [ + { + **paper_to_dict(p.paper), + "saved_at": p.saved_at.isoformat() if p.saved_at else None, + "track_id": p.track_id, + "action": p.action, + } + for p in papers + ], + "total": total, + "limit": limit, + "offset": offset, + } + + +@router.delete("/papers/library/{paper_id}") +async def remove_from_library(paper_id: int, user_id: str): + """ + Remove a paper from user's library (soft-delete the 'save' feedback). + """ + store = get_paper_store() + success = await store.remove_from_library(user_id=user_id, paper_id=paper_id) + return {"success": success} +``` + +#### 2.10.4 PaperStore Addition: get_user_library() + +**File**: `src/paperbot/infrastructure/stores/paper_store.py` (addition) + +```python +@dataclass +class LibraryPaper: + """Paper with library metadata.""" + paper: PaperModel + saved_at: datetime + track_id: Optional[int] + action: str + + +class PaperStore: + # ... existing methods ... + + async def get_user_library( + self, + user_id: str, + *, + track_id: Optional[int] = None, + actions: List[str] = ["save"], + sort_by: str = "saved_at", + limit: int = 50, + offset: int = 0, + ) -> Tuple[List[LibraryPaper], int]: + """ + Get papers in user's library (saved papers). + + Joins papers table with paper_feedback where action in actions. + """ + with self.session_provider() as session: + # Build query joining papers with paper_feedback + stmt = ( + select(PaperModel, PaperFeedbackModel) + .join( + PaperFeedbackModel, + PaperModel.id == cast(PaperFeedbackModel.paper_id, Integer) + ) + .where( + PaperFeedbackModel.user_id == user_id, + PaperFeedbackModel.action.in_(actions), + PaperModel.deleted_at.is_(None), + ) + ) + + if track_id is not None: + stmt = stmt.where(PaperFeedbackModel.track_id == track_id) + + # Count total + count_stmt = select(func.count()).select_from(stmt.subquery()) + total = session.execute(count_stmt).scalar() or 0 + + # Sort + if sort_by == "saved_at": + stmt = stmt.order_by(PaperFeedbackModel.ts.desc()) + elif sort_by == "title": + stmt = stmt.order_by(PaperModel.title.asc()) + elif sort_by == "citation_count": + stmt = stmt.order_by(PaperModel.citation_count.desc()) + else: + stmt = stmt.order_by(PaperFeedbackModel.ts.desc()) + + # Pagination + stmt = stmt.offset(offset).limit(limit) + + results = session.execute(stmt).all() + + return [ + LibraryPaper( + paper=row[0], + saved_at=row[1].ts, + track_id=row[1].track_id, + action=row[1].action, + ) + for row in results + ], total + + async def remove_from_library( + self, + user_id: str, + paper_id: int, + ) -> bool: + """Remove paper from user's library by deleting 'save' feedback.""" + with self.session_provider() as session: + stmt = ( + PaperFeedbackModel.__table__.delete() + .where( + PaperFeedbackModel.user_id == user_id, + PaperFeedbackModel.paper_id == str(paper_id), + PaperFeedbackModel.action == "save", + ) + ) + result = session.execute(stmt) + session.commit() + return result.rowcount > 0 +``` + +#### 2.10.5 Frontend Update: Connect Papers Library to API + +**File**: `web/src/lib/api.ts` (update) + +```typescript +// Replace mock fetchPapers with real API call +export async function fetchPapers(userId: string): Promise { + const res = await fetch(`${API_BASE}/api/papers/library?user_id=${userId}`); + if (!res.ok) { + throw new Error('Failed to fetch papers library'); + } + const data = await res.json(); + return data.papers.map((p: any) => ({ + id: p.id.toString(), + title: p.title, + venue: p.venue || 'Unknown', + authors: p.authors?.join(', ') || 'Unknown', + citations: p.citation_count?.toString() || '0', + status: p.status || 'Saved', // Could track analysis status separately + tags: p.keywords || p.fields_of_study || [], + savedAt: p.saved_at, + })); +} +``` + +#### 2.10.6 Relationship Diagram + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Papers Library Data Relationships │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────┐ ┌─────────────────────┐ │ +│ │ papers (v1 NEW) │ │ paper_feedback │ │ +│ │ │ │ (existing) │ │ +│ ├─────────────────────┤ ├─────────────────────┤ │ +│ │ id (PK) │◄────────│ paper_id (FK) │ │ +│ │ doi │ │ user_id │ │ +│ │ arxiv_id │ │ track_id (FK) │──────┐ │ +│ │ title │ │ action │ │ │ +│ │ abstract │ │ ts │ │ │ +│ │ authors_json │ │ weight │ │ │ +│ │ year │ └─────────────────────┘ │ │ +│ │ venue │ │ │ +│ │ citation_count │ ┌─────────────────────┐ │ │ +│ │ ... │ │ research_tracks │ │ │ +│ └─────────────────────┘ │ (existing) │◄─────┘ │ +│ ├─────────────────────┤ │ +│ │ id (PK) │ │ +│ │ user_id │ │ +│ │ name │ │ +│ │ keywords_json │ │ +│ │ venues_json │ │ +│ └─────────────────────┘ │ +│ │ +│ Query: Papers Library for User │ +│ ─────────────────────────────── │ +│ SELECT p.*, pf.ts AS saved_at, pf.track_id │ +│ FROM papers p │ +│ JOIN paper_feedback pf ON p.id = CAST(pf.paper_id AS INTEGER) │ +│ WHERE pf.user_id = :user_id AND pf.action = 'save' │ +│ ORDER BY pf.ts DESC │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 3. Implementation Principles + +### 3.1 Core Design Principles + +| Principle | Description | Implementation | +|-----------|-------------|----------------| +| **Open Sources First** | Prioritize free, no-auth APIs | arXiv, S2, OpenAlex (no IEEE/ACM) | +| **Metadata Only** | No PDF download or parsing | Store URLs only, defer PDF to v2 | +| **Graceful Degradation** | Partial results if some sources fail | Continue pipeline, report errors | +| **Idempotent Upserts** | Same paper → same record | Multi-strategy deduplication | +| **Audit Trail** | Track all harvest runs | harvest_runs table with timing/counts | + +### 3.2 Deduplication Strategy (Priority Order) + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Paper Arrives from Source │ +└─────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ 1. DOI Match? (most reliable) │ +│ doi.lower() in doi_index → DUPLICATE │ +└─────────────────────────────────────────────────────────────┘ + │ No + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ 2. arXiv ID Match? │ +│ arxiv_id.lower() in arxiv_index → DUPLICATE │ +└─────────────────────────────────────────────────────────────┘ + │ No + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ 3. Semantic Scholar ID Match? │ +│ s2_id.lower() in s2_index → DUPLICATE │ +└─────────────────────────────────────────────────────────────┘ + │ No + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ 4. OpenAlex ID Match? │ +│ openalex_id.lower() in openalex_index → DUPLICATE │ +└─────────────────────────────────────────────────────────────┘ + │ No + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ 5. Title Hash Match? (fallback) │ +│ sha256(normalize(title)) in title_hash_index → DUPLICATE│ +└─────────────────────────────────────────────────────────────┘ + │ No + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ NEW PAPER → Insert │ +└─────────────────────────────────────────────────────────────┘ +``` + +### 3.3 Component Responsibilities + +| Component | Responsibility | Should NOT Do | +|-----------|----------------|---------------| +| **Harvester** | Fetch papers from source, normalize to HarvestedPaper | Deduplicate, store, apply business rules | +| **QueryRewriter** | Expand/transform keywords | Fetch papers, access database | +| **VenueRecommender** | Map keywords to venues | Fetch papers, access database | +| **Deduplicator** | Find duplicates in memory, merge metadata | Access database, make API calls | +| **PaperStore** | Persist papers, DB-level dedup, search | Fetch from external APIs | +| **HarvestPipeline** | Orchestrate all stages | Implement stage logic | + +--- + +## 4. Technology Selection Rationale + +### 4.1 Third Source: OpenAlex + +| Criterion | OpenAlex | CrossRef | PubMed | +|-----------|----------|----------|--------| +| Coverage | 240M+ works | 140M+ | 35M+ (biomedical only) | +| API Cost | Free | Free | Free | +| Rate Limit | 10 req/s | 50 req/s | 3 req/s | +| Auth Required | No | No (polite pool) | No | +| DOI Support | Yes | Yes | Limited | +| CS Coverage | Excellent | Good | Poor | + +**Decision**: OpenAlex (best coverage, generous rate limit, no auth) + +### 4.2 Storage: SQLite + +| Criterion | SQLite | PostgreSQL | +|-----------|--------|------------| +| Consistency with stack | Same DB | New infra | +| Deployment simplicity | Single file | Server required | +| Full-text search | FTS5 (v2) | pg_trgm | +| Scale limit | ~10M rows | Unlimited | + +**Decision**: SQLite (consistent with existing stack, sufficient for v1) + +### 4.3 Search: LIKE Queries (v1) + +| Criterion | LIKE | FTS5 | Elasticsearch | +|-----------|------|------|---------------| +| Setup complexity | None | Index creation | New infra | +| Query speed | Slow | Fast | Fastest | +| Relevance ranking | None | BM25 | Full control | + +**Decision**: LIKE queries for v1 (simple, sufficient for TopN), defer FTS5 to v2 + +--- + +## 5. Best Practices and References + +### 5.1 API Documentation + +| Source | API Docs | Key Endpoints | +|--------|----------|---------------| +| **arXiv** | https://arxiv.org/help/api | `export.arxiv.org/api/query` | +| **Semantic Scholar** | https://api.semanticscholar.org/api-docs/ | `/graph/v1/paper/search` | +| **OpenAlex** | https://docs.openalex.org/ | `/works?search=...` | + +### 5.2 Open Source References + +| Project | Relevance | +|---------|-----------| +| **semanticscholar** (PyPI) | Python client for S2 API | +| **arxiv-sanity-lite** | Query handling patterns | +| **paperetl** | Metadata extraction + dedup patterns | + +### 5.3 Internal Documents + +| Document | Content | +|----------|---------| +| `config/top_venues.yaml` | Venue tier rankings | +| `src/paperbot/infrastructure/connectors/arxiv_connector.py` | Existing arXiv XML parsing | +| `src/paperbot/infrastructure/api_clients/semantic_scholar.py` | Existing S2 client | + +--- + +## 6. Risks and Mitigations + +### 6.1 Technical Risks + +| Risk | Likelihood | Impact | Mitigation | +|------|------------|--------|------------| +| API rate limiting | High | Medium | Respect rate limits, exponential backoff | +| Source API changes | Low | High | Version harvesters, monitor for changes | +| Dedup misses duplicates | Medium | Low | Multiple strategies, title hash fallback | +| Large result sets slow DB | Medium | Medium | Pagination, indexes, defer FTS to v2 | + +### 6.2 Operational Risks + +| Risk | Likelihood | Impact | Mitigation | +|------|------------|--------|------------| +| OpenAlex API unreliable | Low | Medium | Continue with other sources | +| Stale venue mappings | Medium | Low | Config-driven, easy to update | +| Disk space from paper storage | Low | Low | Metadata only, no PDFs | + +--- + +## 7. Workload Estimation + +### 7.1 Task Breakdown + +| Task | Effort | Dependencies | +|------|--------|--------------| +| **Infrastructure** | | | +| Domain models (`domain/harvest.py`) | 2h | None | +| Database migration (papers, harvest_runs) | 2h | Models | +| PaperStore implementation | 4h | Migration | +| **Harvesters** | | | +| HarvesterPort interface | 1h | Models | +| ArxivHarvester | 3h | Interface | +| SemanticScholarHarvester | 2h | Interface | +| OpenAlexHarvester | 4h | Interface | +| **Services** | | | +| VenueRecommender | 2h | Config | +| QueryRewriter | 2h | None | +| PaperDeduplicator | 3h | Models | +| **Pipeline & API** | | | +| HarvestPipeline orchestrator | 4h | All above | +| API routes (harvest, search) | 3h | Pipeline, Store | +| **Papers Library Integration** | | | +| PaperStore.get_user_library() | 2h | PaperStore | +| API route (/api/papers/library) | 1h | PaperStore | +| Frontend update (web/src/lib/api.ts) | 1h | API | +| **Testing** | | | +| Unit tests (dedup, rewriter, recommender) | 3h | Services | +| Integration tests (harvesters, store) | 3h | Harvesters | +| E2E test (full pipeline) | 2h | API | + +### 7.2 Summary + +| Category | Hours | +|----------|-------| +| Infrastructure | 8h | +| Harvesters | 10h | +| Services | 7h | +| Pipeline & API | 7h | +| Papers Library Integration | 4h | +| Testing | 8h | +| **Total** | **44h (~6-7 days)** | + +### 7.3 Suggested Timeline + +``` +Day 1: Infrastructure + - Domain models + - Database migration + - PaperStore (partial) + +Day 2: Infrastructure + Harvesters + - PaperStore completion + - HarvesterPort interface + - ArxivHarvester + +Day 3: Harvesters + - SemanticScholarHarvester + - OpenAlexHarvester + - Unit tests for harvesters + +Day 4: Services + - VenueRecommender + - QueryRewriter + - PaperDeduplicator + - Unit tests + +Day 5: Pipeline & API + - HarvestPipeline orchestrator + - API routes + - Integration tests + +Day 6: Testing & Polish + - E2E tests + - Error handling improvements + - Documentation + +Day 7: Buffer / Review + - Code review + - Bug fixes + - Update docs +``` + +--- + +## 8. Deliverables Checklist + +### 8.1 Domain Models +- [ ] `src/paperbot/domain/harvest.py` - HarvestedPaper, HarvestSource, HarvestResult + +### 8.2 Database +- [ ] `alembic/versions/0003_paper_harvest_tables.py` - Migration +- [ ] `src/paperbot/infrastructure/stores/models.py` - PaperModel, HarvestRunModel + +### 8.3 Harvesters +- [ ] `src/paperbot/application/ports/harvester_port.py` - HarvesterPort interface +- [ ] `src/paperbot/infrastructure/harvesters/__init__.py` +- [ ] `src/paperbot/infrastructure/harvesters/arxiv_harvester.py` +- [ ] `src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py` +- [ ] `src/paperbot/infrastructure/harvesters/openalex_harvester.py` + +### 8.4 Services +- [ ] `src/paperbot/application/services/venue_recommender.py` +- [ ] `src/paperbot/application/services/query_rewriter.py` +- [ ] `src/paperbot/application/services/paper_deduplicator.py` + +### 8.5 Pipeline & Storage +- [ ] `src/paperbot/application/workflows/harvest_pipeline.py` +- [ ] `src/paperbot/infrastructure/stores/paper_store.py` + +### 8.6 API +- [ ] `src/paperbot/api/routes/harvest.py` - POST /api/harvest, POST /api/papers/search, GET /api/papers/library +- [ ] `src/paperbot/api/main.py` - Register router + +### 8.7 Papers Library Integration +- [ ] `src/paperbot/infrastructure/stores/paper_store.py` - Add `get_user_library()`, `remove_from_library()` methods +- [ ] `web/src/lib/api.ts` - Update `fetchPapers()` to call real API +- [ ] `web/src/app/papers/page.tsx` - Connect to `/api/papers/library` endpoint + +### 8.8 Tests +- [ ] `tests/unit/test_paper_deduplicator.py` +- [ ] `tests/unit/test_query_rewriter.py` +- [ ] `tests/unit/test_venue_recommender.py` +- [ ] `tests/integration/test_paper_store.py` +- [ ] `tests/integration/test_harvesters.py` +- [ ] `tests/e2e/test_harvest_api.py` +- [ ] `tests/e2e/test_papers_library.py` - Papers Library integration test + +### 8.9 Documentation +- [ ] `docs/paper_harvest_v1.md` - User guide + +--- + +## 9. Open Questions + +The following questions require user input before implementation: + +1. **Venue configuration format**: Should VenueRecommender use existing `config/top_venues.yaml` or a separate config file with keyword→venue mappings? + +2. **Rate limiting strategy**: Should we implement global rate limiting across all harvesters, or per-harvester limits? + +3. **Search scope**: Should `/api/papers/search` search only harvested papers, or also query external APIs in real-time? + +4. **Frontend integration**: Should harvest progress be shown on a new page, or integrated into the existing Research page? + +5. **Retention policy**: Should old harvest_runs records be automatically cleaned up after N days? + +--- + +## Appendix A: Existing Implementation Summary + +### A.1 Existing Connectors + +| Connector | Status | Reusable? | +|-----------|--------|-----------| +| ArxivConnector | XML parsing only | Use for response parsing | +| SemanticScholarClient | Async API wrapper | Wrap with harvester | +| RedditConnector | RSS parsing | Not relevant | + +### A.2 Existing Infrastructure + +| Component | Status | +|-----------|--------| +| SessionProvider | Ready | +| SQLAlchemy Base | Ready | +| Alembic migrations | Ready | +| FastAPI streaming | Ready | +| EventLogPort | Ready | + +### A.3 API Patterns to Follow + +| Pattern | Example File | +|---------|--------------| +| SSE streaming | `src/paperbot/api/routes/track.py` | +| Pydantic models | `src/paperbot/api/routes/research.py` | +| Store initialization | `src/paperbot/infrastructure/stores/research_store.py` | diff --git a/src/paperbot/api/main.py b/src/paperbot/api/main.py index 8a41f07..45ea821 100644 --- a/src/paperbot/api/main.py +++ b/src/paperbot/api/main.py @@ -20,7 +20,11 @@ memory, research, paperscool, +<<<<<<< HEAD newsletter, +======= + harvest, +>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) ) from paperbot.infrastructure.event_log.logging_event_log import LoggingEventLog from paperbot.infrastructure.event_log.composite_event_log import CompositeEventLog @@ -64,7 +68,11 @@ async def health_check(): app.include_router(memory.router, prefix="/api", tags=["Memory"]) app.include_router(research.router, prefix="/api", tags=["Research"]) app.include_router(paperscool.router, prefix="/api", tags=["PapersCool"]) +<<<<<<< HEAD app.include_router(newsletter.router, prefix="/api", tags=["Newsletter"]) +======= +app.include_router(harvest.router, prefix="/api", tags=["Harvest"]) +>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) @app.on_event("startup") diff --git a/src/paperbot/api/routes/harvest.py b/src/paperbot/api/routes/harvest.py new file mode 100644 index 0000000..10ad62f --- /dev/null +++ b/src/paperbot/api/routes/harvest.py @@ -0,0 +1,429 @@ +# src/paperbot/api/routes/harvest.py +""" +Paper Harvest API Routes. + +Provides endpoints for: +- Paper harvesting from multiple sources +- Paper search and retrieval +- User's paper library management +- Harvest run history +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Optional + +from fastapi import APIRouter, HTTPException, Query, Request +from fastapi.responses import StreamingResponse +from pydantic import BaseModel, Field + +from paperbot.api.streaming import StreamEvent, wrap_generator +from paperbot.application.workflows.harvest_pipeline import ( + HarvestConfig, + HarvestFinalResult, + HarvestPipeline, + HarvestProgress, +) +from paperbot.utils.logging_config import Logger, LogFiles, set_trace_id, clear_trace_id +from paperbot.infrastructure.stores.paper_store import PaperStore, paper_to_dict + +router = APIRouter() + +# Lazy-initialized stores +_paper_store: Optional[PaperStore] = None + + +def _get_paper_store() -> PaperStore: + """Lazy initialization of paper store.""" + global _paper_store + if _paper_store is None: + _paper_store = PaperStore() + return _paper_store + + +# ============================================================================ +# Harvest Endpoints +# ============================================================================ + + +class HarvestRequest(BaseModel): + """Request body for harvest endpoint.""" + + keywords: List[str] = Field(..., min_items=1, description="Search keywords") + venues: Optional[List[str]] = Field(None, description="Filter to specific venues") + year_from: Optional[int] = Field(None, ge=1900, le=2100, description="Start year") + year_to: Optional[int] = Field(None, ge=1900, le=2100, description="End year") + max_results_per_source: int = Field( + 50, ge=1, le=200, description="Max papers per source" + ) + sources: Optional[List[str]] = Field( + None, description="Sources to harvest (arxiv, semantic_scholar, openalex)" + ) + expand_keywords: bool = Field(True, description="Expand abbreviations") + recommend_venues: bool = Field(True, description="Auto-recommend venues if not specified") + + +async def harvest_stream(request: HarvestRequest): + """Stream harvest progress via SSE.""" + config = HarvestConfig( + keywords=request.keywords, + venues=request.venues, + year_from=request.year_from, + year_to=request.year_to, + sources=request.sources, + max_results_per_source=request.max_results_per_source, + expand_keywords=request.expand_keywords, + recommend_venues=request.recommend_venues, + ) + + pipeline = HarvestPipeline() + try: + async for item in pipeline.run(config): + if isinstance(item, HarvestProgress): + yield StreamEvent( + type="progress", + data={ + "phase": item.phase, + "message": item.message, + "details": item.details, + }, + ) + elif isinstance(item, HarvestFinalResult): + yield StreamEvent( + type="result", + data={ + "run_id": item.run_id, + "status": item.status, + "papers_found": item.papers_found, + "papers_new": item.papers_new, + "papers_deduplicated": item.papers_deduplicated, + "sources": item.source_results, + "errors": item.errors, + "duration_seconds": item.duration_seconds, + }, + ) + except Exception as e: + yield StreamEvent(type="error", message=str(e)) + finally: + await pipeline.close() + + +@router.post("/harvest") +async def harvest_papers(request: HarvestRequest): + """ + Harvest papers from multiple sources. + + Returns Server-Sent Events with progress updates. + """ + trace_id = set_trace_id() + Logger.info(f"Starting harvest request: keywords={request.keywords}", file=LogFiles.HARVEST) + return StreamingResponse( + wrap_generator(harvest_stream(request)), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + }, + ) + + +class HarvestRunResponse(BaseModel): + """Response for harvest run details.""" + + run_id: str + keywords: List[str] + venues: List[str] + sources: List[str] + max_results_per_source: int + status: str + papers_found: int + papers_new: int + papers_deduplicated: int + errors: Dict[str, Any] + started_at: Optional[str] + ended_at: Optional[str] + + +class HarvestRunListResponse(BaseModel): + """Response for list of harvest runs.""" + + runs: List[HarvestRunResponse] + + +@router.get("/harvest/runs", response_model=HarvestRunListResponse) +def list_harvest_runs( + status: Optional[str] = Query(None, description="Filter by status"), + limit: int = Query(50, ge=1, le=500), + offset: int = Query(0, ge=0), +): + """List harvest runs with optional filtering.""" + store = _get_paper_store() + runs = store.list_harvest_runs(status=status, limit=limit, offset=offset) + + return HarvestRunListResponse( + runs=[ + HarvestRunResponse( + run_id=run.run_id, + keywords=run.get_keywords(), + venues=run.get_venues(), + sources=run.get_sources(), + max_results_per_source=run.max_results_per_source or 50, + status=run.status or "unknown", + papers_found=run.papers_found or 0, + papers_new=run.papers_new or 0, + papers_deduplicated=run.papers_deduplicated or 0, + errors=run.get_errors(), + started_at=run.started_at.isoformat() if run.started_at else None, + ended_at=run.ended_at.isoformat() if run.ended_at else None, + ) + for run in runs + ] + ) + + +@router.get("/harvest/runs/{run_id}", response_model=HarvestRunResponse) +def get_harvest_run(run_id: str): + """Get details of a specific harvest run.""" + store = _get_paper_store() + run = store.get_harvest_run(run_id) + + if not run: + raise HTTPException(status_code=404, detail="Harvest run not found") + + return HarvestRunResponse( + run_id=run.run_id, + keywords=run.get_keywords(), + venues=run.get_venues(), + sources=run.get_sources(), + max_results_per_source=run.max_results_per_source or 50, + status=run.status or "unknown", + papers_found=run.papers_found or 0, + papers_new=run.papers_new or 0, + papers_deduplicated=run.papers_deduplicated or 0, + errors=run.get_errors(), + started_at=run.started_at.isoformat() if run.started_at else None, + ended_at=run.ended_at.isoformat() if run.ended_at else None, + ) + + +# ============================================================================ +# Paper Search Endpoints +# ============================================================================ + + +class PaperSearchRequest(BaseModel): + """Request body for paper search.""" + + query: Optional[str] = Field(None, description="Full-text search query") + keywords: Optional[List[str]] = Field(None, description="Keyword filters") + venues: Optional[List[str]] = Field(None, description="Venue filters") + year_from: Optional[int] = Field(None, ge=1900, le=2100) + year_to: Optional[int] = Field(None, ge=1900, le=2100) + min_citations: Optional[int] = Field(None, ge=0) + sources: Optional[List[str]] = Field(None, description="Source filters") + sort_by: str = Field("citation_count", description="Sort field") + sort_order: str = Field("desc", description="Sort order (asc/desc)") + limit: int = Field(50, ge=1, le=500) + offset: int = Field(0, ge=0) + + +class PaperResponse(BaseModel): + """Single paper response.""" + + id: int + doi: Optional[str] + arxiv_id: Optional[str] + semantic_scholar_id: Optional[str] + openalex_id: Optional[str] + title: str + abstract: str + authors: List[str] + year: Optional[int] + venue: Optional[str] + publication_date: Optional[str] + citation_count: int + url: Optional[str] + pdf_url: Optional[str] + keywords: List[str] + fields_of_study: List[str] + primary_source: str + sources: List[str] + created_at: Optional[str] + updated_at: Optional[str] + + +class PaperSearchResponse(BaseModel): + """Response for paper search.""" + + papers: List[Dict[str, Any]] + total: int + limit: int + offset: int + + +@router.post("/papers/search", response_model=PaperSearchResponse) +def search_papers(request: PaperSearchRequest): + """Search papers with filters and pagination.""" + set_trace_id() # Initialize trace_id for this request + Logger.info(f"Searching papers: query={request.query}", file=LogFiles.HARVEST) + store = _get_paper_store() + + papers, total = store.search_papers( + query=request.query, + keywords=request.keywords, + venues=request.venues, + year_from=request.year_from, + year_to=request.year_to, + min_citations=request.min_citations, + sources=request.sources, + sort_by=request.sort_by, + sort_order=request.sort_order, + limit=request.limit, + offset=request.offset, + ) + + return PaperSearchResponse( + papers=[paper_to_dict(p) for p in papers], + total=total, + limit=request.limit, + offset=request.offset, + ) + + +@router.get("/papers/stats") +def get_paper_stats(): + """Get paper collection statistics.""" + store = _get_paper_store() + return {"total_papers": store.get_paper_count()} + + +# ============================================================================ +# User Library Endpoints +# ============================================================================ + + +class LibraryPaperResponse(BaseModel): + """Paper in user's library.""" + + paper: Dict[str, Any] + saved_at: str + track_id: Optional[int] + action: str + + +class LibraryResponse(BaseModel): + """Response for user library.""" + + papers: List[LibraryPaperResponse] + total: int + limit: int + offset: int + + +@router.get("/papers/library", response_model=LibraryResponse) +def get_user_library( + user_id: str = Query("default", description="User ID"), + track_id: Optional[int] = Query(None, description="Filter by track"), + actions: Optional[str] = Query(None, description="Filter by actions (comma-separated)"), + sort_by: str = Query("saved_at", description="Sort field"), + sort_order: str = Query("desc", description="Sort order"), + limit: int = Query(50, ge=1, le=500), + offset: int = Query(0, ge=0), +): + """Get user's paper library (saved papers).""" + set_trace_id() # Initialize trace_id for this request + Logger.info("Received request to get user library", file=LogFiles.HARVEST) + store = _get_paper_store() + + action_list = None + if actions: + action_list = [a.strip() for a in actions.split(",") if a.strip()] + + Logger.info("Fetching papers from library store", file=LogFiles.HARVEST) + library_papers, total = store.get_user_library( + user_id=user_id, + track_id=track_id, + actions=action_list, + sort_by=sort_by, + sort_order=sort_order, + limit=limit, + offset=offset, + ) + + Logger.info(f"Retrieved {len(library_papers)} papers from library, total={total}", file=LogFiles.HARVEST) + return LibraryResponse( + papers=[ + LibraryPaperResponse( + paper=paper_to_dict(lp.paper), + saved_at=lp.saved_at.isoformat() if lp.saved_at else "", + track_id=lp.track_id, + action=lp.action, + ) + for lp in library_papers + ], + total=total, + limit=limit, + offset=offset, + ) + + +# NOTE: Parameterized routes must come AFTER specific routes like /papers/stats and /papers/library +@router.get("/papers/{paper_id}") +def get_paper(paper_id: int): + """Get a paper by ID.""" + store = _get_paper_store() + paper = store.get_paper_by_id(paper_id) + + if not paper: + raise HTTPException(status_code=404, detail="Paper not found") + + return {"paper": paper_to_dict(paper)} + + +class SavePaperRequest(BaseModel): + """Request to save paper to library.""" + + user_id: str = Field("default", description="User ID") + track_id: Optional[int] = Field(None, description="Associated track ID") + + +@router.post("/papers/{paper_id}/save") +def save_paper_to_library(paper_id: int, request: SavePaperRequest): + """ + Save a paper to user's library. + + Uses paper_feedback table with action='save'. + """ + from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore + + # Verify paper exists + store = _get_paper_store() + paper = store.get_paper_by_id(paper_id) + if not paper: + raise HTTPException(status_code=404, detail="Paper not found") + + # Use research store to record feedback + research_store = SqlAlchemyResearchStore() + feedback = research_store.record_paper_feedback( + user_id=request.user_id, + paper_id=str(paper_id), + action="save", + track_id=request.track_id, + ) + + return {"success": True, "feedback": feedback} + + +@router.delete("/papers/{paper_id}/save") +def remove_paper_from_library( + paper_id: int, + user_id: str = Query("default", description="User ID"), +): + """Remove a paper from user's library.""" + store = _get_paper_store() + removed = store.remove_from_library(user_id, paper_id) + + if not removed: + raise HTTPException(status_code=404, detail="Paper not in library") + + return {"success": True} diff --git a/src/paperbot/api/routes/research.py b/src/paperbot/api/routes/research.py index abb50f4..2f01503 100644 --- a/src/paperbot/api/routes/research.py +++ b/src/paperbot/api/routes/research.py @@ -11,6 +11,7 @@ from paperbot.context_engine import ContextEngine, ContextEngineConfig from paperbot.context_engine.track_router import TrackRouter +from paperbot.utils.logging_config import Logger, LogFiles, set_trace_id from paperbot.infrastructure.stores.memory_store import SqlAlchemyMemoryStore from paperbot.infrastructure.api_clients.semantic_scholar import SemanticScholarClient from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore @@ -624,18 +625,32 @@ class PaperFeedbackRequest(BaseModel): metadata: Dict[str, Any] = {} context_run_id: Optional[int] = None context_rank: Optional[int] = None + # Paper metadata (optional, used when saving to library) + paper_title: Optional[str] = None + paper_abstract: Optional[str] = None + paper_authors: Optional[List[str]] = None + paper_year: Optional[int] = None + paper_venue: Optional[str] = None + paper_citation_count: Optional[int] = None + paper_url: Optional[str] = None class PaperFeedbackResponse(BaseModel): feedback: Dict[str, Any] + library_paper_id: Optional[int] = None # ID in papers table if saved @router.post("/research/papers/feedback", response_model=PaperFeedbackResponse) def add_paper_feedback(req: PaperFeedbackRequest): + set_trace_id() # Initialize trace_id for this request + Logger.info(f"Received paper feedback request, action={req.action}", file=LogFiles.HARVEST) + track_id = req.track_id if track_id is None: + Logger.info("No track specified, getting active track", file=LogFiles.HARVEST) active = _research_store.get_active_track(user_id=req.user_id) if not active: + Logger.error("No active track found", file=LogFiles.HARVEST) raise HTTPException(status_code=400, detail="track_id missing and no active track") track_id = int(active["id"]) @@ -645,17 +660,61 @@ def add_paper_feedback(req: PaperFeedbackRequest): if req.context_rank is not None: meta["context_rank"] = int(req.context_rank) + library_paper_id: Optional[int] = None + actual_paper_id = req.paper_id + + # If action is "save" and we have paper metadata, insert into papers table + if req.action == "save" and req.paper_title: + Logger.info("Save action detected, inserting paper into papers table", file=LogFiles.HARVEST) + try: + from paperbot.domain.harvest import HarvestedPaper, HarvestSource + from paperbot.infrastructure.stores.paper_store import PaperStore + + paper_store = PaperStore() + paper = HarvestedPaper( + title=req.paper_title, + source=HarvestSource.SEMANTIC_SCHOLAR, + abstract=req.paper_abstract or "", + authors=req.paper_authors or [], + semantic_scholar_id=req.paper_id, + year=req.paper_year, + venue=req.paper_venue, + citation_count=req.paper_citation_count or 0, + url=req.paper_url, + ) + Logger.info("Calling paper store to upsert paper", file=LogFiles.HARVEST) + new_count, _ = paper_store.upsert_papers_batch([paper]) + + # Get the paper ID from database + from paperbot.infrastructure.stores.models import PaperModel + from sqlalchemy import select + with paper_store._provider.session() as session: + result = session.execute( + select(PaperModel).where( + PaperModel.semantic_scholar_id == req.paper_id + ) + ).scalar_one_or_none() + if result: + library_paper_id = result.id + actual_paper_id = str(result.id) # Use integer ID for feedback + Logger.info(f"Paper saved to library with id={library_paper_id}", file=LogFiles.HARVEST) + except Exception as e: + Logger.warning(f"Failed to save paper to library: {e}", file=LogFiles.HARVEST) + + Logger.info("Recording paper feedback to research store", file=LogFiles.HARVEST) fb = _research_store.add_paper_feedback( user_id=req.user_id, track_id=track_id, - paper_id=req.paper_id, + paper_id=actual_paper_id, action=req.action, weight=req.weight, metadata=meta, ) if not fb: + Logger.error("Failed to record feedback - track not found", file=LogFiles.HARVEST) raise HTTPException(status_code=404, detail="Track not found") - return PaperFeedbackResponse(feedback=fb) + Logger.info("Paper feedback recorded successfully", file=LogFiles.HARVEST) + return PaperFeedbackResponse(feedback=fb, library_paper_id=library_paper_id) class PaperFeedbackListResponse(BaseModel): @@ -769,13 +828,19 @@ class ContextResponse(BaseModel): @router.post("/research/context", response_model=ContextResponse) async def build_context(req: ContextRequest): + set_trace_id() # Initialize trace_id for this request + Logger.info("Received build context request", file=LogFiles.HARVEST) + if req.activate_track_id is not None: + Logger.info("Activating research track", file=LogFiles.HARVEST) activated = _research_store.activate_track( user_id=req.user_id, track_id=req.activate_track_id ) if not activated: + Logger.error("Research track not found", file=LogFiles.HARVEST) raise HTTPException(status_code=404, detail="Track not found") + Logger.info("Initializing context engine", file=LogFiles.HARVEST) engine = ContextEngine( research_store=_research_store, memory_store=_memory_store, @@ -794,12 +859,15 @@ async def build_context(req: ContextRequest): ), ) try: + Logger.info("Building context pack with paper recommendations", file=LogFiles.HARVEST) pack = await engine.build_context_pack( user_id=req.user_id, query=req.query, track_id=req.track_id, include_cross_track=req.include_cross_track, ) + paper_count = len(pack.get("paper_recommendations", [])) + Logger.info(f"Context pack built successfully, found {paper_count} papers", file=LogFiles.HARVEST) return ContextResponse(context_pack=pack) finally: await engine.close() diff --git a/src/paperbot/application/ports/harvester_port.py b/src/paperbot/application/ports/harvester_port.py new file mode 100644 index 0000000..5716c45 --- /dev/null +++ b/src/paperbot/application/ports/harvester_port.py @@ -0,0 +1,50 @@ +# src/paperbot/application/ports/harvester_port.py +""" +Harvester port interface. + +Defines the abstract interface for all paper harvesters. +""" + +from __future__ import annotations + +from typing import List, Optional, Protocol, runtime_checkable + +from paperbot.domain.harvest import HarvestResult, HarvestSource + + +@runtime_checkable +class HarvesterPort(Protocol): + """Abstract interface for all paper harvesters.""" + + @property + def source(self) -> HarvestSource: + """Return the harvest source identifier.""" + ... + + async def search( + self, + query: str, + *, + max_results: int = 100, + year_from: Optional[int] = None, + year_to: Optional[int] = None, + venues: Optional[List[str]] = None, + ) -> HarvestResult: + """ + Search for papers matching the query. + + Args: + query: Search query string + max_results: Maximum number of results to return + year_from: Filter papers published on or after this year + year_to: Filter papers published on or before this year + venues: Filter papers from these venues (if supported by source) + + Returns: + HarvestResult with papers and metadata + """ + ... + + async def close(self) -> None: + """Release resources (HTTP sessions, etc.).""" + ... diff --git a/src/paperbot/application/services/__init__.py b/src/paperbot/application/services/__init__.py index 423829e..a319fd2 100644 --- a/src/paperbot/application/services/__init__.py +++ b/src/paperbot/application/services/__init__.py @@ -1,3 +1,12 @@ from paperbot.application.services.llm_service import LLMService, get_llm_service +from paperbot.application.services.paper_deduplicator import PaperDeduplicator +from paperbot.application.services.query_rewriter import QueryRewriter +from paperbot.application.services.venue_recommender import VenueRecommender -__all__ = ["LLMService", "get_llm_service"] +__all__ = [ + "LLMService", + "get_llm_service", + "PaperDeduplicator", + "QueryRewriter", + "VenueRecommender", +] diff --git a/src/paperbot/application/services/paper_deduplicator.py b/src/paperbot/application/services/paper_deduplicator.py new file mode 100644 index 0000000..954fa64 --- /dev/null +++ b/src/paperbot/application/services/paper_deduplicator.py @@ -0,0 +1,190 @@ +# src/paperbot/application/services/paper_deduplicator.py +""" +Paper deduplication service. + +Multi-strategy deduplication for papers from multiple sources. +""" + +from __future__ import annotations + +import logging +from typing import Dict, List, Optional, Tuple + +from paperbot.domain.harvest import HarvestedPaper + +logger = logging.getLogger(__name__) + + +class PaperDeduplicator: + """ + Multi-strategy paper deduplication. + + Priority order: + 1. DOI (most reliable) + 2. arXiv ID + 3. Semantic Scholar ID + 4. OpenAlex ID + 5. Normalized title hash (fallback) + + When duplicates are found, metadata is merged to preserve + the most complete information from all sources. + """ + + def __init__(self): + self._doi_index: Dict[str, int] = {} + self._arxiv_index: Dict[str, int] = {} + self._s2_index: Dict[str, int] = {} + self._openalex_index: Dict[str, int] = {} + self._title_hash_index: Dict[str, int] = {} + + def reset(self) -> None: + """Clear all indexes for a fresh deduplication run.""" + self._doi_index.clear() + self._arxiv_index.clear() + self._s2_index.clear() + self._openalex_index.clear() + self._title_hash_index.clear() + + def deduplicate( + self, + papers: List[HarvestedPaper], + ) -> Tuple[List[HarvestedPaper], int]: + """ + Deduplicate papers in-memory. + + Args: + papers: List of papers from all sources + + Returns: + Tuple of (deduplicated papers, count of duplicates removed) + """ + self.reset() + unique_papers: List[HarvestedPaper] = [] + duplicates_count = 0 + + for paper in papers: + existing_idx = self._find_duplicate(paper) + + if existing_idx is not None: + # Merge metadata into existing paper + self._merge_paper(unique_papers[existing_idx], paper) + duplicates_count += 1 + else: + # Add new paper + idx = len(unique_papers) + self._index_paper(paper, idx) + unique_papers.append(paper) + + logger.info( + f"Deduplication complete: {len(papers)} → {len(unique_papers)} " + f"({duplicates_count} duplicates removed)" + ) + return unique_papers, duplicates_count + + def _find_duplicate(self, paper: HarvestedPaper) -> Optional[int]: + """Find existing paper index if duplicate exists.""" + # 1. DOI match (most reliable) + if paper.doi: + doi_lower = paper.doi.lower().strip() + if doi_lower in self._doi_index: + return self._doi_index[doi_lower] + + # 2. arXiv ID match + if paper.arxiv_id: + arxiv_lower = paper.arxiv_id.lower().strip() + if arxiv_lower in self._arxiv_index: + return self._arxiv_index[arxiv_lower] + + # 3. Semantic Scholar ID match + if paper.semantic_scholar_id: + s2_lower = paper.semantic_scholar_id.lower().strip() + if s2_lower in self._s2_index: + return self._s2_index[s2_lower] + + # 4. OpenAlex ID match + if paper.openalex_id: + openalex_lower = paper.openalex_id.lower().strip() + if openalex_lower in self._openalex_index: + return self._openalex_index[openalex_lower] + + # 5. Title hash match (fallback) + title_hash = paper.compute_title_hash() + if title_hash in self._title_hash_index: + return self._title_hash_index[title_hash] + + return None + + def _index_paper(self, paper: HarvestedPaper, idx: int) -> None: + """Add paper to all relevant indexes.""" + if paper.doi: + self._doi_index[paper.doi.lower().strip()] = idx + if paper.arxiv_id: + self._arxiv_index[paper.arxiv_id.lower().strip()] = idx + if paper.semantic_scholar_id: + self._s2_index[paper.semantic_scholar_id.lower().strip()] = idx + if paper.openalex_id: + self._openalex_index[paper.openalex_id.lower().strip()] = idx + + title_hash = paper.compute_title_hash() + self._title_hash_index[title_hash] = idx + + def _merge_paper(self, existing: HarvestedPaper, new: HarvestedPaper) -> None: + """ + Merge metadata from new paper into existing. + + Strategy: + - Fill in missing identifiers + - Prefer longer/more complete text fields + - Prefer higher citation counts + - Merge lists (keywords, fields of study) + """ + # Fill in missing identifiers + if not existing.doi and new.doi: + existing.doi = new.doi + self._doi_index[new.doi.lower().strip()] = self._find_index(existing) + if not existing.arxiv_id and new.arxiv_id: + existing.arxiv_id = new.arxiv_id + self._arxiv_index[new.arxiv_id.lower().strip()] = self._find_index(existing) + if not existing.semantic_scholar_id and new.semantic_scholar_id: + existing.semantic_scholar_id = new.semantic_scholar_id + self._s2_index[new.semantic_scholar_id.lower().strip()] = self._find_index(existing) + if not existing.openalex_id and new.openalex_id: + existing.openalex_id = new.openalex_id + self._openalex_index[new.openalex_id.lower().strip()] = self._find_index(existing) + + # Prefer longer abstract + if len(new.abstract) > len(existing.abstract): + existing.abstract = new.abstract + + # Prefer higher citation count + if new.citation_count > existing.citation_count: + existing.citation_count = new.citation_count + + # Fill in missing metadata + if not existing.year and new.year: + existing.year = new.year + if not existing.venue and new.venue: + existing.venue = new.venue + if not existing.publication_date and new.publication_date: + existing.publication_date = new.publication_date + if not existing.url and new.url: + existing.url = new.url + if not existing.pdf_url and new.pdf_url: + existing.pdf_url = new.pdf_url + + # Prefer more complete author list + if len(new.authors) > len(existing.authors): + existing.authors = new.authors + + # Merge keywords and fields (deduplicate) + existing.keywords = list(set(existing.keywords + new.keywords)) + existing.fields_of_study = list(set(existing.fields_of_study + new.fields_of_study)) + + def _find_index(self, paper: HarvestedPaper) -> int: + """Find the index of a paper in the title hash index.""" + title_hash = paper.compute_title_hash() + return self._title_hash_index.get(title_hash, -1) + + def is_duplicate(self, paper: HarvestedPaper) -> bool: + """Check if a paper would be considered a duplicate.""" + return self._find_duplicate(paper) is not None diff --git a/src/paperbot/application/services/query_rewriter.py b/src/paperbot/application/services/query_rewriter.py new file mode 100644 index 0000000..c87306f --- /dev/null +++ b/src/paperbot/application/services/query_rewriter.py @@ -0,0 +1,151 @@ +# src/paperbot/application/services/query_rewriter.py +""" +Query rewriting service. + +Expands and rewrites search queries for better coverage. +""" + +from __future__ import annotations + +import logging +import re +from typing import Dict, List, Optional + +logger = logging.getLogger(__name__) + + +class QueryRewriter: + """ + Expand and rewrite queries for better search coverage. + + Handles: + - Abbreviation expansion (LLM → large language model) + - Synonym addition + - Query normalization + """ + + # Abbreviation → full form mappings + DEFAULT_ABBREVIATIONS: Dict[str, str] = { + "llm": "large language model", + "llms": "large language models", + "ml": "machine learning", + "dl": "deep learning", + "nlp": "natural language processing", + "cv": "computer vision", + "rl": "reinforcement learning", + "gan": "generative adversarial network", + "gans": "generative adversarial networks", + "cnn": "convolutional neural network", + "cnns": "convolutional neural networks", + "rnn": "recurrent neural network", + "rnns": "recurrent neural networks", + "lstm": "long short-term memory", + "bert": "bidirectional encoder representations from transformers", + "gpt": "generative pre-trained transformer", + "rag": "retrieval augmented generation", + "vae": "variational autoencoder", + "asr": "automatic speech recognition", + "tts": "text to speech", + "ocr": "optical character recognition", + "sql": "structured query language", + "api": "application programming interface", + "ai": "artificial intelligence", + "nn": "neural network", + "dnn": "deep neural network", + "mlp": "multilayer perceptron", + "svm": "support vector machine", + "knn": "k-nearest neighbors", + "pca": "principal component analysis", + "ssl": "self-supervised learning", + "ner": "named entity recognition", + "qa": "question answering", + "ir": "information retrieval", + "kg": "knowledge graph", + "gcn": "graph convolutional network", + "gnn": "graph neural network", + "vit": "vision transformer", + "clip": "contrastive language-image pre-training", + } + + def __init__(self, abbreviations: Optional[Dict[str, str]] = None): + self.abbreviations = {**self.DEFAULT_ABBREVIATIONS} + if abbreviations: + self.abbreviations.update(abbreviations) + + def rewrite(self, query: str) -> List[str]: + """ + Rewrite query to produce expanded variations. + + Args: + query: Original search query + + Returns: + List of query variations (original + expanded) + """ + queries = [query] + + # Tokenize and expand abbreviations + words = query.lower().split() + expanded_words = [] + has_expansion = False + + for word in words: + # Remove punctuation for matching + clean_word = re.sub(r"[^\w]", "", word) + + if clean_word in self.abbreviations: + expanded_words.append(self.abbreviations[clean_word]) + has_expansion = True + else: + expanded_words.append(word) + + if has_expansion: + expanded_query = " ".join(expanded_words) + if expanded_query != query.lower(): + queries.append(expanded_query) + + logger.debug(f"Query rewrite: '{query}' → {queries}") + return queries + + def expand_all(self, keywords: List[str]) -> List[str]: + """ + Expand all keywords, returning unique expanded terms. + + Args: + keywords: List of search keywords + + Returns: + List of unique expanded keywords + """ + expanded: List[str] = [] + seen: set[str] = set() + + for keyword in keywords: + for variation in self.rewrite(keyword): + normalized = self.normalize(variation) + if normalized and normalized not in seen: + seen.add(normalized) + expanded.append(variation) + + return expanded + + def normalize(self, query: str) -> str: + """ + Normalize query for consistent matching. + + - Lowercase + - Remove extra whitespace + - Remove special characters (except alphanumeric and space) + """ + normalized = query.lower() + normalized = re.sub(r"[^\w\s]", " ", normalized) + normalized = re.sub(r"\s+", " ", normalized).strip() + return normalized + + def add_abbreviation(self, abbrev: str, expansion: str) -> None: + """Add or update an abbreviation mapping.""" + self.abbreviations[abbrev.lower()] = expansion.lower() + + def get_expansion(self, abbrev: str) -> Optional[str]: + """Get the expansion for an abbreviation, if any.""" + return self.abbreviations.get(abbrev.lower()) diff --git a/src/paperbot/application/services/venue_recommender.py b/src/paperbot/application/services/venue_recommender.py new file mode 100644 index 0000000..bc82dab --- /dev/null +++ b/src/paperbot/application/services/venue_recommender.py @@ -0,0 +1,157 @@ +# src/paperbot/application/services/venue_recommender.py +""" +Venue recommendation service. + +Recommends relevant academic venues based on search keywords. +""" + +from __future__ import annotations + +import logging +from typing import Dict, List, Optional + +logger = logging.getLogger(__name__) + + +class VenueRecommender: + """ + Recommend relevant venues based on keywords. + + Uses a static mapping from keywords/domains to top venues. + Configuration can be loaded from config file or use defaults. + """ + + # Default keyword→venue mappings + DEFAULT_MAPPINGS: Dict[str, List[str]] = { + # Security + "security": ["CCS", "S&P", "USENIX Security", "NDSS"], + "ransomware": ["CCS", "S&P", "USENIX Security", "NDSS"], + "malware": ["CCS", "S&P", "USENIX Security", "NDSS"], + "cryptography": ["CRYPTO", "EUROCRYPT", "CCS"], + "privacy": ["S&P", "PETS", "CCS", "USENIX Security"], + "vulnerability": ["CCS", "S&P", "USENIX Security", "NDSS"], + "attack": ["CCS", "S&P", "USENIX Security", "NDSS"], + "adversarial": ["CCS", "S&P", "NeurIPS", "ICML"], + # ML/AI + "machine learning": ["NeurIPS", "ICML", "ICLR"], + "deep learning": ["NeurIPS", "ICML", "ICLR", "CVPR"], + "llm": ["NeurIPS", "ICML", "ACL", "EMNLP"], + "large language model": ["NeurIPS", "ICML", "ACL", "EMNLP"], + "transformer": ["NeurIPS", "ICML", "ACL", "EMNLP"], + "gpt": ["NeurIPS", "ICML", "ACL", "EMNLP"], + "nlp": ["ACL", "EMNLP", "NAACL", "NeurIPS"], + "natural language": ["ACL", "EMNLP", "NAACL"], + "computer vision": ["CVPR", "ICCV", "ECCV", "NeurIPS"], + "image": ["CVPR", "ICCV", "ECCV"], + "neural network": ["NeurIPS", "ICML", "ICLR"], + "reinforcement learning": ["NeurIPS", "ICML", "ICLR"], + "generative": ["NeurIPS", "ICML", "ICLR", "CVPR"], + "diffusion": ["NeurIPS", "ICML", "ICLR", "CVPR"], + # Systems + "database": ["SIGMOD", "VLDB", "ICDE"], + "query": ["SIGMOD", "VLDB", "ICDE"], + "sql": ["SIGMOD", "VLDB", "ICDE"], + "systems": ["OSDI", "SOSP", "EuroSys", "ATC"], + "operating system": ["OSDI", "SOSP", "EuroSys"], + "distributed": ["OSDI", "SOSP", "EuroSys", "NSDI"], + "networking": ["SIGCOMM", "NSDI", "MobiCom"], + "network": ["SIGCOMM", "NSDI", "MobiCom"], + "cloud": ["OSDI", "SOSP", "EuroSys", "SoCC"], + # Software Engineering + "software engineering": ["ICSE", "FSE", "ASE"], + "software": ["ICSE", "FSE", "ASE"], + "testing": ["ICSE", "ISSTA", "FSE"], + "bug": ["ICSE", "FSE", "ASE", "ISSTA"], + "program analysis": ["PLDI", "POPL", "OOPSLA"], + "compiler": ["PLDI", "CGO", "CC"], + "verification": ["CAV", "PLDI", "POPL"], + # HCI + "hci": ["CHI", "UIST", "UbiComp"], + "human computer": ["CHI", "UIST", "UbiComp"], + "interaction": ["CHI", "UIST"], + "user interface": ["CHI", "UIST"], + # Data Mining + "data mining": ["KDD", "ICDM", "WWW"], + "knowledge graph": ["KDD", "WWW", "EMNLP"], + "recommendation": ["KDD", "RecSys", "WWW"], + # Robotics + "robotics": ["ICRA", "IROS", "RSS"], + "robot": ["ICRA", "IROS", "RSS"], + "autonomous": ["ICRA", "IROS", "CVPR"], + } + + def __init__( + self, + config_path: Optional[str] = None, + mappings: Optional[Dict[str, List[str]]] = None, + ): + self.mappings = self.DEFAULT_MAPPINGS.copy() + if mappings: + self.mappings.update(mappings) + if config_path: + self._load_config(config_path) + + def _load_config(self, config_path: str) -> None: + """Load venue mappings from YAML config file.""" + try: + import yaml + + with open(config_path, "r", encoding="utf-8") as f: + config = yaml.safe_load(f) + + if config and isinstance(config, dict): + venue_mappings = config.get("venue_mappings", {}) + if isinstance(venue_mappings, dict): + self.mappings.update(venue_mappings) + logger.info(f"Loaded {len(venue_mappings)} venue mappings from {config_path}") + except Exception as e: + logger.warning(f"Failed to load venue config from {config_path}: {e}") + + def recommend( + self, + keywords: List[str], + *, + max_venues: int = 5, + ) -> List[str]: + """ + Recommend venues based on keywords. + + Args: + keywords: List of search keywords + max_venues: Maximum number of venues to recommend + + Returns: + List of recommended venue names, ordered by relevance + """ + venue_scores: Dict[str, int] = {} + + for keyword in keywords: + keyword_lower = keyword.lower().strip() + if not keyword_lower: + continue + + # Exact match (highest priority) + if keyword_lower in self.mappings: + for venue in self.mappings[keyword_lower]: + venue_scores[venue] = venue_scores.get(venue, 0) + 3 + + # Partial match (medium priority) + for mapped_kw, venues in self.mappings.items(): + if keyword_lower in mapped_kw or mapped_kw in keyword_lower: + for venue in venues: + venue_scores[venue] = venue_scores.get(venue, 0) + 1 + + # Sort by score descending + sorted_venues = sorted(venue_scores.items(), key=lambda x: -x[1]) + result = [v[0] for v in sorted_venues[:max_venues]] + + logger.debug(f"Recommended venues for {keywords}: {result}") + return result + + def get_venues_for_domain(self, domain: str) -> List[str]: + """Get venues for a specific domain keyword.""" + return self.mappings.get(domain.lower(), []) + + def add_mapping(self, keyword: str, venues: List[str]) -> None: + """Add or update a keyword→venues mapping.""" + self.mappings[keyword.lower()] = venues diff --git a/src/paperbot/application/workflows/harvest_pipeline.py b/src/paperbot/application/workflows/harvest_pipeline.py new file mode 100644 index 0000000..983b8ad --- /dev/null +++ b/src/paperbot/application/workflows/harvest_pipeline.py @@ -0,0 +1,376 @@ +# src/paperbot/application/workflows/harvest_pipeline.py +""" +Paper Harvest Pipeline. + +Orchestrates multi-source paper harvesting with deduplication and storage. +""" + +from __future__ import annotations + +import asyncio +import logging +import uuid +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any, AsyncGenerator, Dict, List, Optional + +from paperbot.domain.harvest import ( + HarvestedPaper, + HarvestResult, + HarvestRunResult, + HarvestSource, +) +from paperbot.application.services import ( + PaperDeduplicator, + QueryRewriter, + VenueRecommender, +) +from paperbot.application.ports.harvester_port import HarvesterPort +from paperbot.infrastructure.harvesters import ( + ArxivHarvester, + SemanticScholarHarvester, + OpenAlexHarvester, +) +from paperbot.infrastructure.stores.paper_store import PaperStore + +logger = logging.getLogger(__name__) + + +def _utcnow() -> datetime: + return datetime.now(timezone.utc) + + +@dataclass +class HarvestProgress: + """Progress update during harvesting.""" + + phase: str + message: str + details: Optional[Dict[str, Any]] = None + + +@dataclass +class HarvestConfig: + """Configuration for a harvest run.""" + + keywords: List[str] + venues: Optional[List[str]] = None + year_from: Optional[int] = None + year_to: Optional[int] = None + sources: Optional[List[str]] = None + max_results_per_source: int = 50 + expand_keywords: bool = True + recommend_venues: bool = True + + +@dataclass +class HarvestFinalResult: + """Final result of a harvest run.""" + + run_id: str + status: str # success, partial, failed + papers_found: int + papers_new: int + papers_deduplicated: int + source_results: Dict[str, Dict[str, Any]] + errors: Dict[str, str] + duration_seconds: float + + +class HarvestPipeline: + """ + Multi-source paper harvest pipeline. + + Orchestrates: + 1. Query expansion (QueryRewriter) + 2. Venue recommendation (VenueRecommender) + 3. Parallel harvesting from multiple sources + 4. In-memory deduplication (PaperDeduplicator) + 5. Batch storage with DB-level dedup (PaperStore) + """ + + def __init__( + self, + db_url: Optional[str] = None, + *, + venue_config_path: Optional[str] = None, + ): + self.db_url = db_url + self._venue_config_path = venue_config_path + + # Services (initialized lazily) + self._query_rewriter: Optional[QueryRewriter] = None + self._venue_recommender: Optional[VenueRecommender] = None + self._deduplicator: Optional[PaperDeduplicator] = None + self._paper_store: Optional[PaperStore] = None + + # Harvesters (initialized per-run) + self._harvesters: Dict[str, HarvesterPort] = {} + + @property + def query_rewriter(self) -> QueryRewriter: + if self._query_rewriter is None: + self._query_rewriter = QueryRewriter() + return self._query_rewriter + + @property + def venue_recommender(self) -> VenueRecommender: + if self._venue_recommender is None: + self._venue_recommender = VenueRecommender( + config_path=self._venue_config_path + ) + return self._venue_recommender + + @property + def deduplicator(self) -> PaperDeduplicator: + if self._deduplicator is None: + self._deduplicator = PaperDeduplicator() + return self._deduplicator + + @property + def paper_store(self) -> PaperStore: + if self._paper_store is None: + self._paper_store = PaperStore(self.db_url) + return self._paper_store + + def _get_harvester(self, source: str) -> Optional[HarvesterPort]: + """Get or create harvester for a source.""" + if source not in self._harvesters: + if source == HarvestSource.ARXIV.value: + self._harvesters[source] = ArxivHarvester() + elif source == HarvestSource.SEMANTIC_SCHOLAR.value: + self._harvesters[source] = SemanticScholarHarvester() + elif source == HarvestSource.OPENALEX.value: + self._harvesters[source] = OpenAlexHarvester() + else: + logger.warning(f"Unknown source: {source}") + return None + return self._harvesters[source] + + @staticmethod + def new_run_id() -> str: + """Generate a new harvest run ID.""" + timestamp = _utcnow().strftime("%Y%m%d-%H%M%S") + suffix = uuid.uuid4().hex[:8] + return f"harvest-{timestamp}-{suffix}" + + async def run( + self, + config: HarvestConfig, + *, + run_id: Optional[str] = None, + ) -> AsyncGenerator[HarvestProgress | HarvestFinalResult, None]: + """ + Execute harvest pipeline with progress updates. + + Yields: + HarvestProgress for intermediate updates + HarvestFinalResult as final yield + """ + run_id = run_id or self.new_run_id() + start_time = _utcnow() + errors: Dict[str, str] = {} + source_results: Dict[str, Dict[str, Any]] = {} + + # Determine sources to use + sources = config.sources or [s.value for s in HarvestSource] + + try: + # Phase 1: Expand keywords + yield HarvestProgress( + phase="Expanding", + message="Expanding keywords...", + ) + + expanded_keywords = config.keywords.copy() + if config.expand_keywords: + expanded_keywords = self.query_rewriter.expand_all(config.keywords) + logger.info(f"Expanded keywords: {config.keywords} → {expanded_keywords}") + + # Phase 2: Recommend venues (if not specified) + venues = config.venues + if config.recommend_venues and not venues: + yield HarvestProgress( + phase="Recommending", + message="Recommending venues...", + ) + venues = self.venue_recommender.recommend( + expanded_keywords, max_venues=5 + ) + logger.info(f"Recommended venues: {venues}") + + # Phase 3: Create harvest run record + yield HarvestProgress( + phase="Initializing", + message="Creating harvest run record...", + ) + + self.paper_store.create_harvest_run( + run_id=run_id, + keywords=expanded_keywords, + venues=venues or [], + sources=sources, + max_results_per_source=config.max_results_per_source, + ) + + # Phase 4: Harvest from each source in parallel + all_papers: List[HarvestedPaper] = [] + + # Build search query from expanded keywords + search_query = " ".join(expanded_keywords) + + # Harvest from each source + for source in sources: + yield HarvestProgress( + phase="Harvesting", + message=f"Fetching from {source}...", + details={"source": source}, + ) + + harvester = self._get_harvester(source) + if harvester is None: + errors[source] = f"Unknown source: {source}" + source_results[source] = {"papers": 0, "error": errors[source]} + continue + + try: + result = await harvester.search( + query=search_query, + max_results=config.max_results_per_source, + year_from=config.year_from, + year_to=config.year_to, + venues=venues, + ) + + all_papers.extend(result.papers) + source_results[source] = { + "papers": result.total_found, + "error": result.error, + } + + if result.error: + errors[source] = result.error + logger.warning(f"Error from {source}: {result.error}") + else: + logger.info(f"Harvested {result.total_found} papers from {source}") + + except Exception as e: + error_msg = str(e) + errors[source] = error_msg + source_results[source] = {"papers": 0, "error": error_msg} + logger.exception(f"Exception harvesting from {source}") + + # Phase 5: Deduplicate + yield HarvestProgress( + phase="Deduplicating", + message=f"Removing duplicates from {len(all_papers)} papers...", + ) + + unique_papers, deduplicated_count = self.deduplicator.deduplicate(all_papers) + logger.info( + f"Deduplication: {len(all_papers)} → {len(unique_papers)} " + f"({deduplicated_count} removed)" + ) + + # Phase 6: Store papers + yield HarvestProgress( + phase="Storing", + message=f"Saving {len(unique_papers)} papers to database...", + ) + + new_count, updated_count = self.paper_store.upsert_papers_batch(unique_papers) + logger.info(f"Stored papers: {new_count} new, {updated_count} updated") + + # Phase 7: Update harvest run record + status = "success" + if errors: + status = "partial" if unique_papers else "failed" + + self.paper_store.update_harvest_run( + run_id=run_id, + status=status, + papers_found=len(all_papers), + papers_new=new_count, + papers_deduplicated=deduplicated_count, + errors=errors if errors else None, + ) + + # Calculate duration + end_time = _utcnow() + duration = (end_time - start_time).total_seconds() + + # Yield final result + yield HarvestFinalResult( + run_id=run_id, + status=status, + papers_found=len(all_papers), + papers_new=new_count, + papers_deduplicated=deduplicated_count, + source_results=source_results, + errors=errors, + duration_seconds=duration, + ) + + except Exception as e: + # Handle pipeline-level errors + logger.exception(f"Harvest pipeline failed: {e}") + self.paper_store.update_harvest_run( + run_id=run_id, + status="failed", + errors={"pipeline": str(e)}, + ) + + end_time = _utcnow() + duration = (end_time - start_time).total_seconds() + + yield HarvestFinalResult( + run_id=run_id, + status="failed", + papers_found=0, + papers_new=0, + papers_deduplicated=0, + source_results=source_results, + errors={"pipeline": str(e), **errors}, + duration_seconds=duration, + ) + + async def run_sync( + self, + config: HarvestConfig, + *, + run_id: Optional[str] = None, + ) -> HarvestFinalResult: + """ + Execute harvest pipeline and return only final result. + + Useful for CLI or non-streaming use cases. + """ + result: Optional[HarvestFinalResult] = None + async for item in self.run(config, run_id=run_id): + if isinstance(item, HarvestFinalResult): + result = item + + if result is None: + raise RuntimeError("Pipeline completed without final result") + return result + + async def close(self) -> None: + """Release all resources.""" + # Close harvesters + for harvester in self._harvesters.values(): + try: + await harvester.close() + except Exception: + pass + self._harvesters.clear() + + # Close paper store + if self._paper_store: + self._paper_store.close() + self._paper_store = None + + async def __aenter__(self) -> "HarvestPipeline": + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + await self.close() diff --git a/src/paperbot/context_engine/engine.py b/src/paperbot/context_engine/engine.py index 9daa3ab..6d3004c 100644 --- a/src/paperbot/context_engine/engine.py +++ b/src/paperbot/context_engine/engine.py @@ -12,6 +12,7 @@ from paperbot.domain.paper import PaperMeta from paperbot.infrastructure.stores.memory_store import SqlAlchemyMemoryStore from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore +from paperbot.utils.logging_config import Logger, LogFiles _TOKEN_RX = re.compile(r"[a-zA-Z0-9_+.-]+") @@ -502,6 +503,7 @@ async def build_context_pack( "rebuttal": (0.50, 0.40, 0.10), }.get(stage, (0.55, 0.30, 0.15)) + Logger.info(f"Paper search config: offline={self.config.offline}, paper_limit={self.config.paper_limit}", file=LogFiles.HARVEST) if not self.config.offline and self.config.paper_limit > 0: try: searcher = self.paper_searcher @@ -509,9 +511,12 @@ async def build_context_pack( from paperbot.utils.search import SemanticScholarSearch # local import searcher = SemanticScholarSearch() + Logger.info("Initialized SemanticScholarSearch", file=LogFiles.HARVEST) fetch_limit = max(30, int(self.config.paper_limit) * 3) + Logger.info(f"Searching papers with query='{merged_query}', limit={fetch_limit}", file=LogFiles.HARVEST) resp = await asyncio.to_thread(searcher.search_papers, merged_query, fetch_limit) + Logger.info(f"Search returned {len(getattr(resp, 'papers', []) or [])} papers", file=LogFiles.HARVEST) raw: List[Dict[str, Any]] = [] for p in getattr(resp, "papers", []) or []: @@ -578,7 +583,10 @@ async def build_context_pack( policy=policy, seed=f"{user_id}:{merged_query}:{stage}:{routed_track.get('id') if routed_track else ''}", ) - except Exception: + except Exception as e: + import traceback + tb = traceback.format_exc() + Logger.error(f"Error fetching papers: {e}\n{tb}", file=LogFiles.HARVEST) papers = [] routing = { diff --git a/src/paperbot/domain/harvest.py b/src/paperbot/domain/harvest.py new file mode 100644 index 0000000..64230ab --- /dev/null +++ b/src/paperbot/domain/harvest.py @@ -0,0 +1,160 @@ +# src/paperbot/domain/harvest.py +""" +Paper harvesting domain models. + +Contains data structures for paper collection from multiple sources: +- HarvestedPaper: Unified paper format from any source +- HarvestSource: Enum of supported paper sources +- HarvestResult: Result from a single harvester +- HarvestRunResult: Aggregated result from all harvesters +""" + +from __future__ import annotations + +import hashlib +import re +from dataclasses import dataclass, field +from datetime import datetime +from enum import Enum +from typing import Any, Dict, List, Optional + + +class HarvestSource(str, Enum): + """Supported paper data sources.""" + + ARXIV = "arxiv" + SEMANTIC_SCHOLAR = "semantic_scholar" + OPENALEX = "openalex" + + +@dataclass +class HarvestedPaper: + """ + Unified paper format from any harvest source. + + Required fields: title, source + All other fields are optional to handle varying API responses. + """ + + title: str + source: HarvestSource + abstract: str = "" + authors: List[str] = field(default_factory=list) + doi: Optional[str] = None + arxiv_id: Optional[str] = None + semantic_scholar_id: Optional[str] = None + openalex_id: Optional[str] = None + year: Optional[int] = None + venue: Optional[str] = None + publication_date: Optional[str] = None + citation_count: int = 0 + url: Optional[str] = None + pdf_url: Optional[str] = None + keywords: List[str] = field(default_factory=list) + fields_of_study: List[str] = field(default_factory=list) + source_rank: Optional[int] = None + + def compute_title_hash(self) -> str: + """Compute normalized title hash for deduplication.""" + normalized = self.title.lower() + normalized = re.sub(r"[^\w\s]", "", normalized) + normalized = re.sub(r"\s+", " ", normalized).strip() + return hashlib.sha256(normalized.encode()).hexdigest() + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary representation.""" + return { + "title": self.title, + "source": self.source.value, + "abstract": self.abstract, + "authors": self.authors, + "doi": self.doi, + "arxiv_id": self.arxiv_id, + "semantic_scholar_id": self.semantic_scholar_id, + "openalex_id": self.openalex_id, + "year": self.year, + "venue": self.venue, + "publication_date": self.publication_date, + "citation_count": self.citation_count, + "url": self.url, + "pdf_url": self.pdf_url, + "keywords": self.keywords, + "fields_of_study": self.fields_of_study, + "source_rank": self.source_rank, + "title_hash": self.compute_title_hash(), + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "HarvestedPaper": + """Create instance from dictionary.""" + source = data.get("source", "") + if isinstance(source, str): + source = HarvestSource(source) + return cls( + title=data.get("title", ""), + source=source, + abstract=data.get("abstract", ""), + authors=data.get("authors", []), + doi=data.get("doi"), + arxiv_id=data.get("arxiv_id"), + semantic_scholar_id=data.get("semantic_scholar_id"), + openalex_id=data.get("openalex_id"), + year=data.get("year"), + venue=data.get("venue"), + publication_date=data.get("publication_date"), + citation_count=data.get("citation_count", 0), + url=data.get("url"), + pdf_url=data.get("pdf_url"), + keywords=data.get("keywords", []), + fields_of_study=data.get("fields_of_study", []), + source_rank=data.get("source_rank"), + ) + + +@dataclass +class HarvestResult: + """Result from a single harvester.""" + + source: HarvestSource + papers: List[HarvestedPaper] + total_found: int + error: Optional[str] = None + + @property + def success(self) -> bool: + """Whether the harvest was successful.""" + return self.error is None + + +@dataclass +class HarvestRunResult: + """Aggregated result from all harvesters in a harvest run.""" + + run_id: str + status: str # running/success/partial/failed + papers_found: int + papers_new: int + papers_deduplicated: int + source_results: Dict[HarvestSource, HarvestResult] + started_at: datetime + ended_at: Optional[datetime] = None + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary representation.""" + return { + "run_id": self.run_id, + "status": self.status, + "papers_found": self.papers_found, + "papers_new": self.papers_new, + "papers_deduplicated": self.papers_deduplicated, + "sources": { + source.value: { + "papers": len(result.papers), + "total_found": result.total_found, + "error": result.error, + } + for source, result in self.source_results.items() + }, + "started_at": self.started_at.isoformat() if self.started_at else None, + "ended_at": self.ended_at.isoformat() if self.ended_at else None, + } diff --git a/src/paperbot/infrastructure/harvesters/__init__.py b/src/paperbot/infrastructure/harvesters/__init__.py new file mode 100644 index 0000000..24e9ccc --- /dev/null +++ b/src/paperbot/infrastructure/harvesters/__init__.py @@ -0,0 +1,17 @@ +# src/paperbot/infrastructure/harvesters/__init__.py +""" +Paper harvesters for multiple academic sources. + +Each harvester implements the HarvesterPort interface and normalizes +results to the HarvestedPaper format. +""" + +from .arxiv_harvester import ArxivHarvester +from .semantic_scholar_harvester import SemanticScholarHarvester +from .openalex_harvester import OpenAlexHarvester + +__all__ = [ + "ArxivHarvester", + "SemanticScholarHarvester", + "OpenAlexHarvester", +] diff --git a/src/paperbot/infrastructure/harvesters/arxiv_harvester.py b/src/paperbot/infrastructure/harvesters/arxiv_harvester.py new file mode 100644 index 0000000..6b51d1c --- /dev/null +++ b/src/paperbot/infrastructure/harvesters/arxiv_harvester.py @@ -0,0 +1,168 @@ +# src/paperbot/infrastructure/harvesters/arxiv_harvester.py +""" +arXiv paper harvester. + +Uses the arXiv Atom API for paper search. +API documentation: https://arxiv.org/help/api +""" + +from __future__ import annotations + +import asyncio +import logging +from typing import List, Optional + +import aiohttp + +from paperbot.domain.harvest import HarvestedPaper, HarvestResult, HarvestSource +from paperbot.infrastructure.connectors.arxiv_connector import ArxivConnector, ArxivRecord + +logger = logging.getLogger(__name__) + + +class ArxivHarvester: + """ + arXiv paper harvester using the Atom API. + + API: https://export.arxiv.org/api/query + Rate limit: 1 request per 3 seconds (be conservative) + """ + + ARXIV_API_URL = "https://export.arxiv.org/api/query" + REQUEST_INTERVAL = 3.0 # seconds between requests + + def __init__(self, connector: Optional[ArxivConnector] = None): + self.connector = connector or ArxivConnector() + self._session: Optional[aiohttp.ClientSession] = None + self._last_request_time: float = 0 + + @property + def source(self) -> HarvestSource: + return HarvestSource.ARXIV + + async def _get_session(self) -> aiohttp.ClientSession: + if self._session is None or self._session.closed: + self._session = aiohttp.ClientSession() + return self._session + + async def _rate_limit(self) -> None: + """Enforce rate limiting between requests.""" + import time + + now = time.time() + elapsed = now - self._last_request_time + if elapsed < self.REQUEST_INTERVAL: + await asyncio.sleep(self.REQUEST_INTERVAL - elapsed) + self._last_request_time = time.time() + + def _build_query( + self, + query: str, + year_from: Optional[int] = None, + year_to: Optional[int] = None, + ) -> str: + """Build arXiv search query with optional year filters.""" + # arXiv uses submittedDate for filtering + # Format: submittedDate:[YYYYMMDD TO YYYYMMDD] + search_query = f"all:{query}" + + if year_from or year_to: + start_date = f"{year_from}0101" if year_from else "199101" + end_date = f"{year_to}1231" if year_to else "209912" + search_query += f" AND submittedDate:[{start_date} TO {end_date}]" + + return search_query + + async def search( + self, + query: str, + *, + max_results: int = 100, + year_from: Optional[int] = None, + year_to: Optional[int] = None, + venues: Optional[List[str]] = None, # Not supported by arXiv + ) -> HarvestResult: + """ + Search arXiv using the Atom API. + + Note: arXiv doesn't support venue filtering - all papers are preprints. + """ + search_query = self._build_query(query, year_from, year_to) + + params = { + "search_query": search_query, + "start": 0, + "max_results": min(max_results, 200), # arXiv max is ~200 per request + "sortBy": "relevance", + "sortOrder": "descending", + } + + try: + await self._rate_limit() + session = await self._get_session() + + async with session.get(self.ARXIV_API_URL, params=params) as resp: + if resp.status != 200: + return HarvestResult( + source=self.source, + papers=[], + total_found=0, + error=f"arXiv API returned status {resp.status}", + ) + xml_text = await resp.text() + + records = self.connector.parse_atom(xml_text) + papers = [self._record_to_paper(r, rank=i) for i, r in enumerate(records)] + + logger.info(f"arXiv harvester found {len(papers)} papers for query: {query}") + + return HarvestResult( + source=self.source, + papers=papers, + total_found=len(papers), + ) + except Exception as e: + logger.warning(f"arXiv harvester error: {e}") + return HarvestResult( + source=self.source, + papers=[], + total_found=0, + error=str(e), + ) + + def _record_to_paper(self, record: ArxivRecord, rank: int) -> HarvestedPaper: + """Convert ArxivRecord to HarvestedPaper.""" + # Extract arxiv_id from full URL (e.g., "http://arxiv.org/abs/2301.12345v1") + arxiv_id = record.arxiv_id + if "/" in arxiv_id: + arxiv_id = arxiv_id.split("/")[-1] + # Remove version suffix (e.g., "2301.12345v1" -> "2301.12345") + if "v" in arxiv_id: + arxiv_id = arxiv_id.split("v")[0] + + # Extract year from published date + year = None + if record.published: + try: + year = int(record.published[:4]) + except (ValueError, IndexError): + pass + + return HarvestedPaper( + title=record.title.replace("\n", " ").strip(), + source=HarvestSource.ARXIV, + abstract=record.summary.replace("\n", " ").strip(), + authors=record.authors, + arxiv_id=arxiv_id, + year=year, + publication_date=record.published[:10] if record.published else None, + url=record.abs_url, + pdf_url=record.pdf_url, + source_rank=rank, + ) + + async def close(self) -> None: + """Close the HTTP session.""" + if self._session and not self._session.closed: + await self._session.close() + self._session = None diff --git a/src/paperbot/infrastructure/harvesters/openalex_harvester.py b/src/paperbot/infrastructure/harvesters/openalex_harvester.py new file mode 100644 index 0000000..4153e42 --- /dev/null +++ b/src/paperbot/infrastructure/harvesters/openalex_harvester.py @@ -0,0 +1,212 @@ +# src/paperbot/infrastructure/harvesters/openalex_harvester.py +""" +OpenAlex paper harvester. + +Uses the OpenAlex API for paper search. +API documentation: https://docs.openalex.org/ +""" + +from __future__ import annotations + +import asyncio +import logging +from typing import Any, Dict, List, Optional + +import aiohttp + +from paperbot.domain.harvest import HarvestedPaper, HarvestResult, HarvestSource + +logger = logging.getLogger(__name__) + + +class OpenAlexHarvester: + """ + OpenAlex paper harvester. + + API: https://api.openalex.org/works + Rate limit: 10 req/s (polite pool with email), 100K/day + """ + + OPENALEX_API_URL = "https://api.openalex.org/works" + REQUEST_INTERVAL = 0.1 # 10 req/s + + def __init__(self, email: Optional[str] = None): + self.email = email # For polite pool + self._session: Optional[aiohttp.ClientSession] = None + self._last_request_time: float = 0 + + @property + def source(self) -> HarvestSource: + return HarvestSource.OPENALEX + + async def _get_session(self) -> aiohttp.ClientSession: + if self._session is None or self._session.closed: + self._session = aiohttp.ClientSession() + return self._session + + async def _rate_limit(self) -> None: + """Enforce rate limiting between requests.""" + import time + + now = time.time() + elapsed = now - self._last_request_time + if elapsed < self.REQUEST_INTERVAL: + await asyncio.sleep(self.REQUEST_INTERVAL - elapsed) + self._last_request_time = time.time() + + async def search( + self, + query: str, + *, + max_results: int = 100, + year_from: Optional[int] = None, + year_to: Optional[int] = None, + venues: Optional[List[str]] = None, + ) -> HarvestResult: + """Search OpenAlex API.""" + params: Dict[str, Any] = { + "search": query, + "per_page": min(max_results, 200), # API max is 200 + "sort": "cited_by_count:desc", + } + + # Add email for polite pool + if self.email: + params["mailto"] = self.email + + # Build filter string + filters = [] + if year_from: + filters.append(f"publication_year:>={year_from}") + if year_to: + filters.append(f"publication_year:<={year_to}") + if filters: + params["filter"] = ",".join(filters) + + try: + await self._rate_limit() + session = await self._get_session() + + async with session.get(self.OPENALEX_API_URL, params=params) as resp: + if resp.status != 200: + return HarvestResult( + source=self.source, + papers=[], + total_found=0, + error=f"OpenAlex API returned status {resp.status}", + ) + data = await resp.json() + + results = data.get("results", []) + papers = [self._to_paper(r, rank=i) for i, r in enumerate(results)] + + # Filter by venue if specified + if venues: + venue_set = {v.lower() for v in venues} + papers = [ + p + for p in papers + if p.venue and any(v in p.venue.lower() for v in venue_set) + ] + + total_found = data.get("meta", {}).get("count", len(papers)) + logger.info(f"OpenAlex harvester found {len(papers)} papers for query: {query}") + + return HarvestResult( + source=self.source, + papers=papers, + total_found=total_found, + ) + except Exception as e: + logger.warning(f"OpenAlex harvester error: {e}") + return HarvestResult( + source=self.source, + papers=[], + total_found=0, + error=str(e), + ) + + def _to_paper(self, data: Dict[str, Any], rank: int) -> HarvestedPaper: + """Convert OpenAlex API response to HarvestedPaper.""" + # Extract authors + authors = [] + for authorship in data.get("authorships", []): + author = authorship.get("author", {}) + if author.get("display_name"): + authors.append(author["display_name"]) + + # Extract identifiers + ids = data.get("ids", {}) + doi = ids.get("doi", "") + if doi: + doi = doi.replace("https://doi.org/", "") + + openalex_id = ids.get("openalex", "") + if openalex_id: + openalex_id = openalex_id.replace("https://openalex.org/", "") + + # Extract venue + venue = None + if data.get("primary_location"): + source = data["primary_location"].get("source") or {} + venue = source.get("display_name") + + # Extract PDF URL + pdf_url = None + if data.get("open_access", {}).get("oa_url"): + pdf_url = data["open_access"]["oa_url"] + + # Extract keywords from concepts + keywords = [ + c.get("display_name", "") + for c in data.get("keywords", [])[:10] + if c.get("display_name") + ] + + # Extract fields of study from concepts + fields_of_study = [ + c.get("display_name", "") + for c in data.get("concepts", [])[:5] + if c.get("display_name") + ] + + return HarvestedPaper( + title=data.get("title", "") or data.get("display_name", ""), + source=HarvestSource.OPENALEX, + abstract=self._get_abstract(data), + authors=authors, + doi=doi if doi else None, + openalex_id=openalex_id if openalex_id else None, + year=data.get("publication_year"), + venue=venue, + publication_date=data.get("publication_date"), + citation_count=data.get("cited_by_count", 0) or 0, + url=data.get("doi") or ids.get("openalex"), + pdf_url=pdf_url, + keywords=keywords, + fields_of_study=fields_of_study, + source_rank=rank, + ) + + def _get_abstract(self, data: Dict[str, Any]) -> str: + """Reconstruct abstract from inverted index.""" + abstract_index = data.get("abstract_inverted_index") + if not abstract_index: + return "" + + # OpenAlex stores abstract as inverted index: {"word": [positions]} + try: + words: List[tuple[int, str]] = [] + for word, positions in abstract_index.items(): + for pos in positions: + words.append((pos, word)) + words.sort(key=lambda x: x[0]) + return " ".join(w[1] for w in words) + except Exception: + return "" + + async def close(self) -> None: + """Close the HTTP session.""" + if self._session and not self._session.closed: + await self._session.close() + self._session = None diff --git a/src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py b/src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py new file mode 100644 index 0000000..c3ddae6 --- /dev/null +++ b/src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py @@ -0,0 +1,133 @@ +# src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py +""" +Semantic Scholar paper harvester. + +Uses the Semantic Scholar Academic Graph API for paper search. +API documentation: https://api.semanticscholar.org/api-docs/ +""" + +from __future__ import annotations + +import logging +from typing import Any, Dict, List, Optional + +from paperbot.domain.harvest import HarvestedPaper, HarvestResult, HarvestSource +from paperbot.infrastructure.api_clients.semantic_scholar import SemanticScholarClient + +logger = logging.getLogger(__name__) + + +class SemanticScholarHarvester: + """ + Semantic Scholar paper harvester. + + API: https://api.semanticscholar.org/graph/v1/paper/search + Rate limit: 100 req/min (with API key), 5000/day without key + """ + + FIELDS = [ + "paperId", + "title", + "abstract", + "year", + "venue", + "citationCount", + "authors", + "publicationDate", + "externalIds", + "fieldsOfStudy", + "url", + "openAccessPdf", + ] + + def __init__(self, client: Optional[SemanticScholarClient] = None, api_key: Optional[str] = None): + self.client = client or SemanticScholarClient(api_key=api_key) + + @property + def source(self) -> HarvestSource: + return HarvestSource.SEMANTIC_SCHOLAR + + async def search( + self, + query: str, + *, + max_results: int = 100, + year_from: Optional[int] = None, + year_to: Optional[int] = None, + venues: Optional[List[str]] = None, + ) -> HarvestResult: + """Search Semantic Scholar API.""" + try: + # S2 API supports year filter in query + year_filter = "" + if year_from and year_to: + year_filter = f" year:{year_from}-{year_to}" + elif year_from: + year_filter = f" year:{year_from}-" + elif year_to: + year_filter = f" year:-{year_to}" + + results = await self.client.search_papers( + query=query + year_filter, + limit=min(max_results, 100), # S2 limit per request + fields=self.FIELDS, + ) + + papers = [self._to_paper(r, rank=i) for i, r in enumerate(results)] + + # Filter by venue if specified + if venues: + venue_set = {v.lower() for v in venues} + papers = [ + p + for p in papers + if p.venue and any(v in p.venue.lower() for v in venue_set) + ] + + logger.info(f"Semantic Scholar harvester found {len(papers)} papers for query: {query}") + + return HarvestResult( + source=self.source, + papers=papers, + total_found=len(papers), + ) + except Exception as e: + logger.warning(f"Semantic Scholar harvester error: {e}") + return HarvestResult( + source=self.source, + papers=[], + total_found=0, + error=str(e), + ) + + def _to_paper(self, data: Dict[str, Any], rank: int) -> HarvestedPaper: + """Convert S2 API response to HarvestedPaper.""" + authors = [a.get("name", "") for a in data.get("authors", []) if a.get("name")] + external_ids = data.get("externalIds", {}) or {} + + pdf_url = None + if data.get("openAccessPdf"): + pdf_url = data["openAccessPdf"].get("url") + + return HarvestedPaper( + title=data.get("title", ""), + source=HarvestSource.SEMANTIC_SCHOLAR, + abstract=data.get("abstract") or "", + authors=authors, + doi=external_ids.get("DOI"), + arxiv_id=external_ids.get("ArXiv"), + semantic_scholar_id=data.get("paperId"), + year=data.get("year"), + venue=data.get("venue"), + publication_date=data.get("publicationDate"), + citation_count=data.get("citationCount", 0) or 0, + url=data.get("url"), + pdf_url=pdf_url, + fields_of_study=data.get("fieldsOfStudy") or [], + source_rank=rank, + ) + + async def close(self) -> None: + """Close the HTTP client.""" + # SemanticScholarClient manages its own session + pass diff --git a/src/paperbot/infrastructure/stores/models.py b/src/paperbot/infrastructure/stores/models.py index 726f29b..0cf476d 100644 --- a/src/paperbot/infrastructure/stores/models.py +++ b/src/paperbot/infrastructure/stores/models.py @@ -709,3 +709,130 @@ class PaperImpressionModel(Base): run = relationship("ResearchContextRunModel", back_populates="impressions") track = relationship("ResearchTrackModel") + + +class PaperModel(Base): + """Harvested paper metadata from multiple sources.""" + + __tablename__ = "papers" + + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + + # Canonical identifiers (for deduplication) + doi: Mapped[Optional[str]] = mapped_column(String(128), unique=True, nullable=True, index=True) + arxiv_id: Mapped[Optional[str]] = mapped_column(String(32), unique=True, nullable=True, index=True) + semantic_scholar_id: Mapped[Optional[str]] = mapped_column(String(64), unique=True, nullable=True, index=True) + openalex_id: Mapped[Optional[str]] = mapped_column(String(64), unique=True, nullable=True, index=True) + title_hash: Mapped[str] = mapped_column(String(64), index=True) # SHA256 of normalized title + + # Core metadata + title: Mapped[str] = mapped_column(Text, default="") + abstract: Mapped[str] = mapped_column(Text, default="") + authors_json: Mapped[str] = mapped_column(Text, default="[]") + year: Mapped[Optional[int]] = mapped_column(Integer, nullable=True, index=True) + venue: Mapped[Optional[str]] = mapped_column(String(256), nullable=True, index=True) + publication_date: Mapped[Optional[str]] = mapped_column(String(32), nullable=True) + citation_count: Mapped[int] = mapped_column(Integer, default=0, index=True) + + # URLs (no PDF download, just references) + url: Mapped[Optional[str]] = mapped_column(String(512), nullable=True) + pdf_url: Mapped[Optional[str]] = mapped_column(String(512), nullable=True) + + # Classification + keywords_json: Mapped[str] = mapped_column(Text, default="[]") + fields_of_study_json: Mapped[str] = mapped_column(Text, default="[]") + + # Source tracking + primary_source: Mapped[str] = mapped_column(String(32), default="") # First source that found this paper + sources_json: Mapped[str] = mapped_column(Text, default="[]") # All sources that returned this paper + + # Timestamps + created_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True, index=True) + updated_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + deleted_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) # Soft delete + + def get_authors(self) -> list: + try: + return json.loads(self.authors_json or "[]") + except Exception: + return [] + + def get_keywords(self) -> list: + try: + return json.loads(self.keywords_json or "[]") + except Exception: + return [] + + def get_fields_of_study(self) -> list: + try: + return json.loads(self.fields_of_study_json or "[]") + except Exception: + return [] + + def get_sources(self) -> list: + try: + return json.loads(self.sources_json or "[]") + except Exception: + return [] + + def set_keywords(self, keywords: list) -> None: + self.keywords_json = json.dumps(keywords or [], ensure_ascii=False) + + def set_fields_of_study(self, fields: list) -> None: + self.fields_of_study_json = json.dumps(fields or [], ensure_ascii=False) + + def set_sources(self, sources: list) -> None: + self.sources_json = json.dumps(sources or [], ensure_ascii=False) + + +class HarvestRunModel(Base): + """Harvest execution tracking.""" + + __tablename__ = "harvest_runs" + + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + run_id: Mapped[str] = mapped_column(String(64), unique=True, index=True) + + # Input + keywords_json: Mapped[str] = mapped_column(Text, default="[]") + venues_json: Mapped[str] = mapped_column(Text, default="[]") + sources_json: Mapped[str] = mapped_column(Text, default="[]") + max_results_per_source: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + + # Results + status: Mapped[Optional[str]] = mapped_column(String(32), default="running", index=True) # running/success/partial/failed + papers_found: Mapped[Optional[int]] = mapped_column(Integer, default=0) + papers_new: Mapped[Optional[int]] = mapped_column(Integer, default=0) + papers_deduplicated: Mapped[Optional[int]] = mapped_column(Integer, default=0) + error_json: Mapped[str] = mapped_column(Text, default="{}") + + # Timestamps + started_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True, index=True) + ended_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + + def get_keywords(self) -> list: + try: + return json.loads(self.keywords_json or "[]") + except Exception: + return [] + + def get_venues(self) -> list: + try: + return json.loads(self.venues_json or "[]") + except Exception: + return [] + + def get_sources(self) -> list: + try: + return json.loads(self.sources_json or "[]") + except Exception: + return [] + + def get_errors(self) -> dict: + try: + return json.loads(self.error_json or "{}") + except Exception: + return {} + + def set_errors(self, errors: dict) -> None: + self.error_json = json.dumps(errors or {}, ensure_ascii=False) diff --git a/src/paperbot/infrastructure/stores/paper_store.py b/src/paperbot/infrastructure/stores/paper_store.py index e4f8c87..6e9c3da 100644 --- a/src/paperbot/infrastructure/stores/paper_store.py +++ b/src/paperbot/infrastructure/stores/paper_store.py @@ -1,3 +1,4 @@ +<<<<<<< HEAD from __future__ import annotations from datetime import datetime, timezone @@ -7,6 +8,32 @@ from paperbot.domain.paper_identity import normalize_arxiv_id, normalize_doi from paperbot.infrastructure.stores.models import Base, PaperJudgeScoreModel, PaperModel +======= +# src/paperbot/infrastructure/stores/paper_store.py +""" +Paper storage repository. + +Handles persistence and retrieval of harvested papers. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from datetime import datetime, timezone +from typing import Any, Dict, List, Optional, Tuple + +from sqlalchemy import Integer, cast, func, or_, select + +from paperbot.utils.logging_config import Logger, LogFiles +from paperbot.domain.harvest import HarvestedPaper, HarvestSource +from paperbot.infrastructure.stores.models import ( + Base, + HarvestRunModel, + PaperFeedbackModel, + PaperModel, +) +>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) from paperbot.infrastructure.stores.sqlalchemy_db import SessionProvider, get_db_url @@ -14,6 +41,7 @@ def _utcnow() -> datetime: return datetime.now(timezone.utc) +<<<<<<< HEAD def _safe_list(values: Any) -> List[str]: if not isinstance(values, list): return [] @@ -56,6 +84,28 @@ def _as_utc(value: Optional[datetime]) -> Optional[datetime]: class SqlAlchemyPaperStore: """Canonical paper registry with idempotent upsert for daily workflows.""" +======= +@dataclass +class LibraryPaper: + """Paper with library metadata (saved_at, track_id, action).""" + + paper: PaperModel + saved_at: datetime + track_id: Optional[int] + action: str + + +class PaperStore: + """ + Paper storage repository. + + Handles: + - Batch upsert with DB-level deduplication + - Filter-based search with pagination + - Source tracking + - User library (saved papers) + """ +>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) def __init__(self, db_url: Optional[str] = None, *, auto_create_schema: bool = True): self.db_url = db_url or get_db_url() @@ -63,6 +113,7 @@ def __init__(self, db_url: Optional[str] = None, *, auto_create_schema: bool = T if auto_create_schema: Base.metadata.create_all(self._provider.engine) +<<<<<<< HEAD def upsert_paper( self, *, @@ -315,3 +366,476 @@ def _paper_to_dict(row: PaperModel) -> Dict[str, Any]: "created_at": row.created_at.isoformat() if row.created_at else None, "updated_at": row.updated_at.isoformat() if row.updated_at else None, } +======= + def upsert_papers_batch( + self, + papers: List[HarvestedPaper], + ) -> Tuple[int, int]: + """ + Upsert papers with deduplication. + + Returns: + Tuple of (new_count, updated_count) + """ + Logger.info(f"Starting batch upsert for {len(papers)} papers", file=LogFiles.HARVEST) + new_count = 0 + updated_count = 0 + now = _utcnow() + + with self._provider.session() as session: + for paper in papers: + Logger.info("Checking for existing paper in database", file=LogFiles.HARVEST) + existing = self._find_existing(session, paper) + + if existing: + Logger.info("Found existing paper, updating metadata", file=LogFiles.HARVEST) + self._update_paper(existing, paper, now) + updated_count += 1 + else: + Logger.info("No existing paper found, creating new record", file=LogFiles.HARVEST) + model = self._create_model(paper, now) + session.add(model) + new_count += 1 + + Logger.info("Committing transaction to database", file=LogFiles.HARVEST) + session.commit() + + Logger.info(f"Batch upsert complete: {new_count} new, {updated_count} updated", file=LogFiles.HARVEST) + return new_count, updated_count + + def _find_existing(self, session, paper: HarvestedPaper) -> Optional[PaperModel]: + """Find existing paper by canonical identifiers.""" + # Try each identifier in priority order + if paper.doi: + result = session.execute( + select(PaperModel).where(PaperModel.doi == paper.doi) + ).scalar_one_or_none() + if result: + return result + + if paper.arxiv_id: + result = session.execute( + select(PaperModel).where(PaperModel.arxiv_id == paper.arxiv_id) + ).scalar_one_or_none() + if result: + return result + + if paper.semantic_scholar_id: + result = session.execute( + select(PaperModel).where( + PaperModel.semantic_scholar_id == paper.semantic_scholar_id + ) + ).scalar_one_or_none() + if result: + return result + + if paper.openalex_id: + result = session.execute( + select(PaperModel).where(PaperModel.openalex_id == paper.openalex_id) + ).scalar_one_or_none() + if result: + return result + + # Fallback to title hash + title_hash = paper.compute_title_hash() + result = session.execute( + select(PaperModel).where(PaperModel.title_hash == title_hash) + ).scalar_one_or_none() + return result + + def _create_model(self, paper: HarvestedPaper, now: datetime) -> PaperModel: + """Create a new PaperModel from HarvestedPaper.""" + return PaperModel( + doi=paper.doi, + arxiv_id=paper.arxiv_id, + semantic_scholar_id=paper.semantic_scholar_id, + openalex_id=paper.openalex_id, + title_hash=paper.compute_title_hash(), + title=paper.title, + abstract=paper.abstract, + authors_json=json.dumps(paper.authors, ensure_ascii=False), + year=paper.year, + venue=paper.venue, + publication_date=paper.publication_date, + citation_count=paper.citation_count, + url=paper.url, + pdf_url=paper.pdf_url, + keywords_json=json.dumps(paper.keywords, ensure_ascii=False), + fields_of_study_json=json.dumps(paper.fields_of_study, ensure_ascii=False), + primary_source=paper.source.value, + sources_json=json.dumps([paper.source.value], ensure_ascii=False), + created_at=now, + updated_at=now, + ) + + def _update_paper( + self, existing: PaperModel, paper: HarvestedPaper, now: datetime + ) -> None: + """Update existing paper with new data.""" + # Fill in missing identifiers + if not existing.doi and paper.doi: + existing.doi = paper.doi + if not existing.arxiv_id and paper.arxiv_id: + existing.arxiv_id = paper.arxiv_id + if not existing.semantic_scholar_id and paper.semantic_scholar_id: + existing.semantic_scholar_id = paper.semantic_scholar_id + if not existing.openalex_id and paper.openalex_id: + existing.openalex_id = paper.openalex_id + + # Prefer longer abstract + if len(paper.abstract) > len(existing.abstract or ""): + existing.abstract = paper.abstract + + # Prefer higher citation count + if paper.citation_count > (existing.citation_count or 0): + existing.citation_count = paper.citation_count + + # Fill in missing metadata + if not existing.year and paper.year: + existing.year = paper.year + if not existing.venue and paper.venue: + existing.venue = paper.venue + if not existing.publication_date and paper.publication_date: + existing.publication_date = paper.publication_date + if not existing.url and paper.url: + existing.url = paper.url + if not existing.pdf_url and paper.pdf_url: + existing.pdf_url = paper.pdf_url + + # Merge sources + sources = existing.get_sources() + if paper.source.value not in sources: + sources.append(paper.source.value) + existing.set_sources(sources) + + # Merge keywords and fields + keywords = set(existing.get_keywords() + paper.keywords) + existing.set_keywords(list(keywords)) + + fields = set(existing.get_fields_of_study() + paper.fields_of_study) + existing.set_fields_of_study(list(fields)) + + existing.updated_at = now + + def search_papers( + self, + *, + query: Optional[str] = None, + keywords: Optional[List[str]] = None, + venues: Optional[List[str]] = None, + year_from: Optional[int] = None, + year_to: Optional[int] = None, + min_citations: Optional[int] = None, + sources: Optional[List[str]] = None, + sort_by: str = "citation_count", + sort_order: str = "desc", + limit: int = 50, + offset: int = 0, + ) -> Tuple[List[PaperModel], int]: + """ + Search papers with filters and pagination. + + Returns: + Tuple of (papers, total_count) + """ + with self._provider.session() as session: + stmt = select(PaperModel).where(PaperModel.deleted_at.is_(None)) + + # Full-text search (LIKE for v1) + if query: + pattern = f"%{query}%" + stmt = stmt.where( + or_( + PaperModel.title.ilike(pattern), + PaperModel.abstract.ilike(pattern), + ) + ) + + # Year filters + if year_from: + stmt = stmt.where(PaperModel.year >= year_from) + if year_to: + stmt = stmt.where(PaperModel.year <= year_to) + + # Citation filter + if min_citations: + stmt = stmt.where(PaperModel.citation_count >= min_citations) + + # Venue filter + if venues: + venue_conditions = [PaperModel.venue.ilike(f"%{v}%") for v in venues] + stmt = stmt.where(or_(*venue_conditions)) + + # Source filter + if sources: + stmt = stmt.where(PaperModel.primary_source.in_(sources)) + + # Count total before pagination + count_stmt = select(func.count()).select_from(stmt.subquery()) + total_count = session.execute(count_stmt).scalar() or 0 + + # Sort + sort_col = getattr(PaperModel, sort_by, PaperModel.citation_count) + if sort_order.lower() == "desc": + stmt = stmt.order_by(sort_col.desc()) + else: + stmt = stmt.order_by(sort_col.asc()) + + # Pagination + stmt = stmt.offset(offset).limit(limit) + + papers = session.execute(stmt).scalars().all() + + return list(papers), total_count + + def get_paper_by_id(self, paper_id: int) -> Optional[PaperModel]: + """Get a paper by its ID.""" + with self._provider.session() as session: + return session.execute( + select(PaperModel).where( + PaperModel.id == paper_id, + PaperModel.deleted_at.is_(None), + ) + ).scalar_one_or_none() + + def get_user_library( + self, + user_id: str, + *, + track_id: Optional[int] = None, + actions: Optional[List[str]] = None, + sort_by: str = "saved_at", + sort_order: str = "desc", + limit: int = 50, + offset: int = 0, + ) -> Tuple[List[LibraryPaper], int]: + """ + Get papers in user's library (saved papers). + + Joins papers table with paper_feedback where action in actions. + """ + Logger.info("Starting to fetch user library", file=LogFiles.HARVEST) + if actions is None: + actions = ["save"] + + with self._provider.session() as session: + # Join papers with feedback, then deduplicate by paper.id + # paper_feedback.paper_id can be either: + # 1. Integer ID as string (from harvest saves): "123" -> join on papers.id + # 2. Semantic Scholar ID (from recommendation saves): "abc123" -> join on papers.semantic_scholar_id + + Logger.info("Executing database query to join papers with feedback", file=LogFiles.HARVEST) + # First, get all matching paper-feedback pairs + base_stmt = ( + select(PaperModel, PaperFeedbackModel) + .join( + PaperFeedbackModel, + or_( + PaperModel.id == cast(PaperFeedbackModel.paper_id, Integer), + PaperModel.semantic_scholar_id == PaperFeedbackModel.paper_id, + ), + ) + .where( + PaperFeedbackModel.user_id == user_id, + PaperFeedbackModel.action.in_(actions), + PaperModel.deleted_at.is_(None), + ) + ) + + if track_id is not None: + base_stmt = base_stmt.where(PaperFeedbackModel.track_id == track_id) + + # Execute and deduplicate in Python by paper.id (keeping latest feedback) + all_results = session.execute(base_stmt).all() + Logger.info(f"Query returned {len(all_results)} results before deduplication", file=LogFiles.HARVEST) + + # Deduplicate by paper.id, keeping the one with latest timestamp + Logger.info("Deduplicating results by paper id", file=LogFiles.HARVEST) + paper_map: Dict[int, Tuple[PaperModel, PaperFeedbackModel]] = {} + for row in all_results: + paper = row[0] + feedback = row[1] + if paper.id not in paper_map or feedback.ts > paper_map[paper.id][1].ts: + paper_map[paper.id] = (paper, feedback) + + # Convert to list and sort + unique_results = list(paper_map.values()) + Logger.info(f"After deduplication: {len(unique_results)} unique papers", file=LogFiles.HARVEST) + + # Sort + min_ts = datetime.min.replace(tzinfo=timezone.utc) + if sort_by == "saved_at": + unique_results.sort(key=lambda x: x[1].ts or min_ts, reverse=(sort_order.lower() == "desc")) + elif sort_by == "title": + unique_results.sort(key=lambda x: x[0].title or "", reverse=(sort_order.lower() == "desc")) + elif sort_by == "citation_count": + unique_results.sort(key=lambda x: x[0].citation_count or 0, reverse=(sort_order.lower() == "desc")) + elif sort_by == "year": + unique_results.sort(key=lambda x: x[0].year or 0, reverse=(sort_order.lower() == "desc")) + else: + unique_results.sort(key=lambda x: x[1].ts or min_ts, reverse=(sort_order.lower() == "desc")) + + # Get total count before pagination + total = len(unique_results) + + # Apply pagination + paginated_results = unique_results[offset:offset + limit] + + return [ + LibraryPaper( + paper=row[0], + saved_at=row[1].ts, + track_id=row[1].track_id, + action=row[1].action, + ) + for row in paginated_results + ], total + + def remove_from_library(self, user_id: str, paper_id: int) -> bool: + """Remove paper from user's library by deleting 'save' feedback.""" + with self._provider.session() as session: + stmt = ( + PaperFeedbackModel.__table__.delete().where( + PaperFeedbackModel.user_id == user_id, + PaperFeedbackModel.paper_id == str(paper_id), + PaperFeedbackModel.action == "save", + ) + ) + result = session.execute(stmt) + session.commit() + return result.rowcount > 0 + + def create_harvest_run( + self, + run_id: str, + keywords: List[str], + venues: List[str], + sources: List[str], + max_results_per_source: int, + ) -> HarvestRunModel: + """Create a new harvest run record.""" + now = _utcnow() + with self._provider.session() as session: + run = HarvestRunModel( + run_id=run_id, + keywords_json=json.dumps(keywords, ensure_ascii=False), + venues_json=json.dumps(venues, ensure_ascii=False), + sources_json=json.dumps(sources, ensure_ascii=False), + max_results_per_source=max_results_per_source, + status="running", + started_at=now, + ) + session.add(run) + session.commit() + session.refresh(run) + return run + + def update_harvest_run( + self, + run_id: str, + *, + status: Optional[str] = None, + papers_found: Optional[int] = None, + papers_new: Optional[int] = None, + papers_deduplicated: Optional[int] = None, + errors: Optional[Dict[str, Any]] = None, + ) -> Optional[HarvestRunModel]: + """Update a harvest run record.""" + now = _utcnow() + with self._provider.session() as session: + run = session.execute( + select(HarvestRunModel).where(HarvestRunModel.run_id == run_id) + ).scalar_one_or_none() + + if run is None: + return None + + if status is not None: + run.status = status + if status in ("success", "partial", "failed"): + run.ended_at = now + + if papers_found is not None: + run.papers_found = papers_found + if papers_new is not None: + run.papers_new = papers_new + if papers_deduplicated is not None: + run.papers_deduplicated = papers_deduplicated + if errors is not None: + run.set_errors(errors) + + session.commit() + session.refresh(run) + return run + + def get_harvest_run(self, run_id: str) -> Optional[HarvestRunModel]: + """Get a harvest run by its ID.""" + with self._provider.session() as session: + return session.execute( + select(HarvestRunModel).where(HarvestRunModel.run_id == run_id) + ).scalar_one_or_none() + + def list_harvest_runs( + self, + *, + status: Optional[str] = None, + limit: int = 50, + offset: int = 0, + ) -> List[HarvestRunModel]: + """List harvest runs with optional filtering.""" + with self._provider.session() as session: + stmt = select(HarvestRunModel) + + if status: + stmt = stmt.where(HarvestRunModel.status == status) + + stmt = stmt.order_by(HarvestRunModel.started_at.desc()) + stmt = stmt.offset(offset).limit(limit) + + return list(session.execute(stmt).scalars().all()) + + def get_paper_count(self) -> int: + """Get total count of papers in the store.""" + with self._provider.session() as session: + return ( + session.execute( + select(func.count()).select_from(PaperModel).where( + PaperModel.deleted_at.is_(None) + ) + ).scalar() + or 0 + ) + + def close(self) -> None: + """Close database connections.""" + try: + self._provider.engine.dispose() + except Exception: + pass + + +def paper_to_dict(paper: PaperModel) -> Dict[str, Any]: + """Convert PaperModel to dictionary for API response.""" + return { + "id": paper.id, + "doi": paper.doi, + "arxiv_id": paper.arxiv_id, + "semantic_scholar_id": paper.semantic_scholar_id, + "openalex_id": paper.openalex_id, + "title": paper.title, + "abstract": paper.abstract, + "authors": paper.get_authors(), + "year": paper.year, + "venue": paper.venue, + "publication_date": paper.publication_date, + "citation_count": paper.citation_count, + "url": paper.url, + "pdf_url": paper.pdf_url, + "keywords": paper.get_keywords(), + "fields_of_study": paper.get_fields_of_study(), + "primary_source": paper.primary_source, + "sources": paper.get_sources(), + "created_at": paper.created_at.isoformat() if paper.created_at else None, + "updated_at": paper.updated_at.isoformat() if paper.updated_at else None, + } +>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) diff --git a/src/paperbot/infrastructure/stores/research_store.py b/src/paperbot/infrastructure/stores/research_store.py index 425724a..9549e7f 100644 --- a/src/paperbot/infrastructure/stores/research_store.py +++ b/src/paperbot/infrastructure/stores/research_store.py @@ -8,7 +8,11 @@ from sqlalchemy import desc, func, or_, select from sqlalchemy.exc import IntegrityError +<<<<<<< HEAD from paperbot.domain.paper_identity import normalize_arxiv_id, normalize_doi +======= +from paperbot.utils.logging_config import Logger, LogFiles +>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) from paperbot.infrastructure.stores.models import ( Base, PaperFeedbackModel, @@ -329,6 +333,7 @@ def add_paper_feedback( weight: float = 0.0, metadata: Optional[Dict[str, Any]] = None, ) -> Optional[Dict[str, Any]]: + Logger.info("Recording paper feedback", file=LogFiles.HARVEST) now = _utcnow() metadata = dict(metadata or {}) with self._provider.session() as session: @@ -338,14 +343,19 @@ def add_paper_feedback( ) ).scalar_one_or_none() if track is None: + Logger.error("Track not found", file=LogFiles.HARVEST) return None +<<<<<<< HEAD resolved_paper_ref_id = self._resolve_paper_ref_id( session=session, paper_id=(paper_id or "").strip(), metadata=metadata, ) +======= + Logger.info("Creating new feedback record", file=LogFiles.HARVEST) +>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) row = PaperFeedbackModel( user_id=user_id, track_id=track_id, @@ -373,6 +383,7 @@ def add_paper_feedback( session.add(track) session.commit() session.refresh(row) + Logger.info("Feedback record created successfully", file=LogFiles.HARVEST) return self._feedback_to_dict(row) def list_paper_feedback( diff --git a/tests/integration/test_harvest_pipeline.py b/tests/integration/test_harvest_pipeline.py new file mode 100644 index 0000000..18f30d5 --- /dev/null +++ b/tests/integration/test_harvest_pipeline.py @@ -0,0 +1,537 @@ +""" +HarvestPipeline integration tests. + +Tests the complete harvest pipeline with mocked harvesters. +""" + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from paperbot.domain.harvest import HarvestedPaper, HarvestResult, HarvestSource +from paperbot.application.workflows.harvest_pipeline import ( + HarvestConfig, + HarvestFinalResult, + HarvestPipeline, + HarvestProgress, +) + + +@pytest.fixture +def mock_harvesters(): + """Create mock harvesters with predefined responses.""" + arxiv_papers = [ + HarvestedPaper( + title="Transformer Architecture for NLP", + source=HarvestSource.ARXIV, + abstract="We propose transformers.", + arxiv_id="2301.00001", + year=2023, + ), + HarvestedPaper( + title="BERT Pre-training", + source=HarvestSource.ARXIV, + abstract="We introduce BERT.", + arxiv_id="2301.00002", + year=2023, + ), + ] + + s2_papers = [ + HarvestedPaper( + title="Transformer Architecture for NLP", # Duplicate + source=HarvestSource.SEMANTIC_SCHOLAR, + abstract="We propose transformers for various NLP tasks.", + doi="10.1234/transformer", + arxiv_id="2301.00001", + semantic_scholar_id="s2-001", + year=2023, + citation_count=500, + ), + HarvestedPaper( + title="GPT Language Models", + source=HarvestSource.SEMANTIC_SCHOLAR, + abstract="Generative pre-training for language understanding.", + doi="10.1234/gpt", + semantic_scholar_id="s2-002", + year=2023, + citation_count=1000, + ), + ] + + openalex_papers = [ + HarvestedPaper( + title="Vision Transformers", + source=HarvestSource.OPENALEX, + abstract="Transformers for computer vision.", + doi="10.1234/vit", + openalex_id="W001", + year=2024, + citation_count=300, + ), + ] + + def create_harvester(source, papers, error=None): + harvester = MagicMock() + harvester.source = source + harvester.search = AsyncMock( + return_value=HarvestResult( + source=source, + papers=papers, + total_found=len(papers), + error=error, + ) + ) + harvester.close = AsyncMock() + return harvester + + return { + "arxiv": create_harvester(HarvestSource.ARXIV, arxiv_papers), + "semantic_scholar": create_harvester(HarvestSource.SEMANTIC_SCHOLAR, s2_papers), + "openalex": create_harvester(HarvestSource.OPENALEX, openalex_papers), + } + + +@pytest.fixture +def pipeline(tmp_path, mock_harvesters): + """Create HarvestPipeline with mocked dependencies.""" + db_url = f"sqlite:///{tmp_path / 'test_harvest.db'}" + pipeline = HarvestPipeline(db_url=db_url) + + # Inject mock harvesters + def get_mock_harvester(source): + return mock_harvesters.get(source) + + pipeline._get_harvester = get_mock_harvester + return pipeline + + +class TestHarvestPipelineRun: + """Tests for harvest pipeline execution.""" + + @pytest.mark.asyncio + async def test_run_full_pipeline(self, pipeline): + """Run full harvest pipeline with all sources.""" + config = HarvestConfig( + keywords=["transformer", "NLP"], + max_results_per_source=50, + expand_keywords=False, # Skip expansion for predictable test + recommend_venues=False, # Skip venue recommendation + ) + + progress_messages = [] + final_result = None + + async for item in pipeline.run(config): + if isinstance(item, HarvestProgress): + progress_messages.append(item) + elif isinstance(item, HarvestFinalResult): + final_result = item + + # Verify progress messages + phases = [p.phase for p in progress_messages] + assert "Expanding" in phases + assert "Initializing" in phases + assert "Harvesting" in phases + assert "Deduplicating" in phases + assert "Storing" in phases + + # Verify final result + assert final_result is not None + assert final_result.status == "success" + assert final_result.papers_found == 5 # 2 + 2 + 1 + assert final_result.papers_deduplicated > 0 # Transformer paper is duplicate + assert final_result.duration_seconds > 0 + + @pytest.mark.asyncio + async def test_run_with_keyword_expansion(self, pipeline): + """Pipeline expands keywords when enabled.""" + config = HarvestConfig( + keywords=["LLM"], # Should expand to "large language model" + expand_keywords=True, + recommend_venues=False, + ) + + async for item in pipeline.run(config): + if isinstance(item, HarvestProgress) and item.phase == "Expanding": + assert item.message == "Expanding keywords..." + + @pytest.mark.asyncio + async def test_run_with_venue_recommendation(self, pipeline): + """Pipeline recommends venues when enabled and no venues specified.""" + config = HarvestConfig( + keywords=["security"], + venues=None, # No venues specified + expand_keywords=False, + recommend_venues=True, + ) + + found_recommend_phase = False + async for item in pipeline.run(config): + if isinstance(item, HarvestProgress) and item.phase == "Recommending": + found_recommend_phase = True + + assert found_recommend_phase + + @pytest.mark.asyncio + async def test_run_with_specific_sources(self, pipeline): + """Pipeline uses only specified sources.""" + config = HarvestConfig( + keywords=["test"], + sources=["arxiv"], # Only arXiv + expand_keywords=False, + recommend_venues=False, + ) + + final_result = None + async for item in pipeline.run(config): + if isinstance(item, HarvestFinalResult): + final_result = item + + assert final_result is not None + assert "arxiv" in final_result.source_results + # Should not query other sources + # (mock harvesters would have papers, so we check papers_found) + assert final_result.papers_found == 2 # Only arXiv papers + + @pytest.mark.asyncio + async def test_run_creates_harvest_run_record(self, pipeline): + """Pipeline creates harvest run record in database.""" + config = HarvestConfig( + keywords=["test"], + expand_keywords=False, + recommend_venues=False, + ) + + final_result = None + async for item in pipeline.run(config): + if isinstance(item, HarvestFinalResult): + final_result = item + + # Verify harvest run was created + run = pipeline.paper_store.get_harvest_run(final_result.run_id) + assert run is not None + assert run.status == "success" + assert run.papers_found > 0 + + @pytest.mark.asyncio + async def test_run_stores_papers(self, pipeline): + """Pipeline stores papers in database.""" + config = HarvestConfig( + keywords=["test"], + expand_keywords=False, + recommend_venues=False, + ) + + async for item in pipeline.run(config): + pass # Just run to completion + + # Verify papers were stored + paper_count = pipeline.paper_store.get_paper_count() + assert paper_count > 0 + + @pytest.mark.asyncio + async def test_run_deduplicates_papers(self, pipeline): + """Pipeline deduplicates papers across sources.""" + config = HarvestConfig( + keywords=["test"], + sources=["arxiv", "semantic_scholar"], + expand_keywords=False, + recommend_venues=False, + ) + + final_result = None + async for item in pipeline.run(config): + if isinstance(item, HarvestFinalResult): + final_result = item + + # Transformer paper appears in both sources + assert final_result.papers_deduplicated > 0 + # Total found - new papers = deduplicated + total_raw = final_result.papers_found + stored = final_result.papers_new + # Due to deduplication, stored < total_raw + assert stored < total_raw or final_result.papers_deduplicated > 0 + + @pytest.mark.asyncio + async def test_run_with_year_filter(self, pipeline, mock_harvesters): + """Pipeline passes year filters to harvesters.""" + config = HarvestConfig( + keywords=["test"], + year_from=2023, + year_to=2024, + sources=["arxiv"], + expand_keywords=False, + recommend_venues=False, + ) + + async for item in pipeline.run(config): + pass + + # Verify harvester was called with year filters + mock_harvesters["arxiv"].search.assert_called_once() + call_kwargs = mock_harvesters["arxiv"].search.call_args[1] + assert call_kwargs["year_from"] == 2023 + assert call_kwargs["year_to"] == 2024 + + +class TestHarvestPipelineErrorHandling: + """Tests for error handling in harvest pipeline.""" + + @pytest.mark.asyncio + async def test_partial_failure(self, tmp_path): + """Pipeline handles partial source failures.""" + db_url = f"sqlite:///{tmp_path / 'test_partial.db'}" + pipeline = HarvestPipeline(db_url=db_url) + + # Create harvesters with one failing + def get_harvester(source): + if source == "arxiv": + harvester = MagicMock() + harvester.source = HarvestSource.ARXIV + harvester.search = AsyncMock( + return_value=HarvestResult( + source=HarvestSource.ARXIV, + papers=[ + HarvestedPaper( + title="Working Paper", + source=HarvestSource.ARXIV, + ) + ], + total_found=1, + ) + ) + harvester.close = AsyncMock() + return harvester + elif source == "semantic_scholar": + harvester = MagicMock() + harvester.source = HarvestSource.SEMANTIC_SCHOLAR + harvester.search = AsyncMock( + return_value=HarvestResult( + source=HarvestSource.SEMANTIC_SCHOLAR, + papers=[], + total_found=0, + error="Rate limit exceeded", + ) + ) + harvester.close = AsyncMock() + return harvester + return None + + pipeline._get_harvester = get_harvester + + config = HarvestConfig( + keywords=["test"], + sources=["arxiv", "semantic_scholar"], + expand_keywords=False, + recommend_venues=False, + ) + + final_result = None + async for item in pipeline.run(config): + if isinstance(item, HarvestFinalResult): + final_result = item + + assert final_result.status == "partial" # Not full success + assert "semantic_scholar" in final_result.errors + assert final_result.papers_new == 1 # From arXiv + + await pipeline.close() + + @pytest.mark.asyncio + async def test_all_sources_fail(self, tmp_path): + """Pipeline handles all sources failing.""" + db_url = f"sqlite:///{tmp_path / 'test_all_fail.db'}" + pipeline = HarvestPipeline(db_url=db_url) + + def get_failing_harvester(source): + if source == "arxiv": + harvester = MagicMock() + harvester.source = HarvestSource.ARXIV + harvester.search = AsyncMock( + return_value=HarvestResult( + source=HarvestSource.ARXIV, + papers=[], + total_found=0, + error="Connection timeout", + ) + ) + harvester.close = AsyncMock() + return harvester + return None + + pipeline._get_harvester = get_failing_harvester + + config = HarvestConfig( + keywords=["test"], + sources=["arxiv"], + expand_keywords=False, + recommend_venues=False, + ) + + final_result = None + async for item in pipeline.run(config): + if isinstance(item, HarvestFinalResult): + final_result = item + + assert final_result.status == "failed" + assert "arxiv" in final_result.errors + assert final_result.papers_new == 0 + + await pipeline.close() + + @pytest.mark.asyncio + async def test_harvester_exception(self, tmp_path): + """Pipeline handles harvester exceptions gracefully.""" + db_url = f"sqlite:///{tmp_path / 'test_exception.db'}" + pipeline = HarvestPipeline(db_url=db_url) + + def get_throwing_harvester(source): + if source == "arxiv": + harvester = MagicMock() + harvester.source = HarvestSource.ARXIV + harvester.search = AsyncMock( + side_effect=Exception("Unexpected error") + ) + harvester.close = AsyncMock() + return harvester + return None + + pipeline._get_harvester = get_throwing_harvester + + config = HarvestConfig( + keywords=["test"], + sources=["arxiv"], + expand_keywords=False, + recommend_venues=False, + ) + + final_result = None + async for item in pipeline.run(config): + if isinstance(item, HarvestFinalResult): + final_result = item + + assert final_result is not None + assert "arxiv" in final_result.errors + assert "Unexpected error" in final_result.errors["arxiv"] + + await pipeline.close() + + +class TestHarvestPipelineRunSync: + """Tests for synchronous pipeline execution.""" + + @pytest.mark.asyncio + async def test_run_sync_returns_final_result(self, pipeline): + """run_sync returns only the final result.""" + config = HarvestConfig( + keywords=["test"], + expand_keywords=False, + recommend_venues=False, + ) + + result = await pipeline.run_sync(config) + + assert isinstance(result, HarvestFinalResult) + assert result.status in ("success", "partial", "failed") + + +class TestHarvestPipelineContextManager: + """Tests for context manager protocol.""" + + @pytest.mark.asyncio + async def test_context_manager(self, tmp_path, mock_harvesters): + """Pipeline can be used as async context manager.""" + db_url = f"sqlite:///{tmp_path / 'test_ctx.db'}" + + async with HarvestPipeline(db_url=db_url) as pipeline: + # Inject mock harvesters + def get_mock_harvester(source): + return mock_harvesters.get(source) + + pipeline._get_harvester = get_mock_harvester + + config = HarvestConfig( + keywords=["test"], + expand_keywords=False, + recommend_venues=False, + ) + + result = await pipeline.run_sync(config) + assert result is not None + + # Pipeline should be closed after context exits + + +class TestHarvestPipelineRunId: + """Tests for run ID generation.""" + + def test_new_run_id_format(self): + """Run ID follows expected format.""" + run_id = HarvestPipeline.new_run_id() + + assert run_id.startswith("harvest-") + parts = run_id.split("-") + assert len(parts) == 4 # harvest-YYYYMMDD-HHMMSS-suffix + + def test_new_run_id_unique(self): + """Each run ID is unique.""" + ids = [HarvestPipeline.new_run_id() for _ in range(10)] + assert len(set(ids)) == 10 + + @pytest.mark.asyncio + async def test_custom_run_id(self, pipeline): + """Pipeline accepts custom run ID.""" + config = HarvestConfig( + keywords=["test"], + expand_keywords=False, + recommend_venues=False, + ) + + custom_id = "custom-run-123" + final_result = None + + async for item in pipeline.run(config, run_id=custom_id): + if isinstance(item, HarvestFinalResult): + final_result = item + + assert final_result.run_id == custom_id + + +class TestHarvestPipelineServices: + """Tests for lazy-loaded services.""" + + def test_query_rewriter_lazy_init(self, tmp_path): + """QueryRewriter is lazily initialized.""" + db_url = f"sqlite:///{tmp_path / 'test_lazy.db'}" + pipeline = HarvestPipeline(db_url=db_url) + + assert pipeline._query_rewriter is None + _ = pipeline.query_rewriter + assert pipeline._query_rewriter is not None + + def test_venue_recommender_lazy_init(self, tmp_path): + """VenueRecommender is lazily initialized.""" + db_url = f"sqlite:///{tmp_path / 'test_lazy.db'}" + pipeline = HarvestPipeline(db_url=db_url) + + assert pipeline._venue_recommender is None + _ = pipeline.venue_recommender + assert pipeline._venue_recommender is not None + + def test_deduplicator_lazy_init(self, tmp_path): + """PaperDeduplicator is lazily initialized.""" + db_url = f"sqlite:///{tmp_path / 'test_lazy.db'}" + pipeline = HarvestPipeline(db_url=db_url) + + assert pipeline._deduplicator is None + _ = pipeline.deduplicator + assert pipeline._deduplicator is not None + + def test_paper_store_lazy_init(self, tmp_path): + """PaperStore is lazily initialized.""" + db_url = f"sqlite:///{tmp_path / 'test_lazy.db'}" + pipeline = HarvestPipeline(db_url=db_url) + + assert pipeline._paper_store is None + _ = pipeline.paper_store + assert pipeline._paper_store is not None diff --git a/tests/integration/test_harvesters.py b/tests/integration/test_harvesters.py new file mode 100644 index 0000000..489337b --- /dev/null +++ b/tests/integration/test_harvesters.py @@ -0,0 +1,478 @@ +""" +Harvester integration tests with mocked API responses. + +Tests ArxivHarvester, SemanticScholarHarvester, and OpenAlexHarvester. +""" + +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from paperbot.domain.harvest import HarvestSource +from paperbot.infrastructure.harvesters import ( + ArxivHarvester, + SemanticScholarHarvester, + OpenAlexHarvester, +) + + +# Sample API response data +ARXIV_ATOM_RESPONSE = """ + + + http://arxiv.org/abs/2301.12345v1 + Attention Is All You Need + We propose a new architecture called Transformer. + Ashish Vaswani + Noam Shazeer + 2023-01-15T00:00:00Z + + + + + http://arxiv.org/abs/2301.12346v1 + BERT: Pre-training of Deep Bidirectional Transformers + We introduce BERT for language understanding. + Jacob Devlin + 2023-01-16T00:00:00Z + + + + +""" + +S2_API_RESPONSE = { + "data": [ + { + "paperId": "s2-paper-001", + "title": "Deep Learning for NLP", + "abstract": "A comprehensive study on deep learning for NLP.", + "year": 2023, + "venue": "NeurIPS", + "citationCount": 150, + "authors": [{"name": "Alice Smith"}, {"name": "Bob Jones"}], + "publicationDate": "2023-12-01", + "externalIds": {"DOI": "10.1234/dl-nlp", "ArXiv": "2301.00001"}, + "fieldsOfStudy": ["Computer Science", "Machine Learning"], + "url": "https://www.semanticscholar.org/paper/abc123", + "openAccessPdf": {"url": "https://arxiv.org/pdf/2301.00001.pdf"}, + }, + { + "paperId": "s2-paper-002", + "title": "Reinforcement Learning in Robotics", + "abstract": "RL algorithms for robotic control.", + "year": 2022, + "venue": "ICRA", + "citationCount": 75, + "authors": [{"name": "Charlie Brown"}], + "publicationDate": "2022-06-15", + "externalIds": {"DOI": "10.1234/rl-robot"}, + "fieldsOfStudy": ["Robotics", "AI"], + "url": "https://www.semanticscholar.org/paper/def456", + "openAccessPdf": None, + }, + ] +} + +OPENALEX_API_RESPONSE = { + "meta": {"count": 2}, + "results": [ + { + "id": "https://openalex.org/W123456", + "title": "Computer Vision Advances", + "abstract_inverted_index": { + "Computer": [0], + "vision": [1], + "has": [2], + "advanced": [3], + "significantly": [4], + }, + "publication_year": 2024, + "cited_by_count": 200, + "authorships": [ + {"author": {"display_name": "David Wilson"}}, + {"author": {"display_name": "Eve Martinez"}}, + ], + "primary_location": {"source": {"display_name": "CVPR"}}, + "publication_date": "2024-01-10", + "ids": { + "doi": "https://doi.org/10.1234/cv-adv", + "openalex": "https://openalex.org/W123456", + }, + "open_access": {"oa_url": "https://example.com/paper.pdf"}, + "keywords": [{"display_name": "Computer Vision"}, {"display_name": "CNN"}], + "concepts": [ + {"display_name": "Computer Science"}, + {"display_name": "Image Processing"}, + ], + }, + ], +} + + +class TestArxivHarvester: + """Tests for ArxivHarvester.""" + + @pytest.fixture + def harvester(self): + """Create ArxivHarvester instance.""" + return ArxivHarvester() + + @pytest.mark.asyncio + async def test_search_success(self, harvester): + """Successful search returns papers.""" + with patch.object(harvester, "_get_session") as mock_session: + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.text = AsyncMock(return_value=ARXIV_ATOM_RESPONSE) + + mock_session.return_value.get = MagicMock( + return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response)) + ) + + result = await harvester.search("transformer", max_results=10) + + assert result.success + assert len(result.papers) == 2 + assert result.source == HarvestSource.ARXIV + + # Check first paper + paper1 = result.papers[0] + assert "Attention" in paper1.title + assert paper1.arxiv_id == "2301.12345" + assert paper1.source == HarvestSource.ARXIV + assert len(paper1.authors) >= 1 + + @pytest.mark.asyncio + async def test_search_api_error(self, harvester): + """API error returns error result.""" + with patch.object(harvester, "_get_session") as mock_session: + mock_response = AsyncMock() + mock_response.status = 500 + + mock_session.return_value.get = MagicMock( + return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response)) + ) + + result = await harvester.search("test") + + assert not result.success + assert result.error is not None + assert "500" in result.error + + @pytest.mark.asyncio + async def test_search_with_year_filter(self, harvester): + """Search with year filter builds correct query.""" + with patch.object(harvester, "_get_session") as mock_session: + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.text = AsyncMock(return_value=ARXIV_ATOM_RESPONSE) + + mock_get = MagicMock( + return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response)) + ) + mock_session.return_value.get = mock_get + + await harvester.search( + "deep learning", + year_from=2020, + year_to=2024, + max_results=50, + ) + + # Verify query includes year filter + call_args = mock_get.call_args + params = call_args[1]["params"] + assert "submittedDate" in params["search_query"] + assert "20200101" in params["search_query"] + assert "20241231" in params["search_query"] + + @pytest.mark.asyncio + async def test_source_property(self, harvester): + """source property returns ARXIV.""" + assert harvester.source == HarvestSource.ARXIV + + @pytest.mark.asyncio + async def test_close(self, harvester): + """close() releases resources.""" + mock_session = MagicMock() + mock_session.closed = False + mock_session.close = AsyncMock() + harvester._session = mock_session + + await harvester.close() + + mock_session.close.assert_called_once() + assert harvester._session is None + + +class TestSemanticScholarHarvester: + """Tests for SemanticScholarHarvester.""" + + @pytest.fixture + def harvester(self): + """Create SemanticScholarHarvester with mocked client.""" + mock_client = MagicMock() + mock_client.search_papers = AsyncMock(return_value=S2_API_RESPONSE["data"]) + return SemanticScholarHarvester(client=mock_client) + + @pytest.mark.asyncio + async def test_search_success(self, harvester): + """Successful search returns papers.""" + result = await harvester.search("deep learning", max_results=10) + + assert result.success + assert len(result.papers) == 2 + assert result.source == HarvestSource.SEMANTIC_SCHOLAR + + # Check first paper + paper1 = result.papers[0] + assert paper1.title == "Deep Learning for NLP" + assert paper1.semantic_scholar_id == "s2-paper-001" + assert paper1.doi == "10.1234/dl-nlp" + assert paper1.arxiv_id == "2301.00001" + assert paper1.year == 2023 + assert paper1.venue == "NeurIPS" + assert paper1.citation_count == 150 + assert len(paper1.authors) == 2 + assert paper1.pdf_url is not None + + @pytest.mark.asyncio + async def test_search_with_venue_filter(self, harvester): + """Search filters by venue.""" + # Return all papers, then filter + result = await harvester.search( + "learning", + venues=["NeurIPS"], + max_results=10, + ) + + # Only NeurIPS paper should be returned + assert all("NeurIPS" in (p.venue or "").lower() or "neurips" in (p.venue or "").lower() + for p in result.papers if p.venue) + + @pytest.mark.asyncio + async def test_search_client_error(self, harvester): + """Client error returns error result.""" + harvester.client.search_papers = AsyncMock( + side_effect=Exception("API connection failed") + ) + + result = await harvester.search("test") + + assert not result.success + assert "API connection failed" in result.error + + @pytest.mark.asyncio + async def test_paper_without_optional_fields(self): + """Paper handles missing optional fields.""" + mock_client = MagicMock() + mock_client.search_papers = AsyncMock( + return_value=[ + { + "paperId": "minimal-paper", + "title": "Minimal Paper", + "abstract": None, + "year": None, + "venue": None, + "citationCount": None, + "authors": [], + "externalIds": None, + "fieldsOfStudy": None, + "openAccessPdf": None, + } + ] + ) + harvester = SemanticScholarHarvester(client=mock_client) + + result = await harvester.search("test") + + assert result.success + paper = result.papers[0] + assert paper.title == "Minimal Paper" + assert paper.abstract == "" + assert paper.citation_count == 0 + assert paper.authors == [] + + @pytest.mark.asyncio + async def test_source_property(self, harvester): + """source property returns SEMANTIC_SCHOLAR.""" + assert harvester.source == HarvestSource.SEMANTIC_SCHOLAR + + +class TestOpenAlexHarvester: + """Tests for OpenAlexHarvester.""" + + @pytest.fixture + def harvester(self): + """Create OpenAlexHarvester instance.""" + return OpenAlexHarvester(email="test@example.com") + + @pytest.mark.asyncio + async def test_search_success(self, harvester): + """Successful search returns papers.""" + with patch.object(harvester, "_get_session") as mock_session: + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.json = AsyncMock(return_value=OPENALEX_API_RESPONSE) + + mock_session.return_value.get = MagicMock( + return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response)) + ) + + result = await harvester.search("computer vision", max_results=10) + + assert result.success + assert len(result.papers) == 1 + assert result.source == HarvestSource.OPENALEX + assert result.total_found == 2 # From meta.count + + # Check paper details + paper = result.papers[0] + assert paper.title == "Computer Vision Advances" + assert paper.openalex_id == "W123456" + assert paper.doi == "10.1234/cv-adv" + assert paper.year == 2024 + assert paper.venue == "CVPR" + assert paper.citation_count == 200 + assert len(paper.authors) == 2 + assert paper.pdf_url is not None + + @pytest.mark.asyncio + async def test_search_api_error(self, harvester): + """API error returns error result.""" + with patch.object(harvester, "_get_session") as mock_session: + mock_response = AsyncMock() + mock_response.status = 429 # Rate limit + + mock_session.return_value.get = MagicMock( + return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response)) + ) + + result = await harvester.search("test") + + assert not result.success + assert "429" in result.error + + @pytest.mark.asyncio + async def test_search_with_year_filter(self, harvester): + """Search with year filter includes correct params.""" + with patch.object(harvester, "_get_session") as mock_session: + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.json = AsyncMock(return_value=OPENALEX_API_RESPONSE) + + mock_get = MagicMock( + return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response)) + ) + mock_session.return_value.get = mock_get + + await harvester.search( + "test", + year_from=2020, + year_to=2024, + ) + + # Verify filter params + call_args = mock_get.call_args + params = call_args[1]["params"] + assert "filter" in params + assert "publication_year" in params["filter"] + + @pytest.mark.asyncio + async def test_abstract_reconstruction(self, harvester): + """Abstract is reconstructed from inverted index.""" + with patch.object(harvester, "_get_session") as mock_session: + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.json = AsyncMock(return_value=OPENALEX_API_RESPONSE) + + mock_session.return_value.get = MagicMock( + return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response)) + ) + + result = await harvester.search("test") + + paper = result.papers[0] + assert "Computer vision has advanced significantly" == paper.abstract + + @pytest.mark.asyncio + async def test_email_polite_pool(self, harvester): + """Email is included for polite pool.""" + with patch.object(harvester, "_get_session") as mock_session: + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.json = AsyncMock(return_value=OPENALEX_API_RESPONSE) + + mock_get = MagicMock( + return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response)) + ) + mock_session.return_value.get = mock_get + + await harvester.search("test") + + call_args = mock_get.call_args + params = call_args[1]["params"] + assert params.get("mailto") == "test@example.com" + + @pytest.mark.asyncio + async def test_source_property(self, harvester): + """source property returns OPENALEX.""" + assert harvester.source == HarvestSource.OPENALEX + + @pytest.mark.asyncio + async def test_close(self, harvester): + """close() releases resources.""" + mock_session = MagicMock() + mock_session.closed = False + mock_session.close = AsyncMock() + harvester._session = mock_session + + await harvester.close() + + mock_session.close.assert_called_once() + assert harvester._session is None + + +class TestHarvesterInterface: + """Tests to verify all harvesters implement the same interface.""" + + @pytest.mark.asyncio + async def test_all_harvesters_have_source_property(self): + """All harvesters have source property.""" + harvesters = [ + ArxivHarvester(), + SemanticScholarHarvester(), + OpenAlexHarvester(), + ] + + for harvester in harvesters: + assert hasattr(harvester, "source") + assert isinstance(harvester.source, HarvestSource) + + @pytest.mark.asyncio + async def test_all_harvesters_have_search_method(self): + """All harvesters have async search method.""" + harvesters = [ + ArxivHarvester(), + SemanticScholarHarvester(), + OpenAlexHarvester(), + ] + + for harvester in harvesters: + assert hasattr(harvester, "search") + import inspect + assert inspect.iscoroutinefunction(harvester.search) + + @pytest.mark.asyncio + async def test_all_harvesters_have_close_method(self): + """All harvesters have async close method.""" + harvesters = [ + ArxivHarvester(), + SemanticScholarHarvester(), + OpenAlexHarvester(), + ] + + for harvester in harvesters: + assert hasattr(harvester, "close") + import inspect + assert inspect.iscoroutinefunction(harvester.close) diff --git a/tests/integration/test_paper_store.py b/tests/integration/test_paper_store.py new file mode 100644 index 0000000..08cd029 --- /dev/null +++ b/tests/integration/test_paper_store.py @@ -0,0 +1,580 @@ +""" +PaperStore integration tests. + +Tests paper storage, deduplication, search, and library functionality. +""" + +import pytest +from datetime import datetime, timezone + +from paperbot.domain.harvest import HarvestedPaper, HarvestSource +from paperbot.infrastructure.stores.paper_store import PaperStore, paper_to_dict + + +@pytest.fixture +def paper_store(tmp_path): + """Create a PaperStore with a temporary SQLite database.""" + db_url = f"sqlite:///{tmp_path / 'test_papers.db'}" + store = PaperStore(db_url=db_url, auto_create_schema=True) + yield store + store.close() + + +class TestPaperStoreUpsert: + """Tests for paper upsert functionality.""" + + def test_upsert_single_paper(self, paper_store): + """Upsert a single paper.""" + paper = HarvestedPaper( + title="Test Paper", + source=HarvestSource.ARXIV, + abstract="Test abstract", + authors=["Alice", "Bob"], + doi="10.1234/test", + year=2023, + citation_count=10, + ) + + new_count, updated_count = paper_store.upsert_papers_batch([paper]) + + assert new_count == 1 + assert updated_count == 0 + + def test_upsert_multiple_papers(self, paper_store): + """Upsert multiple papers.""" + papers = [ + HarvestedPaper( + title=f"Paper {i}", + source=HarvestSource.ARXIV, + doi=f"10.1234/paper{i}", + year=2023, + ) + for i in range(5) + ] + + new_count, updated_count = paper_store.upsert_papers_batch(papers) + + assert new_count == 5 + assert updated_count == 0 + assert paper_store.get_paper_count() == 5 + + def test_upsert_deduplicates_by_doi(self, paper_store): + """Papers with same DOI are deduplicated.""" + paper1 = HarvestedPaper( + title="Original Title", + source=HarvestSource.ARXIV, + doi="10.1234/same-doi", + citation_count=10, + ) + paper2 = HarvestedPaper( + title="Different Title", + source=HarvestSource.SEMANTIC_SCHOLAR, + doi="10.1234/same-doi", + citation_count=20, + ) + + new1, _ = paper_store.upsert_papers_batch([paper1]) + new2, updated2 = paper_store.upsert_papers_batch([paper2]) + + assert new1 == 1 + assert new2 == 0 + assert updated2 == 1 + assert paper_store.get_paper_count() == 1 + + def test_upsert_deduplicates_by_arxiv_id(self, paper_store): + """Papers with same arXiv ID are deduplicated.""" + paper1 = HarvestedPaper( + title="Paper 1", + source=HarvestSource.ARXIV, + arxiv_id="2301.12345", + ) + paper2 = HarvestedPaper( + title="Paper 1 Variant", + source=HarvestSource.SEMANTIC_SCHOLAR, + arxiv_id="2301.12345", + doi="10.1234/new-doi", # New identifier + ) + + paper_store.upsert_papers_batch([paper1]) + new_count, updated_count = paper_store.upsert_papers_batch([paper2]) + + assert new_count == 0 + assert updated_count == 1 + + # DOI should be merged into existing record + papers, _ = paper_store.search_papers(query="Paper 1") + assert len(papers) == 1 + assert papers[0].doi == "10.1234/new-doi" + + def test_upsert_deduplicates_by_title_hash(self, paper_store): + """Papers with same normalized title are deduplicated.""" + paper1 = HarvestedPaper( + title="Deep Learning for NLP", + source=HarvestSource.ARXIV, + ) + paper2 = HarvestedPaper( + title="DEEP LEARNING FOR NLP", # Same title, different case + source=HarvestSource.OPENALEX, + doi="10.1234/dedup-test", + ) + + paper_store.upsert_papers_batch([paper1]) + new_count, updated_count = paper_store.upsert_papers_batch([paper2]) + + assert new_count == 0 + assert updated_count == 1 + assert paper_store.get_paper_count() == 1 + + def test_upsert_merges_metadata(self, paper_store): + """Upsert merges metadata from duplicate papers.""" + paper1 = HarvestedPaper( + title="Merge Test", + source=HarvestSource.ARXIV, + doi="10.1234/merge", + abstract="Short", + citation_count=10, + keywords=["ML"], + ) + paper2 = HarvestedPaper( + title="Merge Test", + source=HarvestSource.SEMANTIC_SCHOLAR, + doi="10.1234/merge", + abstract="A much longer abstract with more details", + citation_count=20, + keywords=["AI"], + semantic_scholar_id="s2-123", + ) + + paper_store.upsert_papers_batch([paper1]) + paper_store.upsert_papers_batch([paper2]) + + papers, _ = paper_store.search_papers(query="Merge Test") + assert len(papers) == 1 + paper = papers[0] + + # Longer abstract preserved + assert "longer" in paper.abstract + # Higher citation count preserved + assert paper.citation_count == 20 + # New identifier merged + assert paper.semantic_scholar_id == "s2-123" + + +class TestPaperStoreSearch: + """Tests for paper search functionality.""" + + @pytest.fixture(autouse=True) + def setup_papers(self, paper_store): + """Add test papers to the store.""" + self.store = paper_store + papers = [ + HarvestedPaper( + title="Deep Learning for Natural Language Processing", + source=HarvestSource.ARXIV, + abstract="A study on transformers and attention mechanisms", + doi="10.1234/nlp", + year=2023, + venue="NeurIPS", + citation_count=100, + ), + HarvestedPaper( + title="Computer Vision with Convolutional Networks", + source=HarvestSource.SEMANTIC_SCHOLAR, + abstract="CNN architectures for image classification", + doi="10.1234/cv", + year=2022, + venue="CVPR", + citation_count=200, + ), + HarvestedPaper( + title="Reinforcement Learning in Robotics", + source=HarvestSource.OPENALEX, + abstract="RL algorithms for robot control", + doi="10.1234/rl", + year=2024, + venue="ICRA", + citation_count=50, + ), + HarvestedPaper( + title="Security Analysis of Machine Learning Systems", + source=HarvestSource.ARXIV, + abstract="Adversarial attacks on deep learning models", + doi="10.1234/security", + year=2023, + venue="CCS", + citation_count=75, + ), + ] + paper_store.upsert_papers_batch(papers) + + def test_search_by_query(self): + """Search papers by query string.""" + papers, total = self.store.search_papers(query="deep learning") + + assert total >= 1 + assert any("Deep Learning" in p.title for p in papers) + + def test_search_by_year_range(self): + """Search papers within year range.""" + papers, total = self.store.search_papers(year_from=2023, year_to=2024) + + assert all(2023 <= p.year <= 2024 for p in papers) + assert total >= 2 + + def test_search_by_venue(self): + """Search papers by venue.""" + papers, total = self.store.search_papers(venues=["NeurIPS"]) + + assert total >= 1 + assert all("NeurIPS" in (p.venue or "") for p in papers) + + def test_search_by_min_citations(self): + """Search papers with minimum citations.""" + papers, total = self.store.search_papers(min_citations=100) + + assert all(p.citation_count >= 100 for p in papers) + assert total >= 1 + + def test_search_by_source(self): + """Search papers by source.""" + papers, total = self.store.search_papers(sources=["arxiv"]) + + assert all(p.primary_source == "arxiv" for p in papers) + + def test_search_sort_by_citations(self): + """Search results sorted by citation count.""" + papers, _ = self.store.search_papers( + sort_by="citation_count", sort_order="desc" + ) + + # Verify descending order + for i in range(len(papers) - 1): + assert (papers[i].citation_count or 0) >= (papers[i + 1].citation_count or 0) + + def test_search_sort_by_year(self): + """Search results sorted by year.""" + papers, _ = self.store.search_papers(sort_by="year", sort_order="asc") + + # Verify ascending order + for i in range(len(papers) - 1): + if papers[i].year and papers[i + 1].year: + assert papers[i].year <= papers[i + 1].year + + def test_search_pagination(self): + """Search with pagination.""" + all_papers, total = self.store.search_papers(limit=100) + + # Get first page + page1, _ = self.store.search_papers(limit=2, offset=0) + assert len(page1) == 2 + + # Get second page + page2, _ = self.store.search_papers(limit=2, offset=2) + + # Pages should not overlap + page1_ids = {p.id for p in page1} + page2_ids = {p.id for p in page2} + assert page1_ids.isdisjoint(page2_ids) + + def test_search_combined_filters(self): + """Search with multiple filters combined.""" + papers, total = self.store.search_papers( + query="learning", + year_from=2023, + min_citations=50, + sort_by="citation_count", + sort_order="desc", + ) + + for paper in papers: + assert paper.year >= 2023 + assert paper.citation_count >= 50 + + def test_search_no_results(self): + """Search with no matching results.""" + papers, total = self.store.search_papers(query="xyznonexistent123") + + assert papers == [] + assert total == 0 + + +class TestPaperStoreHarvestRun: + """Tests for harvest run tracking.""" + + def test_create_harvest_run(self, paper_store): + """Create a harvest run record.""" + run = paper_store.create_harvest_run( + run_id="test-run-001", + keywords=["machine learning", "deep learning"], + venues=["NeurIPS", "ICML"], + sources=["arxiv", "semantic_scholar"], + max_results_per_source=50, + ) + + assert run.run_id == "test-run-001" + assert run.status == "running" + assert run.get_keywords() == ["machine learning", "deep learning"] + assert run.get_venues() == ["NeurIPS", "ICML"] + assert run.get_sources() == ["arxiv", "semantic_scholar"] + assert run.max_results_per_source == 50 + + def test_update_harvest_run(self, paper_store): + """Update a harvest run record.""" + paper_store.create_harvest_run( + run_id="test-run-002", + keywords=["test"], + venues=[], + sources=["arxiv"], + max_results_per_source=50, + ) + + updated = paper_store.update_harvest_run( + run_id="test-run-002", + status="success", + papers_found=100, + papers_new=80, + papers_deduplicated=20, + ) + + assert updated is not None + assert updated.status == "success" + assert updated.papers_found == 100 + assert updated.papers_new == 80 + assert updated.papers_deduplicated == 20 + assert updated.ended_at is not None + + def test_update_harvest_run_with_errors(self, paper_store): + """Update harvest run with error information.""" + paper_store.create_harvest_run( + run_id="test-run-003", + keywords=["test"], + venues=[], + sources=["arxiv", "semantic_scholar"], + max_results_per_source=50, + ) + + errors = {"semantic_scholar": "Rate limit exceeded"} + updated = paper_store.update_harvest_run( + run_id="test-run-003", + status="partial", + errors=errors, + ) + + assert updated.status == "partial" + assert updated.get_errors() == errors + + def test_get_harvest_run(self, paper_store): + """Retrieve a harvest run by ID.""" + paper_store.create_harvest_run( + run_id="test-run-004", + keywords=["retrieval test"], + venues=["SIGIR"], + sources=["openalex"], + max_results_per_source=25, + ) + + run = paper_store.get_harvest_run("test-run-004") + + assert run is not None + assert run.run_id == "test-run-004" + assert run.get_keywords() == ["retrieval test"] + + def test_get_harvest_run_not_found(self, paper_store): + """Get non-existent harvest run returns None.""" + run = paper_store.get_harvest_run("nonexistent-run") + assert run is None + + def test_list_harvest_runs(self, paper_store): + """List harvest runs.""" + for i in range(3): + paper_store.create_harvest_run( + run_id=f"list-test-{i}", + keywords=[f"keyword{i}"], + venues=[], + sources=["arxiv"], + max_results_per_source=50, + ) + + runs = paper_store.list_harvest_runs(limit=10) + + assert len(runs) >= 3 + # Should be sorted by started_at descending + for i in range(len(runs) - 1): + if runs[i].started_at and runs[i + 1].started_at: + assert runs[i].started_at >= runs[i + 1].started_at + + def test_list_harvest_runs_by_status(self, paper_store): + """List harvest runs filtered by status.""" + paper_store.create_harvest_run( + run_id="status-test-1", + keywords=["test"], + venues=[], + sources=["arxiv"], + max_results_per_source=50, + ) + paper_store.update_harvest_run("status-test-1", status="success") + + paper_store.create_harvest_run( + run_id="status-test-2", + keywords=["test"], + venues=[], + sources=["arxiv"], + max_results_per_source=50, + ) + # Remains "running" + + success_runs = paper_store.list_harvest_runs(status="success") + running_runs = paper_store.list_harvest_runs(status="running") + + assert any(r.run_id == "status-test-1" for r in success_runs) + assert any(r.run_id == "status-test-2" for r in running_runs) + + +class TestPaperStoreLibrary: + """Tests for user library functionality.""" + + def test_get_paper_by_id(self, paper_store): + """Get paper by ID.""" + paper = HarvestedPaper( + title="Get By ID Test", + source=HarvestSource.ARXIV, + doi="10.1234/getbyid", + ) + paper_store.upsert_papers_batch([paper]) + + papers, _ = paper_store.search_papers(query="Get By ID") + assert len(papers) == 1 + + retrieved = paper_store.get_paper_by_id(papers[0].id) + assert retrieved is not None + assert retrieved.title == "Get By ID Test" + + def test_get_paper_by_id_not_found(self, paper_store): + """Get non-existent paper returns None.""" + paper = paper_store.get_paper_by_id(99999) + assert paper is None + + def test_paper_to_dict(self, paper_store): + """paper_to_dict converts model correctly.""" + paper = HarvestedPaper( + title="Dict Test", + source=HarvestSource.SEMANTIC_SCHOLAR, + abstract="Test abstract", + authors=["Alice", "Bob"], + doi="10.1234/dict", + year=2023, + venue="ICML", + citation_count=42, + keywords=["ML"], + fields_of_study=["CS"], + ) + paper_store.upsert_papers_batch([paper]) + + papers, _ = paper_store.search_papers(query="Dict Test") + result = paper_to_dict(papers[0]) + + assert result["title"] == "Dict Test" + assert result["abstract"] == "Test abstract" + assert result["authors"] == ["Alice", "Bob"] + assert result["doi"] == "10.1234/dict" + assert result["year"] == 2023 + assert result["venue"] == "ICML" + assert result["citation_count"] == 42 + assert result["primary_source"] == "semantic_scholar" + + def test_get_paper_count(self, paper_store): + """Get total paper count.""" + initial_count = paper_store.get_paper_count() + + papers = [ + HarvestedPaper( + title=f"Count Test {i}", + source=HarvestSource.ARXIV, + doi=f"10.1234/count{i}", + ) + for i in range(3) + ] + paper_store.upsert_papers_batch(papers) + + new_count = paper_store.get_paper_count() + assert new_count == initial_count + 3 + + +class TestPaperStoreEdgeCases: + """Tests for edge cases and error handling.""" + + def test_upsert_empty_list(self, paper_store): + """Upsert empty list does nothing.""" + new_count, updated_count = paper_store.upsert_papers_batch([]) + + assert new_count == 0 + assert updated_count == 0 + + def test_upsert_paper_without_identifiers(self, paper_store): + """Upsert paper with only title (uses title hash).""" + paper = HarvestedPaper( + title="No Identifiers Paper", + source=HarvestSource.ARXIV, + ) + + new_count, _ = paper_store.upsert_papers_batch([paper]) + assert new_count == 1 + + # Second upsert with same title should update + paper2 = HarvestedPaper( + title="No Identifiers Paper", + source=HarvestSource.SEMANTIC_SCHOLAR, + citation_count=10, + ) + + new_count, updated_count = paper_store.upsert_papers_batch([paper2]) + assert new_count == 0 + assert updated_count == 1 + + def test_search_with_special_characters(self, paper_store): + """Search handles special characters.""" + paper = HarvestedPaper( + title="Test: A Paper with Special (Characters) & Symbols!", + source=HarvestSource.ARXIV, + doi="10.1234/special", + ) + paper_store.upsert_papers_batch([paper]) + + # Search with part of title (single word matches more reliably) + papers, total = paper_store.search_papers(query="Special") + assert total >= 1 + assert any("Special" in p.title for p in papers) + + def test_upsert_paper_with_unicode(self, paper_store): + """Upsert paper with unicode characters.""" + paper = HarvestedPaper( + title="机器学习论文 - Machine Learning Paper", + source=HarvestSource.ARXIV, + abstract="This paper discusses 深度学习 (deep learning)", + authors=["张三", "李四"], + doi="10.1234/unicode", + ) + + new_count, _ = paper_store.upsert_papers_batch([paper]) + assert new_count == 1 + + papers, _ = paper_store.search_papers(query="Machine Learning") + assert len(papers) == 1 + assert "机器学习" in papers[0].title + + def test_upsert_paper_with_long_abstract(self, paper_store): + """Upsert paper with very long abstract.""" + long_abstract = "Lorem ipsum " * 1000 # ~12000 characters + + paper = HarvestedPaper( + title="Long Abstract Paper", + source=HarvestSource.ARXIV, + abstract=long_abstract, + doi="10.1234/long", + ) + + new_count, _ = paper_store.upsert_papers_batch([paper]) + assert new_count == 1 + + papers, _ = paper_store.search_papers(query="Long Abstract") + assert papers[0].abstract == long_abstract diff --git a/tests/unit/test_harvested_paper.py b/tests/unit/test_harvested_paper.py new file mode 100644 index 0000000..5ec732d --- /dev/null +++ b/tests/unit/test_harvested_paper.py @@ -0,0 +1,328 @@ +""" +HarvestedPaper domain model unit tests. +""" + +import pytest + +from paperbot.domain.harvest import ( + HarvestedPaper, + HarvestResult, + HarvestRunResult, + HarvestSource, +) + + +class TestHarvestedPaper: + """Tests for HarvestedPaper data model.""" + + def test_create_minimal_paper(self): + """Create paper with only required fields.""" + paper = HarvestedPaper( + title="Test Paper", + source=HarvestSource.ARXIV, + ) + assert paper.title == "Test Paper" + assert paper.source == HarvestSource.ARXIV + assert paper.abstract == "" + assert paper.authors == [] + assert paper.doi is None + assert paper.citation_count == 0 + + def test_create_full_paper(self): + """Create paper with all fields.""" + paper = HarvestedPaper( + title="Full Paper", + source=HarvestSource.SEMANTIC_SCHOLAR, + abstract="This is an abstract.", + authors=["Alice", "Bob"], + doi="10.1234/test", + arxiv_id="2301.12345", + semantic_scholar_id="s2-123", + openalex_id="W12345", + year=2023, + venue="NeurIPS", + publication_date="2023-12-01", + citation_count=100, + url="https://example.com/paper", + pdf_url="https://example.com/paper.pdf", + keywords=["ML", "AI"], + fields_of_study=["Computer Science"], + source_rank=1, + ) + + assert paper.title == "Full Paper" + assert paper.source == HarvestSource.SEMANTIC_SCHOLAR + assert paper.abstract == "This is an abstract." + assert paper.authors == ["Alice", "Bob"] + assert paper.doi == "10.1234/test" + assert paper.arxiv_id == "2301.12345" + assert paper.semantic_scholar_id == "s2-123" + assert paper.openalex_id == "W12345" + assert paper.year == 2023 + assert paper.venue == "NeurIPS" + assert paper.publication_date == "2023-12-01" + assert paper.citation_count == 100 + assert paper.url == "https://example.com/paper" + assert paper.pdf_url == "https://example.com/paper.pdf" + assert paper.keywords == ["ML", "AI"] + assert paper.fields_of_study == ["Computer Science"] + assert paper.source_rank == 1 + + def test_compute_title_hash_basic(self): + """Title hash normalizes and hashes correctly.""" + paper = HarvestedPaper( + title="Deep Learning for NLP", + source=HarvestSource.ARXIV, + ) + hash1 = paper.compute_title_hash() + + # Same title should produce same hash + paper2 = HarvestedPaper( + title="Deep Learning for NLP", + source=HarvestSource.OPENALEX, + ) + assert paper2.compute_title_hash() == hash1 + + def test_compute_title_hash_case_insensitive(self): + """Title hash is case-insensitive.""" + paper1 = HarvestedPaper(title="Deep Learning", source=HarvestSource.ARXIV) + paper2 = HarvestedPaper(title="DEEP LEARNING", source=HarvestSource.ARXIV) + paper3 = HarvestedPaper(title="deep learning", source=HarvestSource.ARXIV) + + assert paper1.compute_title_hash() == paper2.compute_title_hash() + assert paper2.compute_title_hash() == paper3.compute_title_hash() + + def test_compute_title_hash_ignores_punctuation(self): + """Title hash ignores punctuation.""" + paper1 = HarvestedPaper(title="Deep Learning", source=HarvestSource.ARXIV) + paper2 = HarvestedPaper(title="Deep, Learning!", source=HarvestSource.ARXIV) + paper3 = HarvestedPaper(title="Deep-Learning?", source=HarvestSource.ARXIV) + + # All should have same hash after removing punctuation + assert paper1.compute_title_hash() == paper2.compute_title_hash() + # Note: hyphens are removed, making it "deeplearning" vs "deep learning" + # This might differ, which is intentional for similar titles + + def test_compute_title_hash_normalizes_whitespace(self): + """Title hash normalizes whitespace.""" + paper1 = HarvestedPaper(title="Deep Learning", source=HarvestSource.ARXIV) + paper2 = HarvestedPaper(title="Deep Learning", source=HarvestSource.ARXIV) + paper3 = HarvestedPaper(title=" Deep Learning ", source=HarvestSource.ARXIV) + + assert paper1.compute_title_hash() == paper2.compute_title_hash() + assert paper2.compute_title_hash() == paper3.compute_title_hash() + + def test_to_dict(self): + """to_dict returns correct dictionary.""" + paper = HarvestedPaper( + title="Test", + source=HarvestSource.ARXIV, + doi="10.1234/test", + year=2023, + ) + result = paper.to_dict() + + assert result["title"] == "Test" + assert result["source"] == "arxiv" + assert result["doi"] == "10.1234/test" + assert result["year"] == 2023 + assert "title_hash" in result + + def test_from_dict(self): + """from_dict creates paper from dictionary.""" + data = { + "title": "From Dict Paper", + "source": "semantic_scholar", + "abstract": "An abstract", + "authors": ["Author1"], + "doi": "10.1234/fromdict", + "year": 2024, + "citation_count": 50, + } + + paper = HarvestedPaper.from_dict(data) + + assert paper.title == "From Dict Paper" + assert paper.source == HarvestSource.SEMANTIC_SCHOLAR + assert paper.abstract == "An abstract" + assert paper.authors == ["Author1"] + assert paper.doi == "10.1234/fromdict" + assert paper.year == 2024 + assert paper.citation_count == 50 + + def test_from_dict_with_source_enum(self): + """from_dict handles source as enum.""" + data = { + "title": "Test", + "source": HarvestSource.OPENALEX, + } + + paper = HarvestedPaper.from_dict(data) + assert paper.source == HarvestSource.OPENALEX + + def test_roundtrip_dict(self): + """to_dict and from_dict roundtrip preserves data.""" + original = HarvestedPaper( + title="Roundtrip Test", + source=HarvestSource.ARXIV, + abstract="Test abstract", + authors=["Alice", "Bob"], + doi="10.1234/roundtrip", + arxiv_id="2301.12345", + year=2023, + venue="ICML", + citation_count=42, + keywords=["ML", "Test"], + fields_of_study=["CS"], + ) + + data = original.to_dict() + restored = HarvestedPaper.from_dict(data) + + assert restored.title == original.title + assert restored.source == original.source + assert restored.abstract == original.abstract + assert restored.authors == original.authors + assert restored.doi == original.doi + assert restored.arxiv_id == original.arxiv_id + assert restored.year == original.year + assert restored.venue == original.venue + assert restored.citation_count == original.citation_count + + +class TestHarvestSource: + """Tests for HarvestSource enum.""" + + def test_source_values(self): + """Source enum has correct string values.""" + assert HarvestSource.ARXIV.value == "arxiv" + assert HarvestSource.SEMANTIC_SCHOLAR.value == "semantic_scholar" + assert HarvestSource.OPENALEX.value == "openalex" + + def test_source_is_string(self): + """Source enum inherits from str.""" + assert isinstance(HarvestSource.ARXIV, str) + assert HarvestSource.ARXIV == "arxiv" + + def test_source_from_string(self): + """Source can be created from string.""" + source = HarvestSource("arxiv") + assert source == HarvestSource.ARXIV + + +class TestHarvestResult: + """Tests for HarvestResult data model.""" + + def test_success_result(self): + """Success result has no error.""" + result = HarvestResult( + source=HarvestSource.ARXIV, + papers=[ + HarvestedPaper(title="Paper 1", source=HarvestSource.ARXIV), + ], + total_found=1, + ) + + assert result.success is True + assert result.error is None + assert len(result.papers) == 1 + assert result.total_found == 1 + + def test_error_result(self): + """Error result has error message.""" + result = HarvestResult( + source=HarvestSource.SEMANTIC_SCHOLAR, + papers=[], + total_found=0, + error="API rate limit exceeded", + ) + + assert result.success is False + assert result.error == "API rate limit exceeded" + assert len(result.papers) == 0 + + def test_partial_result(self): + """Partial result can have both papers and error.""" + result = HarvestResult( + source=HarvestSource.OPENALEX, + papers=[ + HarvestedPaper(title="Paper 1", source=HarvestSource.OPENALEX), + ], + total_found=100, # More papers exist but couldn't be fetched + error="Timeout after 50 papers", + ) + + assert result.success is False + assert len(result.papers) == 1 + assert result.total_found == 100 + + +class TestHarvestRunResult: + """Tests for HarvestRunResult data model.""" + + def test_create_run_result(self): + """Create a complete run result.""" + from datetime import datetime, timezone + + now = datetime.now(timezone.utc) + + result = HarvestRunResult( + run_id="harvest-20260210-abc123", + status="success", + papers_found=150, + papers_new=100, + papers_deduplicated=50, + source_results={ + HarvestSource.ARXIV: HarvestResult( + source=HarvestSource.ARXIV, + papers=[], + total_found=50, + ), + HarvestSource.SEMANTIC_SCHOLAR: HarvestResult( + source=HarvestSource.SEMANTIC_SCHOLAR, + papers=[], + total_found=60, + ), + }, + started_at=now, + ended_at=now, + ) + + assert result.run_id == "harvest-20260210-abc123" + assert result.status == "success" + assert result.papers_found == 150 + assert result.papers_new == 100 + assert result.papers_deduplicated == 50 + + def test_to_dict(self): + """to_dict returns correct structure.""" + from datetime import datetime, timezone + + now = datetime.now(timezone.utc) + + result = HarvestRunResult( + run_id="test-run", + status="partial", + papers_found=100, + papers_new=80, + papers_deduplicated=20, + source_results={ + HarvestSource.ARXIV: HarvestResult( + source=HarvestSource.ARXIV, + papers=[HarvestedPaper(title="P1", source=HarvestSource.ARXIV)], + total_found=50, + ), + }, + started_at=now, + ) + + data = result.to_dict() + + assert data["run_id"] == "test-run" + assert data["status"] == "partial" + assert data["papers_found"] == 100 + assert data["papers_new"] == 80 + assert data["papers_deduplicated"] == 20 + assert "arxiv" in data["sources"] + assert data["sources"]["arxiv"]["papers"] == 1 + assert data["sources"]["arxiv"]["total_found"] == 50 diff --git a/tests/unit/test_paper_deduplicator.py b/tests/unit/test_paper_deduplicator.py new file mode 100644 index 0000000..9d770ff --- /dev/null +++ b/tests/unit/test_paper_deduplicator.py @@ -0,0 +1,292 @@ +""" +PaperDeduplicator unit tests. +""" + +import pytest + +from paperbot.domain.harvest import HarvestedPaper, HarvestSource +from paperbot.application.services.paper_deduplicator import PaperDeduplicator + + +class TestPaperDeduplicator: + """PaperDeduplicator tests.""" + + def setup_method(self): + """Reset deduplicator before each test.""" + self.deduplicator = PaperDeduplicator() + + def test_deduplicate_empty_list(self): + """Empty list returns empty result.""" + unique, count = self.deduplicator.deduplicate([]) + assert unique == [] + assert count == 0 + + def test_deduplicate_single_paper(self): + """Single paper returns unchanged.""" + paper = HarvestedPaper( + title="Test Paper", + source=HarvestSource.ARXIV, + doi="10.1234/test", + ) + unique, count = self.deduplicator.deduplicate([paper]) + assert len(unique) == 1 + assert count == 0 + assert unique[0].title == "Test Paper" + + def test_deduplicate_by_doi(self): + """Papers with same DOI are deduplicated.""" + paper1 = HarvestedPaper( + title="Paper Version 1", + source=HarvestSource.ARXIV, + doi="10.1234/same-doi", + abstract="Short abstract", + ) + paper2 = HarvestedPaper( + title="Paper Version 2", + source=HarvestSource.SEMANTIC_SCHOLAR, + doi="10.1234/same-doi", + abstract="A much longer and more detailed abstract", + ) + + unique, count = self.deduplicator.deduplicate([paper1, paper2]) + + assert len(unique) == 1 + assert count == 1 + # Longer abstract should be preserved + assert "longer" in unique[0].abstract + + def test_deduplicate_by_arxiv_id(self): + """Papers with same arXiv ID are deduplicated.""" + paper1 = HarvestedPaper( + title="Paper 1", + source=HarvestSource.ARXIV, + arxiv_id="2301.12345", + ) + paper2 = HarvestedPaper( + title="Paper 1 (variant)", + source=HarvestSource.SEMANTIC_SCHOLAR, + arxiv_id="2301.12345", + ) + + unique, count = self.deduplicator.deduplicate([paper1, paper2]) + + assert len(unique) == 1 + assert count == 1 + + def test_deduplicate_by_semantic_scholar_id(self): + """Papers with same Semantic Scholar ID are deduplicated.""" + paper1 = HarvestedPaper( + title="Paper A", + source=HarvestSource.SEMANTIC_SCHOLAR, + semantic_scholar_id="abc123", + ) + paper2 = HarvestedPaper( + title="Paper A", + source=HarvestSource.OPENALEX, + semantic_scholar_id="abc123", + ) + + unique, count = self.deduplicator.deduplicate([paper1, paper2]) + + assert len(unique) == 1 + assert count == 1 + + def test_deduplicate_by_openalex_id(self): + """Papers with same OpenAlex ID are deduplicated.""" + paper1 = HarvestedPaper( + title="Paper B", + source=HarvestSource.OPENALEX, + openalex_id="W12345", + ) + paper2 = HarvestedPaper( + title="Paper B", + source=HarvestSource.ARXIV, + openalex_id="W12345", + ) + + unique, count = self.deduplicator.deduplicate([paper1, paper2]) + + assert len(unique) == 1 + assert count == 1 + + def test_deduplicate_by_title_hash(self): + """Papers with same normalized title are deduplicated.""" + paper1 = HarvestedPaper( + title="Deep Learning for NLP", + source=HarvestSource.ARXIV, + ) + paper2 = HarvestedPaper( + title="DEEP LEARNING FOR NLP", # Same title, different case + source=HarvestSource.SEMANTIC_SCHOLAR, + ) + + unique, count = self.deduplicator.deduplicate([paper1, paper2]) + + assert len(unique) == 1 + assert count == 1 + + def test_deduplicate_merges_identifiers(self): + """Deduplication merges identifiers from duplicates.""" + paper1 = HarvestedPaper( + title="Test Paper", + source=HarvestSource.ARXIV, + arxiv_id="2301.12345", + ) + paper2 = HarvestedPaper( + title="Test Paper", + source=HarvestSource.SEMANTIC_SCHOLAR, + doi="10.1234/test", + semantic_scholar_id="s2-123", + ) + + unique, count = self.deduplicator.deduplicate([paper1, paper2]) + + assert len(unique) == 1 + assert unique[0].arxiv_id == "2301.12345" + assert unique[0].doi == "10.1234/test" + assert unique[0].semantic_scholar_id == "s2-123" + + def test_deduplicate_prefers_higher_citations(self): + """Higher citation count is preserved during merge.""" + paper1 = HarvestedPaper( + title="Cited Paper", + source=HarvestSource.ARXIV, + doi="10.1234/cited", + citation_count=10, + ) + paper2 = HarvestedPaper( + title="Cited Paper", + source=HarvestSource.SEMANTIC_SCHOLAR, + doi="10.1234/cited", + citation_count=50, + ) + + unique, count = self.deduplicator.deduplicate([paper1, paper2]) + + assert len(unique) == 1 + assert unique[0].citation_count == 50 + + def test_deduplicate_merges_keywords(self): + """Keywords from all duplicates are merged.""" + paper1 = HarvestedPaper( + title="ML Paper", + source=HarvestSource.ARXIV, + doi="10.1234/ml", + keywords=["deep learning", "neural network"], + ) + paper2 = HarvestedPaper( + title="ML Paper", + source=HarvestSource.OPENALEX, + doi="10.1234/ml", + keywords=["machine learning", "deep learning"], + ) + + unique, count = self.deduplicator.deduplicate([paper1, paper2]) + + assert len(unique) == 1 + keywords = set(unique[0].keywords) + assert "deep learning" in keywords + assert "neural network" in keywords + assert "machine learning" in keywords + + def test_deduplicate_prefers_longer_author_list(self): + """Longer author list is preserved.""" + paper1 = HarvestedPaper( + title="Multi-author Paper", + source=HarvestSource.ARXIV, + doi="10.1234/multi", + authors=["Alice", "Bob"], + ) + paper2 = HarvestedPaper( + title="Multi-author Paper", + source=HarvestSource.SEMANTIC_SCHOLAR, + doi="10.1234/multi", + authors=["Alice", "Bob", "Charlie", "Diana"], + ) + + unique, count = self.deduplicator.deduplicate([paper1, paper2]) + + assert len(unique) == 1 + assert len(unique[0].authors) == 4 + + def test_no_duplicates_different_papers(self): + """Different papers are not deduplicated.""" + paper1 = HarvestedPaper( + title="First Paper", + source=HarvestSource.ARXIV, + doi="10.1234/first", + ) + paper2 = HarvestedPaper( + title="Second Paper", + source=HarvestSource.SEMANTIC_SCHOLAR, + doi="10.1234/second", + ) + paper3 = HarvestedPaper( + title="Third Paper", + source=HarvestSource.OPENALEX, + arxiv_id="2301.99999", + ) + + unique, count = self.deduplicator.deduplicate([paper1, paper2, paper3]) + + assert len(unique) == 3 + assert count == 0 + + def test_is_duplicate_check(self): + """is_duplicate correctly identifies duplicates.""" + paper1 = HarvestedPaper( + title="Indexed Paper", + source=HarvestSource.ARXIV, + doi="10.1234/indexed", + ) + + # First, deduplicate to build index + self.deduplicator.deduplicate([paper1]) + + # Check duplicate + paper2 = HarvestedPaper( + title="Different Title", + source=HarvestSource.SEMANTIC_SCHOLAR, + doi="10.1234/indexed", # Same DOI + ) + assert self.deduplicator.is_duplicate(paper2) is True + + # Check non-duplicate + paper3 = HarvestedPaper( + title="New Paper", + source=HarvestSource.OPENALEX, + doi="10.1234/new", + ) + assert self.deduplicator.is_duplicate(paper3) is False + + def test_reset_clears_indexes(self): + """reset() clears all indexes.""" + paper = HarvestedPaper( + title="Reset Test", + source=HarvestSource.ARXIV, + doi="10.1234/reset", + ) + + self.deduplicator.deduplicate([paper]) + assert self.deduplicator.is_duplicate(paper) is True + + self.deduplicator.reset() + assert self.deduplicator.is_duplicate(paper) is False + + def test_case_insensitive_matching(self): + """ID matching is case-insensitive.""" + paper1 = HarvestedPaper( + title="Case Test", + source=HarvestSource.ARXIV, + doi="10.1234/UPPERCASE", + ) + paper2 = HarvestedPaper( + title="Case Test", + source=HarvestSource.SEMANTIC_SCHOLAR, + doi="10.1234/uppercase", # lowercase + ) + + unique, count = self.deduplicator.deduplicate([paper1, paper2]) + + assert len(unique) == 1 + assert count == 1 diff --git a/tests/unit/test_query_rewriter.py b/tests/unit/test_query_rewriter.py new file mode 100644 index 0000000..da10440 --- /dev/null +++ b/tests/unit/test_query_rewriter.py @@ -0,0 +1,136 @@ +""" +QueryRewriter unit tests. +""" + +import pytest + +from paperbot.application.services.query_rewriter import QueryRewriter + + +class TestQueryRewriter: + """QueryRewriter tests.""" + + def setup_method(self): + """Create fresh rewriter for each test.""" + self.rewriter = QueryRewriter() + + def test_rewrite_no_expansion(self): + """Query without abbreviations returns single item.""" + queries = self.rewriter.rewrite("deep learning") + assert queries == ["deep learning"] + + def test_rewrite_llm_expansion(self): + """LLM expands to large language model.""" + queries = self.rewriter.rewrite("LLM security") + assert len(queries) == 2 + assert "LLM security" in queries + assert "large language model security" in queries + + def test_rewrite_multiple_abbreviations(self): + """Multiple abbreviations are expanded.""" + queries = self.rewriter.rewrite("ML and NLP") + assert len(queries) == 2 + assert "ML and NLP" in queries + assert "machine learning and natural language processing" in queries + + def test_rewrite_case_insensitive(self): + """Abbreviation matching is case-insensitive.""" + queries = self.rewriter.rewrite("llm") + assert "large language model" in queries + + queries = self.rewriter.rewrite("LLM") + assert "large language model" in queries + + def test_rewrite_punctuation_handled(self): + """Punctuation doesn't prevent matching.""" + queries = self.rewriter.rewrite("What is LLM?") + assert len(queries) == 2 + # The expanded version should have the expansion + assert any("large language model" in q for q in queries) + + def test_expand_all_basic(self): + """expand_all expands list of keywords.""" + expanded = self.rewriter.expand_all(["ML", "deep learning"]) + + # Should include originals and expansions + assert "ML" in expanded or "machine learning" in expanded + assert "deep learning" in expanded + + def test_expand_all_deduplicates(self): + """expand_all removes duplicate expansions.""" + # If both "ML" and "machine learning" are provided, + # "machine learning" shouldn't appear twice + expanded = self.rewriter.expand_all(["ML", "machine learning"]) + + # Count occurrences of "machine learning" (normalized) + ml_count = sum(1 for k in expanded if self.rewriter.normalize(k) == "machine learning") + assert ml_count == 1 + + def test_normalize_basic(self): + """normalize applies standard transformations.""" + assert self.rewriter.normalize("Hello World") == "hello world" + assert self.rewriter.normalize(" Multiple Spaces ") == "multiple spaces" + assert self.rewriter.normalize("Special!@#Characters") == "special characters" + + def test_normalize_preserves_alphanumeric(self): + """normalize preserves letters and numbers.""" + assert self.rewriter.normalize("GPT4 model") == "gpt4 model" + assert self.rewriter.normalize("BERT-2022") == "bert 2022" + + def test_add_abbreviation(self): + """Custom abbreviation can be added.""" + self.rewriter.add_abbreviation("XYZ", "extended yellow zebra") + queries = self.rewriter.rewrite("XYZ test") + assert "extended yellow zebra test" in queries + + def test_get_expansion(self): + """get_expansion returns expansion for known abbreviations.""" + assert self.rewriter.get_expansion("llm") == "large language model" + assert self.rewriter.get_expansion("LLM") == "large language model" + assert self.rewriter.get_expansion("unknown") is None + + def test_default_abbreviations_exist(self): + """Default abbreviations are available.""" + known_abbrevs = ["llm", "ml", "dl", "nlp", "cv", "rl", "gan", "cnn", "rnn", "bert", "gpt", "rag"] + for abbrev in known_abbrevs: + assert self.rewriter.get_expansion(abbrev) is not None + + def test_custom_abbreviations_override(self): + """Custom abbreviations override defaults.""" + custom = {"llm": "custom large model"} + rewriter = QueryRewriter(abbreviations=custom) + + assert rewriter.get_expansion("llm") == "custom large model" + + def test_empty_query_returns_empty(self): + """Empty query returns single empty string.""" + queries = self.rewriter.rewrite("") + assert queries == [""] + + def test_expand_all_empty_list(self): + """Empty list returns empty result.""" + expanded = self.rewriter.expand_all([]) + assert expanded == [] + + def test_rewrite_preserves_original(self): + """Original query is always first in result.""" + queries = self.rewriter.rewrite("LLM for NLP") + assert queries[0] == "LLM for NLP" + + def test_common_expansions(self): + """Common AI/ML abbreviations expand correctly.""" + test_cases = [ + ("CNN", "convolutional neural network"), + ("RNN", "recurrent neural network"), + ("LSTM", "long short-term memory"), + ("VAE", "variational autoencoder"), + ("GAN", "generative adversarial network"), + ("RL", "reinforcement learning"), + ("RAG", "retrieval augmented generation"), + ("NER", "named entity recognition"), + ("QA", "question answering"), + ] + + for abbrev, expected in test_cases: + queries = self.rewriter.rewrite(abbrev) + assert expected in queries, f"Expected '{expected}' in expansion of '{abbrev}'" diff --git a/tests/unit/test_venue_recommender.py b/tests/unit/test_venue_recommender.py new file mode 100644 index 0000000..2828925 --- /dev/null +++ b/tests/unit/test_venue_recommender.py @@ -0,0 +1,175 @@ +""" +VenueRecommender unit tests. +""" + +import pytest + +from paperbot.application.services.venue_recommender import VenueRecommender + + +class TestVenueRecommender: + """VenueRecommender tests.""" + + def setup_method(self): + """Create fresh recommender for each test.""" + self.recommender = VenueRecommender() + + def test_recommend_security_keywords(self): + """Security keywords recommend security venues.""" + venues = self.recommender.recommend(["ransomware"]) + + assert len(venues) > 0 + # Should include top security venues + security_venues = {"CCS", "S&P", "USENIX Security", "NDSS"} + assert any(v in security_venues for v in venues) + + def test_recommend_ml_keywords(self): + """ML keywords recommend ML venues.""" + venues = self.recommender.recommend(["machine learning"]) + + assert len(venues) > 0 + ml_venues = {"NeurIPS", "ICML", "ICLR"} + assert any(v in ml_venues for v in venues) + + def test_recommend_nlp_keywords(self): + """NLP keywords recommend NLP venues.""" + venues = self.recommender.recommend(["natural language"]) + + assert len(venues) > 0 + nlp_venues = {"ACL", "EMNLP", "NAACL"} + assert any(v in nlp_venues for v in venues) + + def test_recommend_database_keywords(self): + """Database keywords recommend database venues.""" + venues = self.recommender.recommend(["database", "sql"]) + + assert len(venues) > 0 + db_venues = {"SIGMOD", "VLDB", "ICDE"} + assert any(v in db_venues for v in venues) + + def test_recommend_systems_keywords(self): + """Systems keywords recommend systems venues.""" + venues = self.recommender.recommend(["distributed systems"]) + + assert len(venues) > 0 + sys_venues = {"OSDI", "SOSP", "EuroSys", "NSDI"} + assert any(v in sys_venues for v in venues) + + def test_recommend_empty_keywords(self): + """Empty keywords return empty result.""" + venues = self.recommender.recommend([]) + assert venues == [] + + def test_recommend_unknown_keywords(self): + """Unknown keywords return empty result.""" + venues = self.recommender.recommend(["xyznonexistent123"]) + assert venues == [] + + def test_recommend_max_venues(self): + """max_venues limits output count.""" + venues = self.recommender.recommend(["security", "machine learning"], max_venues=3) + assert len(venues) <= 3 + + def test_recommend_default_max_venues(self): + """Default max_venues is 5.""" + venues = self.recommender.recommend(["security", "machine learning", "deep learning"]) + assert len(venues) <= 5 + + def test_recommend_multiple_keywords_combined(self): + """Multiple keywords combine scores.""" + # Single keyword + venues_single = self.recommender.recommend(["security"]) + + # Multiple related keywords should boost same venues + venues_multi = self.recommender.recommend(["security", "malware", "ransomware"]) + + # Both should return security venues at top + assert len(venues_single) > 0 + assert len(venues_multi) > 0 + + def test_recommend_case_insensitive(self): + """Keyword matching is case-insensitive.""" + venues_lower = self.recommender.recommend(["security"]) + venues_upper = self.recommender.recommend(["SECURITY"]) + venues_mixed = self.recommender.recommend(["Security"]) + + assert venues_lower == venues_upper == venues_mixed + + def test_recommend_partial_match(self): + """Partial keyword matches contribute to scores.""" + # "learning" should partially match "machine learning", "deep learning", etc. + venues = self.recommender.recommend(["learning"]) + assert len(venues) > 0 + + def test_get_venues_for_domain(self): + """get_venues_for_domain returns specific domain venues.""" + venues = self.recommender.get_venues_for_domain("security") + assert "CCS" in venues + assert "S&P" in venues + + def test_get_venues_for_unknown_domain(self): + """Unknown domain returns empty list.""" + venues = self.recommender.get_venues_for_domain("unknown_domain_xyz") + assert venues == [] + + def test_add_mapping(self): + """Custom mapping can be added.""" + self.recommender.add_mapping("custom_topic", ["Venue1", "Venue2"]) + venues = self.recommender.get_venues_for_domain("custom_topic") + assert "Venue1" in venues + assert "Venue2" in venues + + def test_add_mapping_updates_recommend(self): + """Added mapping affects recommendations.""" + self.recommender.add_mapping("quantum", ["QIP", "Quantum"]) + venues = self.recommender.recommend(["quantum"]) + assert "QIP" in venues or "Quantum" in venues + + def test_custom_mappings_in_constructor(self): + """Custom mappings can be passed in constructor.""" + custom = {"custom_key": ["CustomVenue1", "CustomVenue2"]} + recommender = VenueRecommender(mappings=custom) + + venues = recommender.get_venues_for_domain("custom_key") + assert "CustomVenue1" in venues + assert "CustomVenue2" in venues + + def test_default_mappings_preserved_with_custom(self): + """Default mappings are preserved when custom mappings are added.""" + custom = {"new_domain": ["NewVenue"]} + recommender = VenueRecommender(mappings=custom) + + # Default mapping should still work + security_venues = recommender.get_venues_for_domain("security") + assert len(security_venues) > 0 + + # Custom mapping should also work + new_venues = recommender.get_venues_for_domain("new_domain") + assert "NewVenue" in new_venues + + def test_recommend_sorted_by_relevance(self): + """Venues are sorted by relevance score.""" + # Multiple keywords all pointing to security should rank security venues higher + venues = self.recommender.recommend( + ["security", "ransomware", "malware", "attack"] + ) + + # First venue should be a security venue + if venues: + security_venues = {"CCS", "S&P", "USENIX Security", "NDSS"} + assert venues[0] in security_venues + + def test_recommend_whitespace_handling(self): + """Keywords with extra whitespace are handled.""" + venues1 = self.recommender.recommend(["security"]) + venues2 = self.recommender.recommend([" security "]) + + assert venues1 == venues2 + + def test_recommend_empty_string_keyword(self): + """Empty string keyword is ignored.""" + venues = self.recommender.recommend(["", "security", ""]) + assert len(venues) > 0 + # Should still recommend security venues + security_venues = {"CCS", "S&P", "USENIX Security", "NDSS"} + assert any(v in security_venues for v in venues) diff --git a/web/package-lock.json b/web/package-lock.json index e051351..11995c3 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -3604,14 +3604,6 @@ "@types/react": "^19.2.0" } }, - "node_modules/@types/trusted-types": { - "version": "2.0.7", - "resolved": "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz", - "integrity": "sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==", - "license": "MIT", - "optional": true, - "peer": true - }, "node_modules/@types/unist": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", diff --git a/web/src/app/api/papers/[paperId]/save/route.ts b/web/src/app/api/papers/[paperId]/save/route.ts new file mode 100644 index 0000000..ea6a24c --- /dev/null +++ b/web/src/app/api/papers/[paperId]/save/route.ts @@ -0,0 +1,20 @@ +import { apiBaseUrl, proxyJson } from "../../../research/_base" + +export async function DELETE( + req: Request, + { params }: { params: Promise<{ paperId: string }> } +) { + const { paperId } = await params + const url = new URL(req.url) + const upstream = `${apiBaseUrl()}/api/papers/${paperId}/save${url.search}` + return proxyJson(req, upstream, "DELETE") +} + +export async function POST( + req: Request, + { params }: { params: Promise<{ paperId: string }> } +) { + const { paperId } = await params + const upstream = `${apiBaseUrl()}/api/papers/${paperId}/save` + return proxyJson(req, upstream, "POST") +} diff --git a/web/src/app/api/papers/library/route.ts b/web/src/app/api/papers/library/route.ts new file mode 100644 index 0000000..c1e7cb9 --- /dev/null +++ b/web/src/app/api/papers/library/route.ts @@ -0,0 +1,7 @@ +import { apiBaseUrl, proxyJson } from "../../research/_base" + +export async function GET(req: Request) { + const url = new URL(req.url) + const upstream = `${apiBaseUrl()}/api/papers/library${url.search}` + return proxyJson(req, upstream, "GET") +} diff --git a/web/src/components/research/ResearchDashboard.tsx b/web/src/components/research/ResearchDashboard.tsx index 84f481e..6d9c559 100644 --- a/web/src/components/research/ResearchDashboard.tsx +++ b/web/src/components/research/ResearchDashboard.tsx @@ -45,6 +45,7 @@ type MemoryItem = { type Paper = { paper_id: string title: string + abstract?: string year?: number venue?: string citation_count?: number @@ -383,23 +384,34 @@ export default function ResearchDashboard() { } } - async function sendFeedback(paperId: string, action: string, rank?: number) { + async function sendFeedback(paperId: string, action: string, rank?: number, paper?: Paper) { setLoading(true) setError(null) try { const contextRunId = contextPack?.context_run_id ?? null + const body: Record = { + user_id: userId, + track_id: activeTrackId, + paper_id: paperId, + action, + weight: 0.0, + context_run_id: contextRunId, + context_rank: typeof rank === "number" ? rank : undefined, + metadata: {}, + } + // Include paper metadata for save action + if (action === "save" && paper) { + body.paper_title = paper.title + body.paper_abstract = paper.abstract || "" + body.paper_authors = paper.authors || [] + body.paper_year = paper.year + body.paper_venue = paper.venue + body.paper_citation_count = paper.citation_count + body.paper_url = paper.url + } await fetchJson(`/api/research/papers/feedback`, { method: "POST", - body: JSON.stringify({ - user_id: userId, - track_id: activeTrackId, - paper_id: paperId, - action, - weight: 0.0, - context_run_id: contextRunId, - context_rank: typeof rank === "number" ? rank : undefined, - metadata: {}, - }), + body: JSON.stringify(body), headers: { "Content-Type": "application/json" }, }) await buildContext(false) @@ -740,7 +752,7 @@ export default function ResearchDashboard() { > Like - @@ -308,7 +332,7 @@ export default function SavedPapersList() { size="sm" variant="ghost" disabled={rowUpdating} - onClick={() => updateReadingStatus(paper.id, status, false, "unsave")} + onClick={() => unsavePaper(paper.id)} > {unsaving ? : "Unsave"} diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts index 28af73b..ac43354 100644 --- a/web/src/lib/api.ts +++ b/web/src/lib/api.ts @@ -395,33 +395,25 @@ export async function fetchWikiConcepts(): Promise { } export async function fetchPapers(): Promise { - return [ - { - id: "attention-is-all-you-need", - title: "Attention Is All You Need", - venue: "NeurIPS 2017", - authors: "Vaswani et al.", - citations: "100k+", - status: "Reproduced", - tags: ["Transformer", "NLP"] - }, - { - id: "bert-pretraining", - title: "BERT: Pre-training of Deep Bidirectional Transformers", - venue: "NAACL 2019", - authors: "Devlin et al.", - citations: "80k+", - status: "analyzing", - tags: ["NLP", "Language Model"] - }, - { - id: "resnet-deep-residual", - title: "Deep Residual Learning for Image Recognition", - venue: "CVPR 2016", - authors: "He et al.", - citations: "150k+", - status: "pending", - tags: ["CV", "ResNet"] + try { + const res = await fetch(`${API_BASE_URL}/papers/library`) + if (!res.ok) { + console.error("Failed to fetch papers library:", res.status) + return [] } - ] + const data = await res.json() + // Transform backend response to frontend Paper type + return (data.papers || []).map((item: { paper: Record; action: string }) => ({ + id: String(item.paper.id), + title: item.paper.title || "Untitled", + venue: item.paper.venue || "Unknown", + authors: Array.isArray(item.paper.authors) ? item.paper.authors.join(", ") : "Unknown", + citations: item.paper.citation_count ? `${item.paper.citation_count}` : "0", + status: item.action === "save" ? "Saved" : "pending", + tags: Array.isArray(item.paper.fields_of_study) ? item.paper.fields_of_study.slice(0, 3) : [] + })) + } catch (e) { + console.error("Error fetching papers:", e) + return [] + } } From 29a00ddd369459ed377df4057ad6bfbcfdd02b0d Mon Sep 17 00:00:00 2001 From: boyu Date: Wed, 11 Feb 2026 11:01:11 +0100 Subject: [PATCH 2/3] feat(Harvest): add -- Paper Search and Storage Closes #26 Signed-off-by: LIU BOYU --- alembic/versions/0003_paper_registry.py | 19 +- ...tables.py => 0007_paper_harvest_tables.py} | 11 +- docs/architecture_overview.md | 840 ++++++++++++++++++ src/paperbot/api/main.py | 6 - src/paperbot/infrastructure/stores/models.py | 90 +- .../infrastructure/stores/paper_store.py | 17 - .../infrastructure/stores/research_store.py | 12 +- 7 files changed, 868 insertions(+), 127 deletions(-) rename alembic/versions/{0003_paper_harvest_tables.py => 0007_paper_harvest_tables.py} (96%) create mode 100644 docs/architecture_overview.md diff --git a/alembic/versions/0003_paper_registry.py b/alembic/versions/0003_paper_registry.py index 2d04a32..e7e543d 100644 --- a/alembic/versions/0003_paper_registry.py +++ b/alembic/versions/0003_paper_registry.py @@ -51,7 +51,11 @@ def _create_index(name: str, table: str, cols: list[str]) -> None: def upgrade() -> None: + # NOTE: The papers table may also be created by 0007_paper_harvest_tables with a different schema. + # Only create this version if the table doesn't exist. + created_table = False if _is_offline() or not _has_table("papers"): + created_table = True op.create_table( "papers", sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), @@ -75,14 +79,19 @@ def upgrade() -> None: sa.UniqueConstraint("doi", name="uq_papers_doi"), ) + # Only create indexes for columns that exist in this schema version + # These indexes are always safe (columns exist in both schemas): _create_index("ix_papers_arxiv_id", "papers", ["arxiv_id"]) _create_index("ix_papers_doi", "papers", ["doi"]) - _create_index("ix_papers_title", "papers", ["title"]) - _create_index("ix_papers_source", "papers", ["source"]) - _create_index("ix_papers_published_at", "papers", ["published_at"]) - _create_index("ix_papers_first_seen_at", "papers", ["first_seen_at"]) _create_index("ix_papers_created_at", "papers", ["created_at"]) - _create_index("ix_papers_updated_at", "papers", ["updated_at"]) + + # These indexes are only for this schema (not in harvest schema): + if _is_offline() or created_table: + _create_index("ix_papers_title", "papers", ["title"]) + _create_index("ix_papers_source", "papers", ["source"]) + _create_index("ix_papers_published_at", "papers", ["published_at"]) + _create_index("ix_papers_first_seen_at", "papers", ["first_seen_at"]) + _create_index("ix_papers_updated_at", "papers", ["updated_at"]) def downgrade() -> None: diff --git a/alembic/versions/0003_paper_harvest_tables.py b/alembic/versions/0007_paper_harvest_tables.py similarity index 96% rename from alembic/versions/0003_paper_harvest_tables.py rename to alembic/versions/0007_paper_harvest_tables.py index ecf3803..6677a8d 100644 --- a/alembic/versions/0003_paper_harvest_tables.py +++ b/alembic/versions/0007_paper_harvest_tables.py @@ -1,7 +1,7 @@ """paper harvest tables -Revision ID: 0003_paper_harvest_tables -Revises: 0002_research_eval_runs +Revision ID: 0007_paper_harvest_tables +Revises: 0006_newsletter_subscribers Create Date: 2026-02-06 Adds: @@ -14,8 +14,8 @@ import sqlalchemy as sa from alembic import context, op -revision = "0003_paper_harvest_tables" -down_revision = "0002_research_eval_runs" +revision = "0007_paper_harvest_tables" +down_revision = "0006_newsletter_subscribers" branch_labels = None depends_on = None @@ -52,9 +52,6 @@ def _create_index(name: str, table: str, cols: list[str]) -> None: def upgrade() -> None: - if _is_offline(): - _upgrade_create_tables() - return _upgrade_create_tables() _upgrade_create_indexes() diff --git a/docs/architecture_overview.md b/docs/architecture_overview.md new file mode 100644 index 0000000..3320565 --- /dev/null +++ b/docs/architecture_overview.md @@ -0,0 +1,840 @@ +# PaperBot System Architecture + +> **Version**: 1.0 +> **Last Updated**: 2026-02-06 +> **Author**: Claude Code + +--- + +## Table of Contents + +1. [System Overview](#1-system-overview) +2. [Layered Architecture](#2-layered-architecture) +3. [Core Components](#3-core-components) +4. [Data Flow](#4-data-flow) +5. [External Integrations](#5-external-integrations) +6. [Design Patterns](#6-design-patterns) +7. [Configuration](#7-configuration) + +--- + +## 1. System Overview + +PaperBot is a **multi-agent research workflow framework** designed for academic paper analysis, scholar tracking, and code reproduction. It consists of three main components: + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ PaperBot System Architecture │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ Web Dashboard │ │ Terminal CLI │ │ Python API │ │ +│ │ (Next.js 16) │ │ (Ink/React) │ │ (Direct) │ │ +│ └────────┬────────┘ └────────┬────────┘ └────────┬────────┘ │ +│ │ │ │ │ +│ └─────────────────────┼─────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ FastAPI Backend (Python) │ │ +│ │ ┌─────────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ SSE Streaming │ REST API │ WebSocket (future) │ │ │ +│ │ └─────────────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ +│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ +│ │ │ Scholar │ │ Paper │ │ Paper2Code │ │ Research │ │ │ +│ │ │ Tracking │ │ Analysis │ │ Pipeline │ │ Context │ │ │ +│ │ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ │ │ +│ │ │ │ +│ │ ┌──────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ Multi-Agent Orchestration System │ │ │ +│ │ │ ResearchAgent │ CodeAgent │ QualityAgent │ InfluenceCalc │ ... │ │ │ +│ │ └──────────────────────────────────────────────────────────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌─────────────────────┼─────────────────────┐ │ +│ ▼ ▼ ▼ │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ SQLite │ │ LLM APIs │ │ External APIs │ │ +│ │ (Persistence) │ │ (Claude/OpenAI) │ │ (S2/GitHub/...) │ │ +│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +### Directory Structure + +``` +src/paperbot/ +├── api/ # API Layer - FastAPI routes & streaming +├── application/ # Application Layer - Business logic, workflows +├── domain/ # Domain Layer - Core models, entities +├── infrastructure/ # Infrastructure Layer - External services, DB +├── core/ # Core abstractions & patterns +├── agents/ # Multi-agent implementations +├── repro/ # Paper2Code pipeline +├── context_engine/ # Research context routing +├── memory/ # User memory extraction +├── presentation/ # UI components (reports, CLI) +└── workflows/ # Workflow orchestration +``` + +--- + +## 2. Layered Architecture + +PaperBot follows a **Clean Architecture** approach with clear separation of concerns: + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ PRESENTATION LAYER │ +│ ┌─────────────────────────────────────────────────────────────────────────────┐│ +│ │ Web Dashboard (Next.js) │ Terminal CLI (Ink) │ API Clients ││ +│ └─────────────────────────────────────────────────────────────────────────────┘│ +│ │ │ +│ ▼ │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ API LAYER │ +│ ┌─────────────────────────────────────────────────────────────────────────────┐│ +│ │ FastAPI Application (api/main.py) ││ +│ │ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ ││ +│ │ │ /track │ │ /analyze │ │ /gen-code │ │ /research │ │ /memory │ ││ +│ │ └───────────┘ └───────────┘ └───────────┘ └───────────┘ └───────────┘ ││ +│ │ ┌───────────────────────────────────────────────────────────────────┐ ││ +│ │ │ SSE Streaming (streaming.py) │ CORS Middleware │ Auth │ ││ +│ │ └───────────────────────────────────────────────────────────────────┘ ││ +│ └─────────────────────────────────────────────────────────────────────────────┘│ +│ │ │ +│ ▼ │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ APPLICATION LAYER │ +│ ┌─────────────────────────────────────────────────────────────────────────────┐│ +│ │ Workflows & Pipelines ││ +│ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ ││ +│ │ │ ScholarPipeline │ │ Paper2Code Orch │ │ HarvestPipeline │ ││ +│ │ │ (scholar_pipeline)│ │ (orchestrator) │ │ (v1 NEW) │ ││ +│ │ └─────────────────┘ └─────────────────┘ └─────────────────┘ ││ +│ │ ││ +│ │ Services & Ports ││ +│ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ ││ +│ │ │ EventLogPort │ │ SourceRegistry │ │ WorkflowRegistry│ ││ +│ │ │ (Protocol) │ │ (Data Sources) │ │ (Workflows) │ ││ +│ │ └─────────────────┘ └─────────────────┘ └─────────────────┘ ││ +│ │ ││ +│ │ Collaboration ││ +│ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ ││ +│ │ │AgentCoordinator │ │ ScoreShareBus │ │ FailFastEvaluator│ ││ +│ │ │ (message bus) │ │ (cross-stage) │ │ (early stop) │ ││ +│ │ └─────────────────┘ └─────────────────┘ └─────────────────┘ ││ +│ └─────────────────────────────────────────────────────────────────────────────┘│ +│ │ │ +│ ▼ │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ DOMAIN LAYER │ +│ ┌─────────────────────────────────────────────────────────────────────────────┐│ +│ │ Core Models ││ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ││ +│ │ │ PaperMeta │ │ Scholar │ │ Influence │ │ HarvestedPaper│ ││ +│ │ │ (paper.py) │ │ (scholar.py)│ │(influence/) │ │ (v1 NEW) │ ││ +│ │ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ ││ +│ │ ││ +│ │ Agents (BaseAgent → Specialized) ││ +│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ││ +│ │ │ Research │ │ Code │ │ Quality │ │ Review │ │ Influence│ ││ +│ │ │ Agent │ │ Analysis │ │ Agent │ │ Agent │ │Calculator│ ││ +│ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ └──────────┘ ││ +│ │ ││ +│ │ Core Abstractions (core/) ││ +│ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ ││ +│ │ │ Executable │ │ ExecutionResult │ │ DI Container │ ││ +│ │ │ (interface) │ │ (result type) │ │ (singleton) │ ││ +│ │ └─────────────────┘ └─────────────────┘ └─────────────────┘ ││ +│ └─────────────────────────────────────────────────────────────────────────────┘│ +│ │ │ +│ ▼ │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ INFRASTRUCTURE LAYER │ +│ ┌─────────────────────────────────────────────────────────────────────────────┐│ +│ │ LLM Integration ││ +│ │ ┌─────────────────────────────────────────────────────────────────┐ ││ +│ │ │ LLMClient (llm/base.py) │ ││ +│ │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ ││ +│ │ │ │ Claude │ │ OpenAI │ │DeepSeek │ │ Custom │ │ ││ +│ │ │ │ (Anthropic)│ │(GPT-4) │ │ │ │ Endpoint│ │ ││ +│ │ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ ││ +│ │ └─────────────────────────────────────────────────────────────────┘ ││ +│ │ ││ +│ │ API Clients ││ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ││ +│ │ │ Semantic │ │ GitHub │ │ OpenReview │ │ arXiv │ ││ +│ │ │ Scholar │ │ API │ │ API │ │ API │ ││ +│ │ └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ ││ +│ │ ││ +│ │ Persistence ││ +│ │ ┌─────────────────────────────────────────────────────────────────┐ ││ +│ │ │ SQLAlchemy ORM (stores/models.py) │ ││ +│ │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ ││ +│ │ │ │AgentRun │ │AgentEvent│ │ Memory │ │Research │ │ Papers │ │ ││ +│ │ │ │ Model │ │ Model │ │ Model │ │ Track │ │(v1 NEW) │ │ ││ +│ │ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ ││ +│ │ └─────────────────────────────────────────────────────────────────┘ ││ +│ │ ││ +│ │ Event Logging ││ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ││ +│ │ │ Logging │ │ SQLAlchemy │ │ Composite │ ││ +│ │ │ EventLog │ │ EventLog │ │ EventLog │ ││ +│ │ └─────────────┘ └─────────────┘ └─────────────┘ ││ +│ └─────────────────────────────────────────────────────────────────────────────┘│ +│ │ │ +│ ▼ │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ EXTERNAL SYSTEMS │ +│ ┌─────────────────────────────────────────────────────────────────────────────┐│ +│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ││ +│ │ │ SQLite │ │Anthropic│ │ OpenAI │ │Semantic │ │ GitHub │ │ Docker │ ││ +│ │ │ DB │ │ API │ │ API │ │ Scholar │ │ API │ │ / E2B │ ││ +│ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ └─────────┘ └─────────┘ ││ +│ └─────────────────────────────────────────────────────────────────────────────┘│ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +### Layer Responsibilities + +| Layer | Responsibility | Key Components | +|-------|----------------|----------------| +| **Presentation** | User interface, client applications | Web Dashboard, Terminal CLI | +| **API** | HTTP endpoints, streaming, middleware | FastAPI routes, SSE streaming | +| **Application** | Business workflows, orchestration | Pipelines, Coordinators, Services | +| **Domain** | Core business logic, entities | Models, Agents, Influence calculations | +| **Infrastructure** | External services, persistence | LLM clients, API clients, SQLAlchemy | + +--- + +## 3. Core Components + +### 3.1 Multi-Agent System + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Multi-Agent Orchestration │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ AgentCoordinator │ │ +│ │ ┌─────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ - register(agent) - Agent registration │ │ │ +│ │ │ - broadcast(message) - Message distribution │ │ │ +│ │ │ - collect() - Result aggregation │ │ │ +│ │ │ - synthesize() - Final synthesis │ │ │ +│ │ └─────────────────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────┬─────────────────────────────────────────┘ │ +│ │ │ +│ ┌────────────────────────┼────────────────────────┐ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +│ │ScoreShareBus│ │FailFastEval │ │ EventLog │ │ +│ │ │ │ │ │ │ │ +│ │ Cross-stage │ │ Early stop │ │ Persistence │ │ +│ │ score share │ │ on low qual │ │ & audit │ │ +│ └─────────────┘ └─────────────┘ └─────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Registered Agents │ │ +│ │ │ │ +│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ +│ │ │ ResearchAgent│ │CodeAnalysis │ │ QualityAgent │ │ ReviewAgent │ │ │ +│ │ │ │ │ Agent │ │ │ │ │ │ │ +│ │ │ - S2 search │ │ - GitHub API │ │ - Quality │ │ - Peer review│ │ │ +│ │ │ - Enrichment │ │ - Code health│ │ scoring │ │ simulation │ │ │ +│ │ │ - Grounding │ │ - Dependencies│ │ - Method eval│ │ - Strengths │ │ │ +│ │ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ │ │ +│ │ │ │ +│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ +│ │ │ Influence │ │Verification │ │ Documentation│ │ │ +│ │ │ Calculator │ │ Agent │ │ Agent │ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ │ - Citation │ │ - Claim check│ │ - API docs │ │ │ +│ │ │ velocity │ │ - Method │ │ - Code docs │ │ │ +│ │ │ - Momentum │ │ validation │ │ extraction │ │ │ +│ │ └──────────────┘ └──────────────┘ └──────────────┘ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +### 3.2 BaseAgent Architecture + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ BaseAgent (Template Method) │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ execute(input: TInput) → ExecutionResult[TOutput] │ │ +│ │ │ │ +│ │ ┌─────────────────┐ │ │ +│ │ │ 1. Validate │ _validate_input(input) │ │ +│ │ │ Input │ - Check required fields │ │ +│ │ └────────┬────────┘ - Validate constraints │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ ┌─────────────────┐ │ │ +│ │ │ 2. Execute │ _execute(input) [ABSTRACT] │ │ +│ │ │ Core Logic │ - Implemented by subclass │ │ +│ │ └────────┬────────┘ - LLM calls, API calls, etc. │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ ┌─────────────────┐ │ │ +│ │ │ 3. Post-Process │ _post_process(result) │ │ +│ │ │ Results │ - Format output │ │ +│ │ └─────────────────┘ - Emit events │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ ExecutionResult[TOutput] │ │ +│ │ { │ │ +│ │ success: bool, │ │ +│ │ data: Optional[TOutput], │ │ +│ │ error: Optional[str], │ │ +│ │ duration_ms: Optional[float], │ │ +│ │ metadata: Dict[str, Any] │ │ +│ │ } │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Mixins │ │ +│ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ │ +│ │ │SemanticScholar │ │ JSONParser │ │ TextParsing │ │ │ +│ │ │ Mixin │ │ Mixin │ │ Mixin │ │ │ +│ │ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +### 3.3 Paper2Code Pipeline (ReproAgent) + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Paper2Code Pipeline │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Input: Paper Context (PDF/URL → Parsed Content) │ │ +│ └───────────────────────────────────┬─────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Stage 1: Planning Agent │ │ +│ │ ┌─────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ - Blueprint distillation │ │ │ +│ │ │ - Implementation plan generation │ │ │ +│ │ │ - File structure design │ │ │ +│ │ └─────────────────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────┬─────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Stage 2: Coding Agent │ │ +│ │ ┌─────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ │ +│ │ │ │ CodeMemory │ │ CodeRAG │ │ LLM Gen │ │ │ │ +│ │ │ │ │ │ │ │ │ │ │ │ +│ │ │ │Cross-file │ │ Pattern │ │ Code │ │ │ │ +│ │ │ │context │ │ retrieval │ │ generation │ │ │ │ +│ │ │ │AST indexing │ │ similarity │ │ │ │ │ │ +│ │ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │ │ +│ │ └─────────────────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────┬─────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Stage 3: Verification Agent │ │ +│ │ ┌─────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ - Syntax validation │ │ │ +│ │ │ - Import checking │ │ │ +│ │ │ - Test execution (sandbox) │ │ │ +│ │ └─────────────────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────┬─────────────────────────────────────┘ │ +│ │ │ +│ ┌───────────────┴───────────────┐ │ +│ │ Pass? │ │ +│ └───────────────┬───────────────┘ │ +│ No │ │ Yes │ +│ ▼ │ │ +│ ┌─────────────────────────────────────┐ │ │ +│ │ Stage 4: Debugging Agent │ │ │ +│ │ ┌─────────────────────────────┐ │ │ │ +│ │ │ - Error analysis │ │ │ │ +│ │ │ - Fix generation │ │ │ │ +│ │ │ - Retry (max_repair_loops) │◄───┼────┘ │ +│ │ └─────────────────────────────┘ │ │ +│ └─────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Output: Generated Code + Execution Report │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Execution Environments │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ +│ │ │ Docker │ │ E2B │ │ Local │ │ │ +│ │ │ Executor │ │ (Cloud) │ │ (Dev only) │ │ │ +│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +### 3.4 Memory System + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Memory System │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Input Sources │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ +│ │ │ ChatGPT │ │ Gemini │ │ Claude │ │ Plain Text │ │ │ +│ │ │ Export │ │ Export │ │ Export │ │ │ │ │ +│ │ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ │ +│ └─────────┼────────────────┼────────────────┼────────────────┼─────────────┘ │ +│ └────────────────┼────────────────┼────────────────┘ │ +│ ▼ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Memory Extractor │ │ +│ │ ┌─────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ - Parse conversation format │ │ │ +│ │ │ - Extract memory candidates │ │ │ +│ │ │ - Classify by type (profile, preference, goal, fact, etc.) │ │ │ +│ │ │ - Calculate confidence scores │ │ │ +│ │ │ - Detect PII risk │ │ │ +│ │ └─────────────────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────┬─────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Memory Storage (memory_items table) │ │ +│ │ │ │ +│ │ ┌───────────────────────────────────────────────────────────────┐ │ │ +│ │ │ Memory Item │ │ │ +│ │ │ - kind: profile | preference | goal | constraint | fact | ... │ │ │ +│ │ │ - content: string │ │ │ +│ │ │ - confidence: 0.0 - 1.0 │ │ │ +│ │ │ - status: pending | approved | rejected | superseded │ │ │ +│ │ │ - scope: global | track | workspace │ │ │ +│ │ │ - pii_risk: 0 | 1 | 2 │ │ │ +│ │ └───────────────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────┬─────────────────────────────────────┘ │ +│ │ │ +│ ┌───────────────────────┼───────────────────────┐ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌───────────────────┐ ┌───────────────────┐ ┌───────────────────┐ │ +│ │ Memory Inbox │ │ Context Engine │ │ Quality Metrics │ │ +│ │ │ │ │ │ │ │ +│ │ - Pending review │ │ - Memory inject │ │ - Precision ≥85% │ │ +│ │ - Approve/Reject │ │ - Routing signal │ │ - FP rate ≤5% │ │ +│ │ - Scope mgmt │ │ - Recommendation │ │ - Hit rate ≥80% │ │ +│ └───────────────────┘ └───────────────────┘ └───────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +### 3.5 Influence Calculation System + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Influence Calculation System │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Input: Paper Metadata + External Data │ │ +│ └───────────────────────────────────┬─────────────────────────────────────┘ │ +│ │ │ +│ ┌────────────────────────────┼────────────────────────────┐ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ Academic Metrics│ │Engineering Metrics│ │ Context Analysis│ │ +│ │ │ │ │ │ │ │ +│ │ - Citation count│ │ - GitHub stars │ │ - Citation │ │ +│ │ - H-index │ │ - Forks │ │ sentiment │ │ +│ │ - Venue tier │ │ - Code health │ │ - Dynamic PIS │ │ +│ │ weighting │ │ - Doc coverage │ │ - Momentum │ │ +│ └────────┬────────┘ └────────┬────────┘ └────────┬────────┘ │ +│ │ │ │ │ +│ └────────────────────────────┼────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Influence Calculator │ │ +│ │ ┌─────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ Composite Score = Σ (weight_i × metric_i) │ │ │ +│ │ │ │ │ │ +│ │ │ Weights: │ │ │ +│ │ │ - Academic (citations, venue): 0.4 │ │ │ +│ │ │ - Engineering (code, stars): 0.3 │ │ │ +│ │ │ - Momentum (velocity, trend): 0.3 │ │ │ +│ │ └─────────────────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────┬─────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Output: Paper Influence Score (PIS) + Breakdown │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 4. Data Flow + +### 4.1 Scholar Tracking Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Scholar Tracking Data Flow │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌────────────┐ │ +│ │ Client │ GET /api/track?scholar_id=xxx │ +│ └─────┬──────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────────────────────┐ │ +│ │ API Route (/track) │ │ +│ │ - Parse request │ │ +│ │ - Create StreamingResponse │ │ +│ └─────────────────────────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────────────────────┐ │ +│ │ ScholarPipeline.analyze_paper() │ │ +│ └─────────────────────────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────────────────────┐ │ +│ │ ScholarWorkflowCoordinator │ │ +│ │ │ │ +│ │ Stage 1: ResearchAgent │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ → Semantic Scholar API (paper search, metadata) │ │ │ +│ │ │ → Paper enrichment │ │ │ +│ │ │ → Emit: StreamEvent(progress) │ │ │ +│ │ │ → ScoreShareBus.publish(research_score) │ │ │ +│ │ └─────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ Stage 2: CodeAnalysisAgent │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ → GitHub API (repo discovery) │ │ │ +│ │ │ → Code health analysis │ │ │ +│ │ │ → Emit: StreamEvent(progress) │ │ │ +│ │ │ → ScoreShareBus.publish(code_score) │ │ │ +│ │ └─────────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ Stage 3: QualityAgent → InfluenceCalculator → ReportWriter │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ → Quality scoring │ │ │ +│ │ │ → Influence calculation │ │ │ +│ │ │ → Markdown report (Jinja2) │ │ │ +│ │ │ → Emit: StreamEvent(result) │ │ │ +│ │ └─────────────────────────────────────────────────────────────┘ │ │ +│ └─────────────────────────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────────────────────┐ │ +│ │ SSE Stream │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ event: progress │ │ │ +│ │ │ data: {"stage": "research", "message": "Analyzing..."} │ │ │ +│ │ │ │ │ │ +│ │ │ event: progress │ │ │ +│ │ │ data: {"stage": "code", "message": "Checking GitHub..."} │ │ │ +│ │ │ │ │ │ +│ │ │ event: result │ │ │ +│ │ │ data: {"report": "...", "scores": {...}} │ │ │ +│ │ │ │ │ │ +│ │ │ event: done │ │ │ +│ │ └─────────────────────────────────────────────────────────────┘ │ │ +│ └─────────────────────────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────┐ │ +│ │ Client │ Receives SSE events, updates UI │ +│ └────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +### 4.2 Research Context Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Research Context Data Flow │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌────────────────────────────────────────────────────────────────────────┐ │ +│ │ User Query: "Find papers on LLM security" │ │ +│ └─────────────────────────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────────────────────┐ │ +│ │ Context Engine │ │ +│ │ │ │ +│ │ 1. Load User Memory (approved items for active track) │ │ +│ │ ┌─────────────────────────────────────────────────────────┐ │ │ +│ │ │ Memory: "I'm researching adversarial ML" │ │ │ +│ │ │ Memory: "Prefer transformer-based methods" │ │ │ +│ │ │ Memory: "Deadline: March 15" │ │ │ +│ │ └─────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ 2. Merge Query with Memory Context │ │ +│ │ ┌─────────────────────────────────────────────────────────┐ │ │ +│ │ │ Merged: "LLM security + adversarial ML + transformers" │ │ │ +│ │ └─────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ 3. Route to Paper Sources │ │ +│ │ ┌─────────────────────────────────────────────────────────┐ │ │ +│ │ │ → Semantic Scholar API │ │ │ +│ │ │ → Local Paper Pool (v1) │ │ │ +│ │ └─────────────────────────────────────────────────────────┘ │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ 4. Rank & Filter Results │ │ +│ │ ┌─────────────────────────────────────────────────────────┐ │ │ +│ │ │ - Relevance scoring │ │ │ +│ │ │ - Memory-influenced ranking │ │ │ +│ │ │ - Diversity balancing │ │ │ +│ │ └─────────────────────────────────────────────────────────┘ │ │ +│ └─────────────────────────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────────────────────────┐ │ +│ │ Recommendations Tab │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ Paper 1: "Adversarial Attacks on LLMs..." [Like] [Save] │ │ │ +│ │ │ Paper 2: "Transformer Security..." [Like] [Save] │ │ │ +│ │ │ Paper 3: "..." [Like] [Save] │ │ │ +│ │ └─────────────────────────────────────────────────────────────┘ │ │ +│ └────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 5. External Integrations + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ External Integrations │ +├─────────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ LLM Providers │ │ +│ │ │ │ +│ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ │ +│ │ │ Anthropic │ │ OpenAI │ │ DeepSeek │ │ │ +│ │ │ Claude API │ │ GPT-4 API │ │ API │ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ │ Primary for │ │ Alternative │ │ Cost-effective │ │ │ +│ │ │ agent reasoning│ │ provider │ │ option │ │ │ +│ │ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ │ +│ │ │ │ +│ │ Unified via: LLMClient (llm/base.py) │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Academic Data Sources │ │ +│ │ │ │ +│ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ │ +│ │ │ Semantic Scholar│ │ arXiv │ │ OpenAlex │ │ │ +│ │ │ │ │ │ │ (v1 NEW) │ │ │ +│ │ │ - Paper search │ │ - Preprint │ │ - 240M+ works │ │ │ +│ │ │ - Citations │ │ metadata │ │ - Open access │ │ │ +│ │ │ - Author data │ │ - PDF links │ │ - CS coverage │ │ │ +│ │ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ │ +│ │ │ │ +│ │ ┌─────────────────┐ ┌─────────────────┐ │ │ +│ │ │ OpenReview │ │ Conference │ │ │ +│ │ │ │ │ Websites │ │ │ +│ │ │ - Submissions │ │ - S&P, CCS │ │ │ +│ │ │ - Reviews │ │ - USENIX, NDSS │ │ │ +│ │ └─────────────────┘ └─────────────────┘ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Code & Repository Services │ │ +│ │ │ │ +│ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ │ +│ │ │ GitHub │ │ HuggingFace │ │ Code Execution │ │ │ +│ │ │ API │ │ │ │ │ │ │ +│ │ │ │ │ - Model cards │ │ - Docker │ │ │ +│ │ │ - Repo metadata │ │ - Checkpoints │ │ - E2B (cloud) │ │ │ +│ │ │ - Stars, forks │ │ │ │ - Local (dev) │ │ │ +│ │ │ - Code analysis │ │ │ │ │ │ │ +│ │ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────────┐ │ +│ │ Persistence │ │ +│ │ │ │ +│ │ ┌─────────────────────────────────────────────────────────────────┐ │ │ +│ │ │ SQLite (Default) / PostgreSQL (Production) │ │ │ +│ │ │ │ │ │ +│ │ │ Tables: │ │ │ +│ │ │ - agent_runs, agent_events (execution tracking) │ │ │ +│ │ │ - memory_items, memory_sources (user memory) │ │ │ +│ │ │ - research_tracks, paper_feedback (research context) │ │ │ +│ │ │ - papers, harvest_runs (v1 NEW - paper pool) │ │ │ +│ │ └─────────────────────────────────────────────────────────────────┘ │ │ +│ └─────────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 6. Design Patterns + +### Patterns Used in PaperBot + +| Pattern | Location | Purpose | +|---------|----------|---------| +| **Template Method** | `agents/base.py` | Common agent execution flow with customizable steps | +| **Repository** | `application/ports/` | Abstract data access (EventLogPort) | +| **Adapter** | `infrastructure/llm/` | Unified interface for multiple LLM providers | +| **Pub/Sub** | `core/collaboration/` | AgentCoordinator message broadcasting | +| **Dependency Injection** | `core/di/container.py` | Loose coupling between components | +| **Pipeline** | `core/pipeline/` | Multi-stage processing | +| **Composite** | `infrastructure/event_log/` | Multiple event log backends | +| **Strategy** | `repro/` | Docker/E2B/Local execution strategies | +| **Factory** | `application/services/` | Object creation abstraction | + +### Dependency Direction + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Dependency Direction │ +│ │ +│ ┌─────────────────┐ │ +│ │ Presentation │ ─────────────────┐ │ +│ │ (API) │ │ │ +│ └─────────────────┘ │ │ +│ │ │ │ +│ ▼ │ │ +│ ┌─────────────────┐ │ Dependencies point INWARD │ +│ │ Application │ ─────────────────┤ toward the Domain layer │ +│ │ (Workflows) │ │ │ +│ └─────────────────┘ │ │ +│ │ │ │ +│ ▼ │ │ +│ ┌─────────────────┐ │ │ +│ │ Domain │ ◄────────────────┘ │ +│ │ (Models) │ │ +│ └─────────────────┘ │ +│ ▲ │ +│ │ │ +│ ┌─────────────────┐ │ +│ │ Infrastructure │ ─────────────────────────────────────────────────────────│ +│ │ (External) │ Implements interfaces defined in Domain/Application │ +│ └─────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 7. Configuration + +### Environment Variables + +```bash +# LLM API Keys +OPENAI_API_KEY=sk-... +ANTHROPIC_API_KEY=sk-ant-... + +# External Services +SEMANTIC_SCHOLAR_API_KEY=... # Optional, higher rate limits +GITHUB_TOKEN=ghp_... # Optional, higher rate limits +E2B_API_KEY=... # Optional, cloud sandbox + +# Database +PAPERBOT_DB_URL=sqlite:///data/paperbot.db + +# Execution Mode +PAPERBOT_EXECUTOR=auto|docker|e2b|local +``` + +### Configuration Files + +| File | Purpose | +|------|---------| +| `config/config.yaml` | Main application config (models, venues, thresholds) | +| `config/settings.py` | Pydantic settings validation | +| `config/scholar_subscriptions.yaml` | Tracked scholars list | +| `config/top_venues.yaml` | Venue tier rankings | + +### Key Configuration Options + +```yaml +# config/config.yaml (example structure) +download: + max_retries: 3 + concurrency: 5 + timeout: 30 + +analysis: + parallel: true + max_depth: 3 + +security: + ssl_verify: true + rate_limit: 100 + domain_allowlist: [...] + +output: + formats: [markdown, html, pdf] + template_dir: templates/ + +cache: + enabled: true + ttl: 3600 + +logging: + level: INFO + format: json +``` + +--- + +## Appendix: File Reference + +### Key Implementation Files + +| Component | File Path | +|-----------|-----------| +| API Entry Point | `src/paperbot/api/main.py` | +| SSE Streaming | `src/paperbot/api/streaming.py` | +| Base Agent | `src/paperbot/agents/base.py` | +| Agent Coordinator | `src/paperbot/core/collaboration/coordinator.py` | +| LLM Client | `src/paperbot/infrastructure/llm/base.py` | +| Paper Model | `src/paperbot/domain/paper.py` | +| DB Models | `src/paperbot/infrastructure/stores/models.py` | +| Event Log | `src/paperbot/infrastructure/event_log/` | +| Paper2Code | `src/paperbot/repro/orchestrator.py` | +| Memory System | `src/paperbot/memory/` | +| Context Engine | `src/paperbot/context_engine/` | + +--- + +*Document generated by Claude Code for PaperBot project.* diff --git a/src/paperbot/api/main.py b/src/paperbot/api/main.py index 45ea821..3b2e915 100644 --- a/src/paperbot/api/main.py +++ b/src/paperbot/api/main.py @@ -20,11 +20,8 @@ memory, research, paperscool, -<<<<<<< HEAD newsletter, -======= harvest, ->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) ) from paperbot.infrastructure.event_log.logging_event_log import LoggingEventLog from paperbot.infrastructure.event_log.composite_event_log import CompositeEventLog @@ -68,11 +65,8 @@ async def health_check(): app.include_router(memory.router, prefix="/api", tags=["Memory"]) app.include_router(research.router, prefix="/api", tags=["Research"]) app.include_router(paperscool.router, prefix="/api", tags=["PapersCool"]) -<<<<<<< HEAD app.include_router(newsletter.router, prefix="/api", tags=["Newsletter"]) -======= app.include_router(harvest.router, prefix="/api", tags=["Harvest"]) ->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) @app.on_event("startup") diff --git a/src/paperbot/infrastructure/stores/models.py b/src/paperbot/infrastructure/stores/models.py index 0cf476d..f6f5866 100644 --- a/src/paperbot/infrastructure/stores/models.py +++ b/src/paperbot/infrastructure/stores/models.py @@ -449,91 +449,6 @@ class ResearchMilestoneModel(Base): track = relationship("ResearchTrackModel", back_populates="milestones") - -class PaperModel(Base): - """Canonical paper registry row (deduplicated across sources).""" - - __tablename__ = "papers" - __table_args__ = ( - UniqueConstraint("arxiv_id", name="uq_papers_arxiv_id"), - UniqueConstraint("doi", name="uq_papers_doi"), - ) - - id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) - - arxiv_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True, index=True) - doi: Mapped[Optional[str]] = mapped_column(String(128), nullable=True, index=True) - - title: Mapped[str] = mapped_column(Text, default="", index=True) - authors_json: Mapped[str] = mapped_column(Text, default="[]") - abstract: Mapped[str] = mapped_column(Text, default="") - - url: Mapped[str] = mapped_column(String(512), default="") - external_url: Mapped[str] = mapped_column(String(512), default="") - pdf_url: Mapped[str] = mapped_column(String(512), default="") - - source: Mapped[str] = mapped_column(String(32), default="papers_cool", index=True) - venue: Mapped[str] = mapped_column(String(256), default="") - published_at: Mapped[Optional[datetime]] = mapped_column( - DateTime(timezone=True), nullable=True, index=True - ) - first_seen_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True) - - keywords_json: Mapped[str] = mapped_column(Text, default="[]") - metadata_json: Mapped[str] = mapped_column(Text, default="{}") - - created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True) - updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True) - - judge_scores = relationship( - "PaperJudgeScoreModel", back_populates="paper", cascade="all, delete-orphan" - ) - feedback_rows = relationship("PaperFeedbackModel", back_populates="paper") - reading_status_rows = relationship("PaperReadingStatusModel", back_populates="paper") - - def set_authors(self, values: Optional[list[str]]) -> None: - self.authors_json = json.dumps( - [str(v) for v in (values or []) if str(v).strip()], - ensure_ascii=False, - ) - - def get_authors(self) -> list[str]: - try: - data = json.loads(self.authors_json or "[]") - if isinstance(data, list): - return [str(v) for v in data if str(v).strip()] - except Exception: - pass - return [] - - def set_keywords(self, values: Optional[list[str]]) -> None: - self.keywords_json = json.dumps( - [str(v) for v in (values or []) if str(v).strip()], - ensure_ascii=False, - ) - - def get_keywords(self) -> list[str]: - try: - data = json.loads(self.keywords_json or "[]") - if isinstance(data, list): - return [str(v) for v in data if str(v).strip()] - except Exception: - pass - return [] - - def set_metadata(self, data: Dict[str, Any]) -> None: - self.metadata_json = json.dumps(data or {}, ensure_ascii=False) - - def get_metadata(self) -> Dict[str, Any]: - try: - parsed = json.loads(self.metadata_json or "{}") - if isinstance(parsed, dict): - return parsed - except Exception: - pass - return {} - - class PaperFeedbackModel(Base): """User feedback on recommended/seen papers (track-scoped).""" @@ -751,6 +666,11 @@ class PaperModel(Base): updated_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) deleted_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) # Soft delete + # Relationships + feedback_rows = relationship("PaperFeedbackModel", back_populates="paper") + judge_scores = relationship("PaperJudgeScoreModel", back_populates="paper") + reading_status_rows = relationship("PaperReadingStatusModel", back_populates="paper") + def get_authors(self) -> list: try: return json.loads(self.authors_json or "[]") diff --git a/src/paperbot/infrastructure/stores/paper_store.py b/src/paperbot/infrastructure/stores/paper_store.py index 6e9c3da..f0abdbf 100644 --- a/src/paperbot/infrastructure/stores/paper_store.py +++ b/src/paperbot/infrastructure/stores/paper_store.py @@ -1,4 +1,3 @@ -<<<<<<< HEAD from __future__ import annotations from datetime import datetime, timezone @@ -8,15 +7,6 @@ from paperbot.domain.paper_identity import normalize_arxiv_id, normalize_doi from paperbot.infrastructure.stores.models import Base, PaperJudgeScoreModel, PaperModel -======= -# src/paperbot/infrastructure/stores/paper_store.py -""" -Paper storage repository. - -Handles persistence and retrieval of harvested papers. -""" - -from __future__ import annotations import json from dataclasses import dataclass @@ -33,7 +23,6 @@ PaperFeedbackModel, PaperModel, ) ->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) from paperbot.infrastructure.stores.sqlalchemy_db import SessionProvider, get_db_url @@ -41,7 +30,6 @@ def _utcnow() -> datetime: return datetime.now(timezone.utc) -<<<<<<< HEAD def _safe_list(values: Any) -> List[str]: if not isinstance(values, list): return [] @@ -84,7 +72,6 @@ def _as_utc(value: Optional[datetime]) -> Optional[datetime]: class SqlAlchemyPaperStore: """Canonical paper registry with idempotent upsert for daily workflows.""" -======= @dataclass class LibraryPaper: """Paper with library metadata (saved_at, track_id, action).""" @@ -105,7 +92,6 @@ class PaperStore: - Source tracking - User library (saved papers) """ ->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) def __init__(self, db_url: Optional[str] = None, *, auto_create_schema: bool = True): self.db_url = db_url or get_db_url() @@ -113,7 +99,6 @@ def __init__(self, db_url: Optional[str] = None, *, auto_create_schema: bool = T if auto_create_schema: Base.metadata.create_all(self._provider.engine) -<<<<<<< HEAD def upsert_paper( self, *, @@ -366,7 +351,6 @@ def _paper_to_dict(row: PaperModel) -> Dict[str, Any]: "created_at": row.created_at.isoformat() if row.created_at else None, "updated_at": row.updated_at.isoformat() if row.updated_at else None, } -======= def upsert_papers_batch( self, papers: List[HarvestedPaper], @@ -838,4 +822,3 @@ def paper_to_dict(paper: PaperModel) -> Dict[str, Any]: "created_at": paper.created_at.isoformat() if paper.created_at else None, "updated_at": paper.updated_at.isoformat() if paper.updated_at else None, } ->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) diff --git a/src/paperbot/infrastructure/stores/research_store.py b/src/paperbot/infrastructure/stores/research_store.py index 9549e7f..7e654ec 100644 --- a/src/paperbot/infrastructure/stores/research_store.py +++ b/src/paperbot/infrastructure/stores/research_store.py @@ -8,11 +8,12 @@ from sqlalchemy import desc, func, or_, select from sqlalchemy.exc import IntegrityError -<<<<<<< HEAD from paperbot.domain.paper_identity import normalize_arxiv_id, normalize_doi -======= + +from paperbot.domain.paper_identity import normalize_arxiv_id, normalize_doi + from paperbot.utils.logging_config import Logger, LogFiles ->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) + from paperbot.infrastructure.stores.models import ( Base, PaperFeedbackModel, @@ -346,16 +347,13 @@ def add_paper_feedback( Logger.error("Track not found", file=LogFiles.HARVEST) return None -<<<<<<< HEAD + resolved_paper_ref_id = self._resolve_paper_ref_id( session=session, paper_id=(paper_id or "").strip(), metadata=metadata, ) - -======= Logger.info("Creating new feedback record", file=LogFiles.HARVEST) ->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage) row = PaperFeedbackModel( user_id=user_id, track_id=track_id, From 02c765aa193cf192d5776fa007f04767d0a44fd6 Mon Sep 17 00:00:00 2001 From: boyu Date: Wed, 11 Feb 2026 12:12:03 +0100 Subject: [PATCH 3/3] [Fix] Fix security issues introduced by the harvest module Closes #33, #34, #35, #36, #37, #38 Signed-off-by: LIU BOYU --- src/paperbot/api/routes/harvest.py | 27 ++++++-- src/paperbot/api/routes/research.py | 54 +++++++++------ .../services/paper_deduplicator.py | 21 ++++-- src/paperbot/context_engine/engine.py | 14 +++- src/paperbot/domain/harvest.py | 6 +- .../harvesters/arxiv_harvester.py | 9 ++- .../harvesters/openalex_harvester.py | 7 +- .../infrastructure/stores/paper_store.py | 65 ++++++++++++++++--- .../app/api/papers/[paperId]/save/route.ts | 22 ++++++- 9 files changed, 176 insertions(+), 49 deletions(-) diff --git a/src/paperbot/api/routes/harvest.py b/src/paperbot/api/routes/harvest.py index 10ad62f..491134c 100644 --- a/src/paperbot/api/routes/harvest.py +++ b/src/paperbot/api/routes/harvest.py @@ -31,6 +31,7 @@ # Lazy-initialized stores _paper_store: Optional[PaperStore] = None +_research_store: Optional["SqlAlchemyResearchStore"] = None def _get_paper_store() -> PaperStore: @@ -41,6 +42,16 @@ def _get_paper_store() -> PaperStore: return _paper_store +def _get_research_store() -> "SqlAlchemyResearchStore": + """Lazy initialization of research store.""" + from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore + + global _research_store + if _research_store is None: + _research_store = SqlAlchemyResearchStore() + return _research_store + + # ============================================================================ # Harvest Endpoints # ============================================================================ @@ -49,7 +60,7 @@ def _get_paper_store() -> PaperStore: class HarvestRequest(BaseModel): """Request body for harvest endpoint.""" - keywords: List[str] = Field(..., min_items=1, description="Search keywords") + keywords: List[str] = Field(..., min_length=1, description="Search keywords") venues: Optional[List[str]] = Field(None, description="Filter to specific venues") year_from: Optional[int] = Field(None, ge=1900, le=2100, description="Start year") year_to: Optional[int] = Field(None, ge=1900, le=2100, description="End year") @@ -150,6 +161,9 @@ class HarvestRunListResponse(BaseModel): runs: List[HarvestRunResponse] +# TODO(auth): This endpoint lists all harvest runs without user-based filtering. +# Intentional for MVP single-user setup. For multi-user production, add user_id +# filtering so users only see their own harvest runs. @router.get("/harvest/runs", response_model=HarvestRunListResponse) def list_harvest_runs( status: Optional[str] = Query(None, description="Filter by status"), @@ -320,6 +334,9 @@ class LibraryResponse(BaseModel): offset: int +# TODO(auth): user_id is accepted from client without authentication. +# This is intentional for the MVP single-user setup. For multi-user production, +# user_id should come from an authenticated session or JWT token. @router.get("/papers/library", response_model=LibraryResponse) def get_user_library( user_id: str = Query("default", description="User ID"), @@ -383,6 +400,7 @@ def get_paper(paper_id: int): class SavePaperRequest(BaseModel): """Request to save paper to library.""" + # TODO(auth): user_id from client without auth - intentional for MVP single-user setup user_id: str = Field("default", description="User ID") track_id: Optional[int] = Field(None, description="Associated track ID") @@ -394,8 +412,6 @@ def save_paper_to_library(paper_id: int, request: SavePaperRequest): Uses paper_feedback table with action='save'. """ - from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore - # Verify paper exists store = _get_paper_store() paper = store.get_paper_by_id(paper_id) @@ -403,7 +419,7 @@ def save_paper_to_library(paper_id: int, request: SavePaperRequest): raise HTTPException(status_code=404, detail="Paper not found") # Use research store to record feedback - research_store = SqlAlchemyResearchStore() + research_store = _get_research_store() feedback = research_store.record_paper_feedback( user_id=request.user_id, paper_id=str(paper_id), @@ -414,6 +430,9 @@ def save_paper_to_library(paper_id: int, request: SavePaperRequest): return {"success": True, "feedback": feedback} +# TODO(auth): user_id accepted from query string without authentication. +# Intentional for MVP single-user setup. For multi-user production, user_id +# should come from authenticated session/JWT, not query parameters. @router.delete("/papers/{paper_id}/save") def remove_paper_from_library( paper_id: int, diff --git a/src/paperbot/api/routes/research.py b/src/paperbot/api/routes/research.py index 2f01503..60f8c42 100644 --- a/src/paperbot/api/routes/research.py +++ b/src/paperbot/api/routes/research.py @@ -25,6 +25,7 @@ _memory_store = SqlAlchemyMemoryStore() _track_router = TrackRouter(research_store=_research_store, memory_store=_memory_store) _metric_collector: Optional[MemoryMetricCollector] = None +_paper_store: Optional["PaperStore"] = None def _get_metric_collector() -> MemoryMetricCollector: @@ -35,6 +36,16 @@ def _get_metric_collector() -> MemoryMetricCollector: return _metric_collector +def _get_paper_store() -> "PaperStore": + """Lazy initialization of paper store.""" + from paperbot.infrastructure.stores.paper_store import PaperStore + + global _paper_store + if _paper_store is None: + _paper_store = PaperStore() + return _paper_store + + def _schedule_embedding_precompute( background_tasks: Optional[BackgroundTasks], *, @@ -633,6 +644,7 @@ class PaperFeedbackRequest(BaseModel): paper_venue: Optional[str] = None paper_citation_count: Optional[int] = None paper_url: Optional[str] = None + paper_source: Optional[str] = None # arxiv, semantic_scholar, openalex class PaperFeedbackResponse(BaseModel): @@ -661,22 +673,32 @@ def add_paper_feedback(req: PaperFeedbackRequest): meta["context_rank"] = int(req.context_rank) library_paper_id: Optional[int] = None - actual_paper_id = req.paper_id # If action is "save" and we have paper metadata, insert into papers table if req.action == "save" and req.paper_title: Logger.info("Save action detected, inserting paper into papers table", file=LogFiles.HARVEST) try: from paperbot.domain.harvest import HarvestedPaper, HarvestSource - from paperbot.infrastructure.stores.paper_store import PaperStore - paper_store = PaperStore() + paper_store = _get_paper_store() + + # Determine source from request or default to semantic_scholar + source_str = (req.paper_source or "semantic_scholar").lower() + source_map = { + "arxiv": HarvestSource.ARXIV, + "semantic_scholar": HarvestSource.SEMANTIC_SCHOLAR, + "openalex": HarvestSource.OPENALEX, + } + source = source_map.get(source_str, HarvestSource.SEMANTIC_SCHOLAR) + paper = HarvestedPaper( title=req.paper_title, - source=HarvestSource.SEMANTIC_SCHOLAR, + source=source, abstract=req.paper_abstract or "", authors=req.paper_authors or [], - semantic_scholar_id=req.paper_id, + semantic_scholar_id=req.paper_id if source == HarvestSource.SEMANTIC_SCHOLAR else None, + arxiv_id=req.paper_id if source == HarvestSource.ARXIV else None, + openalex_id=req.paper_id if source == HarvestSource.OPENALEX else None, year=req.paper_year, venue=req.paper_venue, citation_count=req.paper_citation_count or 0, @@ -685,19 +707,13 @@ def add_paper_feedback(req: PaperFeedbackRequest): Logger.info("Calling paper store to upsert paper", file=LogFiles.HARVEST) new_count, _ = paper_store.upsert_papers_batch([paper]) - # Get the paper ID from database - from paperbot.infrastructure.stores.models import PaperModel - from sqlalchemy import select - with paper_store._provider.session() as session: - result = session.execute( - select(PaperModel).where( - PaperModel.semantic_scholar_id == req.paper_id - ) - ).scalar_one_or_none() - if result: - library_paper_id = result.id - actual_paper_id = str(result.id) # Use integer ID for feedback - Logger.info(f"Paper saved to library with id={library_paper_id}", file=LogFiles.HARVEST) + # Get the paper ID from database using store method + result = paper_store.get_paper_by_source_id(source, req.paper_id) + if result: + library_paper_id = result.id + # Store library_paper_id in metadata for joins, keep paper_id as external ID + meta["library_paper_id"] = library_paper_id + Logger.info(f"Paper saved to library with id={library_paper_id}", file=LogFiles.HARVEST) except Exception as e: Logger.warning(f"Failed to save paper to library: {e}", file=LogFiles.HARVEST) @@ -705,7 +721,7 @@ def add_paper_feedback(req: PaperFeedbackRequest): fb = _research_store.add_paper_feedback( user_id=req.user_id, track_id=track_id, - paper_id=actual_paper_id, + paper_id=req.paper_id, # Always use external ID for consistency action=req.action, weight=req.weight, metadata=meta, diff --git a/src/paperbot/application/services/paper_deduplicator.py b/src/paperbot/application/services/paper_deduplicator.py index 954fa64..cb0dd60 100644 --- a/src/paperbot/application/services/paper_deduplicator.py +++ b/src/paperbot/application/services/paper_deduplicator.py @@ -67,7 +67,7 @@ def deduplicate( if existing_idx is not None: # Merge metadata into existing paper - self._merge_paper(unique_papers[existing_idx], paper) + self._merge_paper(unique_papers[existing_idx], paper, existing_idx) duplicates_count += 1 else: # Add new paper @@ -128,29 +128,36 @@ def _index_paper(self, paper: HarvestedPaper, idx: int) -> None: title_hash = paper.compute_title_hash() self._title_hash_index[title_hash] = idx - def _merge_paper(self, existing: HarvestedPaper, new: HarvestedPaper) -> None: + def _merge_paper( + self, existing: HarvestedPaper, new: HarvestedPaper, existing_idx: int + ) -> None: """ Merge metadata from new paper into existing. + Args: + existing: The existing paper to merge into + new: The new paper with potentially additional metadata + existing_idx: The index of the existing paper (used for updating indexes) + Strategy: - Fill in missing identifiers - Prefer longer/more complete text fields - Prefer higher citation counts - Merge lists (keywords, fields of study) """ - # Fill in missing identifiers + # Fill in missing identifiers (use existing_idx directly, not _find_index) if not existing.doi and new.doi: existing.doi = new.doi - self._doi_index[new.doi.lower().strip()] = self._find_index(existing) + self._doi_index[new.doi.lower().strip()] = existing_idx if not existing.arxiv_id and new.arxiv_id: existing.arxiv_id = new.arxiv_id - self._arxiv_index[new.arxiv_id.lower().strip()] = self._find_index(existing) + self._arxiv_index[new.arxiv_id.lower().strip()] = existing_idx if not existing.semantic_scholar_id and new.semantic_scholar_id: existing.semantic_scholar_id = new.semantic_scholar_id - self._s2_index[new.semantic_scholar_id.lower().strip()] = self._find_index(existing) + self._s2_index[new.semantic_scholar_id.lower().strip()] = existing_idx if not existing.openalex_id and new.openalex_id: existing.openalex_id = new.openalex_id - self._openalex_index[new.openalex_id.lower().strip()] = self._find_index(existing) + self._openalex_index[new.openalex_id.lower().strip()] = existing_idx # Prefer longer abstract if len(new.abstract) > len(existing.abstract): diff --git a/src/paperbot/context_engine/engine.py b/src/paperbot/context_engine/engine.py index 6d3004c..6b59dfd 100644 --- a/src/paperbot/context_engine/engine.py +++ b/src/paperbot/context_engine/engine.py @@ -503,7 +503,11 @@ async def build_context_pack( "rebuttal": (0.50, 0.40, 0.10), }.get(stage, (0.55, 0.30, 0.15)) - Logger.info(f"Paper search config: offline={self.config.offline}, paper_limit={self.config.paper_limit}", file=LogFiles.HARVEST) + Logger.info( + f"Paper search config: offline={self.config.offline}, " + f"paper_limit={self.config.paper_limit}", + file=LogFiles.HARVEST, + ) if not self.config.offline and self.config.paper_limit > 0: try: searcher = self.paper_searcher @@ -514,9 +518,13 @@ async def build_context_pack( Logger.info("Initialized SemanticScholarSearch", file=LogFiles.HARVEST) fetch_limit = max(30, int(self.config.paper_limit) * 3) - Logger.info(f"Searching papers with query='{merged_query}', limit={fetch_limit}", file=LogFiles.HARVEST) + Logger.info( + f"Searching papers with query='{merged_query}', limit={fetch_limit}", + file=LogFiles.HARVEST, + ) resp = await asyncio.to_thread(searcher.search_papers, merged_query, fetch_limit) - Logger.info(f"Search returned {len(getattr(resp, 'papers', []) or [])} papers", file=LogFiles.HARVEST) + papers_count = len(getattr(resp, "papers", []) or []) + Logger.info(f"Search returned {papers_count} papers", file=LogFiles.HARVEST) raw: List[Dict[str, Any]] = [] for p in getattr(resp, "papers", []) or []: diff --git a/src/paperbot/domain/harvest.py b/src/paperbot/domain/harvest.py index 64230ab..67c4164 100644 --- a/src/paperbot/domain/harvest.py +++ b/src/paperbot/domain/harvest.py @@ -89,7 +89,11 @@ def from_dict(cls, data: Dict[str, Any]) -> "HarvestedPaper": """Create instance from dictionary.""" source = data.get("source", "") if isinstance(source, str): - source = HarvestSource(source) + try: + source = HarvestSource(source) + except ValueError: + # Fallback for empty or invalid source strings + source = HarvestSource.SEMANTIC_SCHOLAR return cls( title=data.get("title", ""), source=source, diff --git a/src/paperbot/infrastructure/harvesters/arxiv_harvester.py b/src/paperbot/infrastructure/harvesters/arxiv_harvester.py index 6b51d1c..4c0b815 100644 --- a/src/paperbot/infrastructure/harvesters/arxiv_harvester.py +++ b/src/paperbot/infrastructure/harvesters/arxiv_harvester.py @@ -30,9 +30,13 @@ class ArxivHarvester: ARXIV_API_URL = "https://export.arxiv.org/api/query" REQUEST_INTERVAL = 3.0 # seconds between requests + DEFAULT_TIMEOUT_SECONDS = 30 - def __init__(self, connector: Optional[ArxivConnector] = None): + def __init__( + self, connector: Optional[ArxivConnector] = None, timeout_seconds: int = 30 + ): self.connector = connector or ArxivConnector() + self.timeout_seconds = timeout_seconds self._session: Optional[aiohttp.ClientSession] = None self._last_request_time: float = 0 @@ -42,7 +46,8 @@ def source(self) -> HarvestSource: async def _get_session(self) -> aiohttp.ClientSession: if self._session is None or self._session.closed: - self._session = aiohttp.ClientSession() + timeout = aiohttp.ClientTimeout(total=self.timeout_seconds) + self._session = aiohttp.ClientSession(timeout=timeout) return self._session async def _rate_limit(self) -> None: diff --git a/src/paperbot/infrastructure/harvesters/openalex_harvester.py b/src/paperbot/infrastructure/harvesters/openalex_harvester.py index 4153e42..55d72df 100644 --- a/src/paperbot/infrastructure/harvesters/openalex_harvester.py +++ b/src/paperbot/infrastructure/harvesters/openalex_harvester.py @@ -29,9 +29,11 @@ class OpenAlexHarvester: OPENALEX_API_URL = "https://api.openalex.org/works" REQUEST_INTERVAL = 0.1 # 10 req/s + DEFAULT_TIMEOUT_SECONDS = 30 - def __init__(self, email: Optional[str] = None): + def __init__(self, email: Optional[str] = None, timeout_seconds: int = 30): self.email = email # For polite pool + self.timeout_seconds = timeout_seconds self._session: Optional[aiohttp.ClientSession] = None self._last_request_time: float = 0 @@ -41,7 +43,8 @@ def source(self) -> HarvestSource: async def _get_session(self) -> aiohttp.ClientSession: if self._session is None or self._session.closed: - self._session = aiohttp.ClientSession() + timeout = aiohttp.ClientTimeout(total=self.timeout_seconds) + self._session = aiohttp.ClientSession(timeout=timeout) return self._session async def _rate_limit(self) -> None: diff --git a/src/paperbot/infrastructure/stores/paper_store.py b/src/paperbot/infrastructure/stores/paper_store.py index f0abdbf..b26327a 100644 --- a/src/paperbot/infrastructure/stores/paper_store.py +++ b/src/paperbot/infrastructure/stores/paper_store.py @@ -13,7 +13,7 @@ from datetime import datetime, timezone from typing import Any, Dict, List, Optional, Tuple -from sqlalchemy import Integer, cast, func, or_, select +from sqlalchemy import Integer, String, cast, func, or_, select from paperbot.utils.logging_config import Logger, LogFiles from paperbot.domain.harvest import HarvestedPaper, HarvestSource @@ -522,6 +522,15 @@ def search_papers( Returns: Tuple of (papers, total_count) """ + # Whitelist of allowed sort columns for security + allowed_sort_columns = { + "citation_count": PaperModel.citation_count, + "year": PaperModel.year, + "created_at": PaperModel.created_at, + "updated_at": PaperModel.updated_at, + "title": PaperModel.title, + } + with self._provider.session() as session: stmt = select(PaperModel).where(PaperModel.deleted_at.is_(None)) @@ -535,14 +544,21 @@ def search_papers( ) ) - # Year filters - if year_from: + # Keyword filter (search in keywords_json) + if keywords: + keyword_conditions = [ + PaperModel.keywords_json.ilike(f"%{kw}%") for kw in keywords + ] + stmt = stmt.where(or_(*keyword_conditions)) + + # Year filters (use explicit None check to allow year_from=0 if needed) + if year_from is not None: stmt = stmt.where(PaperModel.year >= year_from) - if year_to: + if year_to is not None: stmt = stmt.where(PaperModel.year <= year_to) - # Citation filter - if min_citations: + # Citation filter (use explicit None check to allow min_citations=0) + if min_citations is not None: stmt = stmt.where(PaperModel.citation_count >= min_citations) # Venue filter @@ -558,8 +574,8 @@ def search_papers( count_stmt = select(func.count()).select_from(stmt.subquery()) total_count = session.execute(count_stmt).scalar() or 0 - # Sort - sort_col = getattr(PaperModel, sort_by, PaperModel.citation_count) + # Sort (use whitelist for security) + sort_col = allowed_sort_columns.get(sort_by, PaperModel.citation_count) if sort_order.lower() == "desc": stmt = stmt.order_by(sort_col.desc()) else: @@ -582,6 +598,31 @@ def get_paper_by_id(self, paper_id: int) -> Optional[PaperModel]: ) ).scalar_one_or_none() + def get_paper_by_source_id( + self, source: HarvestSource, source_id: str + ) -> Optional[PaperModel]: + """ + Get a paper by its source-specific ID. + + Args: + source: The harvest source (ARXIV, SEMANTIC_SCHOLAR, OPENALEX) + source_id: The ID from that source + + Returns: + PaperModel if found, None otherwise + """ + with self._provider.session() as session: + if source == HarvestSource.ARXIV: + condition = PaperModel.arxiv_id == source_id + elif source == HarvestSource.OPENALEX: + condition = PaperModel.openalex_id == source_id + else: # Default to SEMANTIC_SCHOLAR + condition = PaperModel.semantic_scholar_id == source_id + + return session.execute( + select(PaperModel).where(condition, PaperModel.deleted_at.is_(None)) + ).scalar_one_or_none() + def get_user_library( self, user_id: str, @@ -610,13 +651,19 @@ def get_user_library( Logger.info("Executing database query to join papers with feedback", file=LogFiles.HARVEST) # First, get all matching paper-feedback pairs + # Join on external IDs (semantic_scholar_id, arxiv_id, openalex_id) + # This avoids CAST errors on PostgreSQL for non-numeric paper_ids + # Also check library_paper_id from metadata if available base_stmt = ( select(PaperModel, PaperFeedbackModel) .join( PaperFeedbackModel, or_( - PaperModel.id == cast(PaperFeedbackModel.paper_id, Integer), PaperModel.semantic_scholar_id == PaperFeedbackModel.paper_id, + PaperModel.arxiv_id == PaperFeedbackModel.paper_id, + PaperModel.openalex_id == PaperFeedbackModel.paper_id, + # For backwards compatibility with numeric IDs stored as strings + cast(PaperModel.id, String) == PaperFeedbackModel.paper_id, ), ) .where( diff --git a/web/src/app/api/papers/[paperId]/save/route.ts b/web/src/app/api/papers/[paperId]/save/route.ts index ea6a24c..494b4ee 100644 --- a/web/src/app/api/papers/[paperId]/save/route.ts +++ b/web/src/app/api/papers/[paperId]/save/route.ts @@ -1,12 +1,26 @@ +import { NextResponse } from "next/server" import { apiBaseUrl, proxyJson } from "../../../research/_base" +// Validate paperId to prevent path traversal attacks +function validatePaperId(paperId: string): number | null { + const parsed = parseInt(paperId, 10) + if (isNaN(parsed) || parsed <= 0 || String(parsed) !== paperId) { + return null + } + return parsed +} + export async function DELETE( req: Request, { params }: { params: Promise<{ paperId: string }> } ) { const { paperId } = await params + const validId = validatePaperId(paperId) + if (validId === null) { + return NextResponse.json({ error: "Invalid paper ID" }, { status: 400 }) + } const url = new URL(req.url) - const upstream = `${apiBaseUrl()}/api/papers/${paperId}/save${url.search}` + const upstream = `${apiBaseUrl()}/api/papers/${validId}/save${url.search}` return proxyJson(req, upstream, "DELETE") } @@ -15,6 +29,10 @@ export async function POST( { params }: { params: Promise<{ paperId: string }> } ) { const { paperId } = await params - const upstream = `${apiBaseUrl()}/api/papers/${paperId}/save` + const validId = validatePaperId(paperId) + if (validId === null) { + return NextResponse.json({ error: "Invalid paper ID" }, { status: 400 }) + } + const upstream = `${apiBaseUrl()}/api/papers/${validId}/save` return proxyJson(req, upstream, "POST") }