From 04e13269810e239345619c8a19f43a77dd88724a Mon Sep 17 00:00:00 2001
From: boyu <oor2020@163.com>
Date: Tue, 10 Feb 2026 19:51:03 +0100
Subject: [PATCH 1/3] feat(Harvest): add -- Paper Search and Storage

Closes #26

Signed-off-by: LIU BOYU <oor2020@163.com>
---
 alembic/versions/0003_paper_harvest_tables.py |  141 ++
 docs/paper_harvest_v1_design.md               | 2211 +++++++++++++++++
 src/paperbot/api/main.py                      |    8 +
 src/paperbot/api/routes/harvest.py            |  429 ++++
 src/paperbot/api/routes/research.py           |   72 +-
 .../application/ports/harvester_port.py       |   50 +
 src/paperbot/application/services/__init__.py |   11 +-
 .../services/paper_deduplicator.py            |  190 ++
 .../application/services/query_rewriter.py    |  151 ++
 .../application/services/venue_recommender.py |  157 ++
 .../application/workflows/harvest_pipeline.py |  376 +++
 src/paperbot/context_engine/engine.py         |   10 +-
 src/paperbot/domain/harvest.py                |  160 ++
 .../infrastructure/harvesters/__init__.py     |   17 +
 .../harvesters/arxiv_harvester.py             |  168 ++
 .../harvesters/openalex_harvester.py          |  212 ++
 .../harvesters/semantic_scholar_harvester.py  |  133 +
 src/paperbot/infrastructure/stores/models.py  |  127 +
 .../infrastructure/stores/paper_store.py      |  524 ++++
 .../infrastructure/stores/research_store.py   |   11 +
 tests/integration/test_harvest_pipeline.py    |  537 ++++
 tests/integration/test_harvesters.py          |  478 ++++
 tests/integration/test_paper_store.py         |  580 +++++
 tests/unit/test_harvested_paper.py            |  328 +++
 tests/unit/test_paper_deduplicator.py         |  292 +++
 tests/unit/test_query_rewriter.py             |  136 +
 tests/unit/test_venue_recommender.py          |  175 ++
 web/package-lock.json                         |    8 -
 .../app/api/papers/[paperId]/save/route.ts    |   20 +
 web/src/app/api/papers/library/route.ts       |    7 +
 .../components/research/ResearchDashboard.tsx |   36 +-
 .../components/research/SavedPapersList.tsx   |  108 +-
 web/src/lib/api.ts                            |   48 +-
 33 files changed, 7817 insertions(+), 94 deletions(-)
 create mode 100644 alembic/versions/0003_paper_harvest_tables.py
 create mode 100644 docs/paper_harvest_v1_design.md
 create mode 100644 src/paperbot/api/routes/harvest.py
 create mode 100644 src/paperbot/application/ports/harvester_port.py
 create mode 100644 src/paperbot/application/services/paper_deduplicator.py
 create mode 100644 src/paperbot/application/services/query_rewriter.py
 create mode 100644 src/paperbot/application/services/venue_recommender.py
 create mode 100644 src/paperbot/application/workflows/harvest_pipeline.py
 create mode 100644 src/paperbot/domain/harvest.py
 create mode 100644 src/paperbot/infrastructure/harvesters/__init__.py
 create mode 100644 src/paperbot/infrastructure/harvesters/arxiv_harvester.py
 create mode 100644 src/paperbot/infrastructure/harvesters/openalex_harvester.py
 create mode 100644 src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py
 create mode 100644 tests/integration/test_harvest_pipeline.py
 create mode 100644 tests/integration/test_harvesters.py
 create mode 100644 tests/integration/test_paper_store.py
 create mode 100644 tests/unit/test_harvested_paper.py
 create mode 100644 tests/unit/test_paper_deduplicator.py
 create mode 100644 tests/unit/test_query_rewriter.py
 create mode 100644 tests/unit/test_venue_recommender.py
 create mode 100644 web/src/app/api/papers/[paperId]/save/route.ts
 create mode 100644 web/src/app/api/papers/library/route.ts

diff --git a/alembic/versions/0003_paper_harvest_tables.py b/alembic/versions/0003_paper_harvest_tables.py
new file mode 100644
index 0000000..ecf3803
--- /dev/null
+++ b/alembic/versions/0003_paper_harvest_tables.py
@@ -0,0 +1,141 @@
+"""paper harvest tables
+
+Revision ID: 0003_paper_harvest_tables
+Revises: 0002_research_eval_runs
+Create Date: 2026-02-06
+
+Adds:
+- papers: harvested paper metadata with multi-source deduplication
+- harvest_runs: harvest execution tracking and audit
+"""
+
+from __future__ import annotations
+
+import sqlalchemy as sa
+from alembic import context, op
+
+revision = "0003_paper_harvest_tables"
+down_revision = "0002_research_eval_runs"
+branch_labels = None
+depends_on = None
+
+
+def _is_offline() -> bool:
+    try:
+        return bool(context.is_offline_mode())
+    except Exception:
+        return False
+
+
+def _insp():
+    return sa.inspect(op.get_bind())
+
+
+def _has_table(name: str) -> bool:
+    return _insp().has_table(name)
+
+
+def _get_indexes(table: str) -> set[str]:
+    idx = set()
+    for i in _insp().get_indexes(table):
+        idx.add(str(i.get("name") or ""))
+    return idx
+
+
+def _create_index(name: str, table: str, cols: list[str]) -> None:
+    if _is_offline():
+        op.create_index(name, table, cols)
+        return
+    if name in _get_indexes(table):
+        return
+    op.create_index(name, table, cols)
+
+
+def upgrade() -> None:
+    if _is_offline():
+        _upgrade_create_tables()
+        return
+    _upgrade_create_tables()
+    _upgrade_create_indexes()
+
+
+def _upgrade_create_tables() -> None:
+    # Papers table - harvested paper metadata
+    if _is_offline() or not _has_table("papers"):
+        op.create_table(
+            "papers",
+            sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
+            # Canonical identifiers (for deduplication)
+            sa.Column("doi", sa.String(length=256), nullable=True),
+            sa.Column("arxiv_id", sa.String(length=64), nullable=True),
+            sa.Column("semantic_scholar_id", sa.String(length=64), nullable=True),
+            sa.Column("openalex_id", sa.String(length=64), nullable=True),
+            sa.Column("title_hash", sa.String(length=64), nullable=False),
+            # Core metadata
+            sa.Column("title", sa.Text(), nullable=False),
+            sa.Column("abstract", sa.Text(), server_default="", nullable=False),
+            sa.Column("authors_json", sa.Text(), server_default="[]", nullable=False),
+            sa.Column("year", sa.Integer(), nullable=True),
+            sa.Column("venue", sa.String(length=256), nullable=True),
+            sa.Column("publication_date", sa.String(length=32), nullable=True),
+            sa.Column("citation_count", sa.Integer(), server_default="0", nullable=False),
+            # URLs
+            sa.Column("url", sa.String(length=1024), nullable=True),
+            sa.Column("pdf_url", sa.String(length=1024), nullable=True),
+            # Classification
+            sa.Column("keywords_json", sa.Text(), server_default="[]", nullable=False),
+            sa.Column("fields_of_study_json", sa.Text(), server_default="[]", nullable=False),
+            # Source tracking
+            sa.Column("primary_source", sa.String(length=32), nullable=False),
+            sa.Column("sources_json", sa.Text(), server_default="[]", nullable=False),
+            # Timestamps
+            sa.Column("created_at", sa.DateTime(timezone=True), nullable=True),
+            sa.Column("updated_at", sa.DateTime(timezone=True), nullable=True),
+            sa.Column("deleted_at", sa.DateTime(timezone=True), nullable=True),
+        )
+
+    # Harvest runs table - execution tracking
+    if _is_offline() or not _has_table("harvest_runs"):
+        op.create_table(
+            "harvest_runs",
+            sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
+            sa.Column("run_id", sa.String(length=64), unique=True, nullable=False),
+            # Input parameters
+            sa.Column("keywords_json", sa.Text(), server_default="[]", nullable=False),
+            sa.Column("venues_json", sa.Text(), server_default="[]", nullable=False),
+            sa.Column("sources_json", sa.Text(), server_default="[]", nullable=False),
+            sa.Column("max_results_per_source", sa.Integer(), server_default="100", nullable=False),
+            # Results
+            sa.Column("status", sa.String(length=32), server_default="running", nullable=False),
+            sa.Column("papers_found", sa.Integer(), server_default="0", nullable=False),
+            sa.Column("papers_new", sa.Integer(), server_default="0", nullable=False),
+            sa.Column("papers_deduplicated", sa.Integer(), server_default="0", nullable=False),
+            sa.Column("error_json", sa.Text(), server_default="{}", nullable=False),
+            # Timestamps
+            sa.Column("started_at", sa.DateTime(timezone=True), nullable=True),
+            sa.Column("ended_at", sa.DateTime(timezone=True), nullable=True),
+        )
+
+
+def _upgrade_create_indexes() -> None:
+    # Papers indexes
+    _create_index("ix_papers_doi", "papers", ["doi"])
+    _create_index("ix_papers_arxiv_id", "papers", ["arxiv_id"])
+    _create_index("ix_papers_semantic_scholar_id", "papers", ["semantic_scholar_id"])
+    _create_index("ix_papers_openalex_id", "papers", ["openalex_id"])
+    _create_index("ix_papers_title_hash", "papers", ["title_hash"])
+    _create_index("ix_papers_year", "papers", ["year"])
+    _create_index("ix_papers_venue", "papers", ["venue"])
+    _create_index("ix_papers_citation_count", "papers", ["citation_count"])
+    _create_index("ix_papers_primary_source", "papers", ["primary_source"])
+    _create_index("ix_papers_created_at", "papers", ["created_at"])
+
+    # Harvest runs indexes
+    _create_index("ix_harvest_runs_run_id", "harvest_runs", ["run_id"])
+    _create_index("ix_harvest_runs_status", "harvest_runs", ["status"])
+    _create_index("ix_harvest_runs_started_at", "harvest_runs", ["started_at"])
+
+
+def downgrade() -> None:
+    op.drop_table("harvest_runs")
+    op.drop_table("papers")
diff --git a/docs/paper_harvest_v1_design.md b/docs/paper_harvest_v1_design.md
new file mode 100644
index 0000000..2c54602
--- /dev/null
+++ b/docs/paper_harvest_v1_design.md
@@ -0,0 +1,2211 @@
+# Paper Collection & Resource Pool v1 - Technical Design Document
+
+> **Status**: Draft
+> **Author**: Claude Code
+> **Date**: 2026-02-03
+> **Estimated Effort**: 5-7 days (~40h)
+
+---
+
+## 0. Architecture Context: Where v1 Fits
+
+v1 spans **three layers** of the PaperBot architecture, focusing on **Paper Harvesting, Storage, and Search capabilities**.
+
+### 0.1 PaperBot Architecture with Harvest Layer
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                        PaperBot Standard Architecture                           │
+│         (Offline Ingestion → Storage → Online Retrieval → Generation → Feedback)│
+└─────────────────────────────────────────────────────────────────────────────────┘
+
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│  Layer 1 · Ingestion (Async) - HARVEST LAYER                                    │
+│  ┌──────────────────────────────────────────────────────────────────────────┐   │
+│  │ ╔════════════════════════════════════════════════════════════════════╗   │   │
+│  │ ║  v1: Paper Harvesters                                              ║   │   │
+│  │ ║  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐                 ║   │   │
+│  │ ║  │   arXiv     │  │ Semantic    │  │  OpenAlex   │                 ║   │   │
+│  │ ║  │  Harvester  │  │  Scholar    │  │  Harvester  │                 ║   │   │
+│  │ ║  └──────┬──────┘  └──────┬──────┘  └──────┬──────┘                 ║   │   │
+│  │ ║         └─────────────────┼─────────────────┘                      ║   │   │
+│  │ ║                           ▼                                        ║   │   │
+│  │ ║               ┌───────────────────────┐                            ║   │   │
+│  │ ║               │  PaperDeduplicator    │                            ║   │   │
+│  │ ║               │  (DOI/Title/ID match) │                            ║   │   │
+│  │ ║               └───────────┬───────────┘                            ║   │   │
+│  │ ╚═══════════════════════════╪════════════════════════════════════════╝   │   │
+│  └─────────────────────────────┼────────────────────────────────────────────┘   │
+│                                ▼                                                 │
+└────────────────────────────────┼─────────────────────────────────────────────────┘
+                                 │
+┌────────────────────────────────┼─────────────────────────────────────────────────┐
+│  Layer 2 · Storage                                                               │
+│  ┌─────────────────────────────▼────────────────────────────────────────────┐   │
+│  │  SQL 主库 (SQLite/Postgres)                                               │   │
+│  │  ╔═══════════════════════════════════════════════════════════════════╗   │   │
+│  │  ║  v1: papers table (NEW)                                           ║   │   │
+│  │  ║  - doi, arxiv_id, semantic_scholar_id, openalex_id               ║   │   │
+│  │  ║  - title, abstract, authors, year, venue, citations               ║   │   │
+│  │  ║  - title_hash (dedup), primary_source, sources_json              ║   │   │
+│  │  ╠═══════════════════════════════════════════════════════════════════╣   │   │
+│  │  ║  v1: harvest_runs table (NEW)                                     ║   │   │
+│  │  ║  - run_id, keywords, venues, status, papers_found/new/deduped    ║   │   │
+│  │  ╚═══════════════════════════════════════════════════════════════════╝   │   │
+│  │    research_tracks / tasks / paper_feedback (existing)                   │   │
+│  └──────────────────────────────────────────────────────────────────────────┘   │
+└────────────────────────────────┼─────────────────────────────────────────────────┘
+                                 │
+┌────────────────────────────────┼─────────────────────────────────────────────────┐
+│  Layer 3 · Retrieval (Online)                                                    │
+│  ┌─────────────────────────────▼────────────────────────────────────────────┐   │
+│  │  ╔═══════════════════════════════════════════════════════════════════╗   │   │
+│  │  ║  v1: PaperStore.search_papers() (NEW)                             ║   │   │
+│  │  ║  - Full-text search in title/abstract                             ║   │   │
+│  │  ║  - Filter by: keywords, venues, year range, citations, sources   ║   │   │
+│  │  ║  - Sort by: citation_count, year, created_at                     ║   │   │
+│  │  ║  - Pagination with limit/offset (TopN)                           ║   │   │
+│  │  ╚═══════════════════════════════════════════════════════════════════╝   │   │
+│  │  ContextEngine / TrackRouter / Paper Searcher (existing)                 │   │
+│  └──────────────────────────────────────────────────────────────────────────┘   │
+└────────────────────────────────┼─────────────────────────────────────────────────┘
+                                 │
+┌────────────────────────────────┼─────────────────────────────────────────────────┐
+│  Layer 4-5 · Generation & Feedback (Existing - No Changes)                       │
+│  ┌─────────────────────────────▼────────────────────────────────────────────┐   │
+│  │  PromptComposer → LLM → Output Parser → Paper Feedback                   │   │
+│  └──────────────────────────────────────────────────────────────────────────┘   │
+└──────────────────────────────────────────────────────────────────────────────────┘
+
+Legend:
+  ╔═══╗  v1 Focus Area (Paper Harvest & Storage)
+  ╚═══╝
+  ───▶   Data Flow
+```
+
+### 0.2 v1 Components Mapped to Architecture Layers
+
+| Layer | Component | v1 Deliverable |
+|-------|-----------|----------------|
+| **Layer 1: Ingestion** | Harvesters | ArxivHarvester, SemanticScholarHarvester, OpenAlexHarvester |
+| | Query Services | VenueRecommender, QueryRewriter |
+| | Deduplication | PaperDeduplicator (multi-strategy) |
+| **Layer 2: Storage** | `papers` table | Paper metadata with multi-source IDs |
+| | `harvest_runs` table | Harvest execution tracking |
+| | PaperStore | SQLAlchemy repository implementation |
+| **Layer 3: Retrieval** | Search API | Filter-based TopN retrieval |
+
+### 0.3 v1 Focus: Harvest Pipeline
+
+```
+                    ┌─────────────────────────────────────────┐
+                    │         v1: Harvest Pipeline            │
+                    └─────────────────────────────────────────┘
+                                       │
+           ┌───────────────────────────┼───────────────────────────┐
+           │                           │                           │
+           ▼                           ▼                           ▼
+┌─────────────────────┐   ┌─────────────────────┐   ┌─────────────────────┐
+│  Query Services     │   │  Harvesters         │   │  Storage & Search   │
+│                     │   │                     │   │                     │
+│  - VenueRecommender │   │  - ArxivHarvester   │   │  - PaperStore       │
+│    keyword→venues   │   │  - S2Harvester      │   │    upsert/search    │
+│  - QueryRewriter    │   │  - OpenAlexHarvester│   │  - Deduplication    │
+│    expand/synonyms  │   │                     │   │    DOI/title/ID     │
+│                     │   │                     │   │                     │
+└─────────────────────┘   └─────────────────────┘   └─────────────────────┘
+         │                           │                           │
+         └───────────────────────────┼───────────────────────────┘
+                                     │
+                                     ▼
+                    ┌─────────────────────────────────────────┐
+                    │        Implementation Artifacts         │
+                    │                                         │
+                    │  src/paperbot/domain/harvest.py         │
+                    │  src/paperbot/infrastructure/harvesters/│
+                    │  src/paperbot/infrastructure/stores/    │
+                    │  src/paperbot/application/services/     │
+                    │  src/paperbot/api/routes/harvest.py     │
+                    └─────────────────────────────────────────┘
+```
+
+### 0.4 Data Flow with v1 Touch Points
+
+```
+                              USER INPUT
+                                  │
+    ┌─────────────────────────────┼─────────────────────────────┐
+    │                             ▼                             │
+    │  ┌─────────────────────────────────────────────────────┐  │
+    │  │  POST /api/harvest                                  │  │
+    │  │  keywords: ["ransomware", "machine learning"]       │  │
+    │  │  venues: ["USENIX Security", "CCS"] (optional)      │  │
+    │  │  year_from: 2020, year_to: 2024 (optional)          │  │
+    │  └─────────────────────────────────────────────────────┘  │
+    │                             │                             │
+    └─────────────────────────────┼─────────────────────────────┘
+                                  │
+                            QUERY SERVICES
+                                  │
+    ┌─────────────────────────────┼─────────────────────────────┐
+    │                             ▼                             │
+    │  ┌─────────────────────────────────────────────────────┐  │
+    │  │  VenueRecommender.recommend()                       │  │
+    │  │  ╔═══════════════════════════════════════════════╗  │  │
+    │  │  ║ v1: keyword→venue mapping from config         ║  │  │
+    │  │  ║ "ransomware" → security: [CCS, S&P, USENIX]   ║  │  │
+    │  │  ╚═══════════════════════════════════════════════╝  │  │
+    │  └─────────────────────────────────────────────────────┘  │
+    │                             │                             │
+    │                             ▼                             │
+    │  ┌─────────────────────────────────────────────────────┐  │
+    │  │  QueryRewriter.rewrite()                            │  │
+    │  │  ╔═══════════════════════════════════════════════╗  │  │
+    │  │  ║ v1: abbreviation expansion + synonyms         ║  │  │
+    │  │  ║ "ML" → "machine learning"                     ║  │  │
+    │  │  ║ "LLM" → "large language model"                ║  │  │
+    │  │  ╚═══════════════════════════════════════════════╝  │  │
+    │  └─────────────────────────────────────────────────────┘  │
+    │                                                           │
+    └─────────────────────────────┼─────────────────────────────┘
+                                  │
+                         PARALLEL HARVEST
+                                  │
+    ┌─────────────────────────────┼─────────────────────────────┐
+    │              ┌──────────────┼──────────────┐              │
+    │              ▼              ▼              ▼              │
+    │  ┌────────────────┐ ┌────────────────┐ ┌────────────────┐ │
+    │  │ ArxivHarvester │ │  S2Harvester   │ │OpenAlexHarvest │ │
+    │  │ ╔════════════╗ │ │ ╔════════════╗ │ │ ╔════════════╗ │ │
+    │  │ ║ v1: Atom   ║ │ │ ║ v1: REST   ║ │ │ ║ v1: REST   ║ │ │
+    │  │ ║ XML API    ║ │ │ ║ API wrap   ║ │ │ ║ API (new)  ║ │ │
+    │  │ ╚════════════╝ │ │ ╚════════════╝ │ │ ╚════════════╝ │ │
+    │  └───────┬────────┘ └───────┬────────┘ └───────┬────────┘ │
+    │          └──────────────────┼──────────────────┘          │
+    │                             ▼                             │
+    │  ┌─────────────────────────────────────────────────────┐  │
+    │  │  List[HarvestedPaper] (unified format)              │  │
+    │  └─────────────────────────────────────────────────────┘  │
+    │                                                           │
+    └─────────────────────────────┼─────────────────────────────┘
+                                  │
+                           DEDUPLICATION
+                                  │
+    ┌─────────────────────────────┼─────────────────────────────┐
+    │                             ▼                             │
+    │  ┌─────────────────────────────────────────────────────┐  │
+    │  │  PaperDeduplicator.deduplicate()                    │  │
+    │  │  ╔═══════════════════════════════════════════════╗  │  │
+    │  │  ║ v1: Multi-strategy matching (priority order): ║  │  │
+    │  │  ║ 1. DOI (canonical, most reliable)             ║  │  │
+    │  │  ║ 2. arXiv ID                                   ║  │  │
+    │  │  ║ 3. Semantic Scholar ID                        ║  │  │
+    │  │  ║ 4. OpenAlex ID                                ║  │  │
+    │  │  ║ 5. Normalized title hash (fallback)          ║  │  │
+    │  │  ╚═══════════════════════════════════════════════╝  │  │
+    │  └─────────────────────────────────────────────────────┘  │
+    │                                                           │
+    └─────────────────────────────┼─────────────────────────────┘
+                                  │
+                              STORAGE
+                                  │
+    ┌─────────────────────────────┼─────────────────────────────┐
+    │                             ▼                             │
+    │  ┌─────────────────────────────────────────────────────┐  │
+    │  │  PaperStore.upsert_papers_batch()                   │  │
+    │  │  ╔═══════════════════════════════════════════════╗  │  │
+    │  │  ║ v1: Atomic upsert with dedup at DB level      ║  │  │
+    │  │  ║ - Unique constraints on DOI, arxiv_id, etc.   ║  │  │
+    │  │  ║ - Merge metadata from duplicates              ║  │  │
+    │  │  ║ - Track sources that returned each paper      ║  │  │
+    │  │  ╚═══════════════════════════════════════════════╝  │  │
+    │  └─────────────────────────────────────────────────────┘  │
+    │                                                           │
+    └─────────────────────────────┼─────────────────────────────┘
+                                  │
+                             RETRIEVAL
+                                  │
+    ┌─────────────────────────────┼─────────────────────────────┐
+    │                             ▼                             │
+    │  ┌─────────────────────────────────────────────────────┐  │
+    │  │  POST /api/papers/search                            │  │
+    │  │  ╔═══════════════════════════════════════════════╗  │  │
+    │  │  ║ v1: Filter-based search with TopN             ║  │  │
+    │  │  ║ - Full-text: title LIKE '%query%'             ║  │  │
+    │  │  ║ - Filters: year, venue, citations, source     ║  │  │
+    │  │  ║ - Sort: citation_count DESC (default)         ║  │  │
+    │  │  ║ - Pagination: limit=50, offset=0              ║  │  │
+    │  │  ╚═══════════════════════════════════════════════╝  │  │
+    │  └─────────────────────────────────────────────────────┘  │
+    │                                                           │
+    └───────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 1. Executive Summary
+
+**Objective**: Build a stable pipeline for "keywords → recommend venues → pull papers → store → search", enabling paper collection from 3 open sources with deduplication and filter-based retrieval.
+
+**Current State**:
+- ArxivConnector exists (XML parsing only, no search)
+- SemanticScholarClient exists (async API wrapper)
+- No unified harvester interface
+- No persistent paper storage
+- No deduplication across sources
+
+**Scope**: This document covers the v1 deliverables:
+1. Unified harvester interface and 3 implementations
+2. Paper storage with multi-strategy deduplication
+3. Query services (VenueRecommender, QueryRewriter)
+4. API endpoints for harvest and search
+
+**Non-Goals (deferred to v2)**:
+- PDF downloading and parsing
+- Full-text search (FTS5/Elasticsearch)
+- Embedding-based semantic search
+- Authenticated sources (IEEE, ACM)
+
+---
+
+## 2. Technical Solution Design
+
+### 2.1 Domain Models
+
+#### 2.1.1 HarvestedPaper (Unified Format)
+
+**File**: `src/paperbot/domain/harvest.py`
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `title` | str | Yes | Paper title |
+| `abstract` | str | No | Paper abstract |
+| `authors` | List[str] | No | Author names |
+| `doi` | str | No | Digital Object Identifier |
+| `arxiv_id` | str | No | arXiv identifier (e.g., 2301.12345) |
+| `semantic_scholar_id` | str | No | S2 paper ID |
+| `openalex_id` | str | No | OpenAlex work ID |
+| `year` | int | No | Publication year |
+| `venue` | str | No | Conference/journal name |
+| `publication_date` | str | No | ISO date string |
+| `citation_count` | int | No | Number of citations |
+| `url` | str | No | Paper URL |
+| `pdf_url` | str | No | PDF URL (metadata only, no download) |
+| `keywords` | List[str] | No | Author keywords |
+| `fields_of_study` | List[str] | No | Research fields |
+| `source` | HarvestSource | Yes | Which harvester found this |
+| `source_rank` | int | No | Position in source results |
+
+```python
+@dataclass
+class HarvestedPaper:
+    title: str
+    source: HarvestSource
+    abstract: str = ""
+    authors: List[str] = field(default_factory=list)
+    doi: Optional[str] = None
+    arxiv_id: Optional[str] = None
+    semantic_scholar_id: Optional[str] = None
+    openalex_id: Optional[str] = None
+    year: Optional[int] = None
+    venue: Optional[str] = None
+    publication_date: Optional[str] = None
+    citation_count: int = 0
+    url: Optional[str] = None
+    pdf_url: Optional[str] = None
+    keywords: List[str] = field(default_factory=list)
+    fields_of_study: List[str] = field(default_factory=list)
+    source_rank: Optional[int] = None
+```
+
+#### 2.1.2 HarvestSource Enum
+
+```python
+class HarvestSource(str, Enum):
+    ARXIV = "arxiv"
+    SEMANTIC_SCHOLAR = "semantic_scholar"
+    OPENALEX = "openalex"
+```
+
+#### 2.1.3 HarvestResult
+
+```python
+@dataclass
+class HarvestResult:
+    """Result from a single harvester."""
+    source: HarvestSource
+    papers: List[HarvestedPaper]
+    total_found: int
+    error: Optional[str] = None
+
+@dataclass
+class HarvestRunResult:
+    """Aggregated result from all harvesters."""
+    run_id: str
+    status: str  # running/success/partial/failed
+    papers_found: int
+    papers_new: int
+    papers_deduplicated: int
+    source_results: Dict[HarvestSource, HarvestResult]
+    started_at: datetime
+    ended_at: Optional[datetime] = None
+```
+
+### 2.2 Database Schema
+
+#### 2.2.1 papers Table (NEW)
+
+**File**: `alembic/versions/0003_paper_harvest_tables.py`
+
+```sql
+CREATE TABLE papers (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+
+    -- Canonical identifiers (for deduplication)
+    doi TEXT UNIQUE,
+    arxiv_id TEXT UNIQUE,
+    semantic_scholar_id TEXT UNIQUE,
+    openalex_id TEXT UNIQUE,
+    title_hash TEXT NOT NULL,  -- SHA256 of normalized title
+
+    -- Core metadata
+    title TEXT NOT NULL,
+    abstract TEXT DEFAULT '',
+    authors_json TEXT DEFAULT '[]',
+    year INTEGER,
+    venue TEXT,
+    publication_date TEXT,
+    citation_count INTEGER DEFAULT 0,
+
+    -- URLs (no PDF download, just references)
+    url TEXT,
+    pdf_url TEXT,
+
+    -- Classification
+    keywords_json TEXT DEFAULT '[]',
+    fields_of_study_json TEXT DEFAULT '[]',
+
+    -- Source tracking
+    primary_source TEXT NOT NULL,  -- First source that found this paper
+    sources_json TEXT DEFAULT '[]',  -- All sources that returned this paper
+
+    -- Timestamps
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    deleted_at TIMESTAMP  -- Soft delete
+);
+
+-- Indexes
+CREATE INDEX idx_papers_doi ON papers(doi);
+CREATE INDEX idx_papers_arxiv_id ON papers(arxiv_id);
+CREATE INDEX idx_papers_title_hash ON papers(title_hash);
+CREATE INDEX idx_papers_year ON papers(year);
+CREATE INDEX idx_papers_venue ON papers(venue);
+CREATE INDEX idx_papers_citation_count ON papers(citation_count);
+CREATE INDEX idx_papers_created_at ON papers(created_at);
+```
+
+#### 2.2.2 harvest_runs Table (NEW)
+
+```sql
+CREATE TABLE harvest_runs (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id TEXT UNIQUE NOT NULL,
+
+    -- Input
+    keywords_json TEXT DEFAULT '[]',
+    venues_json TEXT DEFAULT '[]',
+    sources_json TEXT DEFAULT '[]',
+    max_results_per_source INTEGER DEFAULT 100,
+
+    -- Results
+    status TEXT DEFAULT 'running',  -- running/success/partial/failed
+    papers_found INTEGER DEFAULT 0,
+    papers_new INTEGER DEFAULT 0,
+    papers_deduplicated INTEGER DEFAULT 0,
+    error_json TEXT DEFAULT '{}',
+
+    -- Timestamps
+    started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    ended_at TIMESTAMP
+);
+
+CREATE INDEX idx_harvest_runs_run_id ON harvest_runs(run_id);
+CREATE INDEX idx_harvest_runs_status ON harvest_runs(status);
+CREATE INDEX idx_harvest_runs_started_at ON harvest_runs(started_at);
+```
+
+#### 2.2.3 SQLAlchemy Models
+
+**File**: `src/paperbot/infrastructure/stores/models.py` (additions)
+
+```python
+class PaperModel(Base):
+    __tablename__ = "papers"
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
+
+    # Canonical identifiers
+    doi: Mapped[Optional[str]] = mapped_column(String(128), unique=True, nullable=True, index=True)
+    arxiv_id: Mapped[Optional[str]] = mapped_column(String(64), unique=True, nullable=True, index=True)
+    semantic_scholar_id: Mapped[Optional[str]] = mapped_column(String(64), unique=True, nullable=True, index=True)
+    openalex_id: Mapped[Optional[str]] = mapped_column(String(64), unique=True, nullable=True, index=True)
+    title_hash: Mapped[str] = mapped_column(String(64), nullable=False, index=True)
+
+    # Core metadata
+    title: Mapped[str] = mapped_column(Text, nullable=False)
+    abstract: Mapped[str] = mapped_column(Text, default="")
+    authors_json: Mapped[str] = mapped_column(Text, default="[]")
+    year: Mapped[Optional[int]] = mapped_column(Integer, nullable=True, index=True)
+    venue: Mapped[Optional[str]] = mapped_column(String(256), nullable=True, index=True)
+    publication_date: Mapped[Optional[str]] = mapped_column(String(32), nullable=True)
+    citation_count: Mapped[int] = mapped_column(Integer, default=0, index=True)
+
+    # URLs
+    url: Mapped[Optional[str]] = mapped_column(String(512), nullable=True)
+    pdf_url: Mapped[Optional[str]] = mapped_column(String(512), nullable=True)
+
+    # Classification
+    keywords_json: Mapped[str] = mapped_column(Text, default="[]")
+    fields_of_study_json: Mapped[str] = mapped_column(Text, default="[]")
+
+    # Source tracking
+    primary_source: Mapped[str] = mapped_column(String(32), nullable=False)
+    sources_json: Mapped[str] = mapped_column(Text, default="[]")
+
+    # Timestamps
+    created_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True, index=True)
+    updated_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True)
+    deleted_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True)
+
+
+class HarvestRunModel(Base):
+    __tablename__ = "harvest_runs"
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
+    run_id: Mapped[str] = mapped_column(String(64), unique=True, nullable=False, index=True)
+
+    # Input
+    keywords_json: Mapped[str] = mapped_column(Text, default="[]")
+    venues_json: Mapped[str] = mapped_column(Text, default="[]")
+    sources_json: Mapped[str] = mapped_column(Text, default="[]")
+    max_results_per_source: Mapped[int] = mapped_column(Integer, default=100)
+
+    # Results
+    status: Mapped[str] = mapped_column(String(32), default="running", index=True)
+    papers_found: Mapped[int] = mapped_column(Integer, default=0)
+    papers_new: Mapped[int] = mapped_column(Integer, default=0)
+    papers_deduplicated: Mapped[int] = mapped_column(Integer, default=0)
+    error_json: Mapped[str] = mapped_column(Text, default="{}")
+
+    # Timestamps
+    started_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True, index=True)
+    ended_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True)
+```
+
+### 2.3 Harvester Interface
+
+**File**: `src/paperbot/application/ports/harvester_port.py`
+
+```python
+from typing import Protocol, runtime_checkable, Optional, List
+from paperbot.domain.harvest import HarvestSource, HarvestResult
+
+@runtime_checkable
+class HarvesterPort(Protocol):
+    """Abstract interface for all paper harvesters."""
+
+    @property
+    def source(self) -> HarvestSource:
+        """Return the harvest source identifier."""
+        ...
+
+    async def search(
+        self,
+        query: str,
+        *,
+        max_results: int = 100,
+        year_from: Optional[int] = None,
+        year_to: Optional[int] = None,
+        venues: Optional[List[str]] = None,
+    ) -> HarvestResult:
+        """
+        Search for papers matching the query.
+
+        Args:
+            query: Search query string
+            max_results: Maximum number of results to return
+            year_from: Filter papers published on or after this year
+            year_to: Filter papers published on or before this year
+            venues: Filter papers from these venues (if supported)
+
+        Returns:
+            HarvestResult with papers and metadata
+        """
+        ...
+
+    async def close(self) -> None:
+        """Release resources (HTTP sessions, etc.)."""
+        ...
+```
+
+### 2.4 Harvester Implementations
+
+#### 2.4.1 ArxivHarvester
+
+**File**: `src/paperbot/infrastructure/harvesters/arxiv_harvester.py`
+
+```python
+class ArxivHarvester:
+    """
+    arXiv paper harvester using the Atom API.
+
+    API: https://export.arxiv.org/api/query
+    Rate limit: 1 request per 3 seconds (be conservative)
+    """
+
+    ARXIV_API_URL = "https://export.arxiv.org/api/query"
+    REQUEST_INTERVAL = 3.0  # seconds between requests
+
+    def __init__(self, connector: ArxivConnector):
+        self.connector = connector
+        self._session: Optional[aiohttp.ClientSession] = None
+
+    @property
+    def source(self) -> HarvestSource:
+        return HarvestSource.ARXIV
+
+    async def search(
+        self,
+        query: str,
+        *,
+        max_results: int = 100,
+        year_from: Optional[int] = None,
+        year_to: Optional[int] = None,
+        venues: Optional[List[str]] = None,  # Not supported by arXiv
+    ) -> HarvestResult:
+        """
+        Search arXiv using the Atom API.
+
+        Query syntax: https://arxiv.org/help/api/user-manual#query_details
+        """
+        # Build query with year filters if provided
+        search_query = self._build_query(query, year_from, year_to)
+
+        params = {
+            "search_query": search_query,
+            "start": 0,
+            "max_results": max_results,
+            "sortBy": "relevance",
+            "sortOrder": "descending",
+        }
+
+        try:
+            async with self._get_session().get(self.ARXIV_API_URL, params=params) as resp:
+                xml_text = await resp.text()
+
+            records = self.connector.parse_atom(xml_text)
+            papers = [self._record_to_paper(r, rank=i) for i, r in enumerate(records)]
+
+            return HarvestResult(
+                source=self.source,
+                papers=papers,
+                total_found=len(papers),
+            )
+        except Exception as e:
+            return HarvestResult(
+                source=self.source,
+                papers=[],
+                total_found=0,
+                error=str(e),
+            )
+
+    def _record_to_paper(self, record: ArxivRecord, rank: int) -> HarvestedPaper:
+        """Convert ArxivRecord to HarvestedPaper."""
+        # Extract arxiv_id from full URL (e.g., "http://arxiv.org/abs/2301.12345v1")
+        arxiv_id = record.arxiv_id.split("/")[-1].split("v")[0]
+
+        # Extract year from published date
+        year = None
+        if record.published:
+            try:
+                year = int(record.published[:4])
+            except ValueError:
+                pass
+
+        return HarvestedPaper(
+            title=record.title,
+            source=HarvestSource.ARXIV,
+            abstract=record.summary,
+            authors=record.authors,
+            arxiv_id=arxiv_id,
+            year=year,
+            publication_date=record.published,
+            url=record.abs_url,
+            pdf_url=record.pdf_url,
+            source_rank=rank,
+        )
+```
+
+#### 2.4.2 SemanticScholarHarvester
+
+**File**: `src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py`
+
+```python
+class SemanticScholarHarvester:
+    """
+    Semantic Scholar paper harvester.
+
+    API: https://api.semanticscholar.org/graph/v1/paper/search
+    Rate limit: 100 req/min (with API key), 5000/day without key
+    """
+
+    FIELDS = [
+        "paperId", "title", "abstract", "year", "venue",
+        "citationCount", "authors", "publicationDate",
+        "externalIds", "fieldsOfStudy", "url", "openAccessPdf"
+    ]
+
+    def __init__(self, client: SemanticScholarClient):
+        self.client = client
+
+    @property
+    def source(self) -> HarvestSource:
+        return HarvestSource.SEMANTIC_SCHOLAR
+
+    async def search(
+        self,
+        query: str,
+        *,
+        max_results: int = 100,
+        year_from: Optional[int] = None,
+        year_to: Optional[int] = None,
+        venues: Optional[List[str]] = None,
+    ) -> HarvestResult:
+        """Search Semantic Scholar API."""
+        try:
+            # S2 API supports year filter in query
+            year_filter = ""
+            if year_from and year_to:
+                year_filter = f" year:{year_from}-{year_to}"
+            elif year_from:
+                year_filter = f" year:{year_from}-"
+            elif year_to:
+                year_filter = f" year:-{year_to}"
+
+            results = await self.client.search_papers(
+                query=query + year_filter,
+                limit=max_results,
+                fields=self.FIELDS,
+            )
+
+            papers = [self._to_paper(r, rank=i) for i, r in enumerate(results)]
+
+            # Filter by venue if specified
+            if venues:
+                venue_set = {v.lower() for v in venues}
+                papers = [p for p in papers if p.venue and p.venue.lower() in venue_set]
+
+            return HarvestResult(
+                source=self.source,
+                papers=papers,
+                total_found=len(papers),
+            )
+        except Exception as e:
+            return HarvestResult(
+                source=self.source,
+                papers=[],
+                total_found=0,
+                error=str(e),
+            )
+
+    def _to_paper(self, data: Dict[str, Any], rank: int) -> HarvestedPaper:
+        """Convert S2 API response to HarvestedPaper."""
+        authors = [a.get("name", "") for a in data.get("authors", [])]
+        external_ids = data.get("externalIds", {}) or {}
+
+        pdf_url = None
+        if data.get("openAccessPdf"):
+            pdf_url = data["openAccessPdf"].get("url")
+
+        return HarvestedPaper(
+            title=data.get("title", ""),
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            abstract=data.get("abstract") or "",
+            authors=authors,
+            doi=external_ids.get("DOI"),
+            arxiv_id=external_ids.get("ArXiv"),
+            semantic_scholar_id=data.get("paperId"),
+            year=data.get("year"),
+            venue=data.get("venue"),
+            publication_date=data.get("publicationDate"),
+            citation_count=data.get("citationCount", 0),
+            url=data.get("url"),
+            pdf_url=pdf_url,
+            fields_of_study=data.get("fieldsOfStudy") or [],
+            source_rank=rank,
+        )
+```
+
+#### 2.4.3 OpenAlexHarvester
+
+**File**: `src/paperbot/infrastructure/harvesters/openalex_harvester.py`
+
+```python
+class OpenAlexHarvester:
+    """
+    OpenAlex paper harvester.
+
+    API: https://docs.openalex.org/api-entities/works
+    Rate limit: 10 req/s (polite pool with email), 100K/day
+    """
+
+    OPENALEX_API_URL = "https://api.openalex.org/works"
+    REQUEST_INTERVAL = 0.1  # 10 req/s
+
+    def __init__(self, email: Optional[str] = None):
+        self.email = email  # For polite pool
+        self._session: Optional[aiohttp.ClientSession] = None
+
+    @property
+    def source(self) -> HarvestSource:
+        return HarvestSource.OPENALEX
+
+    async def search(
+        self,
+        query: str,
+        *,
+        max_results: int = 100,
+        year_from: Optional[int] = None,
+        year_to: Optional[int] = None,
+        venues: Optional[List[str]] = None,
+    ) -> HarvestResult:
+        """Search OpenAlex API."""
+        params = {
+            "search": query,
+            "per_page": min(max_results, 200),  # API max is 200
+            "sort": "cited_by_count:desc",
+        }
+
+        # Add email for polite pool
+        if self.email:
+            params["mailto"] = self.email
+
+        # Build filter string
+        filters = []
+        if year_from:
+            filters.append(f"publication_year:>={year_from}")
+        if year_to:
+            filters.append(f"publication_year:<={year_to}")
+        if filters:
+            params["filter"] = ",".join(filters)
+
+        try:
+            async with self._get_session().get(self.OPENALEX_API_URL, params=params) as resp:
+                data = await resp.json()
+
+            results = data.get("results", [])
+            papers = [self._to_paper(r, rank=i) for i, r in enumerate(results)]
+
+            # Filter by venue if specified
+            if venues:
+                venue_set = {v.lower() for v in venues}
+                papers = [p for p in papers if p.venue and p.venue.lower() in venue_set]
+
+            return HarvestResult(
+                source=self.source,
+                papers=papers,
+                total_found=data.get("meta", {}).get("count", len(papers)),
+            )
+        except Exception as e:
+            return HarvestResult(
+                source=self.source,
+                papers=[],
+                total_found=0,
+                error=str(e),
+            )
+
+    def _to_paper(self, data: Dict[str, Any], rank: int) -> HarvestedPaper:
+        """Convert OpenAlex API response to HarvestedPaper."""
+        # Extract authors
+        authors = []
+        for authorship in data.get("authorships", []):
+            author = authorship.get("author", {})
+            if author.get("display_name"):
+                authors.append(author["display_name"])
+
+        # Extract identifiers
+        ids = data.get("ids", {})
+        doi = ids.get("doi", "").replace("https://doi.org/", "") if ids.get("doi") else None
+        openalex_id = ids.get("openalex", "").replace("https://openalex.org/", "")
+
+        # Extract venue
+        venue = None
+        if data.get("primary_location"):
+            source = data["primary_location"].get("source") or {}
+            venue = source.get("display_name")
+
+        # Extract PDF URL
+        pdf_url = None
+        if data.get("open_access", {}).get("oa_url"):
+            pdf_url = data["open_access"]["oa_url"]
+
+        return HarvestedPaper(
+            title=data.get("title", ""),
+            source=HarvestSource.OPENALEX,
+            abstract=self._get_abstract(data),
+            authors=authors,
+            doi=doi,
+            openalex_id=openalex_id,
+            year=data.get("publication_year"),
+            venue=venue,
+            publication_date=data.get("publication_date"),
+            citation_count=data.get("cited_by_count", 0),
+            url=data.get("doi") or ids.get("openalex"),
+            pdf_url=pdf_url,
+            keywords=self._extract_keywords(data),
+            fields_of_study=[c.get("display_name", "") for c in data.get("concepts", [])[:5]],
+            source_rank=rank,
+        )
+
+    def _get_abstract(self, data: Dict[str, Any]) -> str:
+        """Reconstruct abstract from inverted index."""
+        abstract_index = data.get("abstract_inverted_index")
+        if not abstract_index:
+            return ""
+
+        # OpenAlex stores abstract as inverted index: {"word": [positions]}
+        words = []
+        for word, positions in abstract_index.items():
+            for pos in positions:
+                words.append((pos, word))
+        words.sort(key=lambda x: x[0])
+        return " ".join(w[1] for w in words)
+```
+
+### 2.5 Query Services
+
+#### 2.5.1 VenueRecommender
+
+**File**: `src/paperbot/application/services/venue_recommender.py`
+
+```python
+class VenueRecommender:
+    """
+    Recommend relevant venues based on keywords.
+
+    Uses a static mapping from keywords/domains to top venues.
+    Configuration loaded from config/venue_mappings.yaml.
+    """
+
+    # Default keyword→venue mappings (can be overridden by config)
+    DEFAULT_MAPPINGS = {
+        # Security
+        "security": ["CCS", "S&P", "USENIX Security", "NDSS"],
+        "ransomware": ["CCS", "S&P", "USENIX Security", "NDSS"],
+        "malware": ["CCS", "S&P", "USENIX Security", "NDSS"],
+        "cryptography": ["CRYPTO", "EUROCRYPT", "CCS"],
+        "privacy": ["S&P", "PETS", "CCS", "USENIX Security"],
+
+        # ML/AI
+        "machine learning": ["NeurIPS", "ICML", "ICLR"],
+        "deep learning": ["NeurIPS", "ICML", "ICLR", "CVPR"],
+        "llm": ["NeurIPS", "ICML", "ACL", "EMNLP"],
+        "large language model": ["NeurIPS", "ICML", "ACL", "EMNLP"],
+        "transformer": ["NeurIPS", "ICML", "ACL", "EMNLP"],
+        "nlp": ["ACL", "EMNLP", "NAACL", "NeurIPS"],
+        "computer vision": ["CVPR", "ICCV", "ECCV", "NeurIPS"],
+
+        # Systems
+        "database": ["SIGMOD", "VLDB", "ICDE"],
+        "systems": ["OSDI", "SOSP", "EuroSys", "ATC"],
+        "networking": ["SIGCOMM", "NSDI", "MobiCom"],
+
+        # Software Engineering
+        "software engineering": ["ICSE", "FSE", "ASE"],
+        "testing": ["ICSE", "ISSTA", "FSE"],
+        "program analysis": ["PLDI", "POPL", "OOPSLA"],
+    }
+
+    def __init__(self, config_path: Optional[str] = None):
+        self.mappings = self.DEFAULT_MAPPINGS.copy()
+        if config_path:
+            self._load_config(config_path)
+
+    def recommend(
+        self,
+        keywords: List[str],
+        *,
+        max_venues: int = 5,
+    ) -> List[str]:
+        """
+        Recommend venues based on keywords.
+
+        Args:
+            keywords: List of search keywords
+            max_venues: Maximum number of venues to recommend
+
+        Returns:
+            List of recommended venue names, ordered by relevance
+        """
+        venue_scores: Dict[str, int] = {}
+
+        for keyword in keywords:
+            keyword_lower = keyword.lower()
+
+            # Exact match
+            if keyword_lower in self.mappings:
+                for venue in self.mappings[keyword_lower]:
+                    venue_scores[venue] = venue_scores.get(venue, 0) + 2
+
+            # Partial match
+            for mapped_kw, venues in self.mappings.items():
+                if keyword_lower in mapped_kw or mapped_kw in keyword_lower:
+                    for venue in venues:
+                        venue_scores[venue] = venue_scores.get(venue, 0) + 1
+
+        # Sort by score descending
+        sorted_venues = sorted(venue_scores.items(), key=lambda x: -x[1])
+        return [v[0] for v in sorted_venues[:max_venues]]
+```
+
+#### 2.5.2 QueryRewriter
+
+**File**: `src/paperbot/application/services/query_rewriter.py`
+
+```python
+class QueryRewriter:
+    """
+    Expand and rewrite queries for better search coverage.
+
+    Handles:
+    - Abbreviation expansion (LLM → large language model)
+    - Synonym addition (ML → machine learning)
+    - Query normalization
+    """
+
+    # Abbreviation → full form mappings
+    ABBREVIATIONS = {
+        "llm": "large language model",
+        "llms": "large language models",
+        "ml": "machine learning",
+        "dl": "deep learning",
+        "nlp": "natural language processing",
+        "cv": "computer vision",
+        "rl": "reinforcement learning",
+        "gan": "generative adversarial network",
+        "gans": "generative adversarial networks",
+        "cnn": "convolutional neural network",
+        "cnns": "convolutional neural networks",
+        "rnn": "recurrent neural network",
+        "rnns": "recurrent neural networks",
+        "lstm": "long short-term memory",
+        "bert": "bidirectional encoder representations from transformers",
+        "gpt": "generative pre-trained transformer",
+        "rag": "retrieval augmented generation",
+        "vae": "variational autoencoder",
+        "asr": "automatic speech recognition",
+        "tts": "text to speech",
+        "ocr": "optical character recognition",
+        "sql": "structured query language",
+        "api": "application programming interface",
+    }
+
+    def __init__(self, abbreviations: Optional[Dict[str, str]] = None):
+        self.abbreviations = {**self.ABBREVIATIONS}
+        if abbreviations:
+            self.abbreviations.update(abbreviations)
+
+    def rewrite(self, query: str) -> List[str]:
+        """
+        Rewrite query to produce expanded variations.
+
+        Args:
+            query: Original search query
+
+        Returns:
+            List of query variations (original + expanded)
+        """
+        queries = [query]
+
+        # Tokenize and expand abbreviations
+        words = query.lower().split()
+        expanded_words = []
+        has_expansion = False
+
+        for word in words:
+            # Remove punctuation for matching
+            clean_word = word.strip(".,;:!?()[]{}\"'")
+
+            if clean_word in self.abbreviations:
+                expanded_words.append(self.abbreviations[clean_word])
+                has_expansion = True
+            else:
+                expanded_words.append(word)
+
+        if has_expansion:
+            expanded_query = " ".join(expanded_words)
+            if expanded_query != query.lower():
+                queries.append(expanded_query)
+
+        return queries
+
+    def normalize(self, query: str) -> str:
+        """
+        Normalize query for consistent matching.
+
+        - Lowercase
+        - Remove extra whitespace
+        - Remove special characters (except alphanumeric and space)
+        """
+        import re
+        normalized = query.lower()
+        normalized = re.sub(r"[^\w\s]", " ", normalized)
+        normalized = re.sub(r"\s+", " ", normalized).strip()
+        return normalized
+```
+
+### 2.6 Deduplication Service
+
+**File**: `src/paperbot/application/services/paper_deduplicator.py`
+
+```python
+class PaperDeduplicator:
+    """
+    Multi-strategy paper deduplication.
+
+    Priority order:
+    1. DOI (most reliable)
+    2. arXiv ID
+    3. Semantic Scholar ID
+    4. OpenAlex ID
+    5. Normalized title hash (fallback)
+    """
+
+    def __init__(self):
+        self._doi_index: Dict[str, int] = {}
+        self._arxiv_index: Dict[str, int] = {}
+        self._s2_index: Dict[str, int] = {}
+        self._openalex_index: Dict[str, int] = {}
+        self._title_hash_index: Dict[str, int] = {}
+
+    def deduplicate(
+        self,
+        papers: List[HarvestedPaper],
+    ) -> Tuple[List[HarvestedPaper], int]:
+        """
+        Deduplicate papers in-memory.
+
+        Args:
+            papers: List of papers from all sources
+
+        Returns:
+            Tuple of (deduplicated papers, count of duplicates removed)
+        """
+        unique_papers: List[HarvestedPaper] = []
+        duplicates_count = 0
+
+        for paper in papers:
+            existing_idx = self._find_duplicate(paper)
+
+            if existing_idx is not None:
+                # Merge metadata into existing paper
+                self._merge_paper(unique_papers[existing_idx], paper)
+                duplicates_count += 1
+            else:
+                # Add new paper
+                idx = len(unique_papers)
+                self._index_paper(paper, idx)
+                unique_papers.append(paper)
+
+        return unique_papers, duplicates_count
+
+    def _find_duplicate(self, paper: HarvestedPaper) -> Optional[int]:
+        """Find existing paper index if duplicate exists."""
+        # 1. DOI match
+        if paper.doi:
+            doi_lower = paper.doi.lower()
+            if doi_lower in self._doi_index:
+                return self._doi_index[doi_lower]
+
+        # 2. arXiv ID match
+        if paper.arxiv_id:
+            arxiv_lower = paper.arxiv_id.lower()
+            if arxiv_lower in self._arxiv_index:
+                return self._arxiv_index[arxiv_lower]
+
+        # 3. Semantic Scholar ID match
+        if paper.semantic_scholar_id:
+            s2_lower = paper.semantic_scholar_id.lower()
+            if s2_lower in self._s2_index:
+                return self._s2_index[s2_lower]
+
+        # 4. OpenAlex ID match
+        if paper.openalex_id:
+            openalex_lower = paper.openalex_id.lower()
+            if openalex_lower in self._openalex_index:
+                return self._openalex_index[openalex_lower]
+
+        # 5. Title hash match (fallback)
+        title_hash = self._compute_title_hash(paper.title)
+        if title_hash in self._title_hash_index:
+            return self._title_hash_index[title_hash]
+
+        return None
+
+    def _index_paper(self, paper: HarvestedPaper, idx: int) -> None:
+        """Add paper to all relevant indexes."""
+        if paper.doi:
+            self._doi_index[paper.doi.lower()] = idx
+        if paper.arxiv_id:
+            self._arxiv_index[paper.arxiv_id.lower()] = idx
+        if paper.semantic_scholar_id:
+            self._s2_index[paper.semantic_scholar_id.lower()] = idx
+        if paper.openalex_id:
+            self._openalex_index[paper.openalex_id.lower()] = idx
+
+        title_hash = self._compute_title_hash(paper.title)
+        self._title_hash_index[title_hash] = idx
+
+    def _merge_paper(self, existing: HarvestedPaper, new: HarvestedPaper) -> None:
+        """Merge metadata from new paper into existing."""
+        # Fill in missing identifiers
+        if not existing.doi and new.doi:
+            existing.doi = new.doi
+        if not existing.arxiv_id and new.arxiv_id:
+            existing.arxiv_id = new.arxiv_id
+        if not existing.semantic_scholar_id and new.semantic_scholar_id:
+            existing.semantic_scholar_id = new.semantic_scholar_id
+        if not existing.openalex_id and new.openalex_id:
+            existing.openalex_id = new.openalex_id
+
+        # Prefer longer abstract
+        if len(new.abstract) > len(existing.abstract):
+            existing.abstract = new.abstract
+
+        # Prefer higher citation count
+        if new.citation_count > existing.citation_count:
+            existing.citation_count = new.citation_count
+
+        # Merge keywords and fields
+        existing.keywords = list(set(existing.keywords + new.keywords))
+        existing.fields_of_study = list(set(existing.fields_of_study + new.fields_of_study))
+
+    @staticmethod
+    def _compute_title_hash(title: str) -> str:
+        """Compute normalized title hash for deduplication."""
+        import hashlib
+        import re
+
+        # Normalize: lowercase, remove punctuation, collapse whitespace
+        normalized = title.lower()
+        normalized = re.sub(r"[^\w\s]", "", normalized)
+        normalized = re.sub(r"\s+", " ", normalized).strip()
+
+        return hashlib.sha256(normalized.encode()).hexdigest()
+```
+
+### 2.7 PaperStore Repository
+
+**File**: `src/paperbot/infrastructure/stores/paper_store.py`
+
+```python
+class PaperStore:
+    """
+    Paper storage repository.
+
+    Handles:
+    - Batch upsert with DB-level deduplication
+    - Filter-based search with pagination
+    - Source tracking
+    """
+
+    def __init__(self, session_provider: SessionProvider):
+        self.session_provider = session_provider
+
+    async def upsert_papers_batch(
+        self,
+        papers: List[HarvestedPaper],
+    ) -> Tuple[int, int]:
+        """
+        Upsert papers with deduplication.
+
+        Returns:
+            Tuple of (new_count, updated_count)
+        """
+        new_count = 0
+        updated_count = 0
+
+        with self.session_provider() as session:
+            for paper in papers:
+                existing = self._find_existing(session, paper)
+
+                if existing:
+                    self._update_paper(existing, paper)
+                    updated_count += 1
+                else:
+                    model = self._create_model(paper)
+                    session.add(model)
+                    new_count += 1
+
+            session.commit()
+
+        return new_count, updated_count
+
+    async def search_papers(
+        self,
+        *,
+        query: Optional[str] = None,
+        keywords: Optional[List[str]] = None,
+        venues: Optional[List[str]] = None,
+        year_from: Optional[int] = None,
+        year_to: Optional[int] = None,
+        min_citations: Optional[int] = None,
+        sources: Optional[List[str]] = None,
+        sort_by: str = "citation_count",
+        sort_order: str = "desc",
+        limit: int = 50,
+        offset: int = 0,
+    ) -> Tuple[List[PaperModel], int]:
+        """
+        Search papers with filters and pagination.
+
+        Returns:
+            Tuple of (papers, total_count)
+        """
+        with self.session_provider() as session:
+            stmt = select(PaperModel).where(PaperModel.deleted_at.is_(None))
+
+            # Full-text search (LIKE for v1)
+            if query:
+                pattern = f"%{query}%"
+                stmt = stmt.where(
+                    or_(
+                        PaperModel.title.ilike(pattern),
+                        PaperModel.abstract.ilike(pattern),
+                    )
+                )
+
+            # Filters
+            if year_from:
+                stmt = stmt.where(PaperModel.year >= year_from)
+            if year_to:
+                stmt = stmt.where(PaperModel.year <= year_to)
+            if min_citations:
+                stmt = stmt.where(PaperModel.citation_count >= min_citations)
+            if venues:
+                stmt = stmt.where(PaperModel.venue.in_(venues))
+            if sources:
+                stmt = stmt.where(PaperModel.primary_source.in_(sources))
+
+            # Count total
+            count_stmt = select(func.count()).select_from(stmt.subquery())
+            total_count = session.execute(count_stmt).scalar() or 0
+
+            # Sort
+            sort_col = getattr(PaperModel, sort_by, PaperModel.citation_count)
+            if sort_order == "desc":
+                stmt = stmt.order_by(sort_col.desc())
+            else:
+                stmt = stmt.order_by(sort_col.asc())
+
+            # Pagination
+            stmt = stmt.offset(offset).limit(limit)
+
+            papers = session.execute(stmt).scalars().all()
+
+            return list(papers), total_count
+```
+
+### 2.8 Harvest Pipeline Orchestrator
+
+**File**: `src/paperbot/application/workflows/harvest_pipeline.py`
+
+```python
+class HarvestPipeline:
+    """
+    Orchestrates the paper harvest pipeline.
+
+    Stages:
+    1. Query expansion (QueryRewriter)
+    2. Venue recommendation (VenueRecommender)
+    3. Parallel harvesting (all harvesters)
+    4. Deduplication (PaperDeduplicator)
+    5. Storage (PaperStore)
+    """
+
+    def __init__(
+        self,
+        harvesters: List[HarvesterPort],
+        paper_store: PaperStore,
+        query_rewriter: QueryRewriter,
+        venue_recommender: VenueRecommender,
+        deduplicator: PaperDeduplicator,
+    ):
+        self.harvesters = harvesters
+        self.paper_store = paper_store
+        self.query_rewriter = query_rewriter
+        self.venue_recommender = venue_recommender
+        self.deduplicator = deduplicator
+
+    async def run(
+        self,
+        keywords: List[str],
+        *,
+        venues: Optional[List[str]] = None,
+        year_from: Optional[int] = None,
+        year_to: Optional[int] = None,
+        max_results_per_source: int = 50,
+        sources: Optional[List[str]] = None,
+        progress_callback: Optional[Callable[[str, str], None]] = None,
+    ) -> HarvestRunResult:
+        """
+        Run the full harvest pipeline.
+
+        Args:
+            keywords: Search keywords
+            venues: Venue filter (optional, will recommend if not provided)
+            year_from: Publication year lower bound
+            year_to: Publication year upper bound
+            max_results_per_source: Max papers per source
+            sources: Which sources to use (default: all)
+            progress_callback: Optional callback for progress updates
+
+        Returns:
+            HarvestRunResult with all papers and statistics
+        """
+        run_id = f"harvest-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{uuid4().hex[:6]}"
+        started_at = datetime.now(timezone.utc)
+
+        def emit(phase: str, message: str):
+            if progress_callback:
+                progress_callback(phase, message)
+
+        # Stage 1: Query expansion
+        emit("Expanding", "Expanding keywords...")
+        expanded_queries = []
+        for kw in keywords:
+            expanded_queries.extend(self.query_rewriter.rewrite(kw))
+        combined_query = " ".join(expanded_queries)
+
+        # Stage 2: Venue recommendation
+        if not venues:
+            emit("Recommending", "Recommending venues...")
+            venues = self.venue_recommender.recommend(keywords)
+
+        # Stage 3: Parallel harvesting
+        emit("Harvesting", "Fetching from sources...")
+
+        selected_harvesters = self.harvesters
+        if sources:
+            source_set = {HarvestSource(s) for s in sources}
+            selected_harvesters = [h for h in self.harvesters if h.source in source_set]
+
+        # Run all harvesters in parallel
+        tasks = [
+            h.search(
+                combined_query,
+                max_results=max_results_per_source,
+                year_from=year_from,
+                year_to=year_to,
+                venues=venues,
+            )
+            for h in selected_harvesters
+        ]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        # Collect results
+        source_results: Dict[HarvestSource, HarvestResult] = {}
+        all_papers: List[HarvestedPaper] = []
+
+        for harvester, result in zip(selected_harvesters, results):
+            if isinstance(result, Exception):
+                source_results[harvester.source] = HarvestResult(
+                    source=harvester.source,
+                    papers=[],
+                    total_found=0,
+                    error=str(result),
+                )
+            else:
+                source_results[harvester.source] = result
+                all_papers.extend(result.papers)
+                emit("Harvesting", f"Found {result.total_found} from {harvester.source.value}")
+
+        papers_found = len(all_papers)
+
+        # Stage 4: Deduplication
+        emit("Deduplicating", "Removing duplicates...")
+        unique_papers, duplicates_count = self.deduplicator.deduplicate(all_papers)
+
+        # Stage 5: Storage
+        emit("Storing", "Saving to database...")
+        new_count, updated_count = await self.paper_store.upsert_papers_batch(unique_papers)
+
+        # Determine final status
+        has_errors = any(r.error for r in source_results.values())
+        has_results = any(r.papers for r in source_results.values())
+
+        if has_errors and not has_results:
+            status = "failed"
+        elif has_errors:
+            status = "partial"
+        else:
+            status = "success"
+
+        return HarvestRunResult(
+            run_id=run_id,
+            status=status,
+            papers_found=papers_found,
+            papers_new=new_count,
+            papers_deduplicated=duplicates_count,
+            source_results=source_results,
+            started_at=started_at,
+            ended_at=datetime.now(timezone.utc),
+        )
+```
+
+### 2.9 API Endpoints
+
+**File**: `src/paperbot/api/routes/harvest.py`
+
+```python
+router = APIRouter(prefix="/api", tags=["harvest"])
+
+
+class HarvestRequest(BaseModel):
+    keywords: List[str]
+    venues: Optional[List[str]] = None
+    year_from: Optional[int] = None
+    year_to: Optional[int] = None
+    max_results_per_source: int = Field(default=50, ge=1, le=200)
+    sources: Optional[List[str]] = None
+
+
+class PaperSearchRequest(BaseModel):
+    query: Optional[str] = None
+    keywords: Optional[List[str]] = None
+    venues: Optional[List[str]] = None
+    year_from: Optional[int] = None
+    year_to: Optional[int] = None
+    min_citations: Optional[int] = None
+    sources: Optional[List[str]] = None
+    sort_by: str = Field(default="citation_count")
+    sort_order: str = Field(default="desc")
+    limit: int = Field(default=50, ge=1, le=500)
+    offset: int = Field(default=0, ge=0)
+
+
+@router.post("/harvest")
+async def harvest_papers(request: HarvestRequest):
+    """
+    Start paper harvesting pipeline.
+
+    Returns SSE stream with progress updates and final result.
+    """
+    async def generate():
+        pipeline = get_harvest_pipeline()  # From DI container
+
+        async def on_progress(phase: str, message: str):
+            yield sse_event("progress", {"phase": phase, "message": message})
+
+        result = await pipeline.run(
+            keywords=request.keywords,
+            venues=request.venues,
+            year_from=request.year_from,
+            year_to=request.year_to,
+            max_results_per_source=request.max_results_per_source,
+            sources=request.sources,
+            progress_callback=on_progress,
+        )
+
+        yield sse_event("result", {
+            "run_id": result.run_id,
+            "status": result.status,
+            "papers_found": result.papers_found,
+            "papers_new": result.papers_new,
+            "papers_deduplicated": result.papers_deduplicated,
+            "sources": {
+                source.value: {
+                    "papers": len(r.papers),
+                    "error": r.error,
+                }
+                for source, r in result.source_results.items()
+            },
+        })
+        yield sse_event("done", {})
+
+    return StreamingResponse(
+        generate(),
+        media_type="text/event-stream",
+    )
+
+
+@router.post("/papers/search")
+async def search_papers(request: PaperSearchRequest):
+    """
+    Search harvested papers with filters.
+    """
+    store = get_paper_store()  # From DI container
+
+    papers, total = await store.search_papers(
+        query=request.query,
+        venues=request.venues,
+        year_from=request.year_from,
+        year_to=request.year_to,
+        min_citations=request.min_citations,
+        sources=request.sources,
+        sort_by=request.sort_by,
+        sort_order=request.sort_order,
+        limit=request.limit,
+        offset=request.offset,
+    )
+
+    return {
+        "papers": [paper_to_dict(p) for p in papers],
+        "total": total,
+        "limit": request.limit,
+        "offset": request.offset,
+    }
+```
+
+### 2.10 Papers Library Integration
+
+The **Papers Library** (web UI at `/papers`) displays the user's personal paper collection. When a user clicks "Save" on a paper from search results or recommendations, that paper should appear in their Papers Library.
+
+#### 2.10.1 Data Flow: Save → Papers Library
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                      Save Action → Papers Library Flow                          │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                                                                                 │
+│  ┌──────────────────────────────────────────────────────────────────────────┐  │
+│  │  Research Page / Recommendations                                         │  │
+│  │  ┌─────────────────────────────────────────────────────────────────┐    │  │
+│  │  │  Paper: "Attention Is All You Need"                             │    │  │
+│  │  │  [Like]  [Save]  [Dislike]                                      │    │  │
+│  │  └─────────────────────────────────────────────────────────────────┘    │  │
+│  └───────────────────────────────────┬──────────────────────────────────────┘  │
+│                                      │ User clicks "Save"                       │
+│                                      ▼                                          │
+│  ┌──────────────────────────────────────────────────────────────────────────┐  │
+│  │  POST /api/research/feedback                                             │  │
+│  │  {                                                                       │  │
+│  │    "user_id": "user123",                                                 │  │
+│  │    "track_id": 1,                                                        │  │
+│  │    "paper_id": 42,              ← papers.id from paper_store             │  │
+│  │    "action": "save"                                                      │  │
+│  │  }                                                                       │  │
+│  └───────────────────────────────────┬──────────────────────────────────────┘  │
+│                                      │                                          │
+│                                      ▼                                          │
+│  ┌──────────────────────────────────────────────────────────────────────────┐  │
+│  │  paper_feedback table (existing in research_store)                       │  │
+│  │  ┌─────────────────────────────────────────────────────────────────┐    │  │
+│  │  │  id: 1                                                          │    │  │
+│  │  │  user_id: "user123"                                             │    │  │
+│  │  │  track_id: 1                                                    │    │  │
+│  │  │  paper_id: "42"               ← Reference to papers.id          │    │  │
+│  │  │  action: "save"                                                 │    │  │
+│  │  │  ts: 2026-02-06T10:30:00Z                                       │    │  │
+│  │  └─────────────────────────────────────────────────────────────────┘    │  │
+│  └───────────────────────────────────┬──────────────────────────────────────┘  │
+│                                      │                                          │
+│                                      ▼                                          │
+│  ┌──────────────────────────────────────────────────────────────────────────┐  │
+│  │  GET /api/papers/library                                                 │  │
+│  │  ┌─────────────────────────────────────────────────────────────────┐    │  │
+│  │  │  SELECT p.*, pf.action, pf.ts AS saved_at                       │    │  │
+│  │  │  FROM papers p                                                  │    │  │
+│  │  │  JOIN paper_feedback pf ON p.id = CAST(pf.paper_id AS INTEGER)  │    │  │
+│  │  │  WHERE pf.user_id = ? AND pf.action = 'save'                    │    │  │
+│  │  │  ORDER BY pf.ts DESC                                            │    │  │
+│  │  └─────────────────────────────────────────────────────────────────┘    │  │
+│  └───────────────────────────────────┬──────────────────────────────────────┘  │
+│                                      │                                          │
+│                                      ▼                                          │
+│  ┌──────────────────────────────────────────────────────────────────────────┐  │
+│  │  Papers Library Page (/papers)                                           │  │
+│  │  ┌─────────────────────────────────────────────────────────────────┐    │  │
+│  │  │  📄 Attention Is All You Need    [Transformer] [NLP]            │    │  │
+│  │  │     NeurIPS 2017 · Vaswani et al. · 100k+ citations             │    │  │
+│  │  │     Saved: Feb 6, 2026                         [Analyze] [Remove]│    │  │
+│  │  └─────────────────────────────────────────────────────────────────┘    │  │
+│  └──────────────────────────────────────────────────────────────────────────┘  │
+│                                                                                 │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+#### 2.10.2 Key Design Decisions
+
+| Decision | Rationale |
+|----------|-----------|
+| **Use `paper_feedback.paper_id` to reference `papers.id`** | Links user actions to the local paper pool |
+| **Papers Library = papers WHERE action='save'** | Simple query, no new table needed |
+| **Store `papers.id` (integer) not external IDs** | Consistent internal reference, supports papers from any source |
+| **Keep track_id in feedback** | Papers can be saved in context of a research track |
+
+#### 2.10.3 New API Endpoint: GET /api/papers/library
+
+**File**: `src/paperbot/api/routes/harvest.py` (addition)
+
+```python
+class PaperLibraryRequest(BaseModel):
+    user_id: str
+    track_id: Optional[int] = None  # Filter by track, or all if None
+    include_actions: List[str] = Field(default=["save"])  # "save", "like", "cite"
+    sort_by: str = Field(default="saved_at")  # saved_at, title, citation_count
+    sort_order: str = Field(default="desc")
+    limit: int = Field(default=50, ge=1, le=500)
+    offset: int = Field(default=0, ge=0)
+
+
+@router.get("/papers/library")
+async def get_user_library(
+    user_id: str,
+    track_id: Optional[int] = None,
+    sort_by: str = "saved_at",
+    limit: int = 50,
+    offset: int = 0,
+):
+    """
+    Get user's saved papers (Papers Library).
+
+    Joins paper_feedback (action='save') with papers table to return
+    full paper metadata for the user's personal collection.
+    """
+    store = get_paper_store()
+
+    papers, total = await store.get_user_library(
+        user_id=user_id,
+        track_id=track_id,
+        actions=["save"],
+        sort_by=sort_by,
+        limit=limit,
+        offset=offset,
+    )
+
+    return {
+        "papers": [
+            {
+                **paper_to_dict(p.paper),
+                "saved_at": p.saved_at.isoformat() if p.saved_at else None,
+                "track_id": p.track_id,
+                "action": p.action,
+            }
+            for p in papers
+        ],
+        "total": total,
+        "limit": limit,
+        "offset": offset,
+    }
+
+
+@router.delete("/papers/library/{paper_id}")
+async def remove_from_library(paper_id: int, user_id: str):
+    """
+    Remove a paper from user's library (soft-delete the 'save' feedback).
+    """
+    store = get_paper_store()
+    success = await store.remove_from_library(user_id=user_id, paper_id=paper_id)
+    return {"success": success}
+```
+
+#### 2.10.4 PaperStore Addition: get_user_library()
+
+**File**: `src/paperbot/infrastructure/stores/paper_store.py` (addition)
+
+```python
+@dataclass
+class LibraryPaper:
+    """Paper with library metadata."""
+    paper: PaperModel
+    saved_at: datetime
+    track_id: Optional[int]
+    action: str
+
+
+class PaperStore:
+    # ... existing methods ...
+
+    async def get_user_library(
+        self,
+        user_id: str,
+        *,
+        track_id: Optional[int] = None,
+        actions: List[str] = ["save"],
+        sort_by: str = "saved_at",
+        limit: int = 50,
+        offset: int = 0,
+    ) -> Tuple[List[LibraryPaper], int]:
+        """
+        Get papers in user's library (saved papers).
+
+        Joins papers table with paper_feedback where action in actions.
+        """
+        with self.session_provider() as session:
+            # Build query joining papers with paper_feedback
+            stmt = (
+                select(PaperModel, PaperFeedbackModel)
+                .join(
+                    PaperFeedbackModel,
+                    PaperModel.id == cast(PaperFeedbackModel.paper_id, Integer)
+                )
+                .where(
+                    PaperFeedbackModel.user_id == user_id,
+                    PaperFeedbackModel.action.in_(actions),
+                    PaperModel.deleted_at.is_(None),
+                )
+            )
+
+            if track_id is not None:
+                stmt = stmt.where(PaperFeedbackModel.track_id == track_id)
+
+            # Count total
+            count_stmt = select(func.count()).select_from(stmt.subquery())
+            total = session.execute(count_stmt).scalar() or 0
+
+            # Sort
+            if sort_by == "saved_at":
+                stmt = stmt.order_by(PaperFeedbackModel.ts.desc())
+            elif sort_by == "title":
+                stmt = stmt.order_by(PaperModel.title.asc())
+            elif sort_by == "citation_count":
+                stmt = stmt.order_by(PaperModel.citation_count.desc())
+            else:
+                stmt = stmt.order_by(PaperFeedbackModel.ts.desc())
+
+            # Pagination
+            stmt = stmt.offset(offset).limit(limit)
+
+            results = session.execute(stmt).all()
+
+            return [
+                LibraryPaper(
+                    paper=row[0],
+                    saved_at=row[1].ts,
+                    track_id=row[1].track_id,
+                    action=row[1].action,
+                )
+                for row in results
+            ], total
+
+    async def remove_from_library(
+        self,
+        user_id: str,
+        paper_id: int,
+    ) -> bool:
+        """Remove paper from user's library by deleting 'save' feedback."""
+        with self.session_provider() as session:
+            stmt = (
+                PaperFeedbackModel.__table__.delete()
+                .where(
+                    PaperFeedbackModel.user_id == user_id,
+                    PaperFeedbackModel.paper_id == str(paper_id),
+                    PaperFeedbackModel.action == "save",
+                )
+            )
+            result = session.execute(stmt)
+            session.commit()
+            return result.rowcount > 0
+```
+
+#### 2.10.5 Frontend Update: Connect Papers Library to API
+
+**File**: `web/src/lib/api.ts` (update)
+
+```typescript
+// Replace mock fetchPapers with real API call
+export async function fetchPapers(userId: string): Promise<Paper[]> {
+    const res = await fetch(`${API_BASE}/api/papers/library?user_id=${userId}`);
+    if (!res.ok) {
+        throw new Error('Failed to fetch papers library');
+    }
+    const data = await res.json();
+    return data.papers.map((p: any) => ({
+        id: p.id.toString(),
+        title: p.title,
+        venue: p.venue || 'Unknown',
+        authors: p.authors?.join(', ') || 'Unknown',
+        citations: p.citation_count?.toString() || '0',
+        status: p.status || 'Saved',  // Could track analysis status separately
+        tags: p.keywords || p.fields_of_study || [],
+        savedAt: p.saved_at,
+    }));
+}
+```
+
+#### 2.10.6 Relationship Diagram
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                     Papers Library Data Relationships                            │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                                                                                 │
+│  ┌─────────────────────┐         ┌─────────────────────┐                       │
+│  │    papers (v1 NEW)  │         │   paper_feedback    │                       │
+│  │                     │         │     (existing)      │                       │
+│  ├─────────────────────┤         ├─────────────────────┤                       │
+│  │ id (PK)             │◄────────│ paper_id (FK)       │                       │
+│  │ doi                 │         │ user_id             │                       │
+│  │ arxiv_id            │         │ track_id (FK)       │──────┐                │
+│  │ title               │         │ action              │      │                │
+│  │ abstract            │         │ ts                  │      │                │
+│  │ authors_json        │         │ weight              │      │                │
+│  │ year                │         └─────────────────────┘      │                │
+│  │ venue               │                                      │                │
+│  │ citation_count      │         ┌─────────────────────┐      │                │
+│  │ ...                 │         │  research_tracks    │      │                │
+│  └─────────────────────┘         │    (existing)       │◄─────┘                │
+│                                  ├─────────────────────┤                       │
+│                                  │ id (PK)             │                       │
+│                                  │ user_id             │                       │
+│                                  │ name                │                       │
+│                                  │ keywords_json       │                       │
+│                                  │ venues_json         │                       │
+│                                  └─────────────────────┘                       │
+│                                                                                 │
+│  Query: Papers Library for User                                                 │
+│  ───────────────────────────────                                                │
+│  SELECT p.*, pf.ts AS saved_at, pf.track_id                                     │
+│  FROM papers p                                                                  │
+│  JOIN paper_feedback pf ON p.id = CAST(pf.paper_id AS INTEGER)                  │
+│  WHERE pf.user_id = :user_id AND pf.action = 'save'                             │
+│  ORDER BY pf.ts DESC                                                            │
+│                                                                                 │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 3. Implementation Principles
+
+### 3.1 Core Design Principles
+
+| Principle | Description | Implementation |
+|-----------|-------------|----------------|
+| **Open Sources First** | Prioritize free, no-auth APIs | arXiv, S2, OpenAlex (no IEEE/ACM) |
+| **Metadata Only** | No PDF download or parsing | Store URLs only, defer PDF to v2 |
+| **Graceful Degradation** | Partial results if some sources fail | Continue pipeline, report errors |
+| **Idempotent Upserts** | Same paper → same record | Multi-strategy deduplication |
+| **Audit Trail** | Track all harvest runs | harvest_runs table with timing/counts |
+
+### 3.2 Deduplication Strategy (Priority Order)
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                  Paper Arrives from Source                   │
+└─────────────────────────────────────────────────────────────┘
+                              │
+                              ▼
+┌─────────────────────────────────────────────────────────────┐
+│  1. DOI Match? (most reliable)                              │
+│     doi.lower() in doi_index → DUPLICATE                    │
+└─────────────────────────────────────────────────────────────┘
+                              │ No
+                              ▼
+┌─────────────────────────────────────────────────────────────┐
+│  2. arXiv ID Match?                                         │
+│     arxiv_id.lower() in arxiv_index → DUPLICATE             │
+└─────────────────────────────────────────────────────────────┘
+                              │ No
+                              ▼
+┌─────────────────────────────────────────────────────────────┐
+│  3. Semantic Scholar ID Match?                              │
+│     s2_id.lower() in s2_index → DUPLICATE                   │
+└─────────────────────────────────────────────────────────────┘
+                              │ No
+                              ▼
+┌─────────────────────────────────────────────────────────────┐
+│  4. OpenAlex ID Match?                                      │
+│     openalex_id.lower() in openalex_index → DUPLICATE       │
+└─────────────────────────────────────────────────────────────┘
+                              │ No
+                              ▼
+┌─────────────────────────────────────────────────────────────┐
+│  5. Title Hash Match? (fallback)                            │
+│     sha256(normalize(title)) in title_hash_index → DUPLICATE│
+└─────────────────────────────────────────────────────────────┘
+                              │ No
+                              ▼
+┌─────────────────────────────────────────────────────────────┐
+│                    NEW PAPER → Insert                        │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### 3.3 Component Responsibilities
+
+| Component | Responsibility | Should NOT Do |
+|-----------|----------------|---------------|
+| **Harvester** | Fetch papers from source, normalize to HarvestedPaper | Deduplicate, store, apply business rules |
+| **QueryRewriter** | Expand/transform keywords | Fetch papers, access database |
+| **VenueRecommender** | Map keywords to venues | Fetch papers, access database |
+| **Deduplicator** | Find duplicates in memory, merge metadata | Access database, make API calls |
+| **PaperStore** | Persist papers, DB-level dedup, search | Fetch from external APIs |
+| **HarvestPipeline** | Orchestrate all stages | Implement stage logic |
+
+---
+
+## 4. Technology Selection Rationale
+
+### 4.1 Third Source: OpenAlex
+
+| Criterion | OpenAlex | CrossRef | PubMed |
+|-----------|----------|----------|--------|
+| Coverage | 240M+ works | 140M+ | 35M+ (biomedical only) |
+| API Cost | Free | Free | Free |
+| Rate Limit | 10 req/s | 50 req/s | 3 req/s |
+| Auth Required | No | No (polite pool) | No |
+| DOI Support | Yes | Yes | Limited |
+| CS Coverage | Excellent | Good | Poor |
+
+**Decision**: OpenAlex (best coverage, generous rate limit, no auth)
+
+### 4.2 Storage: SQLite
+
+| Criterion | SQLite | PostgreSQL |
+|-----------|--------|------------|
+| Consistency with stack | Same DB | New infra |
+| Deployment simplicity | Single file | Server required |
+| Full-text search | FTS5 (v2) | pg_trgm |
+| Scale limit | ~10M rows | Unlimited |
+
+**Decision**: SQLite (consistent with existing stack, sufficient for v1)
+
+### 4.3 Search: LIKE Queries (v1)
+
+| Criterion | LIKE | FTS5 | Elasticsearch |
+|-----------|------|------|---------------|
+| Setup complexity | None | Index creation | New infra |
+| Query speed | Slow | Fast | Fastest |
+| Relevance ranking | None | BM25 | Full control |
+
+**Decision**: LIKE queries for v1 (simple, sufficient for TopN), defer FTS5 to v2
+
+---
+
+## 5. Best Practices and References
+
+### 5.1 API Documentation
+
+| Source | API Docs | Key Endpoints |
+|--------|----------|---------------|
+| **arXiv** | https://arxiv.org/help/api | `export.arxiv.org/api/query` |
+| **Semantic Scholar** | https://api.semanticscholar.org/api-docs/ | `/graph/v1/paper/search` |
+| **OpenAlex** | https://docs.openalex.org/ | `/works?search=...` |
+
+### 5.2 Open Source References
+
+| Project | Relevance |
+|---------|-----------|
+| **semanticscholar** (PyPI) | Python client for S2 API |
+| **arxiv-sanity-lite** | Query handling patterns |
+| **paperetl** | Metadata extraction + dedup patterns |
+
+### 5.3 Internal Documents
+
+| Document | Content |
+|----------|---------|
+| `config/top_venues.yaml` | Venue tier rankings |
+| `src/paperbot/infrastructure/connectors/arxiv_connector.py` | Existing arXiv XML parsing |
+| `src/paperbot/infrastructure/api_clients/semantic_scholar.py` | Existing S2 client |
+
+---
+
+## 6. Risks and Mitigations
+
+### 6.1 Technical Risks
+
+| Risk | Likelihood | Impact | Mitigation |
+|------|------------|--------|------------|
+| API rate limiting | High | Medium | Respect rate limits, exponential backoff |
+| Source API changes | Low | High | Version harvesters, monitor for changes |
+| Dedup misses duplicates | Medium | Low | Multiple strategies, title hash fallback |
+| Large result sets slow DB | Medium | Medium | Pagination, indexes, defer FTS to v2 |
+
+### 6.2 Operational Risks
+
+| Risk | Likelihood | Impact | Mitigation |
+|------|------------|--------|------------|
+| OpenAlex API unreliable | Low | Medium | Continue with other sources |
+| Stale venue mappings | Medium | Low | Config-driven, easy to update |
+| Disk space from paper storage | Low | Low | Metadata only, no PDFs |
+
+---
+
+## 7. Workload Estimation
+
+### 7.1 Task Breakdown
+
+| Task | Effort | Dependencies |
+|------|--------|--------------|
+| **Infrastructure** | | |
+| Domain models (`domain/harvest.py`) | 2h | None |
+| Database migration (papers, harvest_runs) | 2h | Models |
+| PaperStore implementation | 4h | Migration |
+| **Harvesters** | | |
+| HarvesterPort interface | 1h | Models |
+| ArxivHarvester | 3h | Interface |
+| SemanticScholarHarvester | 2h | Interface |
+| OpenAlexHarvester | 4h | Interface |
+| **Services** | | |
+| VenueRecommender | 2h | Config |
+| QueryRewriter | 2h | None |
+| PaperDeduplicator | 3h | Models |
+| **Pipeline & API** | | |
+| HarvestPipeline orchestrator | 4h | All above |
+| API routes (harvest, search) | 3h | Pipeline, Store |
+| **Papers Library Integration** | | |
+| PaperStore.get_user_library() | 2h | PaperStore |
+| API route (/api/papers/library) | 1h | PaperStore |
+| Frontend update (web/src/lib/api.ts) | 1h | API |
+| **Testing** | | |
+| Unit tests (dedup, rewriter, recommender) | 3h | Services |
+| Integration tests (harvesters, store) | 3h | Harvesters |
+| E2E test (full pipeline) | 2h | API |
+
+### 7.2 Summary
+
+| Category | Hours |
+|----------|-------|
+| Infrastructure | 8h |
+| Harvesters | 10h |
+| Services | 7h |
+| Pipeline & API | 7h |
+| Papers Library Integration | 4h |
+| Testing | 8h |
+| **Total** | **44h (~6-7 days)** |
+
+### 7.3 Suggested Timeline
+
+```
+Day 1:    Infrastructure
+          - Domain models
+          - Database migration
+          - PaperStore (partial)
+
+Day 2:    Infrastructure + Harvesters
+          - PaperStore completion
+          - HarvesterPort interface
+          - ArxivHarvester
+
+Day 3:    Harvesters
+          - SemanticScholarHarvester
+          - OpenAlexHarvester
+          - Unit tests for harvesters
+
+Day 4:    Services
+          - VenueRecommender
+          - QueryRewriter
+          - PaperDeduplicator
+          - Unit tests
+
+Day 5:    Pipeline & API
+          - HarvestPipeline orchestrator
+          - API routes
+          - Integration tests
+
+Day 6:    Testing & Polish
+          - E2E tests
+          - Error handling improvements
+          - Documentation
+
+Day 7:    Buffer / Review
+          - Code review
+          - Bug fixes
+          - Update docs
+```
+
+---
+
+## 8. Deliverables Checklist
+
+### 8.1 Domain Models
+- [ ] `src/paperbot/domain/harvest.py` - HarvestedPaper, HarvestSource, HarvestResult
+
+### 8.2 Database
+- [ ] `alembic/versions/0003_paper_harvest_tables.py` - Migration
+- [ ] `src/paperbot/infrastructure/stores/models.py` - PaperModel, HarvestRunModel
+
+### 8.3 Harvesters
+- [ ] `src/paperbot/application/ports/harvester_port.py` - HarvesterPort interface
+- [ ] `src/paperbot/infrastructure/harvesters/__init__.py`
+- [ ] `src/paperbot/infrastructure/harvesters/arxiv_harvester.py`
+- [ ] `src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py`
+- [ ] `src/paperbot/infrastructure/harvesters/openalex_harvester.py`
+
+### 8.4 Services
+- [ ] `src/paperbot/application/services/venue_recommender.py`
+- [ ] `src/paperbot/application/services/query_rewriter.py`
+- [ ] `src/paperbot/application/services/paper_deduplicator.py`
+
+### 8.5 Pipeline & Storage
+- [ ] `src/paperbot/application/workflows/harvest_pipeline.py`
+- [ ] `src/paperbot/infrastructure/stores/paper_store.py`
+
+### 8.6 API
+- [ ] `src/paperbot/api/routes/harvest.py` - POST /api/harvest, POST /api/papers/search, GET /api/papers/library
+- [ ] `src/paperbot/api/main.py` - Register router
+
+### 8.7 Papers Library Integration
+- [ ] `src/paperbot/infrastructure/stores/paper_store.py` - Add `get_user_library()`, `remove_from_library()` methods
+- [ ] `web/src/lib/api.ts` - Update `fetchPapers()` to call real API
+- [ ] `web/src/app/papers/page.tsx` - Connect to `/api/papers/library` endpoint
+
+### 8.8 Tests
+- [ ] `tests/unit/test_paper_deduplicator.py`
+- [ ] `tests/unit/test_query_rewriter.py`
+- [ ] `tests/unit/test_venue_recommender.py`
+- [ ] `tests/integration/test_paper_store.py`
+- [ ] `tests/integration/test_harvesters.py`
+- [ ] `tests/e2e/test_harvest_api.py`
+- [ ] `tests/e2e/test_papers_library.py` - Papers Library integration test
+
+### 8.9 Documentation
+- [ ] `docs/paper_harvest_v1.md` - User guide
+
+---
+
+## 9. Open Questions
+
+The following questions require user input before implementation:
+
+1. **Venue configuration format**: Should VenueRecommender use existing `config/top_venues.yaml` or a separate config file with keyword→venue mappings?
+
+2. **Rate limiting strategy**: Should we implement global rate limiting across all harvesters, or per-harvester limits?
+
+3. **Search scope**: Should `/api/papers/search` search only harvested papers, or also query external APIs in real-time?
+
+4. **Frontend integration**: Should harvest progress be shown on a new page, or integrated into the existing Research page?
+
+5. **Retention policy**: Should old harvest_runs records be automatically cleaned up after N days?
+
+---
+
+## Appendix A: Existing Implementation Summary
+
+### A.1 Existing Connectors
+
+| Connector | Status | Reusable? |
+|-----------|--------|-----------|
+| ArxivConnector | XML parsing only | Use for response parsing |
+| SemanticScholarClient | Async API wrapper | Wrap with harvester |
+| RedditConnector | RSS parsing | Not relevant |
+
+### A.2 Existing Infrastructure
+
+| Component | Status |
+|-----------|--------|
+| SessionProvider | Ready |
+| SQLAlchemy Base | Ready |
+| Alembic migrations | Ready |
+| FastAPI streaming | Ready |
+| EventLogPort | Ready |
+
+### A.3 API Patterns to Follow
+
+| Pattern | Example File |
+|---------|--------------|
+| SSE streaming | `src/paperbot/api/routes/track.py` |
+| Pydantic models | `src/paperbot/api/routes/research.py` |
+| Store initialization | `src/paperbot/infrastructure/stores/research_store.py` |
diff --git a/src/paperbot/api/main.py b/src/paperbot/api/main.py
index 8a41f07..45ea821 100644
--- a/src/paperbot/api/main.py
+++ b/src/paperbot/api/main.py
@@ -20,7 +20,11 @@
     memory,
     research,
     paperscool,
+<<<<<<< HEAD
     newsletter,
+=======
+    harvest,
+>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
 )
 from paperbot.infrastructure.event_log.logging_event_log import LoggingEventLog
 from paperbot.infrastructure.event_log.composite_event_log import CompositeEventLog
@@ -64,7 +68,11 @@ async def health_check():
 app.include_router(memory.router, prefix="/api", tags=["Memory"])
 app.include_router(research.router, prefix="/api", tags=["Research"])
 app.include_router(paperscool.router, prefix="/api", tags=["PapersCool"])
+<<<<<<< HEAD
 app.include_router(newsletter.router, prefix="/api", tags=["Newsletter"])
+=======
+app.include_router(harvest.router, prefix="/api", tags=["Harvest"])
+>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
 
 
 @app.on_event("startup")
diff --git a/src/paperbot/api/routes/harvest.py b/src/paperbot/api/routes/harvest.py
new file mode 100644
index 0000000..10ad62f
--- /dev/null
+++ b/src/paperbot/api/routes/harvest.py
@@ -0,0 +1,429 @@
+# src/paperbot/api/routes/harvest.py
+"""
+Paper Harvest API Routes.
+
+Provides endpoints for:
+- Paper harvesting from multiple sources
+- Paper search and retrieval
+- User's paper library management
+- Harvest run history
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional
+
+from fastapi import APIRouter, HTTPException, Query, Request
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel, Field
+
+from paperbot.api.streaming import StreamEvent, wrap_generator
+from paperbot.application.workflows.harvest_pipeline import (
+    HarvestConfig,
+    HarvestFinalResult,
+    HarvestPipeline,
+    HarvestProgress,
+)
+from paperbot.utils.logging_config import Logger, LogFiles, set_trace_id, clear_trace_id
+from paperbot.infrastructure.stores.paper_store import PaperStore, paper_to_dict
+
+router = APIRouter()
+
+# Lazy-initialized stores
+_paper_store: Optional[PaperStore] = None
+
+
+def _get_paper_store() -> PaperStore:
+    """Lazy initialization of paper store."""
+    global _paper_store
+    if _paper_store is None:
+        _paper_store = PaperStore()
+    return _paper_store
+
+
+# ============================================================================
+# Harvest Endpoints
+# ============================================================================
+
+
+class HarvestRequest(BaseModel):
+    """Request body for harvest endpoint."""
+
+    keywords: List[str] = Field(..., min_items=1, description="Search keywords")
+    venues: Optional[List[str]] = Field(None, description="Filter to specific venues")
+    year_from: Optional[int] = Field(None, ge=1900, le=2100, description="Start year")
+    year_to: Optional[int] = Field(None, ge=1900, le=2100, description="End year")
+    max_results_per_source: int = Field(
+        50, ge=1, le=200, description="Max papers per source"
+    )
+    sources: Optional[List[str]] = Field(
+        None, description="Sources to harvest (arxiv, semantic_scholar, openalex)"
+    )
+    expand_keywords: bool = Field(True, description="Expand abbreviations")
+    recommend_venues: bool = Field(True, description="Auto-recommend venues if not specified")
+
+
+async def harvest_stream(request: HarvestRequest):
+    """Stream harvest progress via SSE."""
+    config = HarvestConfig(
+        keywords=request.keywords,
+        venues=request.venues,
+        year_from=request.year_from,
+        year_to=request.year_to,
+        sources=request.sources,
+        max_results_per_source=request.max_results_per_source,
+        expand_keywords=request.expand_keywords,
+        recommend_venues=request.recommend_venues,
+    )
+
+    pipeline = HarvestPipeline()
+    try:
+        async for item in pipeline.run(config):
+            if isinstance(item, HarvestProgress):
+                yield StreamEvent(
+                    type="progress",
+                    data={
+                        "phase": item.phase,
+                        "message": item.message,
+                        "details": item.details,
+                    },
+                )
+            elif isinstance(item, HarvestFinalResult):
+                yield StreamEvent(
+                    type="result",
+                    data={
+                        "run_id": item.run_id,
+                        "status": item.status,
+                        "papers_found": item.papers_found,
+                        "papers_new": item.papers_new,
+                        "papers_deduplicated": item.papers_deduplicated,
+                        "sources": item.source_results,
+                        "errors": item.errors,
+                        "duration_seconds": item.duration_seconds,
+                    },
+                )
+    except Exception as e:
+        yield StreamEvent(type="error", message=str(e))
+    finally:
+        await pipeline.close()
+
+
+@router.post("/harvest")
+async def harvest_papers(request: HarvestRequest):
+    """
+    Harvest papers from multiple sources.
+
+    Returns Server-Sent Events with progress updates.
+    """
+    trace_id = set_trace_id()
+    Logger.info(f"Starting harvest request: keywords={request.keywords}", file=LogFiles.HARVEST)
+    return StreamingResponse(
+        wrap_generator(harvest_stream(request)),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+        },
+    )
+
+
+class HarvestRunResponse(BaseModel):
+    """Response for harvest run details."""
+
+    run_id: str
+    keywords: List[str]
+    venues: List[str]
+    sources: List[str]
+    max_results_per_source: int
+    status: str
+    papers_found: int
+    papers_new: int
+    papers_deduplicated: int
+    errors: Dict[str, Any]
+    started_at: Optional[str]
+    ended_at: Optional[str]
+
+
+class HarvestRunListResponse(BaseModel):
+    """Response for list of harvest runs."""
+
+    runs: List[HarvestRunResponse]
+
+
+@router.get("/harvest/runs", response_model=HarvestRunListResponse)
+def list_harvest_runs(
+    status: Optional[str] = Query(None, description="Filter by status"),
+    limit: int = Query(50, ge=1, le=500),
+    offset: int = Query(0, ge=0),
+):
+    """List harvest runs with optional filtering."""
+    store = _get_paper_store()
+    runs = store.list_harvest_runs(status=status, limit=limit, offset=offset)
+
+    return HarvestRunListResponse(
+        runs=[
+            HarvestRunResponse(
+                run_id=run.run_id,
+                keywords=run.get_keywords(),
+                venues=run.get_venues(),
+                sources=run.get_sources(),
+                max_results_per_source=run.max_results_per_source or 50,
+                status=run.status or "unknown",
+                papers_found=run.papers_found or 0,
+                papers_new=run.papers_new or 0,
+                papers_deduplicated=run.papers_deduplicated or 0,
+                errors=run.get_errors(),
+                started_at=run.started_at.isoformat() if run.started_at else None,
+                ended_at=run.ended_at.isoformat() if run.ended_at else None,
+            )
+            for run in runs
+        ]
+    )
+
+
+@router.get("/harvest/runs/{run_id}", response_model=HarvestRunResponse)
+def get_harvest_run(run_id: str):
+    """Get details of a specific harvest run."""
+    store = _get_paper_store()
+    run = store.get_harvest_run(run_id)
+
+    if not run:
+        raise HTTPException(status_code=404, detail="Harvest run not found")
+
+    return HarvestRunResponse(
+        run_id=run.run_id,
+        keywords=run.get_keywords(),
+        venues=run.get_venues(),
+        sources=run.get_sources(),
+        max_results_per_source=run.max_results_per_source or 50,
+        status=run.status or "unknown",
+        papers_found=run.papers_found or 0,
+        papers_new=run.papers_new or 0,
+        papers_deduplicated=run.papers_deduplicated or 0,
+        errors=run.get_errors(),
+        started_at=run.started_at.isoformat() if run.started_at else None,
+        ended_at=run.ended_at.isoformat() if run.ended_at else None,
+    )
+
+
+# ============================================================================
+# Paper Search Endpoints
+# ============================================================================
+
+
+class PaperSearchRequest(BaseModel):
+    """Request body for paper search."""
+
+    query: Optional[str] = Field(None, description="Full-text search query")
+    keywords: Optional[List[str]] = Field(None, description="Keyword filters")
+    venues: Optional[List[str]] = Field(None, description="Venue filters")
+    year_from: Optional[int] = Field(None, ge=1900, le=2100)
+    year_to: Optional[int] = Field(None, ge=1900, le=2100)
+    min_citations: Optional[int] = Field(None, ge=0)
+    sources: Optional[List[str]] = Field(None, description="Source filters")
+    sort_by: str = Field("citation_count", description="Sort field")
+    sort_order: str = Field("desc", description="Sort order (asc/desc)")
+    limit: int = Field(50, ge=1, le=500)
+    offset: int = Field(0, ge=0)
+
+
+class PaperResponse(BaseModel):
+    """Single paper response."""
+
+    id: int
+    doi: Optional[str]
+    arxiv_id: Optional[str]
+    semantic_scholar_id: Optional[str]
+    openalex_id: Optional[str]
+    title: str
+    abstract: str
+    authors: List[str]
+    year: Optional[int]
+    venue: Optional[str]
+    publication_date: Optional[str]
+    citation_count: int
+    url: Optional[str]
+    pdf_url: Optional[str]
+    keywords: List[str]
+    fields_of_study: List[str]
+    primary_source: str
+    sources: List[str]
+    created_at: Optional[str]
+    updated_at: Optional[str]
+
+
+class PaperSearchResponse(BaseModel):
+    """Response for paper search."""
+
+    papers: List[Dict[str, Any]]
+    total: int
+    limit: int
+    offset: int
+
+
+@router.post("/papers/search", response_model=PaperSearchResponse)
+def search_papers(request: PaperSearchRequest):
+    """Search papers with filters and pagination."""
+    set_trace_id()  # Initialize trace_id for this request
+    Logger.info(f"Searching papers: query={request.query}", file=LogFiles.HARVEST)
+    store = _get_paper_store()
+
+    papers, total = store.search_papers(
+        query=request.query,
+        keywords=request.keywords,
+        venues=request.venues,
+        year_from=request.year_from,
+        year_to=request.year_to,
+        min_citations=request.min_citations,
+        sources=request.sources,
+        sort_by=request.sort_by,
+        sort_order=request.sort_order,
+        limit=request.limit,
+        offset=request.offset,
+    )
+
+    return PaperSearchResponse(
+        papers=[paper_to_dict(p) for p in papers],
+        total=total,
+        limit=request.limit,
+        offset=request.offset,
+    )
+
+
+@router.get("/papers/stats")
+def get_paper_stats():
+    """Get paper collection statistics."""
+    store = _get_paper_store()
+    return {"total_papers": store.get_paper_count()}
+
+
+# ============================================================================
+# User Library Endpoints
+# ============================================================================
+
+
+class LibraryPaperResponse(BaseModel):
+    """Paper in user's library."""
+
+    paper: Dict[str, Any]
+    saved_at: str
+    track_id: Optional[int]
+    action: str
+
+
+class LibraryResponse(BaseModel):
+    """Response for user library."""
+
+    papers: List[LibraryPaperResponse]
+    total: int
+    limit: int
+    offset: int
+
+
+@router.get("/papers/library", response_model=LibraryResponse)
+def get_user_library(
+    user_id: str = Query("default", description="User ID"),
+    track_id: Optional[int] = Query(None, description="Filter by track"),
+    actions: Optional[str] = Query(None, description="Filter by actions (comma-separated)"),
+    sort_by: str = Query("saved_at", description="Sort field"),
+    sort_order: str = Query("desc", description="Sort order"),
+    limit: int = Query(50, ge=1, le=500),
+    offset: int = Query(0, ge=0),
+):
+    """Get user's paper library (saved papers)."""
+    set_trace_id()  # Initialize trace_id for this request
+    Logger.info("Received request to get user library", file=LogFiles.HARVEST)
+    store = _get_paper_store()
+
+    action_list = None
+    if actions:
+        action_list = [a.strip() for a in actions.split(",") if a.strip()]
+
+    Logger.info("Fetching papers from library store", file=LogFiles.HARVEST)
+    library_papers, total = store.get_user_library(
+        user_id=user_id,
+        track_id=track_id,
+        actions=action_list,
+        sort_by=sort_by,
+        sort_order=sort_order,
+        limit=limit,
+        offset=offset,
+    )
+
+    Logger.info(f"Retrieved {len(library_papers)} papers from library, total={total}", file=LogFiles.HARVEST)
+    return LibraryResponse(
+        papers=[
+            LibraryPaperResponse(
+                paper=paper_to_dict(lp.paper),
+                saved_at=lp.saved_at.isoformat() if lp.saved_at else "",
+                track_id=lp.track_id,
+                action=lp.action,
+            )
+            for lp in library_papers
+        ],
+        total=total,
+        limit=limit,
+        offset=offset,
+    )
+
+
+# NOTE: Parameterized routes must come AFTER specific routes like /papers/stats and /papers/library
+@router.get("/papers/{paper_id}")
+def get_paper(paper_id: int):
+    """Get a paper by ID."""
+    store = _get_paper_store()
+    paper = store.get_paper_by_id(paper_id)
+
+    if not paper:
+        raise HTTPException(status_code=404, detail="Paper not found")
+
+    return {"paper": paper_to_dict(paper)}
+
+
+class SavePaperRequest(BaseModel):
+    """Request to save paper to library."""
+
+    user_id: str = Field("default", description="User ID")
+    track_id: Optional[int] = Field(None, description="Associated track ID")
+
+
+@router.post("/papers/{paper_id}/save")
+def save_paper_to_library(paper_id: int, request: SavePaperRequest):
+    """
+    Save a paper to user's library.
+
+    Uses paper_feedback table with action='save'.
+    """
+    from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore
+
+    # Verify paper exists
+    store = _get_paper_store()
+    paper = store.get_paper_by_id(paper_id)
+    if not paper:
+        raise HTTPException(status_code=404, detail="Paper not found")
+
+    # Use research store to record feedback
+    research_store = SqlAlchemyResearchStore()
+    feedback = research_store.record_paper_feedback(
+        user_id=request.user_id,
+        paper_id=str(paper_id),
+        action="save",
+        track_id=request.track_id,
+    )
+
+    return {"success": True, "feedback": feedback}
+
+
+@router.delete("/papers/{paper_id}/save")
+def remove_paper_from_library(
+    paper_id: int,
+    user_id: str = Query("default", description="User ID"),
+):
+    """Remove a paper from user's library."""
+    store = _get_paper_store()
+    removed = store.remove_from_library(user_id, paper_id)
+
+    if not removed:
+        raise HTTPException(status_code=404, detail="Paper not in library")
+
+    return {"success": True}
diff --git a/src/paperbot/api/routes/research.py b/src/paperbot/api/routes/research.py
index abb50f4..2f01503 100644
--- a/src/paperbot/api/routes/research.py
+++ b/src/paperbot/api/routes/research.py
@@ -11,6 +11,7 @@
 
 from paperbot.context_engine import ContextEngine, ContextEngineConfig
 from paperbot.context_engine.track_router import TrackRouter
+from paperbot.utils.logging_config import Logger, LogFiles, set_trace_id
 from paperbot.infrastructure.stores.memory_store import SqlAlchemyMemoryStore
 from paperbot.infrastructure.api_clients.semantic_scholar import SemanticScholarClient
 from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore
@@ -624,18 +625,32 @@ class PaperFeedbackRequest(BaseModel):
     metadata: Dict[str, Any] = {}
     context_run_id: Optional[int] = None
     context_rank: Optional[int] = None
+    # Paper metadata (optional, used when saving to library)
+    paper_title: Optional[str] = None
+    paper_abstract: Optional[str] = None
+    paper_authors: Optional[List[str]] = None
+    paper_year: Optional[int] = None
+    paper_venue: Optional[str] = None
+    paper_citation_count: Optional[int] = None
+    paper_url: Optional[str] = None
 
 
 class PaperFeedbackResponse(BaseModel):
     feedback: Dict[str, Any]
+    library_paper_id: Optional[int] = None  # ID in papers table if saved
 
 
 @router.post("/research/papers/feedback", response_model=PaperFeedbackResponse)
 def add_paper_feedback(req: PaperFeedbackRequest):
+    set_trace_id()  # Initialize trace_id for this request
+    Logger.info(f"Received paper feedback request, action={req.action}", file=LogFiles.HARVEST)
+
     track_id = req.track_id
     if track_id is None:
+        Logger.info("No track specified, getting active track", file=LogFiles.HARVEST)
         active = _research_store.get_active_track(user_id=req.user_id)
         if not active:
+            Logger.error("No active track found", file=LogFiles.HARVEST)
             raise HTTPException(status_code=400, detail="track_id missing and no active track")
         track_id = int(active["id"])
 
@@ -645,17 +660,61 @@ def add_paper_feedback(req: PaperFeedbackRequest):
     if req.context_rank is not None:
         meta["context_rank"] = int(req.context_rank)
 
+    library_paper_id: Optional[int] = None
+    actual_paper_id = req.paper_id
+
+    # If action is "save" and we have paper metadata, insert into papers table
+    if req.action == "save" and req.paper_title:
+        Logger.info("Save action detected, inserting paper into papers table", file=LogFiles.HARVEST)
+        try:
+            from paperbot.domain.harvest import HarvestedPaper, HarvestSource
+            from paperbot.infrastructure.stores.paper_store import PaperStore
+
+            paper_store = PaperStore()
+            paper = HarvestedPaper(
+                title=req.paper_title,
+                source=HarvestSource.SEMANTIC_SCHOLAR,
+                abstract=req.paper_abstract or "",
+                authors=req.paper_authors or [],
+                semantic_scholar_id=req.paper_id,
+                year=req.paper_year,
+                venue=req.paper_venue,
+                citation_count=req.paper_citation_count or 0,
+                url=req.paper_url,
+            )
+            Logger.info("Calling paper store to upsert paper", file=LogFiles.HARVEST)
+            new_count, _ = paper_store.upsert_papers_batch([paper])
+
+            # Get the paper ID from database
+            from paperbot.infrastructure.stores.models import PaperModel
+            from sqlalchemy import select
+            with paper_store._provider.session() as session:
+                result = session.execute(
+                    select(PaperModel).where(
+                        PaperModel.semantic_scholar_id == req.paper_id
+                    )
+                ).scalar_one_or_none()
+                if result:
+                    library_paper_id = result.id
+                    actual_paper_id = str(result.id)  # Use integer ID for feedback
+                    Logger.info(f"Paper saved to library with id={library_paper_id}", file=LogFiles.HARVEST)
+        except Exception as e:
+            Logger.warning(f"Failed to save paper to library: {e}", file=LogFiles.HARVEST)
+
+    Logger.info("Recording paper feedback to research store", file=LogFiles.HARVEST)
     fb = _research_store.add_paper_feedback(
         user_id=req.user_id,
         track_id=track_id,
-        paper_id=req.paper_id,
+        paper_id=actual_paper_id,
         action=req.action,
         weight=req.weight,
         metadata=meta,
     )
     if not fb:
+        Logger.error("Failed to record feedback - track not found", file=LogFiles.HARVEST)
         raise HTTPException(status_code=404, detail="Track not found")
-    return PaperFeedbackResponse(feedback=fb)
+    Logger.info("Paper feedback recorded successfully", file=LogFiles.HARVEST)
+    return PaperFeedbackResponse(feedback=fb, library_paper_id=library_paper_id)
 
 
 class PaperFeedbackListResponse(BaseModel):
@@ -769,13 +828,19 @@ class ContextResponse(BaseModel):
 
 @router.post("/research/context", response_model=ContextResponse)
 async def build_context(req: ContextRequest):
+    set_trace_id()  # Initialize trace_id for this request
+    Logger.info("Received build context request", file=LogFiles.HARVEST)
+
     if req.activate_track_id is not None:
+        Logger.info("Activating research track", file=LogFiles.HARVEST)
         activated = _research_store.activate_track(
             user_id=req.user_id, track_id=req.activate_track_id
         )
         if not activated:
+            Logger.error("Research track not found", file=LogFiles.HARVEST)
             raise HTTPException(status_code=404, detail="Track not found")
 
+    Logger.info("Initializing context engine", file=LogFiles.HARVEST)
     engine = ContextEngine(
         research_store=_research_store,
         memory_store=_memory_store,
@@ -794,12 +859,15 @@ async def build_context(req: ContextRequest):
         ),
     )
     try:
+        Logger.info("Building context pack with paper recommendations", file=LogFiles.HARVEST)
         pack = await engine.build_context_pack(
             user_id=req.user_id,
             query=req.query,
             track_id=req.track_id,
             include_cross_track=req.include_cross_track,
         )
+        paper_count = len(pack.get("paper_recommendations", []))
+        Logger.info(f"Context pack built successfully, found {paper_count} papers", file=LogFiles.HARVEST)
         return ContextResponse(context_pack=pack)
     finally:
         await engine.close()
diff --git a/src/paperbot/application/ports/harvester_port.py b/src/paperbot/application/ports/harvester_port.py
new file mode 100644
index 0000000..5716c45
--- /dev/null
+++ b/src/paperbot/application/ports/harvester_port.py
@@ -0,0 +1,50 @@
+# src/paperbot/application/ports/harvester_port.py
+"""
+Harvester port interface.
+
+Defines the abstract interface for all paper harvesters.
+"""
+
+from __future__ import annotations
+
+from typing import List, Optional, Protocol, runtime_checkable
+
+from paperbot.domain.harvest import HarvestResult, HarvestSource
+
+
+@runtime_checkable
+class HarvesterPort(Protocol):
+    """Abstract interface for all paper harvesters."""
+
+    @property
+    def source(self) -> HarvestSource:
+        """Return the harvest source identifier."""
+        ...
+
+    async def search(
+        self,
+        query: str,
+        *,
+        max_results: int = 100,
+        year_from: Optional[int] = None,
+        year_to: Optional[int] = None,
+        venues: Optional[List[str]] = None,
+    ) -> HarvestResult:
+        """
+        Search for papers matching the query.
+
+        Args:
+            query: Search query string
+            max_results: Maximum number of results to return
+            year_from: Filter papers published on or after this year
+            year_to: Filter papers published on or before this year
+            venues: Filter papers from these venues (if supported by source)
+
+        Returns:
+            HarvestResult with papers and metadata
+        """
+        ...
+
+    async def close(self) -> None:
+        """Release resources (HTTP sessions, etc.)."""
+        ...
diff --git a/src/paperbot/application/services/__init__.py b/src/paperbot/application/services/__init__.py
index 423829e..a319fd2 100644
--- a/src/paperbot/application/services/__init__.py
+++ b/src/paperbot/application/services/__init__.py
@@ -1,3 +1,12 @@
 from paperbot.application.services.llm_service import LLMService, get_llm_service
+from paperbot.application.services.paper_deduplicator import PaperDeduplicator
+from paperbot.application.services.query_rewriter import QueryRewriter
+from paperbot.application.services.venue_recommender import VenueRecommender
 
-__all__ = ["LLMService", "get_llm_service"]
+__all__ = [
+    "LLMService",
+    "get_llm_service",
+    "PaperDeduplicator",
+    "QueryRewriter",
+    "VenueRecommender",
+]
diff --git a/src/paperbot/application/services/paper_deduplicator.py b/src/paperbot/application/services/paper_deduplicator.py
new file mode 100644
index 0000000..954fa64
--- /dev/null
+++ b/src/paperbot/application/services/paper_deduplicator.py
@@ -0,0 +1,190 @@
+# src/paperbot/application/services/paper_deduplicator.py
+"""
+Paper deduplication service.
+
+Multi-strategy deduplication for papers from multiple sources.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Dict, List, Optional, Tuple
+
+from paperbot.domain.harvest import HarvestedPaper
+
+logger = logging.getLogger(__name__)
+
+
+class PaperDeduplicator:
+    """
+    Multi-strategy paper deduplication.
+
+    Priority order:
+    1. DOI (most reliable)
+    2. arXiv ID
+    3. Semantic Scholar ID
+    4. OpenAlex ID
+    5. Normalized title hash (fallback)
+
+    When duplicates are found, metadata is merged to preserve
+    the most complete information from all sources.
+    """
+
+    def __init__(self):
+        self._doi_index: Dict[str, int] = {}
+        self._arxiv_index: Dict[str, int] = {}
+        self._s2_index: Dict[str, int] = {}
+        self._openalex_index: Dict[str, int] = {}
+        self._title_hash_index: Dict[str, int] = {}
+
+    def reset(self) -> None:
+        """Clear all indexes for a fresh deduplication run."""
+        self._doi_index.clear()
+        self._arxiv_index.clear()
+        self._s2_index.clear()
+        self._openalex_index.clear()
+        self._title_hash_index.clear()
+
+    def deduplicate(
+        self,
+        papers: List[HarvestedPaper],
+    ) -> Tuple[List[HarvestedPaper], int]:
+        """
+        Deduplicate papers in-memory.
+
+        Args:
+            papers: List of papers from all sources
+
+        Returns:
+            Tuple of (deduplicated papers, count of duplicates removed)
+        """
+        self.reset()
+        unique_papers: List[HarvestedPaper] = []
+        duplicates_count = 0
+
+        for paper in papers:
+            existing_idx = self._find_duplicate(paper)
+
+            if existing_idx is not None:
+                # Merge metadata into existing paper
+                self._merge_paper(unique_papers[existing_idx], paper)
+                duplicates_count += 1
+            else:
+                # Add new paper
+                idx = len(unique_papers)
+                self._index_paper(paper, idx)
+                unique_papers.append(paper)
+
+        logger.info(
+            f"Deduplication complete: {len(papers)} → {len(unique_papers)} "
+            f"({duplicates_count} duplicates removed)"
+        )
+        return unique_papers, duplicates_count
+
+    def _find_duplicate(self, paper: HarvestedPaper) -> Optional[int]:
+        """Find existing paper index if duplicate exists."""
+        # 1. DOI match (most reliable)
+        if paper.doi:
+            doi_lower = paper.doi.lower().strip()
+            if doi_lower in self._doi_index:
+                return self._doi_index[doi_lower]
+
+        # 2. arXiv ID match
+        if paper.arxiv_id:
+            arxiv_lower = paper.arxiv_id.lower().strip()
+            if arxiv_lower in self._arxiv_index:
+                return self._arxiv_index[arxiv_lower]
+
+        # 3. Semantic Scholar ID match
+        if paper.semantic_scholar_id:
+            s2_lower = paper.semantic_scholar_id.lower().strip()
+            if s2_lower in self._s2_index:
+                return self._s2_index[s2_lower]
+
+        # 4. OpenAlex ID match
+        if paper.openalex_id:
+            openalex_lower = paper.openalex_id.lower().strip()
+            if openalex_lower in self._openalex_index:
+                return self._openalex_index[openalex_lower]
+
+        # 5. Title hash match (fallback)
+        title_hash = paper.compute_title_hash()
+        if title_hash in self._title_hash_index:
+            return self._title_hash_index[title_hash]
+
+        return None
+
+    def _index_paper(self, paper: HarvestedPaper, idx: int) -> None:
+        """Add paper to all relevant indexes."""
+        if paper.doi:
+            self._doi_index[paper.doi.lower().strip()] = idx
+        if paper.arxiv_id:
+            self._arxiv_index[paper.arxiv_id.lower().strip()] = idx
+        if paper.semantic_scholar_id:
+            self._s2_index[paper.semantic_scholar_id.lower().strip()] = idx
+        if paper.openalex_id:
+            self._openalex_index[paper.openalex_id.lower().strip()] = idx
+
+        title_hash = paper.compute_title_hash()
+        self._title_hash_index[title_hash] = idx
+
+    def _merge_paper(self, existing: HarvestedPaper, new: HarvestedPaper) -> None:
+        """
+        Merge metadata from new paper into existing.
+
+        Strategy:
+        - Fill in missing identifiers
+        - Prefer longer/more complete text fields
+        - Prefer higher citation counts
+        - Merge lists (keywords, fields of study)
+        """
+        # Fill in missing identifiers
+        if not existing.doi and new.doi:
+            existing.doi = new.doi
+            self._doi_index[new.doi.lower().strip()] = self._find_index(existing)
+        if not existing.arxiv_id and new.arxiv_id:
+            existing.arxiv_id = new.arxiv_id
+            self._arxiv_index[new.arxiv_id.lower().strip()] = self._find_index(existing)
+        if not existing.semantic_scholar_id and new.semantic_scholar_id:
+            existing.semantic_scholar_id = new.semantic_scholar_id
+            self._s2_index[new.semantic_scholar_id.lower().strip()] = self._find_index(existing)
+        if not existing.openalex_id and new.openalex_id:
+            existing.openalex_id = new.openalex_id
+            self._openalex_index[new.openalex_id.lower().strip()] = self._find_index(existing)
+
+        # Prefer longer abstract
+        if len(new.abstract) > len(existing.abstract):
+            existing.abstract = new.abstract
+
+        # Prefer higher citation count
+        if new.citation_count > existing.citation_count:
+            existing.citation_count = new.citation_count
+
+        # Fill in missing metadata
+        if not existing.year and new.year:
+            existing.year = new.year
+        if not existing.venue and new.venue:
+            existing.venue = new.venue
+        if not existing.publication_date and new.publication_date:
+            existing.publication_date = new.publication_date
+        if not existing.url and new.url:
+            existing.url = new.url
+        if not existing.pdf_url and new.pdf_url:
+            existing.pdf_url = new.pdf_url
+
+        # Prefer more complete author list
+        if len(new.authors) > len(existing.authors):
+            existing.authors = new.authors
+
+        # Merge keywords and fields (deduplicate)
+        existing.keywords = list(set(existing.keywords + new.keywords))
+        existing.fields_of_study = list(set(existing.fields_of_study + new.fields_of_study))
+
+    def _find_index(self, paper: HarvestedPaper) -> int:
+        """Find the index of a paper in the title hash index."""
+        title_hash = paper.compute_title_hash()
+        return self._title_hash_index.get(title_hash, -1)
+
+    def is_duplicate(self, paper: HarvestedPaper) -> bool:
+        """Check if a paper would be considered a duplicate."""
+        return self._find_duplicate(paper) is not None
diff --git a/src/paperbot/application/services/query_rewriter.py b/src/paperbot/application/services/query_rewriter.py
new file mode 100644
index 0000000..c87306f
--- /dev/null
+++ b/src/paperbot/application/services/query_rewriter.py
@@ -0,0 +1,151 @@
+# src/paperbot/application/services/query_rewriter.py
+"""
+Query rewriting service.
+
+Expands and rewrites search queries for better coverage.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class QueryRewriter:
+    """
+    Expand and rewrite queries for better search coverage.
+
+    Handles:
+    - Abbreviation expansion (LLM → large language model)
+    - Synonym addition
+    - Query normalization
+    """
+
+    # Abbreviation → full form mappings
+    DEFAULT_ABBREVIATIONS: Dict[str, str] = {
+        "llm": "large language model",
+        "llms": "large language models",
+        "ml": "machine learning",
+        "dl": "deep learning",
+        "nlp": "natural language processing",
+        "cv": "computer vision",
+        "rl": "reinforcement learning",
+        "gan": "generative adversarial network",
+        "gans": "generative adversarial networks",
+        "cnn": "convolutional neural network",
+        "cnns": "convolutional neural networks",
+        "rnn": "recurrent neural network",
+        "rnns": "recurrent neural networks",
+        "lstm": "long short-term memory",
+        "bert": "bidirectional encoder representations from transformers",
+        "gpt": "generative pre-trained transformer",
+        "rag": "retrieval augmented generation",
+        "vae": "variational autoencoder",
+        "asr": "automatic speech recognition",
+        "tts": "text to speech",
+        "ocr": "optical character recognition",
+        "sql": "structured query language",
+        "api": "application programming interface",
+        "ai": "artificial intelligence",
+        "nn": "neural network",
+        "dnn": "deep neural network",
+        "mlp": "multilayer perceptron",
+        "svm": "support vector machine",
+        "knn": "k-nearest neighbors",
+        "pca": "principal component analysis",
+        "ssl": "self-supervised learning",
+        "ner": "named entity recognition",
+        "qa": "question answering",
+        "ir": "information retrieval",
+        "kg": "knowledge graph",
+        "gcn": "graph convolutional network",
+        "gnn": "graph neural network",
+        "vit": "vision transformer",
+        "clip": "contrastive language-image pre-training",
+    }
+
+    def __init__(self, abbreviations: Optional[Dict[str, str]] = None):
+        self.abbreviations = {**self.DEFAULT_ABBREVIATIONS}
+        if abbreviations:
+            self.abbreviations.update(abbreviations)
+
+    def rewrite(self, query: str) -> List[str]:
+        """
+        Rewrite query to produce expanded variations.
+
+        Args:
+            query: Original search query
+
+        Returns:
+            List of query variations (original + expanded)
+        """
+        queries = [query]
+
+        # Tokenize and expand abbreviations
+        words = query.lower().split()
+        expanded_words = []
+        has_expansion = False
+
+        for word in words:
+            # Remove punctuation for matching
+            clean_word = re.sub(r"[^\w]", "", word)
+
+            if clean_word in self.abbreviations:
+                expanded_words.append(self.abbreviations[clean_word])
+                has_expansion = True
+            else:
+                expanded_words.append(word)
+
+        if has_expansion:
+            expanded_query = " ".join(expanded_words)
+            if expanded_query != query.lower():
+                queries.append(expanded_query)
+
+        logger.debug(f"Query rewrite: '{query}' → {queries}")
+        return queries
+
+    def expand_all(self, keywords: List[str]) -> List[str]:
+        """
+        Expand all keywords, returning unique expanded terms.
+
+        Args:
+            keywords: List of search keywords
+
+        Returns:
+            List of unique expanded keywords
+        """
+        expanded: List[str] = []
+        seen: set[str] = set()
+
+        for keyword in keywords:
+            for variation in self.rewrite(keyword):
+                normalized = self.normalize(variation)
+                if normalized and normalized not in seen:
+                    seen.add(normalized)
+                    expanded.append(variation)
+
+        return expanded
+
+    def normalize(self, query: str) -> str:
+        """
+        Normalize query for consistent matching.
+
+        - Lowercase
+        - Remove extra whitespace
+        - Remove special characters (except alphanumeric and space)
+        """
+        normalized = query.lower()
+        normalized = re.sub(r"[^\w\s]", " ", normalized)
+        normalized = re.sub(r"\s+", " ", normalized).strip()
+        return normalized
+
+    def add_abbreviation(self, abbrev: str, expansion: str) -> None:
+        """Add or update an abbreviation mapping."""
+        self.abbreviations[abbrev.lower()] = expansion.lower()
+
+    def get_expansion(self, abbrev: str) -> Optional[str]:
+        """Get the expansion for an abbreviation, if any."""
+        return self.abbreviations.get(abbrev.lower())
diff --git a/src/paperbot/application/services/venue_recommender.py b/src/paperbot/application/services/venue_recommender.py
new file mode 100644
index 0000000..bc82dab
--- /dev/null
+++ b/src/paperbot/application/services/venue_recommender.py
@@ -0,0 +1,157 @@
+# src/paperbot/application/services/venue_recommender.py
+"""
+Venue recommendation service.
+
+Recommends relevant academic venues based on search keywords.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class VenueRecommender:
+    """
+    Recommend relevant venues based on keywords.
+
+    Uses a static mapping from keywords/domains to top venues.
+    Configuration can be loaded from config file or use defaults.
+    """
+
+    # Default keyword→venue mappings
+    DEFAULT_MAPPINGS: Dict[str, List[str]] = {
+        # Security
+        "security": ["CCS", "S&P", "USENIX Security", "NDSS"],
+        "ransomware": ["CCS", "S&P", "USENIX Security", "NDSS"],
+        "malware": ["CCS", "S&P", "USENIX Security", "NDSS"],
+        "cryptography": ["CRYPTO", "EUROCRYPT", "CCS"],
+        "privacy": ["S&P", "PETS", "CCS", "USENIX Security"],
+        "vulnerability": ["CCS", "S&P", "USENIX Security", "NDSS"],
+        "attack": ["CCS", "S&P", "USENIX Security", "NDSS"],
+        "adversarial": ["CCS", "S&P", "NeurIPS", "ICML"],
+        # ML/AI
+        "machine learning": ["NeurIPS", "ICML", "ICLR"],
+        "deep learning": ["NeurIPS", "ICML", "ICLR", "CVPR"],
+        "llm": ["NeurIPS", "ICML", "ACL", "EMNLP"],
+        "large language model": ["NeurIPS", "ICML", "ACL", "EMNLP"],
+        "transformer": ["NeurIPS", "ICML", "ACL", "EMNLP"],
+        "gpt": ["NeurIPS", "ICML", "ACL", "EMNLP"],
+        "nlp": ["ACL", "EMNLP", "NAACL", "NeurIPS"],
+        "natural language": ["ACL", "EMNLP", "NAACL"],
+        "computer vision": ["CVPR", "ICCV", "ECCV", "NeurIPS"],
+        "image": ["CVPR", "ICCV", "ECCV"],
+        "neural network": ["NeurIPS", "ICML", "ICLR"],
+        "reinforcement learning": ["NeurIPS", "ICML", "ICLR"],
+        "generative": ["NeurIPS", "ICML", "ICLR", "CVPR"],
+        "diffusion": ["NeurIPS", "ICML", "ICLR", "CVPR"],
+        # Systems
+        "database": ["SIGMOD", "VLDB", "ICDE"],
+        "query": ["SIGMOD", "VLDB", "ICDE"],
+        "sql": ["SIGMOD", "VLDB", "ICDE"],
+        "systems": ["OSDI", "SOSP", "EuroSys", "ATC"],
+        "operating system": ["OSDI", "SOSP", "EuroSys"],
+        "distributed": ["OSDI", "SOSP", "EuroSys", "NSDI"],
+        "networking": ["SIGCOMM", "NSDI", "MobiCom"],
+        "network": ["SIGCOMM", "NSDI", "MobiCom"],
+        "cloud": ["OSDI", "SOSP", "EuroSys", "SoCC"],
+        # Software Engineering
+        "software engineering": ["ICSE", "FSE", "ASE"],
+        "software": ["ICSE", "FSE", "ASE"],
+        "testing": ["ICSE", "ISSTA", "FSE"],
+        "bug": ["ICSE", "FSE", "ASE", "ISSTA"],
+        "program analysis": ["PLDI", "POPL", "OOPSLA"],
+        "compiler": ["PLDI", "CGO", "CC"],
+        "verification": ["CAV", "PLDI", "POPL"],
+        # HCI
+        "hci": ["CHI", "UIST", "UbiComp"],
+        "human computer": ["CHI", "UIST", "UbiComp"],
+        "interaction": ["CHI", "UIST"],
+        "user interface": ["CHI", "UIST"],
+        # Data Mining
+        "data mining": ["KDD", "ICDM", "WWW"],
+        "knowledge graph": ["KDD", "WWW", "EMNLP"],
+        "recommendation": ["KDD", "RecSys", "WWW"],
+        # Robotics
+        "robotics": ["ICRA", "IROS", "RSS"],
+        "robot": ["ICRA", "IROS", "RSS"],
+        "autonomous": ["ICRA", "IROS", "CVPR"],
+    }
+
+    def __init__(
+        self,
+        config_path: Optional[str] = None,
+        mappings: Optional[Dict[str, List[str]]] = None,
+    ):
+        self.mappings = self.DEFAULT_MAPPINGS.copy()
+        if mappings:
+            self.mappings.update(mappings)
+        if config_path:
+            self._load_config(config_path)
+
+    def _load_config(self, config_path: str) -> None:
+        """Load venue mappings from YAML config file."""
+        try:
+            import yaml
+
+            with open(config_path, "r", encoding="utf-8") as f:
+                config = yaml.safe_load(f)
+
+            if config and isinstance(config, dict):
+                venue_mappings = config.get("venue_mappings", {})
+                if isinstance(venue_mappings, dict):
+                    self.mappings.update(venue_mappings)
+                    logger.info(f"Loaded {len(venue_mappings)} venue mappings from {config_path}")
+        except Exception as e:
+            logger.warning(f"Failed to load venue config from {config_path}: {e}")
+
+    def recommend(
+        self,
+        keywords: List[str],
+        *,
+        max_venues: int = 5,
+    ) -> List[str]:
+        """
+        Recommend venues based on keywords.
+
+        Args:
+            keywords: List of search keywords
+            max_venues: Maximum number of venues to recommend
+
+        Returns:
+            List of recommended venue names, ordered by relevance
+        """
+        venue_scores: Dict[str, int] = {}
+
+        for keyword in keywords:
+            keyword_lower = keyword.lower().strip()
+            if not keyword_lower:
+                continue
+
+            # Exact match (highest priority)
+            if keyword_lower in self.mappings:
+                for venue in self.mappings[keyword_lower]:
+                    venue_scores[venue] = venue_scores.get(venue, 0) + 3
+
+            # Partial match (medium priority)
+            for mapped_kw, venues in self.mappings.items():
+                if keyword_lower in mapped_kw or mapped_kw in keyword_lower:
+                    for venue in venues:
+                        venue_scores[venue] = venue_scores.get(venue, 0) + 1
+
+        # Sort by score descending
+        sorted_venues = sorted(venue_scores.items(), key=lambda x: -x[1])
+        result = [v[0] for v in sorted_venues[:max_venues]]
+
+        logger.debug(f"Recommended venues for {keywords}: {result}")
+        return result
+
+    def get_venues_for_domain(self, domain: str) -> List[str]:
+        """Get venues for a specific domain keyword."""
+        return self.mappings.get(domain.lower(), [])
+
+    def add_mapping(self, keyword: str, venues: List[str]) -> None:
+        """Add or update a keyword→venues mapping."""
+        self.mappings[keyword.lower()] = venues
diff --git a/src/paperbot/application/workflows/harvest_pipeline.py b/src/paperbot/application/workflows/harvest_pipeline.py
new file mode 100644
index 0000000..983b8ad
--- /dev/null
+++ b/src/paperbot/application/workflows/harvest_pipeline.py
@@ -0,0 +1,376 @@
+# src/paperbot/application/workflows/harvest_pipeline.py
+"""
+Paper Harvest Pipeline.
+
+Orchestrates multi-source paper harvesting with deduplication and storage.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import uuid
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+from paperbot.domain.harvest import (
+    HarvestedPaper,
+    HarvestResult,
+    HarvestRunResult,
+    HarvestSource,
+)
+from paperbot.application.services import (
+    PaperDeduplicator,
+    QueryRewriter,
+    VenueRecommender,
+)
+from paperbot.application.ports.harvester_port import HarvesterPort
+from paperbot.infrastructure.harvesters import (
+    ArxivHarvester,
+    SemanticScholarHarvester,
+    OpenAlexHarvester,
+)
+from paperbot.infrastructure.stores.paper_store import PaperStore
+
+logger = logging.getLogger(__name__)
+
+
+def _utcnow() -> datetime:
+    return datetime.now(timezone.utc)
+
+
+@dataclass
+class HarvestProgress:
+    """Progress update during harvesting."""
+
+    phase: str
+    message: str
+    details: Optional[Dict[str, Any]] = None
+
+
+@dataclass
+class HarvestConfig:
+    """Configuration for a harvest run."""
+
+    keywords: List[str]
+    venues: Optional[List[str]] = None
+    year_from: Optional[int] = None
+    year_to: Optional[int] = None
+    sources: Optional[List[str]] = None
+    max_results_per_source: int = 50
+    expand_keywords: bool = True
+    recommend_venues: bool = True
+
+
+@dataclass
+class HarvestFinalResult:
+    """Final result of a harvest run."""
+
+    run_id: str
+    status: str  # success, partial, failed
+    papers_found: int
+    papers_new: int
+    papers_deduplicated: int
+    source_results: Dict[str, Dict[str, Any]]
+    errors: Dict[str, str]
+    duration_seconds: float
+
+
+class HarvestPipeline:
+    """
+    Multi-source paper harvest pipeline.
+
+    Orchestrates:
+    1. Query expansion (QueryRewriter)
+    2. Venue recommendation (VenueRecommender)
+    3. Parallel harvesting from multiple sources
+    4. In-memory deduplication (PaperDeduplicator)
+    5. Batch storage with DB-level dedup (PaperStore)
+    """
+
+    def __init__(
+        self,
+        db_url: Optional[str] = None,
+        *,
+        venue_config_path: Optional[str] = None,
+    ):
+        self.db_url = db_url
+        self._venue_config_path = venue_config_path
+
+        # Services (initialized lazily)
+        self._query_rewriter: Optional[QueryRewriter] = None
+        self._venue_recommender: Optional[VenueRecommender] = None
+        self._deduplicator: Optional[PaperDeduplicator] = None
+        self._paper_store: Optional[PaperStore] = None
+
+        # Harvesters (initialized per-run)
+        self._harvesters: Dict[str, HarvesterPort] = {}
+
+    @property
+    def query_rewriter(self) -> QueryRewriter:
+        if self._query_rewriter is None:
+            self._query_rewriter = QueryRewriter()
+        return self._query_rewriter
+
+    @property
+    def venue_recommender(self) -> VenueRecommender:
+        if self._venue_recommender is None:
+            self._venue_recommender = VenueRecommender(
+                config_path=self._venue_config_path
+            )
+        return self._venue_recommender
+
+    @property
+    def deduplicator(self) -> PaperDeduplicator:
+        if self._deduplicator is None:
+            self._deduplicator = PaperDeduplicator()
+        return self._deduplicator
+
+    @property
+    def paper_store(self) -> PaperStore:
+        if self._paper_store is None:
+            self._paper_store = PaperStore(self.db_url)
+        return self._paper_store
+
+    def _get_harvester(self, source: str) -> Optional[HarvesterPort]:
+        """Get or create harvester for a source."""
+        if source not in self._harvesters:
+            if source == HarvestSource.ARXIV.value:
+                self._harvesters[source] = ArxivHarvester()
+            elif source == HarvestSource.SEMANTIC_SCHOLAR.value:
+                self._harvesters[source] = SemanticScholarHarvester()
+            elif source == HarvestSource.OPENALEX.value:
+                self._harvesters[source] = OpenAlexHarvester()
+            else:
+                logger.warning(f"Unknown source: {source}")
+                return None
+        return self._harvesters[source]
+
+    @staticmethod
+    def new_run_id() -> str:
+        """Generate a new harvest run ID."""
+        timestamp = _utcnow().strftime("%Y%m%d-%H%M%S")
+        suffix = uuid.uuid4().hex[:8]
+        return f"harvest-{timestamp}-{suffix}"
+
+    async def run(
+        self,
+        config: HarvestConfig,
+        *,
+        run_id: Optional[str] = None,
+    ) -> AsyncGenerator[HarvestProgress | HarvestFinalResult, None]:
+        """
+        Execute harvest pipeline with progress updates.
+
+        Yields:
+            HarvestProgress for intermediate updates
+            HarvestFinalResult as final yield
+        """
+        run_id = run_id or self.new_run_id()
+        start_time = _utcnow()
+        errors: Dict[str, str] = {}
+        source_results: Dict[str, Dict[str, Any]] = {}
+
+        # Determine sources to use
+        sources = config.sources or [s.value for s in HarvestSource]
+
+        try:
+            # Phase 1: Expand keywords
+            yield HarvestProgress(
+                phase="Expanding",
+                message="Expanding keywords...",
+            )
+
+            expanded_keywords = config.keywords.copy()
+            if config.expand_keywords:
+                expanded_keywords = self.query_rewriter.expand_all(config.keywords)
+                logger.info(f"Expanded keywords: {config.keywords} → {expanded_keywords}")
+
+            # Phase 2: Recommend venues (if not specified)
+            venues = config.venues
+            if config.recommend_venues and not venues:
+                yield HarvestProgress(
+                    phase="Recommending",
+                    message="Recommending venues...",
+                )
+                venues = self.venue_recommender.recommend(
+                    expanded_keywords, max_venues=5
+                )
+                logger.info(f"Recommended venues: {venues}")
+
+            # Phase 3: Create harvest run record
+            yield HarvestProgress(
+                phase="Initializing",
+                message="Creating harvest run record...",
+            )
+
+            self.paper_store.create_harvest_run(
+                run_id=run_id,
+                keywords=expanded_keywords,
+                venues=venues or [],
+                sources=sources,
+                max_results_per_source=config.max_results_per_source,
+            )
+
+            # Phase 4: Harvest from each source in parallel
+            all_papers: List[HarvestedPaper] = []
+
+            # Build search query from expanded keywords
+            search_query = " ".join(expanded_keywords)
+
+            # Harvest from each source
+            for source in sources:
+                yield HarvestProgress(
+                    phase="Harvesting",
+                    message=f"Fetching from {source}...",
+                    details={"source": source},
+                )
+
+                harvester = self._get_harvester(source)
+                if harvester is None:
+                    errors[source] = f"Unknown source: {source}"
+                    source_results[source] = {"papers": 0, "error": errors[source]}
+                    continue
+
+                try:
+                    result = await harvester.search(
+                        query=search_query,
+                        max_results=config.max_results_per_source,
+                        year_from=config.year_from,
+                        year_to=config.year_to,
+                        venues=venues,
+                    )
+
+                    all_papers.extend(result.papers)
+                    source_results[source] = {
+                        "papers": result.total_found,
+                        "error": result.error,
+                    }
+
+                    if result.error:
+                        errors[source] = result.error
+                        logger.warning(f"Error from {source}: {result.error}")
+                    else:
+                        logger.info(f"Harvested {result.total_found} papers from {source}")
+
+                except Exception as e:
+                    error_msg = str(e)
+                    errors[source] = error_msg
+                    source_results[source] = {"papers": 0, "error": error_msg}
+                    logger.exception(f"Exception harvesting from {source}")
+
+            # Phase 5: Deduplicate
+            yield HarvestProgress(
+                phase="Deduplicating",
+                message=f"Removing duplicates from {len(all_papers)} papers...",
+            )
+
+            unique_papers, deduplicated_count = self.deduplicator.deduplicate(all_papers)
+            logger.info(
+                f"Deduplication: {len(all_papers)} → {len(unique_papers)} "
+                f"({deduplicated_count} removed)"
+            )
+
+            # Phase 6: Store papers
+            yield HarvestProgress(
+                phase="Storing",
+                message=f"Saving {len(unique_papers)} papers to database...",
+            )
+
+            new_count, updated_count = self.paper_store.upsert_papers_batch(unique_papers)
+            logger.info(f"Stored papers: {new_count} new, {updated_count} updated")
+
+            # Phase 7: Update harvest run record
+            status = "success"
+            if errors:
+                status = "partial" if unique_papers else "failed"
+
+            self.paper_store.update_harvest_run(
+                run_id=run_id,
+                status=status,
+                papers_found=len(all_papers),
+                papers_new=new_count,
+                papers_deduplicated=deduplicated_count,
+                errors=errors if errors else None,
+            )
+
+            # Calculate duration
+            end_time = _utcnow()
+            duration = (end_time - start_time).total_seconds()
+
+            # Yield final result
+            yield HarvestFinalResult(
+                run_id=run_id,
+                status=status,
+                papers_found=len(all_papers),
+                papers_new=new_count,
+                papers_deduplicated=deduplicated_count,
+                source_results=source_results,
+                errors=errors,
+                duration_seconds=duration,
+            )
+
+        except Exception as e:
+            # Handle pipeline-level errors
+            logger.exception(f"Harvest pipeline failed: {e}")
+            self.paper_store.update_harvest_run(
+                run_id=run_id,
+                status="failed",
+                errors={"pipeline": str(e)},
+            )
+
+            end_time = _utcnow()
+            duration = (end_time - start_time).total_seconds()
+
+            yield HarvestFinalResult(
+                run_id=run_id,
+                status="failed",
+                papers_found=0,
+                papers_new=0,
+                papers_deduplicated=0,
+                source_results=source_results,
+                errors={"pipeline": str(e), **errors},
+                duration_seconds=duration,
+            )
+
+    async def run_sync(
+        self,
+        config: HarvestConfig,
+        *,
+        run_id: Optional[str] = None,
+    ) -> HarvestFinalResult:
+        """
+        Execute harvest pipeline and return only final result.
+
+        Useful for CLI or non-streaming use cases.
+        """
+        result: Optional[HarvestFinalResult] = None
+        async for item in self.run(config, run_id=run_id):
+            if isinstance(item, HarvestFinalResult):
+                result = item
+
+        if result is None:
+            raise RuntimeError("Pipeline completed without final result")
+        return result
+
+    async def close(self) -> None:
+        """Release all resources."""
+        # Close harvesters
+        for harvester in self._harvesters.values():
+            try:
+                await harvester.close()
+            except Exception:
+                pass
+        self._harvesters.clear()
+
+        # Close paper store
+        if self._paper_store:
+            self._paper_store.close()
+            self._paper_store = None
+
+    async def __aenter__(self) -> "HarvestPipeline":
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        await self.close()
diff --git a/src/paperbot/context_engine/engine.py b/src/paperbot/context_engine/engine.py
index 9daa3ab..6d3004c 100644
--- a/src/paperbot/context_engine/engine.py
+++ b/src/paperbot/context_engine/engine.py
@@ -12,6 +12,7 @@
 from paperbot.domain.paper import PaperMeta
 from paperbot.infrastructure.stores.memory_store import SqlAlchemyMemoryStore
 from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore
+from paperbot.utils.logging_config import Logger, LogFiles
 
 _TOKEN_RX = re.compile(r"[a-zA-Z0-9_+.-]+")
 
@@ -502,6 +503,7 @@ async def build_context_pack(
             "rebuttal": (0.50, 0.40, 0.10),
         }.get(stage, (0.55, 0.30, 0.15))
 
+        Logger.info(f"Paper search config: offline={self.config.offline}, paper_limit={self.config.paper_limit}", file=LogFiles.HARVEST)
         if not self.config.offline and self.config.paper_limit > 0:
             try:
                 searcher = self.paper_searcher
@@ -509,9 +511,12 @@ async def build_context_pack(
                     from paperbot.utils.search import SemanticScholarSearch  # local import
 
                     searcher = SemanticScholarSearch()
+                    Logger.info("Initialized SemanticScholarSearch", file=LogFiles.HARVEST)
 
                 fetch_limit = max(30, int(self.config.paper_limit) * 3)
+                Logger.info(f"Searching papers with query='{merged_query}', limit={fetch_limit}", file=LogFiles.HARVEST)
                 resp = await asyncio.to_thread(searcher.search_papers, merged_query, fetch_limit)
+                Logger.info(f"Search returned {len(getattr(resp, 'papers', []) or [])} papers", file=LogFiles.HARVEST)
 
                 raw: List[Dict[str, Any]] = []
                 for p in getattr(resp, "papers", []) or []:
@@ -578,7 +583,10 @@ async def build_context_pack(
                     policy=policy,
                     seed=f"{user_id}:{merged_query}:{stage}:{routed_track.get('id') if routed_track else ''}",
                 )
-            except Exception:
+            except Exception as e:
+                import traceback
+                tb = traceback.format_exc()
+                Logger.error(f"Error fetching papers: {e}\n{tb}", file=LogFiles.HARVEST)
                 papers = []
 
         routing = {
diff --git a/src/paperbot/domain/harvest.py b/src/paperbot/domain/harvest.py
new file mode 100644
index 0000000..64230ab
--- /dev/null
+++ b/src/paperbot/domain/harvest.py
@@ -0,0 +1,160 @@
+# src/paperbot/domain/harvest.py
+"""
+Paper harvesting domain models.
+
+Contains data structures for paper collection from multiple sources:
+- HarvestedPaper: Unified paper format from any source
+- HarvestSource: Enum of supported paper sources
+- HarvestResult: Result from a single harvester
+- HarvestRunResult: Aggregated result from all harvesters
+"""
+
+from __future__ import annotations
+
+import hashlib
+import re
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+
+class HarvestSource(str, Enum):
+    """Supported paper data sources."""
+
+    ARXIV = "arxiv"
+    SEMANTIC_SCHOLAR = "semantic_scholar"
+    OPENALEX = "openalex"
+
+
+@dataclass
+class HarvestedPaper:
+    """
+    Unified paper format from any harvest source.
+
+    Required fields: title, source
+    All other fields are optional to handle varying API responses.
+    """
+
+    title: str
+    source: HarvestSource
+    abstract: str = ""
+    authors: List[str] = field(default_factory=list)
+    doi: Optional[str] = None
+    arxiv_id: Optional[str] = None
+    semantic_scholar_id: Optional[str] = None
+    openalex_id: Optional[str] = None
+    year: Optional[int] = None
+    venue: Optional[str] = None
+    publication_date: Optional[str] = None
+    citation_count: int = 0
+    url: Optional[str] = None
+    pdf_url: Optional[str] = None
+    keywords: List[str] = field(default_factory=list)
+    fields_of_study: List[str] = field(default_factory=list)
+    source_rank: Optional[int] = None
+
+    def compute_title_hash(self) -> str:
+        """Compute normalized title hash for deduplication."""
+        normalized = self.title.lower()
+        normalized = re.sub(r"[^\w\s]", "", normalized)
+        normalized = re.sub(r"\s+", " ", normalized).strip()
+        return hashlib.sha256(normalized.encode()).hexdigest()
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary representation."""
+        return {
+            "title": self.title,
+            "source": self.source.value,
+            "abstract": self.abstract,
+            "authors": self.authors,
+            "doi": self.doi,
+            "arxiv_id": self.arxiv_id,
+            "semantic_scholar_id": self.semantic_scholar_id,
+            "openalex_id": self.openalex_id,
+            "year": self.year,
+            "venue": self.venue,
+            "publication_date": self.publication_date,
+            "citation_count": self.citation_count,
+            "url": self.url,
+            "pdf_url": self.pdf_url,
+            "keywords": self.keywords,
+            "fields_of_study": self.fields_of_study,
+            "source_rank": self.source_rank,
+            "title_hash": self.compute_title_hash(),
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "HarvestedPaper":
+        """Create instance from dictionary."""
+        source = data.get("source", "")
+        if isinstance(source, str):
+            source = HarvestSource(source)
+        return cls(
+            title=data.get("title", ""),
+            source=source,
+            abstract=data.get("abstract", ""),
+            authors=data.get("authors", []),
+            doi=data.get("doi"),
+            arxiv_id=data.get("arxiv_id"),
+            semantic_scholar_id=data.get("semantic_scholar_id"),
+            openalex_id=data.get("openalex_id"),
+            year=data.get("year"),
+            venue=data.get("venue"),
+            publication_date=data.get("publication_date"),
+            citation_count=data.get("citation_count", 0),
+            url=data.get("url"),
+            pdf_url=data.get("pdf_url"),
+            keywords=data.get("keywords", []),
+            fields_of_study=data.get("fields_of_study", []),
+            source_rank=data.get("source_rank"),
+        )
+
+
+@dataclass
+class HarvestResult:
+    """Result from a single harvester."""
+
+    source: HarvestSource
+    papers: List[HarvestedPaper]
+    total_found: int
+    error: Optional[str] = None
+
+    @property
+    def success(self) -> bool:
+        """Whether the harvest was successful."""
+        return self.error is None
+
+
+@dataclass
+class HarvestRunResult:
+    """Aggregated result from all harvesters in a harvest run."""
+
+    run_id: str
+    status: str  # running/success/partial/failed
+    papers_found: int
+    papers_new: int
+    papers_deduplicated: int
+    source_results: Dict[HarvestSource, HarvestResult]
+    started_at: datetime
+    ended_at: Optional[datetime] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary representation."""
+        return {
+            "run_id": self.run_id,
+            "status": self.status,
+            "papers_found": self.papers_found,
+            "papers_new": self.papers_new,
+            "papers_deduplicated": self.papers_deduplicated,
+            "sources": {
+                source.value: {
+                    "papers": len(result.papers),
+                    "total_found": result.total_found,
+                    "error": result.error,
+                }
+                for source, result in self.source_results.items()
+            },
+            "started_at": self.started_at.isoformat() if self.started_at else None,
+            "ended_at": self.ended_at.isoformat() if self.ended_at else None,
+        }
diff --git a/src/paperbot/infrastructure/harvesters/__init__.py b/src/paperbot/infrastructure/harvesters/__init__.py
new file mode 100644
index 0000000..24e9ccc
--- /dev/null
+++ b/src/paperbot/infrastructure/harvesters/__init__.py
@@ -0,0 +1,17 @@
+# src/paperbot/infrastructure/harvesters/__init__.py
+"""
+Paper harvesters for multiple academic sources.
+
+Each harvester implements the HarvesterPort interface and normalizes
+results to the HarvestedPaper format.
+"""
+
+from .arxiv_harvester import ArxivHarvester
+from .semantic_scholar_harvester import SemanticScholarHarvester
+from .openalex_harvester import OpenAlexHarvester
+
+__all__ = [
+    "ArxivHarvester",
+    "SemanticScholarHarvester",
+    "OpenAlexHarvester",
+]
diff --git a/src/paperbot/infrastructure/harvesters/arxiv_harvester.py b/src/paperbot/infrastructure/harvesters/arxiv_harvester.py
new file mode 100644
index 0000000..6b51d1c
--- /dev/null
+++ b/src/paperbot/infrastructure/harvesters/arxiv_harvester.py
@@ -0,0 +1,168 @@
+# src/paperbot/infrastructure/harvesters/arxiv_harvester.py
+"""
+arXiv paper harvester.
+
+Uses the arXiv Atom API for paper search.
+API documentation: https://arxiv.org/help/api
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from typing import List, Optional
+
+import aiohttp
+
+from paperbot.domain.harvest import HarvestedPaper, HarvestResult, HarvestSource
+from paperbot.infrastructure.connectors.arxiv_connector import ArxivConnector, ArxivRecord
+
+logger = logging.getLogger(__name__)
+
+
+class ArxivHarvester:
+    """
+    arXiv paper harvester using the Atom API.
+
+    API: https://export.arxiv.org/api/query
+    Rate limit: 1 request per 3 seconds (be conservative)
+    """
+
+    ARXIV_API_URL = "https://export.arxiv.org/api/query"
+    REQUEST_INTERVAL = 3.0  # seconds between requests
+
+    def __init__(self, connector: Optional[ArxivConnector] = None):
+        self.connector = connector or ArxivConnector()
+        self._session: Optional[aiohttp.ClientSession] = None
+        self._last_request_time: float = 0
+
+    @property
+    def source(self) -> HarvestSource:
+        return HarvestSource.ARXIV
+
+    async def _get_session(self) -> aiohttp.ClientSession:
+        if self._session is None or self._session.closed:
+            self._session = aiohttp.ClientSession()
+        return self._session
+
+    async def _rate_limit(self) -> None:
+        """Enforce rate limiting between requests."""
+        import time
+
+        now = time.time()
+        elapsed = now - self._last_request_time
+        if elapsed < self.REQUEST_INTERVAL:
+            await asyncio.sleep(self.REQUEST_INTERVAL - elapsed)
+        self._last_request_time = time.time()
+
+    def _build_query(
+        self,
+        query: str,
+        year_from: Optional[int] = None,
+        year_to: Optional[int] = None,
+    ) -> str:
+        """Build arXiv search query with optional year filters."""
+        # arXiv uses submittedDate for filtering
+        # Format: submittedDate:[YYYYMMDD TO YYYYMMDD]
+        search_query = f"all:{query}"
+
+        if year_from or year_to:
+            start_date = f"{year_from}0101" if year_from else "199101"
+            end_date = f"{year_to}1231" if year_to else "209912"
+            search_query += f" AND submittedDate:[{start_date} TO {end_date}]"
+
+        return search_query
+
+    async def search(
+        self,
+        query: str,
+        *,
+        max_results: int = 100,
+        year_from: Optional[int] = None,
+        year_to: Optional[int] = None,
+        venues: Optional[List[str]] = None,  # Not supported by arXiv
+    ) -> HarvestResult:
+        """
+        Search arXiv using the Atom API.
+
+        Note: arXiv doesn't support venue filtering - all papers are preprints.
+        """
+        search_query = self._build_query(query, year_from, year_to)
+
+        params = {
+            "search_query": search_query,
+            "start": 0,
+            "max_results": min(max_results, 200),  # arXiv max is ~200 per request
+            "sortBy": "relevance",
+            "sortOrder": "descending",
+        }
+
+        try:
+            await self._rate_limit()
+            session = await self._get_session()
+
+            async with session.get(self.ARXIV_API_URL, params=params) as resp:
+                if resp.status != 200:
+                    return HarvestResult(
+                        source=self.source,
+                        papers=[],
+                        total_found=0,
+                        error=f"arXiv API returned status {resp.status}",
+                    )
+                xml_text = await resp.text()
+
+            records = self.connector.parse_atom(xml_text)
+            papers = [self._record_to_paper(r, rank=i) for i, r in enumerate(records)]
+
+            logger.info(f"arXiv harvester found {len(papers)} papers for query: {query}")
+
+            return HarvestResult(
+                source=self.source,
+                papers=papers,
+                total_found=len(papers),
+            )
+        except Exception as e:
+            logger.warning(f"arXiv harvester error: {e}")
+            return HarvestResult(
+                source=self.source,
+                papers=[],
+                total_found=0,
+                error=str(e),
+            )
+
+    def _record_to_paper(self, record: ArxivRecord, rank: int) -> HarvestedPaper:
+        """Convert ArxivRecord to HarvestedPaper."""
+        # Extract arxiv_id from full URL (e.g., "http://arxiv.org/abs/2301.12345v1")
+        arxiv_id = record.arxiv_id
+        if "/" in arxiv_id:
+            arxiv_id = arxiv_id.split("/")[-1]
+        # Remove version suffix (e.g., "2301.12345v1" -> "2301.12345")
+        if "v" in arxiv_id:
+            arxiv_id = arxiv_id.split("v")[0]
+
+        # Extract year from published date
+        year = None
+        if record.published:
+            try:
+                year = int(record.published[:4])
+            except (ValueError, IndexError):
+                pass
+
+        return HarvestedPaper(
+            title=record.title.replace("\n", " ").strip(),
+            source=HarvestSource.ARXIV,
+            abstract=record.summary.replace("\n", " ").strip(),
+            authors=record.authors,
+            arxiv_id=arxiv_id,
+            year=year,
+            publication_date=record.published[:10] if record.published else None,
+            url=record.abs_url,
+            pdf_url=record.pdf_url,
+            source_rank=rank,
+        )
+
+    async def close(self) -> None:
+        """Close the HTTP session."""
+        if self._session and not self._session.closed:
+            await self._session.close()
+            self._session = None
diff --git a/src/paperbot/infrastructure/harvesters/openalex_harvester.py b/src/paperbot/infrastructure/harvesters/openalex_harvester.py
new file mode 100644
index 0000000..4153e42
--- /dev/null
+++ b/src/paperbot/infrastructure/harvesters/openalex_harvester.py
@@ -0,0 +1,212 @@
+# src/paperbot/infrastructure/harvesters/openalex_harvester.py
+"""
+OpenAlex paper harvester.
+
+Uses the OpenAlex API for paper search.
+API documentation: https://docs.openalex.org/
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from typing import Any, Dict, List, Optional
+
+import aiohttp
+
+from paperbot.domain.harvest import HarvestedPaper, HarvestResult, HarvestSource
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAlexHarvester:
+    """
+    OpenAlex paper harvester.
+
+    API: https://api.openalex.org/works
+    Rate limit: 10 req/s (polite pool with email), 100K/day
+    """
+
+    OPENALEX_API_URL = "https://api.openalex.org/works"
+    REQUEST_INTERVAL = 0.1  # 10 req/s
+
+    def __init__(self, email: Optional[str] = None):
+        self.email = email  # For polite pool
+        self._session: Optional[aiohttp.ClientSession] = None
+        self._last_request_time: float = 0
+
+    @property
+    def source(self) -> HarvestSource:
+        return HarvestSource.OPENALEX
+
+    async def _get_session(self) -> aiohttp.ClientSession:
+        if self._session is None or self._session.closed:
+            self._session = aiohttp.ClientSession()
+        return self._session
+
+    async def _rate_limit(self) -> None:
+        """Enforce rate limiting between requests."""
+        import time
+
+        now = time.time()
+        elapsed = now - self._last_request_time
+        if elapsed < self.REQUEST_INTERVAL:
+            await asyncio.sleep(self.REQUEST_INTERVAL - elapsed)
+        self._last_request_time = time.time()
+
+    async def search(
+        self,
+        query: str,
+        *,
+        max_results: int = 100,
+        year_from: Optional[int] = None,
+        year_to: Optional[int] = None,
+        venues: Optional[List[str]] = None,
+    ) -> HarvestResult:
+        """Search OpenAlex API."""
+        params: Dict[str, Any] = {
+            "search": query,
+            "per_page": min(max_results, 200),  # API max is 200
+            "sort": "cited_by_count:desc",
+        }
+
+        # Add email for polite pool
+        if self.email:
+            params["mailto"] = self.email
+
+        # Build filter string
+        filters = []
+        if year_from:
+            filters.append(f"publication_year:>={year_from}")
+        if year_to:
+            filters.append(f"publication_year:<={year_to}")
+        if filters:
+            params["filter"] = ",".join(filters)
+
+        try:
+            await self._rate_limit()
+            session = await self._get_session()
+
+            async with session.get(self.OPENALEX_API_URL, params=params) as resp:
+                if resp.status != 200:
+                    return HarvestResult(
+                        source=self.source,
+                        papers=[],
+                        total_found=0,
+                        error=f"OpenAlex API returned status {resp.status}",
+                    )
+                data = await resp.json()
+
+            results = data.get("results", [])
+            papers = [self._to_paper(r, rank=i) for i, r in enumerate(results)]
+
+            # Filter by venue if specified
+            if venues:
+                venue_set = {v.lower() for v in venues}
+                papers = [
+                    p
+                    for p in papers
+                    if p.venue and any(v in p.venue.lower() for v in venue_set)
+                ]
+
+            total_found = data.get("meta", {}).get("count", len(papers))
+            logger.info(f"OpenAlex harvester found {len(papers)} papers for query: {query}")
+
+            return HarvestResult(
+                source=self.source,
+                papers=papers,
+                total_found=total_found,
+            )
+        except Exception as e:
+            logger.warning(f"OpenAlex harvester error: {e}")
+            return HarvestResult(
+                source=self.source,
+                papers=[],
+                total_found=0,
+                error=str(e),
+            )
+
+    def _to_paper(self, data: Dict[str, Any], rank: int) -> HarvestedPaper:
+        """Convert OpenAlex API response to HarvestedPaper."""
+        # Extract authors
+        authors = []
+        for authorship in data.get("authorships", []):
+            author = authorship.get("author", {})
+            if author.get("display_name"):
+                authors.append(author["display_name"])
+
+        # Extract identifiers
+        ids = data.get("ids", {})
+        doi = ids.get("doi", "")
+        if doi:
+            doi = doi.replace("https://doi.org/", "")
+
+        openalex_id = ids.get("openalex", "")
+        if openalex_id:
+            openalex_id = openalex_id.replace("https://openalex.org/", "")
+
+        # Extract venue
+        venue = None
+        if data.get("primary_location"):
+            source = data["primary_location"].get("source") or {}
+            venue = source.get("display_name")
+
+        # Extract PDF URL
+        pdf_url = None
+        if data.get("open_access", {}).get("oa_url"):
+            pdf_url = data["open_access"]["oa_url"]
+
+        # Extract keywords from concepts
+        keywords = [
+            c.get("display_name", "")
+            for c in data.get("keywords", [])[:10]
+            if c.get("display_name")
+        ]
+
+        # Extract fields of study from concepts
+        fields_of_study = [
+            c.get("display_name", "")
+            for c in data.get("concepts", [])[:5]
+            if c.get("display_name")
+        ]
+
+        return HarvestedPaper(
+            title=data.get("title", "") or data.get("display_name", ""),
+            source=HarvestSource.OPENALEX,
+            abstract=self._get_abstract(data),
+            authors=authors,
+            doi=doi if doi else None,
+            openalex_id=openalex_id if openalex_id else None,
+            year=data.get("publication_year"),
+            venue=venue,
+            publication_date=data.get("publication_date"),
+            citation_count=data.get("cited_by_count", 0) or 0,
+            url=data.get("doi") or ids.get("openalex"),
+            pdf_url=pdf_url,
+            keywords=keywords,
+            fields_of_study=fields_of_study,
+            source_rank=rank,
+        )
+
+    def _get_abstract(self, data: Dict[str, Any]) -> str:
+        """Reconstruct abstract from inverted index."""
+        abstract_index = data.get("abstract_inverted_index")
+        if not abstract_index:
+            return ""
+
+        # OpenAlex stores abstract as inverted index: {"word": [positions]}
+        try:
+            words: List[tuple[int, str]] = []
+            for word, positions in abstract_index.items():
+                for pos in positions:
+                    words.append((pos, word))
+            words.sort(key=lambda x: x[0])
+            return " ".join(w[1] for w in words)
+        except Exception:
+            return ""
+
+    async def close(self) -> None:
+        """Close the HTTP session."""
+        if self._session and not self._session.closed:
+            await self._session.close()
+            self._session = None
diff --git a/src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py b/src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py
new file mode 100644
index 0000000..c3ddae6
--- /dev/null
+++ b/src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py
@@ -0,0 +1,133 @@
+# src/paperbot/infrastructure/harvesters/semantic_scholar_harvester.py
+"""
+Semantic Scholar paper harvester.
+
+Uses the Semantic Scholar Academic Graph API for paper search.
+API documentation: https://api.semanticscholar.org/api-docs/
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict, List, Optional
+
+from paperbot.domain.harvest import HarvestedPaper, HarvestResult, HarvestSource
+from paperbot.infrastructure.api_clients.semantic_scholar import SemanticScholarClient
+
+logger = logging.getLogger(__name__)
+
+
+class SemanticScholarHarvester:
+    """
+    Semantic Scholar paper harvester.
+
+    API: https://api.semanticscholar.org/graph/v1/paper/search
+    Rate limit: 100 req/min (with API key), 5000/day without key
+    """
+
+    FIELDS = [
+        "paperId",
+        "title",
+        "abstract",
+        "year",
+        "venue",
+        "citationCount",
+        "authors",
+        "publicationDate",
+        "externalIds",
+        "fieldsOfStudy",
+        "url",
+        "openAccessPdf",
+    ]
+
+    def __init__(self, client: Optional[SemanticScholarClient] = None, api_key: Optional[str] = None):
+        self.client = client or SemanticScholarClient(api_key=api_key)
+
+    @property
+    def source(self) -> HarvestSource:
+        return HarvestSource.SEMANTIC_SCHOLAR
+
+    async def search(
+        self,
+        query: str,
+        *,
+        max_results: int = 100,
+        year_from: Optional[int] = None,
+        year_to: Optional[int] = None,
+        venues: Optional[List[str]] = None,
+    ) -> HarvestResult:
+        """Search Semantic Scholar API."""
+        try:
+            # S2 API supports year filter in query
+            year_filter = ""
+            if year_from and year_to:
+                year_filter = f" year:{year_from}-{year_to}"
+            elif year_from:
+                year_filter = f" year:{year_from}-"
+            elif year_to:
+                year_filter = f" year:-{year_to}"
+
+            results = await self.client.search_papers(
+                query=query + year_filter,
+                limit=min(max_results, 100),  # S2 limit per request
+                fields=self.FIELDS,
+            )
+
+            papers = [self._to_paper(r, rank=i) for i, r in enumerate(results)]
+
+            # Filter by venue if specified
+            if venues:
+                venue_set = {v.lower() for v in venues}
+                papers = [
+                    p
+                    for p in papers
+                    if p.venue and any(v in p.venue.lower() for v in venue_set)
+                ]
+
+            logger.info(f"Semantic Scholar harvester found {len(papers)} papers for query: {query}")
+
+            return HarvestResult(
+                source=self.source,
+                papers=papers,
+                total_found=len(papers),
+            )
+        except Exception as e:
+            logger.warning(f"Semantic Scholar harvester error: {e}")
+            return HarvestResult(
+                source=self.source,
+                papers=[],
+                total_found=0,
+                error=str(e),
+            )
+
+    def _to_paper(self, data: Dict[str, Any], rank: int) -> HarvestedPaper:
+        """Convert S2 API response to HarvestedPaper."""
+        authors = [a.get("name", "") for a in data.get("authors", []) if a.get("name")]
+        external_ids = data.get("externalIds", {}) or {}
+
+        pdf_url = None
+        if data.get("openAccessPdf"):
+            pdf_url = data["openAccessPdf"].get("url")
+
+        return HarvestedPaper(
+            title=data.get("title", ""),
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            abstract=data.get("abstract") or "",
+            authors=authors,
+            doi=external_ids.get("DOI"),
+            arxiv_id=external_ids.get("ArXiv"),
+            semantic_scholar_id=data.get("paperId"),
+            year=data.get("year"),
+            venue=data.get("venue"),
+            publication_date=data.get("publicationDate"),
+            citation_count=data.get("citationCount", 0) or 0,
+            url=data.get("url"),
+            pdf_url=pdf_url,
+            fields_of_study=data.get("fieldsOfStudy") or [],
+            source_rank=rank,
+        )
+
+    async def close(self) -> None:
+        """Close the HTTP client."""
+        # SemanticScholarClient manages its own session
+        pass
diff --git a/src/paperbot/infrastructure/stores/models.py b/src/paperbot/infrastructure/stores/models.py
index 726f29b..0cf476d 100644
--- a/src/paperbot/infrastructure/stores/models.py
+++ b/src/paperbot/infrastructure/stores/models.py
@@ -709,3 +709,130 @@ class PaperImpressionModel(Base):
 
     run = relationship("ResearchContextRunModel", back_populates="impressions")
     track = relationship("ResearchTrackModel")
+
+
+class PaperModel(Base):
+    """Harvested paper metadata from multiple sources."""
+
+    __tablename__ = "papers"
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
+
+    # Canonical identifiers (for deduplication)
+    doi: Mapped[Optional[str]] = mapped_column(String(128), unique=True, nullable=True, index=True)
+    arxiv_id: Mapped[Optional[str]] = mapped_column(String(32), unique=True, nullable=True, index=True)
+    semantic_scholar_id: Mapped[Optional[str]] = mapped_column(String(64), unique=True, nullable=True, index=True)
+    openalex_id: Mapped[Optional[str]] = mapped_column(String(64), unique=True, nullable=True, index=True)
+    title_hash: Mapped[str] = mapped_column(String(64), index=True)  # SHA256 of normalized title
+
+    # Core metadata
+    title: Mapped[str] = mapped_column(Text, default="")
+    abstract: Mapped[str] = mapped_column(Text, default="")
+    authors_json: Mapped[str] = mapped_column(Text, default="[]")
+    year: Mapped[Optional[int]] = mapped_column(Integer, nullable=True, index=True)
+    venue: Mapped[Optional[str]] = mapped_column(String(256), nullable=True, index=True)
+    publication_date: Mapped[Optional[str]] = mapped_column(String(32), nullable=True)
+    citation_count: Mapped[int] = mapped_column(Integer, default=0, index=True)
+
+    # URLs (no PDF download, just references)
+    url: Mapped[Optional[str]] = mapped_column(String(512), nullable=True)
+    pdf_url: Mapped[Optional[str]] = mapped_column(String(512), nullable=True)
+
+    # Classification
+    keywords_json: Mapped[str] = mapped_column(Text, default="[]")
+    fields_of_study_json: Mapped[str] = mapped_column(Text, default="[]")
+
+    # Source tracking
+    primary_source: Mapped[str] = mapped_column(String(32), default="")  # First source that found this paper
+    sources_json: Mapped[str] = mapped_column(Text, default="[]")  # All sources that returned this paper
+
+    # Timestamps
+    created_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True, index=True)
+    updated_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True)
+    deleted_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True)  # Soft delete
+
+    def get_authors(self) -> list:
+        try:
+            return json.loads(self.authors_json or "[]")
+        except Exception:
+            return []
+
+    def get_keywords(self) -> list:
+        try:
+            return json.loads(self.keywords_json or "[]")
+        except Exception:
+            return []
+
+    def get_fields_of_study(self) -> list:
+        try:
+            return json.loads(self.fields_of_study_json or "[]")
+        except Exception:
+            return []
+
+    def get_sources(self) -> list:
+        try:
+            return json.loads(self.sources_json or "[]")
+        except Exception:
+            return []
+
+    def set_keywords(self, keywords: list) -> None:
+        self.keywords_json = json.dumps(keywords or [], ensure_ascii=False)
+
+    def set_fields_of_study(self, fields: list) -> None:
+        self.fields_of_study_json = json.dumps(fields or [], ensure_ascii=False)
+
+    def set_sources(self, sources: list) -> None:
+        self.sources_json = json.dumps(sources or [], ensure_ascii=False)
+
+
+class HarvestRunModel(Base):
+    """Harvest execution tracking."""
+
+    __tablename__ = "harvest_runs"
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
+    run_id: Mapped[str] = mapped_column(String(64), unique=True, index=True)
+
+    # Input
+    keywords_json: Mapped[str] = mapped_column(Text, default="[]")
+    venues_json: Mapped[str] = mapped_column(Text, default="[]")
+    sources_json: Mapped[str] = mapped_column(Text, default="[]")
+    max_results_per_source: Mapped[Optional[int]] = mapped_column(Integer, nullable=True)
+
+    # Results
+    status: Mapped[Optional[str]] = mapped_column(String(32), default="running", index=True)  # running/success/partial/failed
+    papers_found: Mapped[Optional[int]] = mapped_column(Integer, default=0)
+    papers_new: Mapped[Optional[int]] = mapped_column(Integer, default=0)
+    papers_deduplicated: Mapped[Optional[int]] = mapped_column(Integer, default=0)
+    error_json: Mapped[str] = mapped_column(Text, default="{}")
+
+    # Timestamps
+    started_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True, index=True)
+    ended_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True)
+
+    def get_keywords(self) -> list:
+        try:
+            return json.loads(self.keywords_json or "[]")
+        except Exception:
+            return []
+
+    def get_venues(self) -> list:
+        try:
+            return json.loads(self.venues_json or "[]")
+        except Exception:
+            return []
+
+    def get_sources(self) -> list:
+        try:
+            return json.loads(self.sources_json or "[]")
+        except Exception:
+            return []
+
+    def get_errors(self) -> dict:
+        try:
+            return json.loads(self.error_json or "{}")
+        except Exception:
+            return {}
+
+    def set_errors(self, errors: dict) -> None:
+        self.error_json = json.dumps(errors or {}, ensure_ascii=False)
diff --git a/src/paperbot/infrastructure/stores/paper_store.py b/src/paperbot/infrastructure/stores/paper_store.py
index e4f8c87..6e9c3da 100644
--- a/src/paperbot/infrastructure/stores/paper_store.py
+++ b/src/paperbot/infrastructure/stores/paper_store.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 from __future__ import annotations
 
 from datetime import datetime, timezone
@@ -7,6 +8,32 @@
 
 from paperbot.domain.paper_identity import normalize_arxiv_id, normalize_doi
 from paperbot.infrastructure.stores.models import Base, PaperJudgeScoreModel, PaperModel
+=======
+# src/paperbot/infrastructure/stores/paper_store.py
+"""
+Paper storage repository.
+
+Handles persistence and retrieval of harvested papers.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional, Tuple
+
+from sqlalchemy import Integer, cast, func, or_, select
+
+from paperbot.utils.logging_config import Logger, LogFiles
+from paperbot.domain.harvest import HarvestedPaper, HarvestSource
+from paperbot.infrastructure.stores.models import (
+    Base,
+    HarvestRunModel,
+    PaperFeedbackModel,
+    PaperModel,
+)
+>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
 from paperbot.infrastructure.stores.sqlalchemy_db import SessionProvider, get_db_url
 
 
@@ -14,6 +41,7 @@ def _utcnow() -> datetime:
     return datetime.now(timezone.utc)
 
 
+<<<<<<< HEAD
 def _safe_list(values: Any) -> List[str]:
     if not isinstance(values, list):
         return []
@@ -56,6 +84,28 @@ def _as_utc(value: Optional[datetime]) -> Optional[datetime]:
 
 class SqlAlchemyPaperStore:
     """Canonical paper registry with idempotent upsert for daily workflows."""
+=======
+@dataclass
+class LibraryPaper:
+    """Paper with library metadata (saved_at, track_id, action)."""
+
+    paper: PaperModel
+    saved_at: datetime
+    track_id: Optional[int]
+    action: str
+
+
+class PaperStore:
+    """
+    Paper storage repository.
+
+    Handles:
+    - Batch upsert with DB-level deduplication
+    - Filter-based search with pagination
+    - Source tracking
+    - User library (saved papers)
+    """
+>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
 
     def __init__(self, db_url: Optional[str] = None, *, auto_create_schema: bool = True):
         self.db_url = db_url or get_db_url()
@@ -63,6 +113,7 @@ def __init__(self, db_url: Optional[str] = None, *, auto_create_schema: bool = T
         if auto_create_schema:
             Base.metadata.create_all(self._provider.engine)
 
+<<<<<<< HEAD
     def upsert_paper(
         self,
         *,
@@ -315,3 +366,476 @@ def _paper_to_dict(row: PaperModel) -> Dict[str, Any]:
             "created_at": row.created_at.isoformat() if row.created_at else None,
             "updated_at": row.updated_at.isoformat() if row.updated_at else None,
         }
+=======
+    def upsert_papers_batch(
+        self,
+        papers: List[HarvestedPaper],
+    ) -> Tuple[int, int]:
+        """
+        Upsert papers with deduplication.
+
+        Returns:
+            Tuple of (new_count, updated_count)
+        """
+        Logger.info(f"Starting batch upsert for {len(papers)} papers", file=LogFiles.HARVEST)
+        new_count = 0
+        updated_count = 0
+        now = _utcnow()
+
+        with self._provider.session() as session:
+            for paper in papers:
+                Logger.info("Checking for existing paper in database", file=LogFiles.HARVEST)
+                existing = self._find_existing(session, paper)
+
+                if existing:
+                    Logger.info("Found existing paper, updating metadata", file=LogFiles.HARVEST)
+                    self._update_paper(existing, paper, now)
+                    updated_count += 1
+                else:
+                    Logger.info("No existing paper found, creating new record", file=LogFiles.HARVEST)
+                    model = self._create_model(paper, now)
+                    session.add(model)
+                    new_count += 1
+
+            Logger.info("Committing transaction to database", file=LogFiles.HARVEST)
+            session.commit()
+
+        Logger.info(f"Batch upsert complete: {new_count} new, {updated_count} updated", file=LogFiles.HARVEST)
+        return new_count, updated_count
+
+    def _find_existing(self, session, paper: HarvestedPaper) -> Optional[PaperModel]:
+        """Find existing paper by canonical identifiers."""
+        # Try each identifier in priority order
+        if paper.doi:
+            result = session.execute(
+                select(PaperModel).where(PaperModel.doi == paper.doi)
+            ).scalar_one_or_none()
+            if result:
+                return result
+
+        if paper.arxiv_id:
+            result = session.execute(
+                select(PaperModel).where(PaperModel.arxiv_id == paper.arxiv_id)
+            ).scalar_one_or_none()
+            if result:
+                return result
+
+        if paper.semantic_scholar_id:
+            result = session.execute(
+                select(PaperModel).where(
+                    PaperModel.semantic_scholar_id == paper.semantic_scholar_id
+                )
+            ).scalar_one_or_none()
+            if result:
+                return result
+
+        if paper.openalex_id:
+            result = session.execute(
+                select(PaperModel).where(PaperModel.openalex_id == paper.openalex_id)
+            ).scalar_one_or_none()
+            if result:
+                return result
+
+        # Fallback to title hash
+        title_hash = paper.compute_title_hash()
+        result = session.execute(
+            select(PaperModel).where(PaperModel.title_hash == title_hash)
+        ).scalar_one_or_none()
+        return result
+
+    def _create_model(self, paper: HarvestedPaper, now: datetime) -> PaperModel:
+        """Create a new PaperModel from HarvestedPaper."""
+        return PaperModel(
+            doi=paper.doi,
+            arxiv_id=paper.arxiv_id,
+            semantic_scholar_id=paper.semantic_scholar_id,
+            openalex_id=paper.openalex_id,
+            title_hash=paper.compute_title_hash(),
+            title=paper.title,
+            abstract=paper.abstract,
+            authors_json=json.dumps(paper.authors, ensure_ascii=False),
+            year=paper.year,
+            venue=paper.venue,
+            publication_date=paper.publication_date,
+            citation_count=paper.citation_count,
+            url=paper.url,
+            pdf_url=paper.pdf_url,
+            keywords_json=json.dumps(paper.keywords, ensure_ascii=False),
+            fields_of_study_json=json.dumps(paper.fields_of_study, ensure_ascii=False),
+            primary_source=paper.source.value,
+            sources_json=json.dumps([paper.source.value], ensure_ascii=False),
+            created_at=now,
+            updated_at=now,
+        )
+
+    def _update_paper(
+        self, existing: PaperModel, paper: HarvestedPaper, now: datetime
+    ) -> None:
+        """Update existing paper with new data."""
+        # Fill in missing identifiers
+        if not existing.doi and paper.doi:
+            existing.doi = paper.doi
+        if not existing.arxiv_id and paper.arxiv_id:
+            existing.arxiv_id = paper.arxiv_id
+        if not existing.semantic_scholar_id and paper.semantic_scholar_id:
+            existing.semantic_scholar_id = paper.semantic_scholar_id
+        if not existing.openalex_id and paper.openalex_id:
+            existing.openalex_id = paper.openalex_id
+
+        # Prefer longer abstract
+        if len(paper.abstract) > len(existing.abstract or ""):
+            existing.abstract = paper.abstract
+
+        # Prefer higher citation count
+        if paper.citation_count > (existing.citation_count or 0):
+            existing.citation_count = paper.citation_count
+
+        # Fill in missing metadata
+        if not existing.year and paper.year:
+            existing.year = paper.year
+        if not existing.venue and paper.venue:
+            existing.venue = paper.venue
+        if not existing.publication_date and paper.publication_date:
+            existing.publication_date = paper.publication_date
+        if not existing.url and paper.url:
+            existing.url = paper.url
+        if not existing.pdf_url and paper.pdf_url:
+            existing.pdf_url = paper.pdf_url
+
+        # Merge sources
+        sources = existing.get_sources()
+        if paper.source.value not in sources:
+            sources.append(paper.source.value)
+            existing.set_sources(sources)
+
+        # Merge keywords and fields
+        keywords = set(existing.get_keywords() + paper.keywords)
+        existing.set_keywords(list(keywords))
+
+        fields = set(existing.get_fields_of_study() + paper.fields_of_study)
+        existing.set_fields_of_study(list(fields))
+
+        existing.updated_at = now
+
+    def search_papers(
+        self,
+        *,
+        query: Optional[str] = None,
+        keywords: Optional[List[str]] = None,
+        venues: Optional[List[str]] = None,
+        year_from: Optional[int] = None,
+        year_to: Optional[int] = None,
+        min_citations: Optional[int] = None,
+        sources: Optional[List[str]] = None,
+        sort_by: str = "citation_count",
+        sort_order: str = "desc",
+        limit: int = 50,
+        offset: int = 0,
+    ) -> Tuple[List[PaperModel], int]:
+        """
+        Search papers with filters and pagination.
+
+        Returns:
+            Tuple of (papers, total_count)
+        """
+        with self._provider.session() as session:
+            stmt = select(PaperModel).where(PaperModel.deleted_at.is_(None))
+
+            # Full-text search (LIKE for v1)
+            if query:
+                pattern = f"%{query}%"
+                stmt = stmt.where(
+                    or_(
+                        PaperModel.title.ilike(pattern),
+                        PaperModel.abstract.ilike(pattern),
+                    )
+                )
+
+            # Year filters
+            if year_from:
+                stmt = stmt.where(PaperModel.year >= year_from)
+            if year_to:
+                stmt = stmt.where(PaperModel.year <= year_to)
+
+            # Citation filter
+            if min_citations:
+                stmt = stmt.where(PaperModel.citation_count >= min_citations)
+
+            # Venue filter
+            if venues:
+                venue_conditions = [PaperModel.venue.ilike(f"%{v}%") for v in venues]
+                stmt = stmt.where(or_(*venue_conditions))
+
+            # Source filter
+            if sources:
+                stmt = stmt.where(PaperModel.primary_source.in_(sources))
+
+            # Count total before pagination
+            count_stmt = select(func.count()).select_from(stmt.subquery())
+            total_count = session.execute(count_stmt).scalar() or 0
+
+            # Sort
+            sort_col = getattr(PaperModel, sort_by, PaperModel.citation_count)
+            if sort_order.lower() == "desc":
+                stmt = stmt.order_by(sort_col.desc())
+            else:
+                stmt = stmt.order_by(sort_col.asc())
+
+            # Pagination
+            stmt = stmt.offset(offset).limit(limit)
+
+            papers = session.execute(stmt).scalars().all()
+
+            return list(papers), total_count
+
+    def get_paper_by_id(self, paper_id: int) -> Optional[PaperModel]:
+        """Get a paper by its ID."""
+        with self._provider.session() as session:
+            return session.execute(
+                select(PaperModel).where(
+                    PaperModel.id == paper_id,
+                    PaperModel.deleted_at.is_(None),
+                )
+            ).scalar_one_or_none()
+
+    def get_user_library(
+        self,
+        user_id: str,
+        *,
+        track_id: Optional[int] = None,
+        actions: Optional[List[str]] = None,
+        sort_by: str = "saved_at",
+        sort_order: str = "desc",
+        limit: int = 50,
+        offset: int = 0,
+    ) -> Tuple[List[LibraryPaper], int]:
+        """
+        Get papers in user's library (saved papers).
+
+        Joins papers table with paper_feedback where action in actions.
+        """
+        Logger.info("Starting to fetch user library", file=LogFiles.HARVEST)
+        if actions is None:
+            actions = ["save"]
+
+        with self._provider.session() as session:
+            # Join papers with feedback, then deduplicate by paper.id
+            # paper_feedback.paper_id can be either:
+            # 1. Integer ID as string (from harvest saves): "123" -> join on papers.id
+            # 2. Semantic Scholar ID (from recommendation saves): "abc123" -> join on papers.semantic_scholar_id
+
+            Logger.info("Executing database query to join papers with feedback", file=LogFiles.HARVEST)
+            # First, get all matching paper-feedback pairs
+            base_stmt = (
+                select(PaperModel, PaperFeedbackModel)
+                .join(
+                    PaperFeedbackModel,
+                    or_(
+                        PaperModel.id == cast(PaperFeedbackModel.paper_id, Integer),
+                        PaperModel.semantic_scholar_id == PaperFeedbackModel.paper_id,
+                    ),
+                )
+                .where(
+                    PaperFeedbackModel.user_id == user_id,
+                    PaperFeedbackModel.action.in_(actions),
+                    PaperModel.deleted_at.is_(None),
+                )
+            )
+
+            if track_id is not None:
+                base_stmt = base_stmt.where(PaperFeedbackModel.track_id == track_id)
+
+            # Execute and deduplicate in Python by paper.id (keeping latest feedback)
+            all_results = session.execute(base_stmt).all()
+            Logger.info(f"Query returned {len(all_results)} results before deduplication", file=LogFiles.HARVEST)
+
+            # Deduplicate by paper.id, keeping the one with latest timestamp
+            Logger.info("Deduplicating results by paper id", file=LogFiles.HARVEST)
+            paper_map: Dict[int, Tuple[PaperModel, PaperFeedbackModel]] = {}
+            for row in all_results:
+                paper = row[0]
+                feedback = row[1]
+                if paper.id not in paper_map or feedback.ts > paper_map[paper.id][1].ts:
+                    paper_map[paper.id] = (paper, feedback)
+
+            # Convert to list and sort
+            unique_results = list(paper_map.values())
+            Logger.info(f"After deduplication: {len(unique_results)} unique papers", file=LogFiles.HARVEST)
+
+            # Sort
+            min_ts = datetime.min.replace(tzinfo=timezone.utc)
+            if sort_by == "saved_at":
+                unique_results.sort(key=lambda x: x[1].ts or min_ts, reverse=(sort_order.lower() == "desc"))
+            elif sort_by == "title":
+                unique_results.sort(key=lambda x: x[0].title or "", reverse=(sort_order.lower() == "desc"))
+            elif sort_by == "citation_count":
+                unique_results.sort(key=lambda x: x[0].citation_count or 0, reverse=(sort_order.lower() == "desc"))
+            elif sort_by == "year":
+                unique_results.sort(key=lambda x: x[0].year or 0, reverse=(sort_order.lower() == "desc"))
+            else:
+                unique_results.sort(key=lambda x: x[1].ts or min_ts, reverse=(sort_order.lower() == "desc"))
+
+            # Get total count before pagination
+            total = len(unique_results)
+
+            # Apply pagination
+            paginated_results = unique_results[offset:offset + limit]
+
+            return [
+                LibraryPaper(
+                    paper=row[0],
+                    saved_at=row[1].ts,
+                    track_id=row[1].track_id,
+                    action=row[1].action,
+                )
+                for row in paginated_results
+            ], total
+
+    def remove_from_library(self, user_id: str, paper_id: int) -> bool:
+        """Remove paper from user's library by deleting 'save' feedback."""
+        with self._provider.session() as session:
+            stmt = (
+                PaperFeedbackModel.__table__.delete().where(
+                    PaperFeedbackModel.user_id == user_id,
+                    PaperFeedbackModel.paper_id == str(paper_id),
+                    PaperFeedbackModel.action == "save",
+                )
+            )
+            result = session.execute(stmt)
+            session.commit()
+            return result.rowcount > 0
+
+    def create_harvest_run(
+        self,
+        run_id: str,
+        keywords: List[str],
+        venues: List[str],
+        sources: List[str],
+        max_results_per_source: int,
+    ) -> HarvestRunModel:
+        """Create a new harvest run record."""
+        now = _utcnow()
+        with self._provider.session() as session:
+            run = HarvestRunModel(
+                run_id=run_id,
+                keywords_json=json.dumps(keywords, ensure_ascii=False),
+                venues_json=json.dumps(venues, ensure_ascii=False),
+                sources_json=json.dumps(sources, ensure_ascii=False),
+                max_results_per_source=max_results_per_source,
+                status="running",
+                started_at=now,
+            )
+            session.add(run)
+            session.commit()
+            session.refresh(run)
+            return run
+
+    def update_harvest_run(
+        self,
+        run_id: str,
+        *,
+        status: Optional[str] = None,
+        papers_found: Optional[int] = None,
+        papers_new: Optional[int] = None,
+        papers_deduplicated: Optional[int] = None,
+        errors: Optional[Dict[str, Any]] = None,
+    ) -> Optional[HarvestRunModel]:
+        """Update a harvest run record."""
+        now = _utcnow()
+        with self._provider.session() as session:
+            run = session.execute(
+                select(HarvestRunModel).where(HarvestRunModel.run_id == run_id)
+            ).scalar_one_or_none()
+
+            if run is None:
+                return None
+
+            if status is not None:
+                run.status = status
+                if status in ("success", "partial", "failed"):
+                    run.ended_at = now
+
+            if papers_found is not None:
+                run.papers_found = papers_found
+            if papers_new is not None:
+                run.papers_new = papers_new
+            if papers_deduplicated is not None:
+                run.papers_deduplicated = papers_deduplicated
+            if errors is not None:
+                run.set_errors(errors)
+
+            session.commit()
+            session.refresh(run)
+            return run
+
+    def get_harvest_run(self, run_id: str) -> Optional[HarvestRunModel]:
+        """Get a harvest run by its ID."""
+        with self._provider.session() as session:
+            return session.execute(
+                select(HarvestRunModel).where(HarvestRunModel.run_id == run_id)
+            ).scalar_one_or_none()
+
+    def list_harvest_runs(
+        self,
+        *,
+        status: Optional[str] = None,
+        limit: int = 50,
+        offset: int = 0,
+    ) -> List[HarvestRunModel]:
+        """List harvest runs with optional filtering."""
+        with self._provider.session() as session:
+            stmt = select(HarvestRunModel)
+
+            if status:
+                stmt = stmt.where(HarvestRunModel.status == status)
+
+            stmt = stmt.order_by(HarvestRunModel.started_at.desc())
+            stmt = stmt.offset(offset).limit(limit)
+
+            return list(session.execute(stmt).scalars().all())
+
+    def get_paper_count(self) -> int:
+        """Get total count of papers in the store."""
+        with self._provider.session() as session:
+            return (
+                session.execute(
+                    select(func.count()).select_from(PaperModel).where(
+                        PaperModel.deleted_at.is_(None)
+                    )
+                ).scalar()
+                or 0
+            )
+
+    def close(self) -> None:
+        """Close database connections."""
+        try:
+            self._provider.engine.dispose()
+        except Exception:
+            pass
+
+
+def paper_to_dict(paper: PaperModel) -> Dict[str, Any]:
+    """Convert PaperModel to dictionary for API response."""
+    return {
+        "id": paper.id,
+        "doi": paper.doi,
+        "arxiv_id": paper.arxiv_id,
+        "semantic_scholar_id": paper.semantic_scholar_id,
+        "openalex_id": paper.openalex_id,
+        "title": paper.title,
+        "abstract": paper.abstract,
+        "authors": paper.get_authors(),
+        "year": paper.year,
+        "venue": paper.venue,
+        "publication_date": paper.publication_date,
+        "citation_count": paper.citation_count,
+        "url": paper.url,
+        "pdf_url": paper.pdf_url,
+        "keywords": paper.get_keywords(),
+        "fields_of_study": paper.get_fields_of_study(),
+        "primary_source": paper.primary_source,
+        "sources": paper.get_sources(),
+        "created_at": paper.created_at.isoformat() if paper.created_at else None,
+        "updated_at": paper.updated_at.isoformat() if paper.updated_at else None,
+    }
+>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
diff --git a/src/paperbot/infrastructure/stores/research_store.py b/src/paperbot/infrastructure/stores/research_store.py
index 425724a..9549e7f 100644
--- a/src/paperbot/infrastructure/stores/research_store.py
+++ b/src/paperbot/infrastructure/stores/research_store.py
@@ -8,7 +8,11 @@
 from sqlalchemy import desc, func, or_, select
 from sqlalchemy.exc import IntegrityError
 
+<<<<<<< HEAD
 from paperbot.domain.paper_identity import normalize_arxiv_id, normalize_doi
+=======
+from paperbot.utils.logging_config import Logger, LogFiles
+>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
 from paperbot.infrastructure.stores.models import (
     Base,
     PaperFeedbackModel,
@@ -329,6 +333,7 @@ def add_paper_feedback(
         weight: float = 0.0,
         metadata: Optional[Dict[str, Any]] = None,
     ) -> Optional[Dict[str, Any]]:
+        Logger.info("Recording paper feedback", file=LogFiles.HARVEST)
         now = _utcnow()
         metadata = dict(metadata or {})
         with self._provider.session() as session:
@@ -338,14 +343,19 @@ def add_paper_feedback(
                 )
             ).scalar_one_or_none()
             if track is None:
+                Logger.error("Track not found", file=LogFiles.HARVEST)
                 return None
 
+<<<<<<< HEAD
             resolved_paper_ref_id = self._resolve_paper_ref_id(
                 session=session,
                 paper_id=(paper_id or "").strip(),
                 metadata=metadata,
             )
 
+=======
+            Logger.info("Creating new feedback record", file=LogFiles.HARVEST)
+>>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
             row = PaperFeedbackModel(
                 user_id=user_id,
                 track_id=track_id,
@@ -373,6 +383,7 @@ def add_paper_feedback(
             session.add(track)
             session.commit()
             session.refresh(row)
+            Logger.info("Feedback record created successfully", file=LogFiles.HARVEST)
             return self._feedback_to_dict(row)
 
     def list_paper_feedback(
diff --git a/tests/integration/test_harvest_pipeline.py b/tests/integration/test_harvest_pipeline.py
new file mode 100644
index 0000000..18f30d5
--- /dev/null
+++ b/tests/integration/test_harvest_pipeline.py
@@ -0,0 +1,537 @@
+"""
+HarvestPipeline integration tests.
+
+Tests the complete harvest pipeline with mocked harvesters.
+"""
+
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+
+from paperbot.domain.harvest import HarvestedPaper, HarvestResult, HarvestSource
+from paperbot.application.workflows.harvest_pipeline import (
+    HarvestConfig,
+    HarvestFinalResult,
+    HarvestPipeline,
+    HarvestProgress,
+)
+
+
+@pytest.fixture
+def mock_harvesters():
+    """Create mock harvesters with predefined responses."""
+    arxiv_papers = [
+        HarvestedPaper(
+            title="Transformer Architecture for NLP",
+            source=HarvestSource.ARXIV,
+            abstract="We propose transformers.",
+            arxiv_id="2301.00001",
+            year=2023,
+        ),
+        HarvestedPaper(
+            title="BERT Pre-training",
+            source=HarvestSource.ARXIV,
+            abstract="We introduce BERT.",
+            arxiv_id="2301.00002",
+            year=2023,
+        ),
+    ]
+
+    s2_papers = [
+        HarvestedPaper(
+            title="Transformer Architecture for NLP",  # Duplicate
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            abstract="We propose transformers for various NLP tasks.",
+            doi="10.1234/transformer",
+            arxiv_id="2301.00001",
+            semantic_scholar_id="s2-001",
+            year=2023,
+            citation_count=500,
+        ),
+        HarvestedPaper(
+            title="GPT Language Models",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            abstract="Generative pre-training for language understanding.",
+            doi="10.1234/gpt",
+            semantic_scholar_id="s2-002",
+            year=2023,
+            citation_count=1000,
+        ),
+    ]
+
+    openalex_papers = [
+        HarvestedPaper(
+            title="Vision Transformers",
+            source=HarvestSource.OPENALEX,
+            abstract="Transformers for computer vision.",
+            doi="10.1234/vit",
+            openalex_id="W001",
+            year=2024,
+            citation_count=300,
+        ),
+    ]
+
+    def create_harvester(source, papers, error=None):
+        harvester = MagicMock()
+        harvester.source = source
+        harvester.search = AsyncMock(
+            return_value=HarvestResult(
+                source=source,
+                papers=papers,
+                total_found=len(papers),
+                error=error,
+            )
+        )
+        harvester.close = AsyncMock()
+        return harvester
+
+    return {
+        "arxiv": create_harvester(HarvestSource.ARXIV, arxiv_papers),
+        "semantic_scholar": create_harvester(HarvestSource.SEMANTIC_SCHOLAR, s2_papers),
+        "openalex": create_harvester(HarvestSource.OPENALEX, openalex_papers),
+    }
+
+
+@pytest.fixture
+def pipeline(tmp_path, mock_harvesters):
+    """Create HarvestPipeline with mocked dependencies."""
+    db_url = f"sqlite:///{tmp_path / 'test_harvest.db'}"
+    pipeline = HarvestPipeline(db_url=db_url)
+
+    # Inject mock harvesters
+    def get_mock_harvester(source):
+        return mock_harvesters.get(source)
+
+    pipeline._get_harvester = get_mock_harvester
+    return pipeline
+
+
+class TestHarvestPipelineRun:
+    """Tests for harvest pipeline execution."""
+
+    @pytest.mark.asyncio
+    async def test_run_full_pipeline(self, pipeline):
+        """Run full harvest pipeline with all sources."""
+        config = HarvestConfig(
+            keywords=["transformer", "NLP"],
+            max_results_per_source=50,
+            expand_keywords=False,  # Skip expansion for predictable test
+            recommend_venues=False,  # Skip venue recommendation
+        )
+
+        progress_messages = []
+        final_result = None
+
+        async for item in pipeline.run(config):
+            if isinstance(item, HarvestProgress):
+                progress_messages.append(item)
+            elif isinstance(item, HarvestFinalResult):
+                final_result = item
+
+        # Verify progress messages
+        phases = [p.phase for p in progress_messages]
+        assert "Expanding" in phases
+        assert "Initializing" in phases
+        assert "Harvesting" in phases
+        assert "Deduplicating" in phases
+        assert "Storing" in phases
+
+        # Verify final result
+        assert final_result is not None
+        assert final_result.status == "success"
+        assert final_result.papers_found == 5  # 2 + 2 + 1
+        assert final_result.papers_deduplicated > 0  # Transformer paper is duplicate
+        assert final_result.duration_seconds > 0
+
+    @pytest.mark.asyncio
+    async def test_run_with_keyword_expansion(self, pipeline):
+        """Pipeline expands keywords when enabled."""
+        config = HarvestConfig(
+            keywords=["LLM"],  # Should expand to "large language model"
+            expand_keywords=True,
+            recommend_venues=False,
+        )
+
+        async for item in pipeline.run(config):
+            if isinstance(item, HarvestProgress) and item.phase == "Expanding":
+                assert item.message == "Expanding keywords..."
+
+    @pytest.mark.asyncio
+    async def test_run_with_venue_recommendation(self, pipeline):
+        """Pipeline recommends venues when enabled and no venues specified."""
+        config = HarvestConfig(
+            keywords=["security"],
+            venues=None,  # No venues specified
+            expand_keywords=False,
+            recommend_venues=True,
+        )
+
+        found_recommend_phase = False
+        async for item in pipeline.run(config):
+            if isinstance(item, HarvestProgress) and item.phase == "Recommending":
+                found_recommend_phase = True
+
+        assert found_recommend_phase
+
+    @pytest.mark.asyncio
+    async def test_run_with_specific_sources(self, pipeline):
+        """Pipeline uses only specified sources."""
+        config = HarvestConfig(
+            keywords=["test"],
+            sources=["arxiv"],  # Only arXiv
+            expand_keywords=False,
+            recommend_venues=False,
+        )
+
+        final_result = None
+        async for item in pipeline.run(config):
+            if isinstance(item, HarvestFinalResult):
+                final_result = item
+
+        assert final_result is not None
+        assert "arxiv" in final_result.source_results
+        # Should not query other sources
+        # (mock harvesters would have papers, so we check papers_found)
+        assert final_result.papers_found == 2  # Only arXiv papers
+
+    @pytest.mark.asyncio
+    async def test_run_creates_harvest_run_record(self, pipeline):
+        """Pipeline creates harvest run record in database."""
+        config = HarvestConfig(
+            keywords=["test"],
+            expand_keywords=False,
+            recommend_venues=False,
+        )
+
+        final_result = None
+        async for item in pipeline.run(config):
+            if isinstance(item, HarvestFinalResult):
+                final_result = item
+
+        # Verify harvest run was created
+        run = pipeline.paper_store.get_harvest_run(final_result.run_id)
+        assert run is not None
+        assert run.status == "success"
+        assert run.papers_found > 0
+
+    @pytest.mark.asyncio
+    async def test_run_stores_papers(self, pipeline):
+        """Pipeline stores papers in database."""
+        config = HarvestConfig(
+            keywords=["test"],
+            expand_keywords=False,
+            recommend_venues=False,
+        )
+
+        async for item in pipeline.run(config):
+            pass  # Just run to completion
+
+        # Verify papers were stored
+        paper_count = pipeline.paper_store.get_paper_count()
+        assert paper_count > 0
+
+    @pytest.mark.asyncio
+    async def test_run_deduplicates_papers(self, pipeline):
+        """Pipeline deduplicates papers across sources."""
+        config = HarvestConfig(
+            keywords=["test"],
+            sources=["arxiv", "semantic_scholar"],
+            expand_keywords=False,
+            recommend_venues=False,
+        )
+
+        final_result = None
+        async for item in pipeline.run(config):
+            if isinstance(item, HarvestFinalResult):
+                final_result = item
+
+        # Transformer paper appears in both sources
+        assert final_result.papers_deduplicated > 0
+        # Total found - new papers = deduplicated
+        total_raw = final_result.papers_found
+        stored = final_result.papers_new
+        # Due to deduplication, stored < total_raw
+        assert stored < total_raw or final_result.papers_deduplicated > 0
+
+    @pytest.mark.asyncio
+    async def test_run_with_year_filter(self, pipeline, mock_harvesters):
+        """Pipeline passes year filters to harvesters."""
+        config = HarvestConfig(
+            keywords=["test"],
+            year_from=2023,
+            year_to=2024,
+            sources=["arxiv"],
+            expand_keywords=False,
+            recommend_venues=False,
+        )
+
+        async for item in pipeline.run(config):
+            pass
+
+        # Verify harvester was called with year filters
+        mock_harvesters["arxiv"].search.assert_called_once()
+        call_kwargs = mock_harvesters["arxiv"].search.call_args[1]
+        assert call_kwargs["year_from"] == 2023
+        assert call_kwargs["year_to"] == 2024
+
+
+class TestHarvestPipelineErrorHandling:
+    """Tests for error handling in harvest pipeline."""
+
+    @pytest.mark.asyncio
+    async def test_partial_failure(self, tmp_path):
+        """Pipeline handles partial source failures."""
+        db_url = f"sqlite:///{tmp_path / 'test_partial.db'}"
+        pipeline = HarvestPipeline(db_url=db_url)
+
+        # Create harvesters with one failing
+        def get_harvester(source):
+            if source == "arxiv":
+                harvester = MagicMock()
+                harvester.source = HarvestSource.ARXIV
+                harvester.search = AsyncMock(
+                    return_value=HarvestResult(
+                        source=HarvestSource.ARXIV,
+                        papers=[
+                            HarvestedPaper(
+                                title="Working Paper",
+                                source=HarvestSource.ARXIV,
+                            )
+                        ],
+                        total_found=1,
+                    )
+                )
+                harvester.close = AsyncMock()
+                return harvester
+            elif source == "semantic_scholar":
+                harvester = MagicMock()
+                harvester.source = HarvestSource.SEMANTIC_SCHOLAR
+                harvester.search = AsyncMock(
+                    return_value=HarvestResult(
+                        source=HarvestSource.SEMANTIC_SCHOLAR,
+                        papers=[],
+                        total_found=0,
+                        error="Rate limit exceeded",
+                    )
+                )
+                harvester.close = AsyncMock()
+                return harvester
+            return None
+
+        pipeline._get_harvester = get_harvester
+
+        config = HarvestConfig(
+            keywords=["test"],
+            sources=["arxiv", "semantic_scholar"],
+            expand_keywords=False,
+            recommend_venues=False,
+        )
+
+        final_result = None
+        async for item in pipeline.run(config):
+            if isinstance(item, HarvestFinalResult):
+                final_result = item
+
+        assert final_result.status == "partial"  # Not full success
+        assert "semantic_scholar" in final_result.errors
+        assert final_result.papers_new == 1  # From arXiv
+
+        await pipeline.close()
+
+    @pytest.mark.asyncio
+    async def test_all_sources_fail(self, tmp_path):
+        """Pipeline handles all sources failing."""
+        db_url = f"sqlite:///{tmp_path / 'test_all_fail.db'}"
+        pipeline = HarvestPipeline(db_url=db_url)
+
+        def get_failing_harvester(source):
+            if source == "arxiv":
+                harvester = MagicMock()
+                harvester.source = HarvestSource.ARXIV
+                harvester.search = AsyncMock(
+                    return_value=HarvestResult(
+                        source=HarvestSource.ARXIV,
+                        papers=[],
+                        total_found=0,
+                        error="Connection timeout",
+                    )
+                )
+                harvester.close = AsyncMock()
+                return harvester
+            return None
+
+        pipeline._get_harvester = get_failing_harvester
+
+        config = HarvestConfig(
+            keywords=["test"],
+            sources=["arxiv"],
+            expand_keywords=False,
+            recommend_venues=False,
+        )
+
+        final_result = None
+        async for item in pipeline.run(config):
+            if isinstance(item, HarvestFinalResult):
+                final_result = item
+
+        assert final_result.status == "failed"
+        assert "arxiv" in final_result.errors
+        assert final_result.papers_new == 0
+
+        await pipeline.close()
+
+    @pytest.mark.asyncio
+    async def test_harvester_exception(self, tmp_path):
+        """Pipeline handles harvester exceptions gracefully."""
+        db_url = f"sqlite:///{tmp_path / 'test_exception.db'}"
+        pipeline = HarvestPipeline(db_url=db_url)
+
+        def get_throwing_harvester(source):
+            if source == "arxiv":
+                harvester = MagicMock()
+                harvester.source = HarvestSource.ARXIV
+                harvester.search = AsyncMock(
+                    side_effect=Exception("Unexpected error")
+                )
+                harvester.close = AsyncMock()
+                return harvester
+            return None
+
+        pipeline._get_harvester = get_throwing_harvester
+
+        config = HarvestConfig(
+            keywords=["test"],
+            sources=["arxiv"],
+            expand_keywords=False,
+            recommend_venues=False,
+        )
+
+        final_result = None
+        async for item in pipeline.run(config):
+            if isinstance(item, HarvestFinalResult):
+                final_result = item
+
+        assert final_result is not None
+        assert "arxiv" in final_result.errors
+        assert "Unexpected error" in final_result.errors["arxiv"]
+
+        await pipeline.close()
+
+
+class TestHarvestPipelineRunSync:
+    """Tests for synchronous pipeline execution."""
+
+    @pytest.mark.asyncio
+    async def test_run_sync_returns_final_result(self, pipeline):
+        """run_sync returns only the final result."""
+        config = HarvestConfig(
+            keywords=["test"],
+            expand_keywords=False,
+            recommend_venues=False,
+        )
+
+        result = await pipeline.run_sync(config)
+
+        assert isinstance(result, HarvestFinalResult)
+        assert result.status in ("success", "partial", "failed")
+
+
+class TestHarvestPipelineContextManager:
+    """Tests for context manager protocol."""
+
+    @pytest.mark.asyncio
+    async def test_context_manager(self, tmp_path, mock_harvesters):
+        """Pipeline can be used as async context manager."""
+        db_url = f"sqlite:///{tmp_path / 'test_ctx.db'}"
+
+        async with HarvestPipeline(db_url=db_url) as pipeline:
+            # Inject mock harvesters
+            def get_mock_harvester(source):
+                return mock_harvesters.get(source)
+
+            pipeline._get_harvester = get_mock_harvester
+
+            config = HarvestConfig(
+                keywords=["test"],
+                expand_keywords=False,
+                recommend_venues=False,
+            )
+
+            result = await pipeline.run_sync(config)
+            assert result is not None
+
+        # Pipeline should be closed after context exits
+
+
+class TestHarvestPipelineRunId:
+    """Tests for run ID generation."""
+
+    def test_new_run_id_format(self):
+        """Run ID follows expected format."""
+        run_id = HarvestPipeline.new_run_id()
+
+        assert run_id.startswith("harvest-")
+        parts = run_id.split("-")
+        assert len(parts) == 4  # harvest-YYYYMMDD-HHMMSS-suffix
+
+    def test_new_run_id_unique(self):
+        """Each run ID is unique."""
+        ids = [HarvestPipeline.new_run_id() for _ in range(10)]
+        assert len(set(ids)) == 10
+
+    @pytest.mark.asyncio
+    async def test_custom_run_id(self, pipeline):
+        """Pipeline accepts custom run ID."""
+        config = HarvestConfig(
+            keywords=["test"],
+            expand_keywords=False,
+            recommend_venues=False,
+        )
+
+        custom_id = "custom-run-123"
+        final_result = None
+
+        async for item in pipeline.run(config, run_id=custom_id):
+            if isinstance(item, HarvestFinalResult):
+                final_result = item
+
+        assert final_result.run_id == custom_id
+
+
+class TestHarvestPipelineServices:
+    """Tests for lazy-loaded services."""
+
+    def test_query_rewriter_lazy_init(self, tmp_path):
+        """QueryRewriter is lazily initialized."""
+        db_url = f"sqlite:///{tmp_path / 'test_lazy.db'}"
+        pipeline = HarvestPipeline(db_url=db_url)
+
+        assert pipeline._query_rewriter is None
+        _ = pipeline.query_rewriter
+        assert pipeline._query_rewriter is not None
+
+    def test_venue_recommender_lazy_init(self, tmp_path):
+        """VenueRecommender is lazily initialized."""
+        db_url = f"sqlite:///{tmp_path / 'test_lazy.db'}"
+        pipeline = HarvestPipeline(db_url=db_url)
+
+        assert pipeline._venue_recommender is None
+        _ = pipeline.venue_recommender
+        assert pipeline._venue_recommender is not None
+
+    def test_deduplicator_lazy_init(self, tmp_path):
+        """PaperDeduplicator is lazily initialized."""
+        db_url = f"sqlite:///{tmp_path / 'test_lazy.db'}"
+        pipeline = HarvestPipeline(db_url=db_url)
+
+        assert pipeline._deduplicator is None
+        _ = pipeline.deduplicator
+        assert pipeline._deduplicator is not None
+
+    def test_paper_store_lazy_init(self, tmp_path):
+        """PaperStore is lazily initialized."""
+        db_url = f"sqlite:///{tmp_path / 'test_lazy.db'}"
+        pipeline = HarvestPipeline(db_url=db_url)
+
+        assert pipeline._paper_store is None
+        _ = pipeline.paper_store
+        assert pipeline._paper_store is not None
diff --git a/tests/integration/test_harvesters.py b/tests/integration/test_harvesters.py
new file mode 100644
index 0000000..489337b
--- /dev/null
+++ b/tests/integration/test_harvesters.py
@@ -0,0 +1,478 @@
+"""
+Harvester integration tests with mocked API responses.
+
+Tests ArxivHarvester, SemanticScholarHarvester, and OpenAlexHarvester.
+"""
+
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+
+from paperbot.domain.harvest import HarvestSource
+from paperbot.infrastructure.harvesters import (
+    ArxivHarvester,
+    SemanticScholarHarvester,
+    OpenAlexHarvester,
+)
+
+
+# Sample API response data
+ARXIV_ATOM_RESPONSE = """<?xml version="1.0" encoding="UTF-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <entry>
+    <id>http://arxiv.org/abs/2301.12345v1</id>
+    <title>Attention Is All You Need</title>
+    <summary>We propose a new architecture called Transformer.</summary>
+    <author><name>Ashish Vaswani</name></author>
+    <author><name>Noam Shazeer</name></author>
+    <published>2023-01-15T00:00:00Z</published>
+    <link href="http://arxiv.org/abs/2301.12345v1" rel="alternate" type="text/html"/>
+    <link href="http://arxiv.org/pdf/2301.12345v1" rel="related" type="application/pdf"/>
+  </entry>
+  <entry>
+    <id>http://arxiv.org/abs/2301.12346v1</id>
+    <title>BERT: Pre-training of Deep Bidirectional Transformers</title>
+    <summary>We introduce BERT for language understanding.</summary>
+    <author><name>Jacob Devlin</name></author>
+    <published>2023-01-16T00:00:00Z</published>
+    <link href="http://arxiv.org/abs/2301.12346v1" rel="alternate" type="text/html"/>
+    <link href="http://arxiv.org/pdf/2301.12346v1" rel="related" type="application/pdf"/>
+  </entry>
+</feed>
+"""
+
+S2_API_RESPONSE = {
+    "data": [
+        {
+            "paperId": "s2-paper-001",
+            "title": "Deep Learning for NLP",
+            "abstract": "A comprehensive study on deep learning for NLP.",
+            "year": 2023,
+            "venue": "NeurIPS",
+            "citationCount": 150,
+            "authors": [{"name": "Alice Smith"}, {"name": "Bob Jones"}],
+            "publicationDate": "2023-12-01",
+            "externalIds": {"DOI": "10.1234/dl-nlp", "ArXiv": "2301.00001"},
+            "fieldsOfStudy": ["Computer Science", "Machine Learning"],
+            "url": "https://www.semanticscholar.org/paper/abc123",
+            "openAccessPdf": {"url": "https://arxiv.org/pdf/2301.00001.pdf"},
+        },
+        {
+            "paperId": "s2-paper-002",
+            "title": "Reinforcement Learning in Robotics",
+            "abstract": "RL algorithms for robotic control.",
+            "year": 2022,
+            "venue": "ICRA",
+            "citationCount": 75,
+            "authors": [{"name": "Charlie Brown"}],
+            "publicationDate": "2022-06-15",
+            "externalIds": {"DOI": "10.1234/rl-robot"},
+            "fieldsOfStudy": ["Robotics", "AI"],
+            "url": "https://www.semanticscholar.org/paper/def456",
+            "openAccessPdf": None,
+        },
+    ]
+}
+
+OPENALEX_API_RESPONSE = {
+    "meta": {"count": 2},
+    "results": [
+        {
+            "id": "https://openalex.org/W123456",
+            "title": "Computer Vision Advances",
+            "abstract_inverted_index": {
+                "Computer": [0],
+                "vision": [1],
+                "has": [2],
+                "advanced": [3],
+                "significantly": [4],
+            },
+            "publication_year": 2024,
+            "cited_by_count": 200,
+            "authorships": [
+                {"author": {"display_name": "David Wilson"}},
+                {"author": {"display_name": "Eve Martinez"}},
+            ],
+            "primary_location": {"source": {"display_name": "CVPR"}},
+            "publication_date": "2024-01-10",
+            "ids": {
+                "doi": "https://doi.org/10.1234/cv-adv",
+                "openalex": "https://openalex.org/W123456",
+            },
+            "open_access": {"oa_url": "https://example.com/paper.pdf"},
+            "keywords": [{"display_name": "Computer Vision"}, {"display_name": "CNN"}],
+            "concepts": [
+                {"display_name": "Computer Science"},
+                {"display_name": "Image Processing"},
+            ],
+        },
+    ],
+}
+
+
+class TestArxivHarvester:
+    """Tests for ArxivHarvester."""
+
+    @pytest.fixture
+    def harvester(self):
+        """Create ArxivHarvester instance."""
+        return ArxivHarvester()
+
+    @pytest.mark.asyncio
+    async def test_search_success(self, harvester):
+        """Successful search returns papers."""
+        with patch.object(harvester, "_get_session") as mock_session:
+            mock_response = AsyncMock()
+            mock_response.status = 200
+            mock_response.text = AsyncMock(return_value=ARXIV_ATOM_RESPONSE)
+
+            mock_session.return_value.get = MagicMock(
+                return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))
+            )
+
+            result = await harvester.search("transformer", max_results=10)
+
+            assert result.success
+            assert len(result.papers) == 2
+            assert result.source == HarvestSource.ARXIV
+
+            # Check first paper
+            paper1 = result.papers[0]
+            assert "Attention" in paper1.title
+            assert paper1.arxiv_id == "2301.12345"
+            assert paper1.source == HarvestSource.ARXIV
+            assert len(paper1.authors) >= 1
+
+    @pytest.mark.asyncio
+    async def test_search_api_error(self, harvester):
+        """API error returns error result."""
+        with patch.object(harvester, "_get_session") as mock_session:
+            mock_response = AsyncMock()
+            mock_response.status = 500
+
+            mock_session.return_value.get = MagicMock(
+                return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))
+            )
+
+            result = await harvester.search("test")
+
+            assert not result.success
+            assert result.error is not None
+            assert "500" in result.error
+
+    @pytest.mark.asyncio
+    async def test_search_with_year_filter(self, harvester):
+        """Search with year filter builds correct query."""
+        with patch.object(harvester, "_get_session") as mock_session:
+            mock_response = AsyncMock()
+            mock_response.status = 200
+            mock_response.text = AsyncMock(return_value=ARXIV_ATOM_RESPONSE)
+
+            mock_get = MagicMock(
+                return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))
+            )
+            mock_session.return_value.get = mock_get
+
+            await harvester.search(
+                "deep learning",
+                year_from=2020,
+                year_to=2024,
+                max_results=50,
+            )
+
+            # Verify query includes year filter
+            call_args = mock_get.call_args
+            params = call_args[1]["params"]
+            assert "submittedDate" in params["search_query"]
+            assert "20200101" in params["search_query"]
+            assert "20241231" in params["search_query"]
+
+    @pytest.mark.asyncio
+    async def test_source_property(self, harvester):
+        """source property returns ARXIV."""
+        assert harvester.source == HarvestSource.ARXIV
+
+    @pytest.mark.asyncio
+    async def test_close(self, harvester):
+        """close() releases resources."""
+        mock_session = MagicMock()
+        mock_session.closed = False
+        mock_session.close = AsyncMock()
+        harvester._session = mock_session
+
+        await harvester.close()
+
+        mock_session.close.assert_called_once()
+        assert harvester._session is None
+
+
+class TestSemanticScholarHarvester:
+    """Tests for SemanticScholarHarvester."""
+
+    @pytest.fixture
+    def harvester(self):
+        """Create SemanticScholarHarvester with mocked client."""
+        mock_client = MagicMock()
+        mock_client.search_papers = AsyncMock(return_value=S2_API_RESPONSE["data"])
+        return SemanticScholarHarvester(client=mock_client)
+
+    @pytest.mark.asyncio
+    async def test_search_success(self, harvester):
+        """Successful search returns papers."""
+        result = await harvester.search("deep learning", max_results=10)
+
+        assert result.success
+        assert len(result.papers) == 2
+        assert result.source == HarvestSource.SEMANTIC_SCHOLAR
+
+        # Check first paper
+        paper1 = result.papers[0]
+        assert paper1.title == "Deep Learning for NLP"
+        assert paper1.semantic_scholar_id == "s2-paper-001"
+        assert paper1.doi == "10.1234/dl-nlp"
+        assert paper1.arxiv_id == "2301.00001"
+        assert paper1.year == 2023
+        assert paper1.venue == "NeurIPS"
+        assert paper1.citation_count == 150
+        assert len(paper1.authors) == 2
+        assert paper1.pdf_url is not None
+
+    @pytest.mark.asyncio
+    async def test_search_with_venue_filter(self, harvester):
+        """Search filters by venue."""
+        # Return all papers, then filter
+        result = await harvester.search(
+            "learning",
+            venues=["NeurIPS"],
+            max_results=10,
+        )
+
+        # Only NeurIPS paper should be returned
+        assert all("NeurIPS" in (p.venue or "").lower() or "neurips" in (p.venue or "").lower()
+                   for p in result.papers if p.venue)
+
+    @pytest.mark.asyncio
+    async def test_search_client_error(self, harvester):
+        """Client error returns error result."""
+        harvester.client.search_papers = AsyncMock(
+            side_effect=Exception("API connection failed")
+        )
+
+        result = await harvester.search("test")
+
+        assert not result.success
+        assert "API connection failed" in result.error
+
+    @pytest.mark.asyncio
+    async def test_paper_without_optional_fields(self):
+        """Paper handles missing optional fields."""
+        mock_client = MagicMock()
+        mock_client.search_papers = AsyncMock(
+            return_value=[
+                {
+                    "paperId": "minimal-paper",
+                    "title": "Minimal Paper",
+                    "abstract": None,
+                    "year": None,
+                    "venue": None,
+                    "citationCount": None,
+                    "authors": [],
+                    "externalIds": None,
+                    "fieldsOfStudy": None,
+                    "openAccessPdf": None,
+                }
+            ]
+        )
+        harvester = SemanticScholarHarvester(client=mock_client)
+
+        result = await harvester.search("test")
+
+        assert result.success
+        paper = result.papers[0]
+        assert paper.title == "Minimal Paper"
+        assert paper.abstract == ""
+        assert paper.citation_count == 0
+        assert paper.authors == []
+
+    @pytest.mark.asyncio
+    async def test_source_property(self, harvester):
+        """source property returns SEMANTIC_SCHOLAR."""
+        assert harvester.source == HarvestSource.SEMANTIC_SCHOLAR
+
+
+class TestOpenAlexHarvester:
+    """Tests for OpenAlexHarvester."""
+
+    @pytest.fixture
+    def harvester(self):
+        """Create OpenAlexHarvester instance."""
+        return OpenAlexHarvester(email="test@example.com")
+
+    @pytest.mark.asyncio
+    async def test_search_success(self, harvester):
+        """Successful search returns papers."""
+        with patch.object(harvester, "_get_session") as mock_session:
+            mock_response = AsyncMock()
+            mock_response.status = 200
+            mock_response.json = AsyncMock(return_value=OPENALEX_API_RESPONSE)
+
+            mock_session.return_value.get = MagicMock(
+                return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))
+            )
+
+            result = await harvester.search("computer vision", max_results=10)
+
+            assert result.success
+            assert len(result.papers) == 1
+            assert result.source == HarvestSource.OPENALEX
+            assert result.total_found == 2  # From meta.count
+
+            # Check paper details
+            paper = result.papers[0]
+            assert paper.title == "Computer Vision Advances"
+            assert paper.openalex_id == "W123456"
+            assert paper.doi == "10.1234/cv-adv"
+            assert paper.year == 2024
+            assert paper.venue == "CVPR"
+            assert paper.citation_count == 200
+            assert len(paper.authors) == 2
+            assert paper.pdf_url is not None
+
+    @pytest.mark.asyncio
+    async def test_search_api_error(self, harvester):
+        """API error returns error result."""
+        with patch.object(harvester, "_get_session") as mock_session:
+            mock_response = AsyncMock()
+            mock_response.status = 429  # Rate limit
+
+            mock_session.return_value.get = MagicMock(
+                return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))
+            )
+
+            result = await harvester.search("test")
+
+            assert not result.success
+            assert "429" in result.error
+
+    @pytest.mark.asyncio
+    async def test_search_with_year_filter(self, harvester):
+        """Search with year filter includes correct params."""
+        with patch.object(harvester, "_get_session") as mock_session:
+            mock_response = AsyncMock()
+            mock_response.status = 200
+            mock_response.json = AsyncMock(return_value=OPENALEX_API_RESPONSE)
+
+            mock_get = MagicMock(
+                return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))
+            )
+            mock_session.return_value.get = mock_get
+
+            await harvester.search(
+                "test",
+                year_from=2020,
+                year_to=2024,
+            )
+
+            # Verify filter params
+            call_args = mock_get.call_args
+            params = call_args[1]["params"]
+            assert "filter" in params
+            assert "publication_year" in params["filter"]
+
+    @pytest.mark.asyncio
+    async def test_abstract_reconstruction(self, harvester):
+        """Abstract is reconstructed from inverted index."""
+        with patch.object(harvester, "_get_session") as mock_session:
+            mock_response = AsyncMock()
+            mock_response.status = 200
+            mock_response.json = AsyncMock(return_value=OPENALEX_API_RESPONSE)
+
+            mock_session.return_value.get = MagicMock(
+                return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))
+            )
+
+            result = await harvester.search("test")
+
+            paper = result.papers[0]
+            assert "Computer vision has advanced significantly" == paper.abstract
+
+    @pytest.mark.asyncio
+    async def test_email_polite_pool(self, harvester):
+        """Email is included for polite pool."""
+        with patch.object(harvester, "_get_session") as mock_session:
+            mock_response = AsyncMock()
+            mock_response.status = 200
+            mock_response.json = AsyncMock(return_value=OPENALEX_API_RESPONSE)
+
+            mock_get = MagicMock(
+                return_value=AsyncMock(__aenter__=AsyncMock(return_value=mock_response))
+            )
+            mock_session.return_value.get = mock_get
+
+            await harvester.search("test")
+
+            call_args = mock_get.call_args
+            params = call_args[1]["params"]
+            assert params.get("mailto") == "test@example.com"
+
+    @pytest.mark.asyncio
+    async def test_source_property(self, harvester):
+        """source property returns OPENALEX."""
+        assert harvester.source == HarvestSource.OPENALEX
+
+    @pytest.mark.asyncio
+    async def test_close(self, harvester):
+        """close() releases resources."""
+        mock_session = MagicMock()
+        mock_session.closed = False
+        mock_session.close = AsyncMock()
+        harvester._session = mock_session
+
+        await harvester.close()
+
+        mock_session.close.assert_called_once()
+        assert harvester._session is None
+
+
+class TestHarvesterInterface:
+    """Tests to verify all harvesters implement the same interface."""
+
+    @pytest.mark.asyncio
+    async def test_all_harvesters_have_source_property(self):
+        """All harvesters have source property."""
+        harvesters = [
+            ArxivHarvester(),
+            SemanticScholarHarvester(),
+            OpenAlexHarvester(),
+        ]
+
+        for harvester in harvesters:
+            assert hasattr(harvester, "source")
+            assert isinstance(harvester.source, HarvestSource)
+
+    @pytest.mark.asyncio
+    async def test_all_harvesters_have_search_method(self):
+        """All harvesters have async search method."""
+        harvesters = [
+            ArxivHarvester(),
+            SemanticScholarHarvester(),
+            OpenAlexHarvester(),
+        ]
+
+        for harvester in harvesters:
+            assert hasattr(harvester, "search")
+            import inspect
+            assert inspect.iscoroutinefunction(harvester.search)
+
+    @pytest.mark.asyncio
+    async def test_all_harvesters_have_close_method(self):
+        """All harvesters have async close method."""
+        harvesters = [
+            ArxivHarvester(),
+            SemanticScholarHarvester(),
+            OpenAlexHarvester(),
+        ]
+
+        for harvester in harvesters:
+            assert hasattr(harvester, "close")
+            import inspect
+            assert inspect.iscoroutinefunction(harvester.close)
diff --git a/tests/integration/test_paper_store.py b/tests/integration/test_paper_store.py
new file mode 100644
index 0000000..08cd029
--- /dev/null
+++ b/tests/integration/test_paper_store.py
@@ -0,0 +1,580 @@
+"""
+PaperStore integration tests.
+
+Tests paper storage, deduplication, search, and library functionality.
+"""
+
+import pytest
+from datetime import datetime, timezone
+
+from paperbot.domain.harvest import HarvestedPaper, HarvestSource
+from paperbot.infrastructure.stores.paper_store import PaperStore, paper_to_dict
+
+
+@pytest.fixture
+def paper_store(tmp_path):
+    """Create a PaperStore with a temporary SQLite database."""
+    db_url = f"sqlite:///{tmp_path / 'test_papers.db'}"
+    store = PaperStore(db_url=db_url, auto_create_schema=True)
+    yield store
+    store.close()
+
+
+class TestPaperStoreUpsert:
+    """Tests for paper upsert functionality."""
+
+    def test_upsert_single_paper(self, paper_store):
+        """Upsert a single paper."""
+        paper = HarvestedPaper(
+            title="Test Paper",
+            source=HarvestSource.ARXIV,
+            abstract="Test abstract",
+            authors=["Alice", "Bob"],
+            doi="10.1234/test",
+            year=2023,
+            citation_count=10,
+        )
+
+        new_count, updated_count = paper_store.upsert_papers_batch([paper])
+
+        assert new_count == 1
+        assert updated_count == 0
+
+    def test_upsert_multiple_papers(self, paper_store):
+        """Upsert multiple papers."""
+        papers = [
+            HarvestedPaper(
+                title=f"Paper {i}",
+                source=HarvestSource.ARXIV,
+                doi=f"10.1234/paper{i}",
+                year=2023,
+            )
+            for i in range(5)
+        ]
+
+        new_count, updated_count = paper_store.upsert_papers_batch(papers)
+
+        assert new_count == 5
+        assert updated_count == 0
+        assert paper_store.get_paper_count() == 5
+
+    def test_upsert_deduplicates_by_doi(self, paper_store):
+        """Papers with same DOI are deduplicated."""
+        paper1 = HarvestedPaper(
+            title="Original Title",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/same-doi",
+            citation_count=10,
+        )
+        paper2 = HarvestedPaper(
+            title="Different Title",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            doi="10.1234/same-doi",
+            citation_count=20,
+        )
+
+        new1, _ = paper_store.upsert_papers_batch([paper1])
+        new2, updated2 = paper_store.upsert_papers_batch([paper2])
+
+        assert new1 == 1
+        assert new2 == 0
+        assert updated2 == 1
+        assert paper_store.get_paper_count() == 1
+
+    def test_upsert_deduplicates_by_arxiv_id(self, paper_store):
+        """Papers with same arXiv ID are deduplicated."""
+        paper1 = HarvestedPaper(
+            title="Paper 1",
+            source=HarvestSource.ARXIV,
+            arxiv_id="2301.12345",
+        )
+        paper2 = HarvestedPaper(
+            title="Paper 1 Variant",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            arxiv_id="2301.12345",
+            doi="10.1234/new-doi",  # New identifier
+        )
+
+        paper_store.upsert_papers_batch([paper1])
+        new_count, updated_count = paper_store.upsert_papers_batch([paper2])
+
+        assert new_count == 0
+        assert updated_count == 1
+
+        # DOI should be merged into existing record
+        papers, _ = paper_store.search_papers(query="Paper 1")
+        assert len(papers) == 1
+        assert papers[0].doi == "10.1234/new-doi"
+
+    def test_upsert_deduplicates_by_title_hash(self, paper_store):
+        """Papers with same normalized title are deduplicated."""
+        paper1 = HarvestedPaper(
+            title="Deep Learning for NLP",
+            source=HarvestSource.ARXIV,
+        )
+        paper2 = HarvestedPaper(
+            title="DEEP LEARNING FOR NLP",  # Same title, different case
+            source=HarvestSource.OPENALEX,
+            doi="10.1234/dedup-test",
+        )
+
+        paper_store.upsert_papers_batch([paper1])
+        new_count, updated_count = paper_store.upsert_papers_batch([paper2])
+
+        assert new_count == 0
+        assert updated_count == 1
+        assert paper_store.get_paper_count() == 1
+
+    def test_upsert_merges_metadata(self, paper_store):
+        """Upsert merges metadata from duplicate papers."""
+        paper1 = HarvestedPaper(
+            title="Merge Test",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/merge",
+            abstract="Short",
+            citation_count=10,
+            keywords=["ML"],
+        )
+        paper2 = HarvestedPaper(
+            title="Merge Test",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            doi="10.1234/merge",
+            abstract="A much longer abstract with more details",
+            citation_count=20,
+            keywords=["AI"],
+            semantic_scholar_id="s2-123",
+        )
+
+        paper_store.upsert_papers_batch([paper1])
+        paper_store.upsert_papers_batch([paper2])
+
+        papers, _ = paper_store.search_papers(query="Merge Test")
+        assert len(papers) == 1
+        paper = papers[0]
+
+        # Longer abstract preserved
+        assert "longer" in paper.abstract
+        # Higher citation count preserved
+        assert paper.citation_count == 20
+        # New identifier merged
+        assert paper.semantic_scholar_id == "s2-123"
+
+
+class TestPaperStoreSearch:
+    """Tests for paper search functionality."""
+
+    @pytest.fixture(autouse=True)
+    def setup_papers(self, paper_store):
+        """Add test papers to the store."""
+        self.store = paper_store
+        papers = [
+            HarvestedPaper(
+                title="Deep Learning for Natural Language Processing",
+                source=HarvestSource.ARXIV,
+                abstract="A study on transformers and attention mechanisms",
+                doi="10.1234/nlp",
+                year=2023,
+                venue="NeurIPS",
+                citation_count=100,
+            ),
+            HarvestedPaper(
+                title="Computer Vision with Convolutional Networks",
+                source=HarvestSource.SEMANTIC_SCHOLAR,
+                abstract="CNN architectures for image classification",
+                doi="10.1234/cv",
+                year=2022,
+                venue="CVPR",
+                citation_count=200,
+            ),
+            HarvestedPaper(
+                title="Reinforcement Learning in Robotics",
+                source=HarvestSource.OPENALEX,
+                abstract="RL algorithms for robot control",
+                doi="10.1234/rl",
+                year=2024,
+                venue="ICRA",
+                citation_count=50,
+            ),
+            HarvestedPaper(
+                title="Security Analysis of Machine Learning Systems",
+                source=HarvestSource.ARXIV,
+                abstract="Adversarial attacks on deep learning models",
+                doi="10.1234/security",
+                year=2023,
+                venue="CCS",
+                citation_count=75,
+            ),
+        ]
+        paper_store.upsert_papers_batch(papers)
+
+    def test_search_by_query(self):
+        """Search papers by query string."""
+        papers, total = self.store.search_papers(query="deep learning")
+
+        assert total >= 1
+        assert any("Deep Learning" in p.title for p in papers)
+
+    def test_search_by_year_range(self):
+        """Search papers within year range."""
+        papers, total = self.store.search_papers(year_from=2023, year_to=2024)
+
+        assert all(2023 <= p.year <= 2024 for p in papers)
+        assert total >= 2
+
+    def test_search_by_venue(self):
+        """Search papers by venue."""
+        papers, total = self.store.search_papers(venues=["NeurIPS"])
+
+        assert total >= 1
+        assert all("NeurIPS" in (p.venue or "") for p in papers)
+
+    def test_search_by_min_citations(self):
+        """Search papers with minimum citations."""
+        papers, total = self.store.search_papers(min_citations=100)
+
+        assert all(p.citation_count >= 100 for p in papers)
+        assert total >= 1
+
+    def test_search_by_source(self):
+        """Search papers by source."""
+        papers, total = self.store.search_papers(sources=["arxiv"])
+
+        assert all(p.primary_source == "arxiv" for p in papers)
+
+    def test_search_sort_by_citations(self):
+        """Search results sorted by citation count."""
+        papers, _ = self.store.search_papers(
+            sort_by="citation_count", sort_order="desc"
+        )
+
+        # Verify descending order
+        for i in range(len(papers) - 1):
+            assert (papers[i].citation_count or 0) >= (papers[i + 1].citation_count or 0)
+
+    def test_search_sort_by_year(self):
+        """Search results sorted by year."""
+        papers, _ = self.store.search_papers(sort_by="year", sort_order="asc")
+
+        # Verify ascending order
+        for i in range(len(papers) - 1):
+            if papers[i].year and papers[i + 1].year:
+                assert papers[i].year <= papers[i + 1].year
+
+    def test_search_pagination(self):
+        """Search with pagination."""
+        all_papers, total = self.store.search_papers(limit=100)
+
+        # Get first page
+        page1, _ = self.store.search_papers(limit=2, offset=0)
+        assert len(page1) == 2
+
+        # Get second page
+        page2, _ = self.store.search_papers(limit=2, offset=2)
+
+        # Pages should not overlap
+        page1_ids = {p.id for p in page1}
+        page2_ids = {p.id for p in page2}
+        assert page1_ids.isdisjoint(page2_ids)
+
+    def test_search_combined_filters(self):
+        """Search with multiple filters combined."""
+        papers, total = self.store.search_papers(
+            query="learning",
+            year_from=2023,
+            min_citations=50,
+            sort_by="citation_count",
+            sort_order="desc",
+        )
+
+        for paper in papers:
+            assert paper.year >= 2023
+            assert paper.citation_count >= 50
+
+    def test_search_no_results(self):
+        """Search with no matching results."""
+        papers, total = self.store.search_papers(query="xyznonexistent123")
+
+        assert papers == []
+        assert total == 0
+
+
+class TestPaperStoreHarvestRun:
+    """Tests for harvest run tracking."""
+
+    def test_create_harvest_run(self, paper_store):
+        """Create a harvest run record."""
+        run = paper_store.create_harvest_run(
+            run_id="test-run-001",
+            keywords=["machine learning", "deep learning"],
+            venues=["NeurIPS", "ICML"],
+            sources=["arxiv", "semantic_scholar"],
+            max_results_per_source=50,
+        )
+
+        assert run.run_id == "test-run-001"
+        assert run.status == "running"
+        assert run.get_keywords() == ["machine learning", "deep learning"]
+        assert run.get_venues() == ["NeurIPS", "ICML"]
+        assert run.get_sources() == ["arxiv", "semantic_scholar"]
+        assert run.max_results_per_source == 50
+
+    def test_update_harvest_run(self, paper_store):
+        """Update a harvest run record."""
+        paper_store.create_harvest_run(
+            run_id="test-run-002",
+            keywords=["test"],
+            venues=[],
+            sources=["arxiv"],
+            max_results_per_source=50,
+        )
+
+        updated = paper_store.update_harvest_run(
+            run_id="test-run-002",
+            status="success",
+            papers_found=100,
+            papers_new=80,
+            papers_deduplicated=20,
+        )
+
+        assert updated is not None
+        assert updated.status == "success"
+        assert updated.papers_found == 100
+        assert updated.papers_new == 80
+        assert updated.papers_deduplicated == 20
+        assert updated.ended_at is not None
+
+    def test_update_harvest_run_with_errors(self, paper_store):
+        """Update harvest run with error information."""
+        paper_store.create_harvest_run(
+            run_id="test-run-003",
+            keywords=["test"],
+            venues=[],
+            sources=["arxiv", "semantic_scholar"],
+            max_results_per_source=50,
+        )
+
+        errors = {"semantic_scholar": "Rate limit exceeded"}
+        updated = paper_store.update_harvest_run(
+            run_id="test-run-003",
+            status="partial",
+            errors=errors,
+        )
+
+        assert updated.status == "partial"
+        assert updated.get_errors() == errors
+
+    def test_get_harvest_run(self, paper_store):
+        """Retrieve a harvest run by ID."""
+        paper_store.create_harvest_run(
+            run_id="test-run-004",
+            keywords=["retrieval test"],
+            venues=["SIGIR"],
+            sources=["openalex"],
+            max_results_per_source=25,
+        )
+
+        run = paper_store.get_harvest_run("test-run-004")
+
+        assert run is not None
+        assert run.run_id == "test-run-004"
+        assert run.get_keywords() == ["retrieval test"]
+
+    def test_get_harvest_run_not_found(self, paper_store):
+        """Get non-existent harvest run returns None."""
+        run = paper_store.get_harvest_run("nonexistent-run")
+        assert run is None
+
+    def test_list_harvest_runs(self, paper_store):
+        """List harvest runs."""
+        for i in range(3):
+            paper_store.create_harvest_run(
+                run_id=f"list-test-{i}",
+                keywords=[f"keyword{i}"],
+                venues=[],
+                sources=["arxiv"],
+                max_results_per_source=50,
+            )
+
+        runs = paper_store.list_harvest_runs(limit=10)
+
+        assert len(runs) >= 3
+        # Should be sorted by started_at descending
+        for i in range(len(runs) - 1):
+            if runs[i].started_at and runs[i + 1].started_at:
+                assert runs[i].started_at >= runs[i + 1].started_at
+
+    def test_list_harvest_runs_by_status(self, paper_store):
+        """List harvest runs filtered by status."""
+        paper_store.create_harvest_run(
+            run_id="status-test-1",
+            keywords=["test"],
+            venues=[],
+            sources=["arxiv"],
+            max_results_per_source=50,
+        )
+        paper_store.update_harvest_run("status-test-1", status="success")
+
+        paper_store.create_harvest_run(
+            run_id="status-test-2",
+            keywords=["test"],
+            venues=[],
+            sources=["arxiv"],
+            max_results_per_source=50,
+        )
+        # Remains "running"
+
+        success_runs = paper_store.list_harvest_runs(status="success")
+        running_runs = paper_store.list_harvest_runs(status="running")
+
+        assert any(r.run_id == "status-test-1" for r in success_runs)
+        assert any(r.run_id == "status-test-2" for r in running_runs)
+
+
+class TestPaperStoreLibrary:
+    """Tests for user library functionality."""
+
+    def test_get_paper_by_id(self, paper_store):
+        """Get paper by ID."""
+        paper = HarvestedPaper(
+            title="Get By ID Test",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/getbyid",
+        )
+        paper_store.upsert_papers_batch([paper])
+
+        papers, _ = paper_store.search_papers(query="Get By ID")
+        assert len(papers) == 1
+
+        retrieved = paper_store.get_paper_by_id(papers[0].id)
+        assert retrieved is not None
+        assert retrieved.title == "Get By ID Test"
+
+    def test_get_paper_by_id_not_found(self, paper_store):
+        """Get non-existent paper returns None."""
+        paper = paper_store.get_paper_by_id(99999)
+        assert paper is None
+
+    def test_paper_to_dict(self, paper_store):
+        """paper_to_dict converts model correctly."""
+        paper = HarvestedPaper(
+            title="Dict Test",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            abstract="Test abstract",
+            authors=["Alice", "Bob"],
+            doi="10.1234/dict",
+            year=2023,
+            venue="ICML",
+            citation_count=42,
+            keywords=["ML"],
+            fields_of_study=["CS"],
+        )
+        paper_store.upsert_papers_batch([paper])
+
+        papers, _ = paper_store.search_papers(query="Dict Test")
+        result = paper_to_dict(papers[0])
+
+        assert result["title"] == "Dict Test"
+        assert result["abstract"] == "Test abstract"
+        assert result["authors"] == ["Alice", "Bob"]
+        assert result["doi"] == "10.1234/dict"
+        assert result["year"] == 2023
+        assert result["venue"] == "ICML"
+        assert result["citation_count"] == 42
+        assert result["primary_source"] == "semantic_scholar"
+
+    def test_get_paper_count(self, paper_store):
+        """Get total paper count."""
+        initial_count = paper_store.get_paper_count()
+
+        papers = [
+            HarvestedPaper(
+                title=f"Count Test {i}",
+                source=HarvestSource.ARXIV,
+                doi=f"10.1234/count{i}",
+            )
+            for i in range(3)
+        ]
+        paper_store.upsert_papers_batch(papers)
+
+        new_count = paper_store.get_paper_count()
+        assert new_count == initial_count + 3
+
+
+class TestPaperStoreEdgeCases:
+    """Tests for edge cases and error handling."""
+
+    def test_upsert_empty_list(self, paper_store):
+        """Upsert empty list does nothing."""
+        new_count, updated_count = paper_store.upsert_papers_batch([])
+
+        assert new_count == 0
+        assert updated_count == 0
+
+    def test_upsert_paper_without_identifiers(self, paper_store):
+        """Upsert paper with only title (uses title hash)."""
+        paper = HarvestedPaper(
+            title="No Identifiers Paper",
+            source=HarvestSource.ARXIV,
+        )
+
+        new_count, _ = paper_store.upsert_papers_batch([paper])
+        assert new_count == 1
+
+        # Second upsert with same title should update
+        paper2 = HarvestedPaper(
+            title="No Identifiers Paper",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            citation_count=10,
+        )
+
+        new_count, updated_count = paper_store.upsert_papers_batch([paper2])
+        assert new_count == 0
+        assert updated_count == 1
+
+    def test_search_with_special_characters(self, paper_store):
+        """Search handles special characters."""
+        paper = HarvestedPaper(
+            title="Test: A Paper with Special (Characters) & Symbols!",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/special",
+        )
+        paper_store.upsert_papers_batch([paper])
+
+        # Search with part of title (single word matches more reliably)
+        papers, total = paper_store.search_papers(query="Special")
+        assert total >= 1
+        assert any("Special" in p.title for p in papers)
+
+    def test_upsert_paper_with_unicode(self, paper_store):
+        """Upsert paper with unicode characters."""
+        paper = HarvestedPaper(
+            title="机器学习论文 - Machine Learning Paper",
+            source=HarvestSource.ARXIV,
+            abstract="This paper discusses 深度学习 (deep learning)",
+            authors=["张三", "李四"],
+            doi="10.1234/unicode",
+        )
+
+        new_count, _ = paper_store.upsert_papers_batch([paper])
+        assert new_count == 1
+
+        papers, _ = paper_store.search_papers(query="Machine Learning")
+        assert len(papers) == 1
+        assert "机器学习" in papers[0].title
+
+    def test_upsert_paper_with_long_abstract(self, paper_store):
+        """Upsert paper with very long abstract."""
+        long_abstract = "Lorem ipsum " * 1000  # ~12000 characters
+
+        paper = HarvestedPaper(
+            title="Long Abstract Paper",
+            source=HarvestSource.ARXIV,
+            abstract=long_abstract,
+            doi="10.1234/long",
+        )
+
+        new_count, _ = paper_store.upsert_papers_batch([paper])
+        assert new_count == 1
+
+        papers, _ = paper_store.search_papers(query="Long Abstract")
+        assert papers[0].abstract == long_abstract
diff --git a/tests/unit/test_harvested_paper.py b/tests/unit/test_harvested_paper.py
new file mode 100644
index 0000000..5ec732d
--- /dev/null
+++ b/tests/unit/test_harvested_paper.py
@@ -0,0 +1,328 @@
+"""
+HarvestedPaper domain model unit tests.
+"""
+
+import pytest
+
+from paperbot.domain.harvest import (
+    HarvestedPaper,
+    HarvestResult,
+    HarvestRunResult,
+    HarvestSource,
+)
+
+
+class TestHarvestedPaper:
+    """Tests for HarvestedPaper data model."""
+
+    def test_create_minimal_paper(self):
+        """Create paper with only required fields."""
+        paper = HarvestedPaper(
+            title="Test Paper",
+            source=HarvestSource.ARXIV,
+        )
+        assert paper.title == "Test Paper"
+        assert paper.source == HarvestSource.ARXIV
+        assert paper.abstract == ""
+        assert paper.authors == []
+        assert paper.doi is None
+        assert paper.citation_count == 0
+
+    def test_create_full_paper(self):
+        """Create paper with all fields."""
+        paper = HarvestedPaper(
+            title="Full Paper",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            abstract="This is an abstract.",
+            authors=["Alice", "Bob"],
+            doi="10.1234/test",
+            arxiv_id="2301.12345",
+            semantic_scholar_id="s2-123",
+            openalex_id="W12345",
+            year=2023,
+            venue="NeurIPS",
+            publication_date="2023-12-01",
+            citation_count=100,
+            url="https://example.com/paper",
+            pdf_url="https://example.com/paper.pdf",
+            keywords=["ML", "AI"],
+            fields_of_study=["Computer Science"],
+            source_rank=1,
+        )
+
+        assert paper.title == "Full Paper"
+        assert paper.source == HarvestSource.SEMANTIC_SCHOLAR
+        assert paper.abstract == "This is an abstract."
+        assert paper.authors == ["Alice", "Bob"]
+        assert paper.doi == "10.1234/test"
+        assert paper.arxiv_id == "2301.12345"
+        assert paper.semantic_scholar_id == "s2-123"
+        assert paper.openalex_id == "W12345"
+        assert paper.year == 2023
+        assert paper.venue == "NeurIPS"
+        assert paper.publication_date == "2023-12-01"
+        assert paper.citation_count == 100
+        assert paper.url == "https://example.com/paper"
+        assert paper.pdf_url == "https://example.com/paper.pdf"
+        assert paper.keywords == ["ML", "AI"]
+        assert paper.fields_of_study == ["Computer Science"]
+        assert paper.source_rank == 1
+
+    def test_compute_title_hash_basic(self):
+        """Title hash normalizes and hashes correctly."""
+        paper = HarvestedPaper(
+            title="Deep Learning for NLP",
+            source=HarvestSource.ARXIV,
+        )
+        hash1 = paper.compute_title_hash()
+
+        # Same title should produce same hash
+        paper2 = HarvestedPaper(
+            title="Deep Learning for NLP",
+            source=HarvestSource.OPENALEX,
+        )
+        assert paper2.compute_title_hash() == hash1
+
+    def test_compute_title_hash_case_insensitive(self):
+        """Title hash is case-insensitive."""
+        paper1 = HarvestedPaper(title="Deep Learning", source=HarvestSource.ARXIV)
+        paper2 = HarvestedPaper(title="DEEP LEARNING", source=HarvestSource.ARXIV)
+        paper3 = HarvestedPaper(title="deep learning", source=HarvestSource.ARXIV)
+
+        assert paper1.compute_title_hash() == paper2.compute_title_hash()
+        assert paper2.compute_title_hash() == paper3.compute_title_hash()
+
+    def test_compute_title_hash_ignores_punctuation(self):
+        """Title hash ignores punctuation."""
+        paper1 = HarvestedPaper(title="Deep Learning", source=HarvestSource.ARXIV)
+        paper2 = HarvestedPaper(title="Deep, Learning!", source=HarvestSource.ARXIV)
+        paper3 = HarvestedPaper(title="Deep-Learning?", source=HarvestSource.ARXIV)
+
+        # All should have same hash after removing punctuation
+        assert paper1.compute_title_hash() == paper2.compute_title_hash()
+        # Note: hyphens are removed, making it "deeplearning" vs "deep learning"
+        # This might differ, which is intentional for similar titles
+
+    def test_compute_title_hash_normalizes_whitespace(self):
+        """Title hash normalizes whitespace."""
+        paper1 = HarvestedPaper(title="Deep Learning", source=HarvestSource.ARXIV)
+        paper2 = HarvestedPaper(title="Deep  Learning", source=HarvestSource.ARXIV)
+        paper3 = HarvestedPaper(title=" Deep Learning ", source=HarvestSource.ARXIV)
+
+        assert paper1.compute_title_hash() == paper2.compute_title_hash()
+        assert paper2.compute_title_hash() == paper3.compute_title_hash()
+
+    def test_to_dict(self):
+        """to_dict returns correct dictionary."""
+        paper = HarvestedPaper(
+            title="Test",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/test",
+            year=2023,
+        )
+        result = paper.to_dict()
+
+        assert result["title"] == "Test"
+        assert result["source"] == "arxiv"
+        assert result["doi"] == "10.1234/test"
+        assert result["year"] == 2023
+        assert "title_hash" in result
+
+    def test_from_dict(self):
+        """from_dict creates paper from dictionary."""
+        data = {
+            "title": "From Dict Paper",
+            "source": "semantic_scholar",
+            "abstract": "An abstract",
+            "authors": ["Author1"],
+            "doi": "10.1234/fromdict",
+            "year": 2024,
+            "citation_count": 50,
+        }
+
+        paper = HarvestedPaper.from_dict(data)
+
+        assert paper.title == "From Dict Paper"
+        assert paper.source == HarvestSource.SEMANTIC_SCHOLAR
+        assert paper.abstract == "An abstract"
+        assert paper.authors == ["Author1"]
+        assert paper.doi == "10.1234/fromdict"
+        assert paper.year == 2024
+        assert paper.citation_count == 50
+
+    def test_from_dict_with_source_enum(self):
+        """from_dict handles source as enum."""
+        data = {
+            "title": "Test",
+            "source": HarvestSource.OPENALEX,
+        }
+
+        paper = HarvestedPaper.from_dict(data)
+        assert paper.source == HarvestSource.OPENALEX
+
+    def test_roundtrip_dict(self):
+        """to_dict and from_dict roundtrip preserves data."""
+        original = HarvestedPaper(
+            title="Roundtrip Test",
+            source=HarvestSource.ARXIV,
+            abstract="Test abstract",
+            authors=["Alice", "Bob"],
+            doi="10.1234/roundtrip",
+            arxiv_id="2301.12345",
+            year=2023,
+            venue="ICML",
+            citation_count=42,
+            keywords=["ML", "Test"],
+            fields_of_study=["CS"],
+        )
+
+        data = original.to_dict()
+        restored = HarvestedPaper.from_dict(data)
+
+        assert restored.title == original.title
+        assert restored.source == original.source
+        assert restored.abstract == original.abstract
+        assert restored.authors == original.authors
+        assert restored.doi == original.doi
+        assert restored.arxiv_id == original.arxiv_id
+        assert restored.year == original.year
+        assert restored.venue == original.venue
+        assert restored.citation_count == original.citation_count
+
+
+class TestHarvestSource:
+    """Tests for HarvestSource enum."""
+
+    def test_source_values(self):
+        """Source enum has correct string values."""
+        assert HarvestSource.ARXIV.value == "arxiv"
+        assert HarvestSource.SEMANTIC_SCHOLAR.value == "semantic_scholar"
+        assert HarvestSource.OPENALEX.value == "openalex"
+
+    def test_source_is_string(self):
+        """Source enum inherits from str."""
+        assert isinstance(HarvestSource.ARXIV, str)
+        assert HarvestSource.ARXIV == "arxiv"
+
+    def test_source_from_string(self):
+        """Source can be created from string."""
+        source = HarvestSource("arxiv")
+        assert source == HarvestSource.ARXIV
+
+
+class TestHarvestResult:
+    """Tests for HarvestResult data model."""
+
+    def test_success_result(self):
+        """Success result has no error."""
+        result = HarvestResult(
+            source=HarvestSource.ARXIV,
+            papers=[
+                HarvestedPaper(title="Paper 1", source=HarvestSource.ARXIV),
+            ],
+            total_found=1,
+        )
+
+        assert result.success is True
+        assert result.error is None
+        assert len(result.papers) == 1
+        assert result.total_found == 1
+
+    def test_error_result(self):
+        """Error result has error message."""
+        result = HarvestResult(
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            papers=[],
+            total_found=0,
+            error="API rate limit exceeded",
+        )
+
+        assert result.success is False
+        assert result.error == "API rate limit exceeded"
+        assert len(result.papers) == 0
+
+    def test_partial_result(self):
+        """Partial result can have both papers and error."""
+        result = HarvestResult(
+            source=HarvestSource.OPENALEX,
+            papers=[
+                HarvestedPaper(title="Paper 1", source=HarvestSource.OPENALEX),
+            ],
+            total_found=100,  # More papers exist but couldn't be fetched
+            error="Timeout after 50 papers",
+        )
+
+        assert result.success is False
+        assert len(result.papers) == 1
+        assert result.total_found == 100
+
+
+class TestHarvestRunResult:
+    """Tests for HarvestRunResult data model."""
+
+    def test_create_run_result(self):
+        """Create a complete run result."""
+        from datetime import datetime, timezone
+
+        now = datetime.now(timezone.utc)
+
+        result = HarvestRunResult(
+            run_id="harvest-20260210-abc123",
+            status="success",
+            papers_found=150,
+            papers_new=100,
+            papers_deduplicated=50,
+            source_results={
+                HarvestSource.ARXIV: HarvestResult(
+                    source=HarvestSource.ARXIV,
+                    papers=[],
+                    total_found=50,
+                ),
+                HarvestSource.SEMANTIC_SCHOLAR: HarvestResult(
+                    source=HarvestSource.SEMANTIC_SCHOLAR,
+                    papers=[],
+                    total_found=60,
+                ),
+            },
+            started_at=now,
+            ended_at=now,
+        )
+
+        assert result.run_id == "harvest-20260210-abc123"
+        assert result.status == "success"
+        assert result.papers_found == 150
+        assert result.papers_new == 100
+        assert result.papers_deduplicated == 50
+
+    def test_to_dict(self):
+        """to_dict returns correct structure."""
+        from datetime import datetime, timezone
+
+        now = datetime.now(timezone.utc)
+
+        result = HarvestRunResult(
+            run_id="test-run",
+            status="partial",
+            papers_found=100,
+            papers_new=80,
+            papers_deduplicated=20,
+            source_results={
+                HarvestSource.ARXIV: HarvestResult(
+                    source=HarvestSource.ARXIV,
+                    papers=[HarvestedPaper(title="P1", source=HarvestSource.ARXIV)],
+                    total_found=50,
+                ),
+            },
+            started_at=now,
+        )
+
+        data = result.to_dict()
+
+        assert data["run_id"] == "test-run"
+        assert data["status"] == "partial"
+        assert data["papers_found"] == 100
+        assert data["papers_new"] == 80
+        assert data["papers_deduplicated"] == 20
+        assert "arxiv" in data["sources"]
+        assert data["sources"]["arxiv"]["papers"] == 1
+        assert data["sources"]["arxiv"]["total_found"] == 50
diff --git a/tests/unit/test_paper_deduplicator.py b/tests/unit/test_paper_deduplicator.py
new file mode 100644
index 0000000..9d770ff
--- /dev/null
+++ b/tests/unit/test_paper_deduplicator.py
@@ -0,0 +1,292 @@
+"""
+PaperDeduplicator unit tests.
+"""
+
+import pytest
+
+from paperbot.domain.harvest import HarvestedPaper, HarvestSource
+from paperbot.application.services.paper_deduplicator import PaperDeduplicator
+
+
+class TestPaperDeduplicator:
+    """PaperDeduplicator tests."""
+
+    def setup_method(self):
+        """Reset deduplicator before each test."""
+        self.deduplicator = PaperDeduplicator()
+
+    def test_deduplicate_empty_list(self):
+        """Empty list returns empty result."""
+        unique, count = self.deduplicator.deduplicate([])
+        assert unique == []
+        assert count == 0
+
+    def test_deduplicate_single_paper(self):
+        """Single paper returns unchanged."""
+        paper = HarvestedPaper(
+            title="Test Paper",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/test",
+        )
+        unique, count = self.deduplicator.deduplicate([paper])
+        assert len(unique) == 1
+        assert count == 0
+        assert unique[0].title == "Test Paper"
+
+    def test_deduplicate_by_doi(self):
+        """Papers with same DOI are deduplicated."""
+        paper1 = HarvestedPaper(
+            title="Paper Version 1",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/same-doi",
+            abstract="Short abstract",
+        )
+        paper2 = HarvestedPaper(
+            title="Paper Version 2",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            doi="10.1234/same-doi",
+            abstract="A much longer and more detailed abstract",
+        )
+
+        unique, count = self.deduplicator.deduplicate([paper1, paper2])
+
+        assert len(unique) == 1
+        assert count == 1
+        # Longer abstract should be preserved
+        assert "longer" in unique[0].abstract
+
+    def test_deduplicate_by_arxiv_id(self):
+        """Papers with same arXiv ID are deduplicated."""
+        paper1 = HarvestedPaper(
+            title="Paper 1",
+            source=HarvestSource.ARXIV,
+            arxiv_id="2301.12345",
+        )
+        paper2 = HarvestedPaper(
+            title="Paper 1 (variant)",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            arxiv_id="2301.12345",
+        )
+
+        unique, count = self.deduplicator.deduplicate([paper1, paper2])
+
+        assert len(unique) == 1
+        assert count == 1
+
+    def test_deduplicate_by_semantic_scholar_id(self):
+        """Papers with same Semantic Scholar ID are deduplicated."""
+        paper1 = HarvestedPaper(
+            title="Paper A",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            semantic_scholar_id="abc123",
+        )
+        paper2 = HarvestedPaper(
+            title="Paper A",
+            source=HarvestSource.OPENALEX,
+            semantic_scholar_id="abc123",
+        )
+
+        unique, count = self.deduplicator.deduplicate([paper1, paper2])
+
+        assert len(unique) == 1
+        assert count == 1
+
+    def test_deduplicate_by_openalex_id(self):
+        """Papers with same OpenAlex ID are deduplicated."""
+        paper1 = HarvestedPaper(
+            title="Paper B",
+            source=HarvestSource.OPENALEX,
+            openalex_id="W12345",
+        )
+        paper2 = HarvestedPaper(
+            title="Paper B",
+            source=HarvestSource.ARXIV,
+            openalex_id="W12345",
+        )
+
+        unique, count = self.deduplicator.deduplicate([paper1, paper2])
+
+        assert len(unique) == 1
+        assert count == 1
+
+    def test_deduplicate_by_title_hash(self):
+        """Papers with same normalized title are deduplicated."""
+        paper1 = HarvestedPaper(
+            title="Deep Learning for NLP",
+            source=HarvestSource.ARXIV,
+        )
+        paper2 = HarvestedPaper(
+            title="DEEP LEARNING FOR NLP",  # Same title, different case
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+        )
+
+        unique, count = self.deduplicator.deduplicate([paper1, paper2])
+
+        assert len(unique) == 1
+        assert count == 1
+
+    def test_deduplicate_merges_identifiers(self):
+        """Deduplication merges identifiers from duplicates."""
+        paper1 = HarvestedPaper(
+            title="Test Paper",
+            source=HarvestSource.ARXIV,
+            arxiv_id="2301.12345",
+        )
+        paper2 = HarvestedPaper(
+            title="Test Paper",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            doi="10.1234/test",
+            semantic_scholar_id="s2-123",
+        )
+
+        unique, count = self.deduplicator.deduplicate([paper1, paper2])
+
+        assert len(unique) == 1
+        assert unique[0].arxiv_id == "2301.12345"
+        assert unique[0].doi == "10.1234/test"
+        assert unique[0].semantic_scholar_id == "s2-123"
+
+    def test_deduplicate_prefers_higher_citations(self):
+        """Higher citation count is preserved during merge."""
+        paper1 = HarvestedPaper(
+            title="Cited Paper",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/cited",
+            citation_count=10,
+        )
+        paper2 = HarvestedPaper(
+            title="Cited Paper",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            doi="10.1234/cited",
+            citation_count=50,
+        )
+
+        unique, count = self.deduplicator.deduplicate([paper1, paper2])
+
+        assert len(unique) == 1
+        assert unique[0].citation_count == 50
+
+    def test_deduplicate_merges_keywords(self):
+        """Keywords from all duplicates are merged."""
+        paper1 = HarvestedPaper(
+            title="ML Paper",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/ml",
+            keywords=["deep learning", "neural network"],
+        )
+        paper2 = HarvestedPaper(
+            title="ML Paper",
+            source=HarvestSource.OPENALEX,
+            doi="10.1234/ml",
+            keywords=["machine learning", "deep learning"],
+        )
+
+        unique, count = self.deduplicator.deduplicate([paper1, paper2])
+
+        assert len(unique) == 1
+        keywords = set(unique[0].keywords)
+        assert "deep learning" in keywords
+        assert "neural network" in keywords
+        assert "machine learning" in keywords
+
+    def test_deduplicate_prefers_longer_author_list(self):
+        """Longer author list is preserved."""
+        paper1 = HarvestedPaper(
+            title="Multi-author Paper",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/multi",
+            authors=["Alice", "Bob"],
+        )
+        paper2 = HarvestedPaper(
+            title="Multi-author Paper",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            doi="10.1234/multi",
+            authors=["Alice", "Bob", "Charlie", "Diana"],
+        )
+
+        unique, count = self.deduplicator.deduplicate([paper1, paper2])
+
+        assert len(unique) == 1
+        assert len(unique[0].authors) == 4
+
+    def test_no_duplicates_different_papers(self):
+        """Different papers are not deduplicated."""
+        paper1 = HarvestedPaper(
+            title="First Paper",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/first",
+        )
+        paper2 = HarvestedPaper(
+            title="Second Paper",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            doi="10.1234/second",
+        )
+        paper3 = HarvestedPaper(
+            title="Third Paper",
+            source=HarvestSource.OPENALEX,
+            arxiv_id="2301.99999",
+        )
+
+        unique, count = self.deduplicator.deduplicate([paper1, paper2, paper3])
+
+        assert len(unique) == 3
+        assert count == 0
+
+    def test_is_duplicate_check(self):
+        """is_duplicate correctly identifies duplicates."""
+        paper1 = HarvestedPaper(
+            title="Indexed Paper",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/indexed",
+        )
+
+        # First, deduplicate to build index
+        self.deduplicator.deduplicate([paper1])
+
+        # Check duplicate
+        paper2 = HarvestedPaper(
+            title="Different Title",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            doi="10.1234/indexed",  # Same DOI
+        )
+        assert self.deduplicator.is_duplicate(paper2) is True
+
+        # Check non-duplicate
+        paper3 = HarvestedPaper(
+            title="New Paper",
+            source=HarvestSource.OPENALEX,
+            doi="10.1234/new",
+        )
+        assert self.deduplicator.is_duplicate(paper3) is False
+
+    def test_reset_clears_indexes(self):
+        """reset() clears all indexes."""
+        paper = HarvestedPaper(
+            title="Reset Test",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/reset",
+        )
+
+        self.deduplicator.deduplicate([paper])
+        assert self.deduplicator.is_duplicate(paper) is True
+
+        self.deduplicator.reset()
+        assert self.deduplicator.is_duplicate(paper) is False
+
+    def test_case_insensitive_matching(self):
+        """ID matching is case-insensitive."""
+        paper1 = HarvestedPaper(
+            title="Case Test",
+            source=HarvestSource.ARXIV,
+            doi="10.1234/UPPERCASE",
+        )
+        paper2 = HarvestedPaper(
+            title="Case Test",
+            source=HarvestSource.SEMANTIC_SCHOLAR,
+            doi="10.1234/uppercase",  # lowercase
+        )
+
+        unique, count = self.deduplicator.deduplicate([paper1, paper2])
+
+        assert len(unique) == 1
+        assert count == 1
diff --git a/tests/unit/test_query_rewriter.py b/tests/unit/test_query_rewriter.py
new file mode 100644
index 0000000..da10440
--- /dev/null
+++ b/tests/unit/test_query_rewriter.py
@@ -0,0 +1,136 @@
+"""
+QueryRewriter unit tests.
+"""
+
+import pytest
+
+from paperbot.application.services.query_rewriter import QueryRewriter
+
+
+class TestQueryRewriter:
+    """QueryRewriter tests."""
+
+    def setup_method(self):
+        """Create fresh rewriter for each test."""
+        self.rewriter = QueryRewriter()
+
+    def test_rewrite_no_expansion(self):
+        """Query without abbreviations returns single item."""
+        queries = self.rewriter.rewrite("deep learning")
+        assert queries == ["deep learning"]
+
+    def test_rewrite_llm_expansion(self):
+        """LLM expands to large language model."""
+        queries = self.rewriter.rewrite("LLM security")
+        assert len(queries) == 2
+        assert "LLM security" in queries
+        assert "large language model security" in queries
+
+    def test_rewrite_multiple_abbreviations(self):
+        """Multiple abbreviations are expanded."""
+        queries = self.rewriter.rewrite("ML and NLP")
+        assert len(queries) == 2
+        assert "ML and NLP" in queries
+        assert "machine learning and natural language processing" in queries
+
+    def test_rewrite_case_insensitive(self):
+        """Abbreviation matching is case-insensitive."""
+        queries = self.rewriter.rewrite("llm")
+        assert "large language model" in queries
+
+        queries = self.rewriter.rewrite("LLM")
+        assert "large language model" in queries
+
+    def test_rewrite_punctuation_handled(self):
+        """Punctuation doesn't prevent matching."""
+        queries = self.rewriter.rewrite("What is LLM?")
+        assert len(queries) == 2
+        # The expanded version should have the expansion
+        assert any("large language model" in q for q in queries)
+
+    def test_expand_all_basic(self):
+        """expand_all expands list of keywords."""
+        expanded = self.rewriter.expand_all(["ML", "deep learning"])
+
+        # Should include originals and expansions
+        assert "ML" in expanded or "machine learning" in expanded
+        assert "deep learning" in expanded
+
+    def test_expand_all_deduplicates(self):
+        """expand_all removes duplicate expansions."""
+        # If both "ML" and "machine learning" are provided,
+        # "machine learning" shouldn't appear twice
+        expanded = self.rewriter.expand_all(["ML", "machine learning"])
+
+        # Count occurrences of "machine learning" (normalized)
+        ml_count = sum(1 for k in expanded if self.rewriter.normalize(k) == "machine learning")
+        assert ml_count == 1
+
+    def test_normalize_basic(self):
+        """normalize applies standard transformations."""
+        assert self.rewriter.normalize("Hello World") == "hello world"
+        assert self.rewriter.normalize("  Multiple   Spaces  ") == "multiple spaces"
+        assert self.rewriter.normalize("Special!@#Characters") == "special characters"
+
+    def test_normalize_preserves_alphanumeric(self):
+        """normalize preserves letters and numbers."""
+        assert self.rewriter.normalize("GPT4 model") == "gpt4 model"
+        assert self.rewriter.normalize("BERT-2022") == "bert 2022"
+
+    def test_add_abbreviation(self):
+        """Custom abbreviation can be added."""
+        self.rewriter.add_abbreviation("XYZ", "extended yellow zebra")
+        queries = self.rewriter.rewrite("XYZ test")
+        assert "extended yellow zebra test" in queries
+
+    def test_get_expansion(self):
+        """get_expansion returns expansion for known abbreviations."""
+        assert self.rewriter.get_expansion("llm") == "large language model"
+        assert self.rewriter.get_expansion("LLM") == "large language model"
+        assert self.rewriter.get_expansion("unknown") is None
+
+    def test_default_abbreviations_exist(self):
+        """Default abbreviations are available."""
+        known_abbrevs = ["llm", "ml", "dl", "nlp", "cv", "rl", "gan", "cnn", "rnn", "bert", "gpt", "rag"]
+        for abbrev in known_abbrevs:
+            assert self.rewriter.get_expansion(abbrev) is not None
+
+    def test_custom_abbreviations_override(self):
+        """Custom abbreviations override defaults."""
+        custom = {"llm": "custom large model"}
+        rewriter = QueryRewriter(abbreviations=custom)
+
+        assert rewriter.get_expansion("llm") == "custom large model"
+
+    def test_empty_query_returns_empty(self):
+        """Empty query returns single empty string."""
+        queries = self.rewriter.rewrite("")
+        assert queries == [""]
+
+    def test_expand_all_empty_list(self):
+        """Empty list returns empty result."""
+        expanded = self.rewriter.expand_all([])
+        assert expanded == []
+
+    def test_rewrite_preserves_original(self):
+        """Original query is always first in result."""
+        queries = self.rewriter.rewrite("LLM for NLP")
+        assert queries[0] == "LLM for NLP"
+
+    def test_common_expansions(self):
+        """Common AI/ML abbreviations expand correctly."""
+        test_cases = [
+            ("CNN", "convolutional neural network"),
+            ("RNN", "recurrent neural network"),
+            ("LSTM", "long short-term memory"),
+            ("VAE", "variational autoencoder"),
+            ("GAN", "generative adversarial network"),
+            ("RL", "reinforcement learning"),
+            ("RAG", "retrieval augmented generation"),
+            ("NER", "named entity recognition"),
+            ("QA", "question answering"),
+        ]
+
+        for abbrev, expected in test_cases:
+            queries = self.rewriter.rewrite(abbrev)
+            assert expected in queries, f"Expected '{expected}' in expansion of '{abbrev}'"
diff --git a/tests/unit/test_venue_recommender.py b/tests/unit/test_venue_recommender.py
new file mode 100644
index 0000000..2828925
--- /dev/null
+++ b/tests/unit/test_venue_recommender.py
@@ -0,0 +1,175 @@
+"""
+VenueRecommender unit tests.
+"""
+
+import pytest
+
+from paperbot.application.services.venue_recommender import VenueRecommender
+
+
+class TestVenueRecommender:
+    """VenueRecommender tests."""
+
+    def setup_method(self):
+        """Create fresh recommender for each test."""
+        self.recommender = VenueRecommender()
+
+    def test_recommend_security_keywords(self):
+        """Security keywords recommend security venues."""
+        venues = self.recommender.recommend(["ransomware"])
+
+        assert len(venues) > 0
+        # Should include top security venues
+        security_venues = {"CCS", "S&P", "USENIX Security", "NDSS"}
+        assert any(v in security_venues for v in venues)
+
+    def test_recommend_ml_keywords(self):
+        """ML keywords recommend ML venues."""
+        venues = self.recommender.recommend(["machine learning"])
+
+        assert len(venues) > 0
+        ml_venues = {"NeurIPS", "ICML", "ICLR"}
+        assert any(v in ml_venues for v in venues)
+
+    def test_recommend_nlp_keywords(self):
+        """NLP keywords recommend NLP venues."""
+        venues = self.recommender.recommend(["natural language"])
+
+        assert len(venues) > 0
+        nlp_venues = {"ACL", "EMNLP", "NAACL"}
+        assert any(v in nlp_venues for v in venues)
+
+    def test_recommend_database_keywords(self):
+        """Database keywords recommend database venues."""
+        venues = self.recommender.recommend(["database", "sql"])
+
+        assert len(venues) > 0
+        db_venues = {"SIGMOD", "VLDB", "ICDE"}
+        assert any(v in db_venues for v in venues)
+
+    def test_recommend_systems_keywords(self):
+        """Systems keywords recommend systems venues."""
+        venues = self.recommender.recommend(["distributed systems"])
+
+        assert len(venues) > 0
+        sys_venues = {"OSDI", "SOSP", "EuroSys", "NSDI"}
+        assert any(v in sys_venues for v in venues)
+
+    def test_recommend_empty_keywords(self):
+        """Empty keywords return empty result."""
+        venues = self.recommender.recommend([])
+        assert venues == []
+
+    def test_recommend_unknown_keywords(self):
+        """Unknown keywords return empty result."""
+        venues = self.recommender.recommend(["xyznonexistent123"])
+        assert venues == []
+
+    def test_recommend_max_venues(self):
+        """max_venues limits output count."""
+        venues = self.recommender.recommend(["security", "machine learning"], max_venues=3)
+        assert len(venues) <= 3
+
+    def test_recommend_default_max_venues(self):
+        """Default max_venues is 5."""
+        venues = self.recommender.recommend(["security", "machine learning", "deep learning"])
+        assert len(venues) <= 5
+
+    def test_recommend_multiple_keywords_combined(self):
+        """Multiple keywords combine scores."""
+        # Single keyword
+        venues_single = self.recommender.recommend(["security"])
+
+        # Multiple related keywords should boost same venues
+        venues_multi = self.recommender.recommend(["security", "malware", "ransomware"])
+
+        # Both should return security venues at top
+        assert len(venues_single) > 0
+        assert len(venues_multi) > 0
+
+    def test_recommend_case_insensitive(self):
+        """Keyword matching is case-insensitive."""
+        venues_lower = self.recommender.recommend(["security"])
+        venues_upper = self.recommender.recommend(["SECURITY"])
+        venues_mixed = self.recommender.recommend(["Security"])
+
+        assert venues_lower == venues_upper == venues_mixed
+
+    def test_recommend_partial_match(self):
+        """Partial keyword matches contribute to scores."""
+        # "learning" should partially match "machine learning", "deep learning", etc.
+        venues = self.recommender.recommend(["learning"])
+        assert len(venues) > 0
+
+    def test_get_venues_for_domain(self):
+        """get_venues_for_domain returns specific domain venues."""
+        venues = self.recommender.get_venues_for_domain("security")
+        assert "CCS" in venues
+        assert "S&P" in venues
+
+    def test_get_venues_for_unknown_domain(self):
+        """Unknown domain returns empty list."""
+        venues = self.recommender.get_venues_for_domain("unknown_domain_xyz")
+        assert venues == []
+
+    def test_add_mapping(self):
+        """Custom mapping can be added."""
+        self.recommender.add_mapping("custom_topic", ["Venue1", "Venue2"])
+        venues = self.recommender.get_venues_for_domain("custom_topic")
+        assert "Venue1" in venues
+        assert "Venue2" in venues
+
+    def test_add_mapping_updates_recommend(self):
+        """Added mapping affects recommendations."""
+        self.recommender.add_mapping("quantum", ["QIP", "Quantum"])
+        venues = self.recommender.recommend(["quantum"])
+        assert "QIP" in venues or "Quantum" in venues
+
+    def test_custom_mappings_in_constructor(self):
+        """Custom mappings can be passed in constructor."""
+        custom = {"custom_key": ["CustomVenue1", "CustomVenue2"]}
+        recommender = VenueRecommender(mappings=custom)
+
+        venues = recommender.get_venues_for_domain("custom_key")
+        assert "CustomVenue1" in venues
+        assert "CustomVenue2" in venues
+
+    def test_default_mappings_preserved_with_custom(self):
+        """Default mappings are preserved when custom mappings are added."""
+        custom = {"new_domain": ["NewVenue"]}
+        recommender = VenueRecommender(mappings=custom)
+
+        # Default mapping should still work
+        security_venues = recommender.get_venues_for_domain("security")
+        assert len(security_venues) > 0
+
+        # Custom mapping should also work
+        new_venues = recommender.get_venues_for_domain("new_domain")
+        assert "NewVenue" in new_venues
+
+    def test_recommend_sorted_by_relevance(self):
+        """Venues are sorted by relevance score."""
+        # Multiple keywords all pointing to security should rank security venues higher
+        venues = self.recommender.recommend(
+            ["security", "ransomware", "malware", "attack"]
+        )
+
+        # First venue should be a security venue
+        if venues:
+            security_venues = {"CCS", "S&P", "USENIX Security", "NDSS"}
+            assert venues[0] in security_venues
+
+    def test_recommend_whitespace_handling(self):
+        """Keywords with extra whitespace are handled."""
+        venues1 = self.recommender.recommend(["security"])
+        venues2 = self.recommender.recommend(["  security  "])
+
+        assert venues1 == venues2
+
+    def test_recommend_empty_string_keyword(self):
+        """Empty string keyword is ignored."""
+        venues = self.recommender.recommend(["", "security", ""])
+        assert len(venues) > 0
+        # Should still recommend security venues
+        security_venues = {"CCS", "S&P", "USENIX Security", "NDSS"}
+        assert any(v in security_venues for v in venues)
diff --git a/web/package-lock.json b/web/package-lock.json
index e051351..11995c3 100644
--- a/web/package-lock.json
+++ b/web/package-lock.json
@@ -3604,14 +3604,6 @@
         "@types/react": "^19.2.0"
       }
     },
-    "node_modules/@types/trusted-types": {
-      "version": "2.0.7",
-      "resolved": "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz",
-      "integrity": "sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==",
-      "license": "MIT",
-      "optional": true,
-      "peer": true
-    },
     "node_modules/@types/unist": {
       "version": "3.0.3",
       "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
diff --git a/web/src/app/api/papers/[paperId]/save/route.ts b/web/src/app/api/papers/[paperId]/save/route.ts
new file mode 100644
index 0000000..ea6a24c
--- /dev/null
+++ b/web/src/app/api/papers/[paperId]/save/route.ts
@@ -0,0 +1,20 @@
+import { apiBaseUrl, proxyJson } from "../../../research/_base"
+
+export async function DELETE(
+  req: Request,
+  { params }: { params: Promise<{ paperId: string }> }
+) {
+  const { paperId } = await params
+  const url = new URL(req.url)
+  const upstream = `${apiBaseUrl()}/api/papers/${paperId}/save${url.search}`
+  return proxyJson(req, upstream, "DELETE")
+}
+
+export async function POST(
+  req: Request,
+  { params }: { params: Promise<{ paperId: string }> }
+) {
+  const { paperId } = await params
+  const upstream = `${apiBaseUrl()}/api/papers/${paperId}/save`
+  return proxyJson(req, upstream, "POST")
+}
diff --git a/web/src/app/api/papers/library/route.ts b/web/src/app/api/papers/library/route.ts
new file mode 100644
index 0000000..c1e7cb9
--- /dev/null
+++ b/web/src/app/api/papers/library/route.ts
@@ -0,0 +1,7 @@
+import { apiBaseUrl, proxyJson } from "../../research/_base"
+
+export async function GET(req: Request) {
+  const url = new URL(req.url)
+  const upstream = `${apiBaseUrl()}/api/papers/library${url.search}`
+  return proxyJson(req, upstream, "GET")
+}
diff --git a/web/src/components/research/ResearchDashboard.tsx b/web/src/components/research/ResearchDashboard.tsx
index 84f481e..6d9c559 100644
--- a/web/src/components/research/ResearchDashboard.tsx
+++ b/web/src/components/research/ResearchDashboard.tsx
@@ -45,6 +45,7 @@ type MemoryItem = {
 type Paper = {
   paper_id: string
   title: string
+  abstract?: string
   year?: number
   venue?: string
   citation_count?: number
@@ -383,23 +384,34 @@ export default function ResearchDashboard() {
     }
   }
 
-  async function sendFeedback(paperId: string, action: string, rank?: number) {
+  async function sendFeedback(paperId: string, action: string, rank?: number, paper?: Paper) {
     setLoading(true)
     setError(null)
     try {
       const contextRunId = contextPack?.context_run_id ?? null
+      const body: Record<string, unknown> = {
+        user_id: userId,
+        track_id: activeTrackId,
+        paper_id: paperId,
+        action,
+        weight: 0.0,
+        context_run_id: contextRunId,
+        context_rank: typeof rank === "number" ? rank : undefined,
+        metadata: {},
+      }
+      // Include paper metadata for save action
+      if (action === "save" && paper) {
+        body.paper_title = paper.title
+        body.paper_abstract = paper.abstract || ""
+        body.paper_authors = paper.authors || []
+        body.paper_year = paper.year
+        body.paper_venue = paper.venue
+        body.paper_citation_count = paper.citation_count
+        body.paper_url = paper.url
+      }
       await fetchJson(`/api/research/papers/feedback`, {
         method: "POST",
-        body: JSON.stringify({
-          user_id: userId,
-          track_id: activeTrackId,
-          paper_id: paperId,
-          action,
-          weight: 0.0,
-          context_run_id: contextRunId,
-          context_rank: typeof rank === "number" ? rank : undefined,
-          metadata: {},
-        }),
+        body: JSON.stringify(body),
         headers: { "Content-Type": "application/json" },
       })
       await buildContext(false)
@@ -740,7 +752,7 @@ export default function ResearchDashboard() {
                                 >
                                   Like
                                 </Button>
-                                <Button size="sm" variant="outline" onClick={() => sendFeedback(p.paper_id, "save", idx)} disabled={loading}>
+                                <Button size="sm" variant="outline" onClick={() => sendFeedback(p.paper_id, "save", idx, p)} disabled={loading}>
                                   Save
                                 </Button>
                                 <Button
diff --git a/web/src/components/research/SavedPapersList.tsx b/web/src/components/research/SavedPapersList.tsx
index e37ba93..a3bbed5 100644
--- a/web/src/components/research/SavedPapersList.tsx
+++ b/web/src/components/research/SavedPapersList.tsx
@@ -19,13 +19,18 @@ type SavedPaperItem = {
     id: number
     title: string
     authors?: string[]
+    primary_source?: string
     source?: string
     venue?: string
     url?: string
     external_url?: string
     published_at?: string | null
+    publication_date?: string | null
+    citation_count?: number
   }
   saved_at?: string | null
+  track_id?: number | null
+  action?: string
   reading_status?: {
     status?: string
     updated_at?: string | null
@@ -37,8 +42,10 @@ type SavedPaperItem = {
 }
 
 type SavedPapersResponse = {
-  user_id: string
-  items: SavedPaperItem[]
+  papers: SavedPaperItem[]
+  total: number
+  limit: number
+  offset: number
 }
 
 type UpdatingAction = "toggleRead" | "unsave"
@@ -83,15 +90,21 @@ export default function SavedPapersList() {
     try {
       const qs = new URLSearchParams({
         sort_by: sortBy,
+        sort_order: "desc",
         limit: "500",
         user_id: "default",
       })
-      const res = await fetch(`/api/research/papers/saved?${qs.toString()}`)
+      const res = await fetch(`/api/papers/library?${qs.toString()}`)
       if (!res.ok) {
-        throw new Error(await res.text())
+        const errorText = await res.text()
+        // Avoid showing raw HTML in error messages
+        if (errorText.startsWith("<!DOCTYPE") || errorText.startsWith("<html")) {
+          throw new Error(`Server error: ${res.status} ${res.statusText}`)
+        }
+        throw new Error(errorText)
       }
       const payload = (await res.json()) as SavedPapersResponse
-      setItems(payload.items || [])
+      setItems(payload.papers || [])
       setPage(1)
     } catch (err) {
       const detail = err instanceof Error ? err.message : String(err)
@@ -116,38 +129,56 @@ export default function SavedPapersList() {
     return items.slice(start, start + pageSize)
   }, [items, page, pageSize, totalPages])
 
-  const updateReadingStatus = useCallback(
-    async (
-      paperId: number,
-      status: ReadingStatus,
-      markSaved: boolean | null = null,
-      action: UpdatingAction,
-    ) => {
-      setUpdatingAction({ paperId, action })
-      setError(null)
-      try {
-        const body: Record<string, unknown> = {
-          user_id: "default",
-          status,
-          metadata: {},
-        }
-        if (markSaved !== null) {
-          body.mark_saved = markSaved
+  const unsavePaper = useCallback(async (paperId: number) => {
+    setUpdatingAction({ paperId, action: "unsave" })
+    setError(null)
+    try {
+      const res = await fetch(`/api/papers/${paperId}/save?user_id=default`, {
+        method: "DELETE",
+      })
+
+      if (!res.ok) {
+        const errorText = await res.text()
+        if (errorText.startsWith("<!DOCTYPE") || errorText.startsWith("<html")) {
+          throw new Error(`Server error: ${res.status} ${res.statusText}`)
         }
+        throw new Error(errorText)
+      }
 
-        const res = await fetch(`/api/research/papers/${paperId}/status`, {
+      setItems((prev) => prev.filter((row) => row.paper.id !== paperId))
+    } catch (err) {
+      const detail = err instanceof Error ? err.message : String(err)
+      setError(detail)
+    } finally {
+      setUpdatingAction(null)
+    }
+  }, [])
+
+  const toggleReadStatus = useCallback(
+    async (paperId: number, currentStatus: ReadingStatus) => {
+      setUpdatingAction({ paperId, action: "toggleRead" })
+      setError(null)
+      const newStatus = currentStatus === "read" ? "reading" : "read"
+      try {
+        // Use the paper feedback endpoint with action to track reading status
+        const res = await fetch(`/api/research/papers/feedback`, {
           method: "POST",
           headers: { "Content-Type": "application/json" },
-          body: JSON.stringify(body),
+          body: JSON.stringify({
+            user_id: "default",
+            paper_id: String(paperId),
+            action: newStatus,
+            weight: 1.0,
+            metadata: { reading_status: newStatus },
+          }),
         })
 
         if (!res.ok) {
-          throw new Error(await res.text())
-        }
-
-        if (markSaved === false) {
-          setItems((prev) => prev.filter((row) => row.paper.id !== paperId))
-          return
+          const errorText = await res.text()
+          if (errorText.startsWith("<!DOCTYPE") || errorText.startsWith("<html")) {
+            throw new Error(`Server error: ${res.status} ${res.statusText}`)
+          }
+          throw new Error(errorText)
         }
 
         setItems((prev) =>
@@ -157,7 +188,7 @@ export default function SavedPapersList() {
               ...row,
               reading_status: {
                 ...row.reading_status,
-                status,
+                status: newStatus,
                 updated_at: new Date().toISOString(),
               },
             }
@@ -270,11 +301,11 @@ export default function SavedPapersList() {
                           {paper.venue ? <div className="mt-1 text-xs text-muted-foreground">{paper.venue}</div> : null}
                         </TableCell>
                         <TableCell>
-                          <Badge variant="outline">{paper.source || "unknown"}</Badge>
+                          <Badge variant="outline">{paper.primary_source || paper.source || "unknown"}</Badge>
                         </TableCell>
                         <TableCell className="text-xs text-muted-foreground">
                           <div>{formatDate(item.saved_at)}</div>
-                          <div>Published: {formatDate(paper.published_at)}</div>
+                          <div>Published: {formatDate(paper.publication_date || paper.published_at)}</div>
                         </TableCell>
                         <TableCell>
                           <div className="text-sm">{formatJudge(item.latest_judge?.overall)}</div>
@@ -293,14 +324,7 @@ export default function SavedPapersList() {
                             size="sm"
                             variant="secondary"
                             disabled={rowUpdating}
-                            onClick={() =>
-                              updateReadingStatus(
-                                paper.id,
-                                status === "read" ? "reading" : "read",
-                                true,
-                                "toggleRead",
-                              )
-                            }
+                            onClick={() => toggleReadStatus(paper.id, status)}
                           >
                             {togglingRead ? <Loader2 className="h-3 w-3 animate-spin" /> : status === "read" ? "Reading" : "Mark Read"}
                           </Button>
@@ -308,7 +332,7 @@ export default function SavedPapersList() {
                             size="sm"
                             variant="ghost"
                             disabled={rowUpdating}
-                            onClick={() => updateReadingStatus(paper.id, status, false, "unsave")}
+                            onClick={() => unsavePaper(paper.id)}
                           >
                             {unsaving ? <Loader2 className="h-3 w-3 animate-spin" /> : "Unsave"}
                           </Button>
diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts
index 28af73b..ac43354 100644
--- a/web/src/lib/api.ts
+++ b/web/src/lib/api.ts
@@ -395,33 +395,25 @@ export async function fetchWikiConcepts(): Promise<WikiConcept[]> {
 }
 
 export async function fetchPapers(): Promise<Paper[]> {
-    return [
-        {
-            id: "attention-is-all-you-need",
-            title: "Attention Is All You Need",
-            venue: "NeurIPS 2017",
-            authors: "Vaswani et al.",
-            citations: "100k+",
-            status: "Reproduced",
-            tags: ["Transformer", "NLP"]
-        },
-        {
-            id: "bert-pretraining",
-            title: "BERT: Pre-training of Deep Bidirectional Transformers",
-            venue: "NAACL 2019",
-            authors: "Devlin et al.",
-            citations: "80k+",
-            status: "analyzing",
-            tags: ["NLP", "Language Model"]
-        },
-        {
-            id: "resnet-deep-residual",
-            title: "Deep Residual Learning for Image Recognition",
-            venue: "CVPR 2016",
-            authors: "He et al.",
-            citations: "150k+",
-            status: "pending",
-            tags: ["CV", "ResNet"]
+    try {
+        const res = await fetch(`${API_BASE_URL}/papers/library`)
+        if (!res.ok) {
+            console.error("Failed to fetch papers library:", res.status)
+            return []
         }
-    ]
+        const data = await res.json()
+        // Transform backend response to frontend Paper type
+        return (data.papers || []).map((item: { paper: Record<string, unknown>; action: string }) => ({
+            id: String(item.paper.id),
+            title: item.paper.title || "Untitled",
+            venue: item.paper.venue || "Unknown",
+            authors: Array.isArray(item.paper.authors) ? item.paper.authors.join(", ") : "Unknown",
+            citations: item.paper.citation_count ? `${item.paper.citation_count}` : "0",
+            status: item.action === "save" ? "Saved" : "pending",
+            tags: Array.isArray(item.paper.fields_of_study) ? item.paper.fields_of_study.slice(0, 3) : []
+        }))
+    } catch (e) {
+        console.error("Error fetching papers:", e)
+        return []
+    }
 }

From 29a00ddd369459ed377df4057ad6bfbcfdd02b0d Mon Sep 17 00:00:00 2001
From: boyu <oor2020@163.com>
Date: Wed, 11 Feb 2026 11:01:11 +0100
Subject: [PATCH 2/3] feat(Harvest): add -- Paper Search and Storage Closes #26

Signed-off-by: LIU BOYU <oor2020@163.com>
---
 alembic/versions/0003_paper_registry.py       |  19 +-
 ...tables.py => 0007_paper_harvest_tables.py} |  11 +-
 docs/architecture_overview.md                 | 840 ++++++++++++++++++
 src/paperbot/api/main.py                      |   6 -
 src/paperbot/infrastructure/stores/models.py  |  90 +-
 .../infrastructure/stores/paper_store.py      |  17 -
 .../infrastructure/stores/research_store.py   |  12 +-
 7 files changed, 868 insertions(+), 127 deletions(-)
 rename alembic/versions/{0003_paper_harvest_tables.py => 0007_paper_harvest_tables.py} (96%)
 create mode 100644 docs/architecture_overview.md

diff --git a/alembic/versions/0003_paper_registry.py b/alembic/versions/0003_paper_registry.py
index 2d04a32..e7e543d 100644
--- a/alembic/versions/0003_paper_registry.py
+++ b/alembic/versions/0003_paper_registry.py
@@ -51,7 +51,11 @@ def _create_index(name: str, table: str, cols: list[str]) -> None:
 
 
 def upgrade() -> None:
+    # NOTE: The papers table may also be created by 0007_paper_harvest_tables with a different schema.
+    # Only create this version if the table doesn't exist.
+    created_table = False
     if _is_offline() or not _has_table("papers"):
+        created_table = True
         op.create_table(
             "papers",
             sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
@@ -75,14 +79,19 @@ def upgrade() -> None:
             sa.UniqueConstraint("doi", name="uq_papers_doi"),
         )
 
+    # Only create indexes for columns that exist in this schema version
+    # These indexes are always safe (columns exist in both schemas):
     _create_index("ix_papers_arxiv_id", "papers", ["arxiv_id"])
     _create_index("ix_papers_doi", "papers", ["doi"])
-    _create_index("ix_papers_title", "papers", ["title"])
-    _create_index("ix_papers_source", "papers", ["source"])
-    _create_index("ix_papers_published_at", "papers", ["published_at"])
-    _create_index("ix_papers_first_seen_at", "papers", ["first_seen_at"])
     _create_index("ix_papers_created_at", "papers", ["created_at"])
-    _create_index("ix_papers_updated_at", "papers", ["updated_at"])
+
+    # These indexes are only for this schema (not in harvest schema):
+    if _is_offline() or created_table:
+        _create_index("ix_papers_title", "papers", ["title"])
+        _create_index("ix_papers_source", "papers", ["source"])
+        _create_index("ix_papers_published_at", "papers", ["published_at"])
+        _create_index("ix_papers_first_seen_at", "papers", ["first_seen_at"])
+        _create_index("ix_papers_updated_at", "papers", ["updated_at"])
 
 
 def downgrade() -> None:
diff --git a/alembic/versions/0003_paper_harvest_tables.py b/alembic/versions/0007_paper_harvest_tables.py
similarity index 96%
rename from alembic/versions/0003_paper_harvest_tables.py
rename to alembic/versions/0007_paper_harvest_tables.py
index ecf3803..6677a8d 100644
--- a/alembic/versions/0003_paper_harvest_tables.py
+++ b/alembic/versions/0007_paper_harvest_tables.py
@@ -1,7 +1,7 @@
 """paper harvest tables
 
-Revision ID: 0003_paper_harvest_tables
-Revises: 0002_research_eval_runs
+Revision ID: 0007_paper_harvest_tables
+Revises: 0006_newsletter_subscribers
 Create Date: 2026-02-06
 
 Adds:
@@ -14,8 +14,8 @@
 import sqlalchemy as sa
 from alembic import context, op
 
-revision = "0003_paper_harvest_tables"
-down_revision = "0002_research_eval_runs"
+revision = "0007_paper_harvest_tables"
+down_revision = "0006_newsletter_subscribers"
 branch_labels = None
 depends_on = None
 
@@ -52,9 +52,6 @@ def _create_index(name: str, table: str, cols: list[str]) -> None:
 
 
 def upgrade() -> None:
-    if _is_offline():
-        _upgrade_create_tables()
-        return
     _upgrade_create_tables()
     _upgrade_create_indexes()
 
diff --git a/docs/architecture_overview.md b/docs/architecture_overview.md
new file mode 100644
index 0000000..3320565
--- /dev/null
+++ b/docs/architecture_overview.md
@@ -0,0 +1,840 @@
+# PaperBot System Architecture
+
+> **Version**: 1.0
+> **Last Updated**: 2026-02-06
+> **Author**: Claude Code
+
+---
+
+## Table of Contents
+
+1. [System Overview](#1-system-overview)
+2. [Layered Architecture](#2-layered-architecture)
+3. [Core Components](#3-core-components)
+4. [Data Flow](#4-data-flow)
+5. [External Integrations](#5-external-integrations)
+6. [Design Patterns](#6-design-patterns)
+7. [Configuration](#7-configuration)
+
+---
+
+## 1. System Overview
+
+PaperBot is a **multi-agent research workflow framework** designed for academic paper analysis, scholar tracking, and code reproduction. It consists of three main components:
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                           PaperBot System Architecture                          │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                                                                                 │
+│  ┌─────────────────┐   ┌─────────────────┐   ┌─────────────────┐               │
+│  │   Web Dashboard │   │   Terminal CLI  │   │   Python API    │               │
+│  │   (Next.js 16)  │   │   (Ink/React)   │   │   (Direct)      │               │
+│  └────────┬────────┘   └────────┬────────┘   └────────┬────────┘               │
+│           │                     │                     │                         │
+│           └─────────────────────┼─────────────────────┘                         │
+│                                 │                                               │
+│                                 ▼                                               │
+│  ┌──────────────────────────────────────────────────────────────────────────┐  │
+│  │                     FastAPI Backend (Python)                              │  │
+│  │  ┌─────────────────────────────────────────────────────────────────────┐ │  │
+│  │  │  SSE Streaming  │  REST API  │  WebSocket (future)                  │ │  │
+│  │  └─────────────────────────────────────────────────────────────────────┘ │  │
+│  │                                                                          │  │
+│  │  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐  │  │
+│  │  │   Scholar    │  │    Paper     │  │  Paper2Code  │  │   Research   │  │  │
+│  │  │   Tracking   │  │   Analysis   │  │   Pipeline   │  │   Context    │  │  │
+│  │  └──────────────┘  └──────────────┘  └──────────────┘  └──────────────┘  │  │
+│  │                                                                          │  │
+│  │  ┌──────────────────────────────────────────────────────────────────┐   │  │
+│  │  │              Multi-Agent Orchestration System                     │   │  │
+│  │  │   ResearchAgent │ CodeAgent │ QualityAgent │ InfluenceCalc │ ... │   │  │
+│  │  └──────────────────────────────────────────────────────────────────┘   │  │
+│  └──────────────────────────────────────────────────────────────────────────┘  │
+│                                 │                                               │
+│           ┌─────────────────────┼─────────────────────┐                         │
+│           ▼                     ▼                     ▼                         │
+│  ┌─────────────────┐   ┌─────────────────┐   ┌─────────────────┐               │
+│  │     SQLite      │   │   LLM APIs      │   │  External APIs  │               │
+│  │   (Persistence) │   │ (Claude/OpenAI) │   │ (S2/GitHub/...) │               │
+│  └─────────────────┘   └─────────────────┘   └─────────────────┘               │
+│                                                                                 │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+### Directory Structure
+
+```
+src/paperbot/
+├── api/                    # API Layer - FastAPI routes & streaming
+├── application/            # Application Layer - Business logic, workflows
+├── domain/                 # Domain Layer - Core models, entities
+├── infrastructure/         # Infrastructure Layer - External services, DB
+├── core/                   # Core abstractions & patterns
+├── agents/                 # Multi-agent implementations
+├── repro/                  # Paper2Code pipeline
+├── context_engine/         # Research context routing
+├── memory/                 # User memory extraction
+├── presentation/           # UI components (reports, CLI)
+└── workflows/              # Workflow orchestration
+```
+
+---
+
+## 2. Layered Architecture
+
+PaperBot follows a **Clean Architecture** approach with clear separation of concerns:
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                              PRESENTATION LAYER                                  │
+│  ┌─────────────────────────────────────────────────────────────────────────────┐│
+│  │  Web Dashboard (Next.js)  │  Terminal CLI (Ink)  │  API Clients             ││
+│  └─────────────────────────────────────────────────────────────────────────────┘│
+│                                       │                                          │
+│                                       ▼                                          │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                                 API LAYER                                        │
+│  ┌─────────────────────────────────────────────────────────────────────────────┐│
+│  │  FastAPI Application (api/main.py)                                          ││
+│  │  ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐     ││
+│  │  │  /track   │ │ /analyze  │ │ /gen-code │ │ /research │ │  /memory  │     ││
+│  │  └───────────┘ └───────────┘ └───────────┘ └───────────┘ └───────────┘     ││
+│  │  ┌───────────────────────────────────────────────────────────────────┐     ││
+│  │  │  SSE Streaming (streaming.py)  │  CORS Middleware  │  Auth        │     ││
+│  │  └───────────────────────────────────────────────────────────────────┘     ││
+│  └─────────────────────────────────────────────────────────────────────────────┘│
+│                                       │                                          │
+│                                       ▼                                          │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                            APPLICATION LAYER                                     │
+│  ┌─────────────────────────────────────────────────────────────────────────────┐│
+│  │  Workflows & Pipelines                                                       ││
+│  │  ┌─────────────────┐  ┌─────────────────┐  ┌─────────────────┐              ││
+│  │  │ ScholarPipeline │  │ Paper2Code Orch │  │ HarvestPipeline │              ││
+│  │  │ (scholar_pipeline)│ │  (orchestrator) │  │    (v1 NEW)     │              ││
+│  │  └─────────────────┘  └─────────────────┘  └─────────────────┘              ││
+│  │                                                                              ││
+│  │  Services & Ports                                                            ││
+│  │  ┌─────────────────┐  ┌─────────────────┐  ┌─────────────────┐              ││
+│  │  │  EventLogPort   │  │ SourceRegistry  │  │ WorkflowRegistry│              ││
+│  │  │   (Protocol)    │  │  (Data Sources) │  │   (Workflows)   │              ││
+│  │  └─────────────────┘  └─────────────────┘  └─────────────────┘              ││
+│  │                                                                              ││
+│  │  Collaboration                                                               ││
+│  │  ┌─────────────────┐  ┌─────────────────┐  ┌─────────────────┐              ││
+│  │  │AgentCoordinator │  │  ScoreShareBus  │  │ FailFastEvaluator│             ││
+│  │  │ (message bus)   │  │ (cross-stage)   │  │ (early stop)    │              ││
+│  │  └─────────────────┘  └─────────────────┘  └─────────────────┘              ││
+│  └─────────────────────────────────────────────────────────────────────────────┘│
+│                                       │                                          │
+│                                       ▼                                          │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                              DOMAIN LAYER                                        │
+│  ┌─────────────────────────────────────────────────────────────────────────────┐│
+│  │  Core Models                                                                 ││
+│  │  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐         ││
+│  │  │  PaperMeta  │  │  Scholar    │  │  Influence  │  │ HarvestedPaper│        ││
+│  │  │  (paper.py) │  │ (scholar.py)│  │(influence/) │  │  (v1 NEW)   │         ││
+│  │  └─────────────┘  └─────────────┘  └─────────────┘  └─────────────┘         ││
+│  │                                                                              ││
+│  │  Agents (BaseAgent → Specialized)                                            ││
+│  │  ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐           ││
+│  │  │ Research │ │  Code    │ │ Quality  │ │  Review  │ │ Influence│           ││
+│  │  │  Agent   │ │ Analysis │ │  Agent   │ │  Agent   │ │Calculator│           ││
+│  │  └──────────┘ └──────────┘ └──────────┘ └──────────┘ └──────────┘           ││
+│  │                                                                              ││
+│  │  Core Abstractions (core/)                                                   ││
+│  │  ┌─────────────────┐  ┌─────────────────┐  ┌─────────────────┐              ││
+│  │  │   Executable    │  │ ExecutionResult │  │   DI Container  │              ││
+│  │  │  (interface)    │  │  (result type)  │  │   (singleton)   │              ││
+│  │  └─────────────────┘  └─────────────────┘  └─────────────────┘              ││
+│  └─────────────────────────────────────────────────────────────────────────────┘│
+│                                       │                                          │
+│                                       ▼                                          │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                           INFRASTRUCTURE LAYER                                   │
+│  ┌─────────────────────────────────────────────────────────────────────────────┐│
+│  │  LLM Integration                                                             ││
+│  │  ┌─────────────────────────────────────────────────────────────────┐        ││
+│  │  │  LLMClient (llm/base.py)                                        │        ││
+│  │  │  ┌─────────┐  ┌─────────┐  ┌─────────┐  ┌─────────┐            │        ││
+│  │  │  │ Claude  │  │ OpenAI  │  │DeepSeek │  │ Custom  │            │        ││
+│  │  │  │ (Anthropic)│ │(GPT-4) │  │         │  │ Endpoint│            │        ││
+│  │  │  └─────────┘  └─────────┘  └─────────┘  └─────────┘            │        ││
+│  │  └─────────────────────────────────────────────────────────────────┘        ││
+│  │                                                                              ││
+│  │  API Clients                                                                 ││
+│  │  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐         ││
+│  │  │  Semantic   │  │   GitHub    │  │ OpenReview  │  │   arXiv     │         ││
+│  │  │  Scholar    │  │   API       │  │    API      │  │   API       │         ││
+│  │  └─────────────┘  └─────────────┘  └─────────────┘  └─────────────┘         ││
+│  │                                                                              ││
+│  │  Persistence                                                                 ││
+│  │  ┌─────────────────────────────────────────────────────────────────┐        ││
+│  │  │  SQLAlchemy ORM (stores/models.py)                              │        ││
+│  │  │  ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐   │        ││
+│  │  │  │AgentRun │ │AgentEvent│ │ Memory  │ │Research │ │ Papers  │   │        ││
+│  │  │  │  Model  │ │  Model  │ │  Model  │ │ Track   │ │(v1 NEW) │   │        ││
+│  │  │  └─────────┘ └─────────┘ └─────────┘ └─────────┘ └─────────┘   │        ││
+│  │  └─────────────────────────────────────────────────────────────────┘        ││
+│  │                                                                              ││
+│  │  Event Logging                                                               ││
+│  │  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐                          ││
+│  │  │  Logging    │  │ SQLAlchemy  │  │  Composite  │                          ││
+│  │  │  EventLog   │  │  EventLog   │  │  EventLog   │                          ││
+│  │  └─────────────┘  └─────────────┘  └─────────────┘                          ││
+│  └─────────────────────────────────────────────────────────────────────────────┘│
+│                                       │                                          │
+│                                       ▼                                          │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                            EXTERNAL SYSTEMS                                      │
+│  ┌─────────────────────────────────────────────────────────────────────────────┐│
+│  │  ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐   ││
+│  │  │ SQLite  │ │Anthropic│ │ OpenAI  │ │Semantic │ │ GitHub  │ │ Docker  │   ││
+│  │  │   DB    │ │   API   │ │   API   │ │ Scholar │ │   API   │ │  / E2B  │   ││
+│  │  └─────────┘ └─────────┘ └─────────┘ └─────────┘ └─────────┘ └─────────┘   ││
+│  └─────────────────────────────────────────────────────────────────────────────┘│
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+### Layer Responsibilities
+
+| Layer | Responsibility | Key Components |
+|-------|----------------|----------------|
+| **Presentation** | User interface, client applications | Web Dashboard, Terminal CLI |
+| **API** | HTTP endpoints, streaming, middleware | FastAPI routes, SSE streaming |
+| **Application** | Business workflows, orchestration | Pipelines, Coordinators, Services |
+| **Domain** | Core business logic, entities | Models, Agents, Influence calculations |
+| **Infrastructure** | External services, persistence | LLM clients, API clients, SQLAlchemy |
+
+---
+
+## 3. Core Components
+
+### 3.1 Multi-Agent System
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                          Multi-Agent Orchestration                              │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                                                                                 │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │                        AgentCoordinator                                  │   │
+│  │  ┌─────────────────────────────────────────────────────────────────┐    │   │
+│  │  │  - register(agent)      - Agent registration                    │    │   │
+│  │  │  - broadcast(message)   - Message distribution                  │    │   │
+│  │  │  - collect()            - Result aggregation                    │    │   │
+│  │  │  - synthesize()         - Final synthesis                       │    │   │
+│  │  └─────────────────────────────────────────────────────────────────┘    │   │
+│  └───────────────────────────────┬─────────────────────────────────────────┘   │
+│                                  │                                              │
+│         ┌────────────────────────┼────────────────────────┐                    │
+│         │                        │                        │                    │
+│         ▼                        ▼                        ▼                    │
+│  ┌─────────────┐          ┌─────────────┐          ┌─────────────┐            │
+│  │ScoreShareBus│          │FailFastEval │          │  EventLog   │            │
+│  │             │          │             │          │             │            │
+│  │ Cross-stage │          │ Early stop  │          │ Persistence │            │
+│  │ score share │          │ on low qual │          │  & audit    │            │
+│  └─────────────┘          └─────────────┘          └─────────────┘            │
+│                                                                                 │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │                         Registered Agents                                │   │
+│  │                                                                          │   │
+│  │  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐ │   │
+│  │  │ ResearchAgent│  │CodeAnalysis  │  │ QualityAgent │  │ ReviewAgent  │ │   │
+│  │  │              │  │    Agent     │  │              │  │              │ │   │
+│  │  │ - S2 search  │  │ - GitHub API │  │ - Quality    │  │ - Peer review│ │   │
+│  │  │ - Enrichment │  │ - Code health│  │   scoring    │  │   simulation │ │   │
+│  │  │ - Grounding  │  │ - Dependencies│ │ - Method eval│  │ - Strengths  │ │   │
+│  │  └──────────────┘  └──────────────┘  └──────────────┘  └──────────────┘ │   │
+│  │                                                                          │   │
+│  │  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐                   │   │
+│  │  │ Influence    │  │Verification  │  │ Documentation│                   │   │
+│  │  │ Calculator   │  │   Agent      │  │    Agent     │                   │   │
+│  │  │              │  │              │  │              │                   │   │
+│  │  │ - Citation   │  │ - Claim check│  │ - API docs   │                   │   │
+│  │  │   velocity   │  │ - Method     │  │ - Code docs  │                   │   │
+│  │  │ - Momentum   │  │   validation │  │   extraction │                   │   │
+│  │  └──────────────┘  └──────────────┘  └──────────────┘                   │   │
+│  └─────────────────────────────────────────────────────────────────────────┘   │
+│                                                                                 │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+### 3.2 BaseAgent Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                           BaseAgent (Template Method)                           │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                                                                                 │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  execute(input: TInput) → ExecutionResult[TOutput]                      │   │
+│  │                                                                          │   │
+│  │  ┌─────────────────┐                                                    │   │
+│  │  │ 1. Validate     │  _validate_input(input)                            │   │
+│  │  │    Input        │  - Check required fields                           │   │
+│  │  └────────┬────────┘  - Validate constraints                            │   │
+│  │           │                                                              │   │
+│  │           ▼                                                              │   │
+│  │  ┌─────────────────┐                                                    │   │
+│  │  │ 2. Execute      │  _execute(input) [ABSTRACT]                        │   │
+│  │  │    Core Logic   │  - Implemented by subclass                         │   │
+│  │  └────────┬────────┘  - LLM calls, API calls, etc.                      │   │
+│  │           │                                                              │   │
+│  │           ▼                                                              │   │
+│  │  ┌─────────────────┐                                                    │   │
+│  │  │ 3. Post-Process │  _post_process(result)                             │   │
+│  │  │    Results      │  - Format output                                   │   │
+│  │  └─────────────────┘  - Emit events                                     │   │
+│  └─────────────────────────────────────────────────────────────────────────┘   │
+│                                                                                 │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  ExecutionResult[TOutput]                                                │   │
+│  │  {                                                                       │   │
+│  │    success: bool,                                                        │   │
+│  │    data: Optional[TOutput],                                              │   │
+│  │    error: Optional[str],                                                 │   │
+│  │    duration_ms: Optional[float],                                         │   │
+│  │    metadata: Dict[str, Any]                                              │   │
+│  │  }                                                                       │   │
+│  └─────────────────────────────────────────────────────────────────────────┘   │
+│                                                                                 │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Mixins                                                                  │   │
+│  │  ┌─────────────────┐  ┌─────────────────┐  ┌─────────────────┐          │   │
+│  │  │SemanticScholar  │  │  JSONParser     │  │  TextParsing    │          │   │
+│  │  │    Mixin        │  │    Mixin        │  │    Mixin        │          │   │
+│  │  └─────────────────┘  └─────────────────┘  └─────────────────┘          │   │
+│  └─────────────────────────────────────────────────────────────────────────┘   │
+│                                                                                 │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+### 3.3 Paper2Code Pipeline (ReproAgent)
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                          Paper2Code Pipeline                                     │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                                                                                 │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Input: Paper Context (PDF/URL → Parsed Content)                        │   │
+│  └───────────────────────────────────┬─────────────────────────────────────┘   │
+│                                      │                                          │
+│                                      ▼                                          │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Stage 1: Planning Agent                                                 │   │
+│  │  ┌─────────────────────────────────────────────────────────────────┐    │   │
+│  │  │  - Blueprint distillation                                       │    │   │
+│  │  │  - Implementation plan generation                               │    │   │
+│  │  │  - File structure design                                        │    │   │
+│  │  └─────────────────────────────────────────────────────────────────┘    │   │
+│  └───────────────────────────────────┬─────────────────────────────────────┘   │
+│                                      │                                          │
+│                                      ▼                                          │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Stage 2: Coding Agent                                                   │   │
+│  │  ┌─────────────────────────────────────────────────────────────────┐    │   │
+│  │  │  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐              │    │   │
+│  │  │  │ CodeMemory  │  │   CodeRAG   │  │  LLM Gen    │              │    │   │
+│  │  │  │             │  │             │  │             │              │    │   │
+│  │  │  │Cross-file   │  │ Pattern     │  │ Code        │              │    │   │
+│  │  │  │context      │  │ retrieval   │  │ generation  │              │    │   │
+│  │  │  │AST indexing │  │ similarity  │  │             │              │    │   │
+│  │  │  └─────────────┘  └─────────────┘  └─────────────┘              │    │   │
+│  │  └─────────────────────────────────────────────────────────────────┘    │   │
+│  └───────────────────────────────────┬─────────────────────────────────────┘   │
+│                                      │                                          │
+│                                      ▼                                          │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Stage 3: Verification Agent                                             │   │
+│  │  ┌─────────────────────────────────────────────────────────────────┐    │   │
+│  │  │  - Syntax validation                                            │    │   │
+│  │  │  - Import checking                                              │    │   │
+│  │  │  - Test execution (sandbox)                                     │    │   │
+│  │  └─────────────────────────────────────────────────────────────────┘    │   │
+│  └───────────────────────────────────┬─────────────────────────────────────┘   │
+│                                      │                                          │
+│                      ┌───────────────┴───────────────┐                         │
+│                      │         Pass?                 │                         │
+│                      └───────────────┬───────────────┘                         │
+│                         No │                 │ Yes                             │
+│                            ▼                 │                                  │
+│  ┌─────────────────────────────────────┐    │                                  │
+│  │  Stage 4: Debugging Agent           │    │                                  │
+│  │  ┌─────────────────────────────┐    │    │                                  │
+│  │  │  - Error analysis           │    │    │                                  │
+│  │  │  - Fix generation           │    │    │                                  │
+│  │  │  - Retry (max_repair_loops) │◄───┼────┘                                  │
+│  │  └─────────────────────────────┘    │                                       │
+│  └─────────────────────────────────────┘                                       │
+│                                      │                                          │
+│                                      ▼                                          │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Output: Generated Code + Execution Report                               │   │
+│  └─────────────────────────────────────────────────────────────────────────┘   │
+│                                                                                 │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Execution Environments                                                  │   │
+│  │  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐                      │   │
+│  │  │   Docker    │  │    E2B      │  │   Local     │                      │   │
+│  │  │  Executor   │  │  (Cloud)    │  │  (Dev only) │                      │   │
+│  │  └─────────────┘  └─────────────┘  └─────────────┘                      │   │
+│  └─────────────────────────────────────────────────────────────────────────┘   │
+│                                                                                 │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+### 3.4 Memory System
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                             Memory System                                        │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                                                                                 │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Input Sources                                                           │   │
+│  │  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐     │   │
+│  │  │   ChatGPT   │  │   Gemini    │  │   Claude    │  │  Plain Text │     │   │
+│  │  │   Export    │  │   Export    │  │   Export    │  │             │     │   │
+│  │  └──────┬──────┘  └──────┬──────┘  └──────┬──────┘  └──────┬──────┘     │   │
+│  └─────────┼────────────────┼────────────────┼────────────────┼─────────────┘   │
+│            └────────────────┼────────────────┼────────────────┘                 │
+│                             ▼                ▼                                   │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Memory Extractor                                                        │   │
+│  │  ┌─────────────────────────────────────────────────────────────────┐    │   │
+│  │  │  - Parse conversation format                                    │    │   │
+│  │  │  - Extract memory candidates                                    │    │   │
+│  │  │  - Classify by type (profile, preference, goal, fact, etc.)     │    │   │
+│  │  │  - Calculate confidence scores                                  │    │   │
+│  │  │  - Detect PII risk                                              │    │   │
+│  │  └─────────────────────────────────────────────────────────────────┘    │   │
+│  └───────────────────────────────────┬─────────────────────────────────────┘   │
+│                                      │                                          │
+│                                      ▼                                          │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Memory Storage (memory_items table)                                     │   │
+│  │                                                                          │   │
+│  │  ┌───────────────────────────────────────────────────────────────┐      │   │
+│  │  │  Memory Item                                                   │      │   │
+│  │  │  - kind: profile | preference | goal | constraint | fact | ... │      │   │
+│  │  │  - content: string                                             │      │   │
+│  │  │  - confidence: 0.0 - 1.0                                       │      │   │
+│  │  │  - status: pending | approved | rejected | superseded          │      │   │
+│  │  │  - scope: global | track | workspace                           │      │   │
+│  │  │  - pii_risk: 0 | 1 | 2                                         │      │   │
+│  │  └───────────────────────────────────────────────────────────────┘      │   │
+│  └───────────────────────────────────┬─────────────────────────────────────┘   │
+│                                      │                                          │
+│              ┌───────────────────────┼───────────────────────┐                 │
+│              │                       │                       │                 │
+│              ▼                       ▼                       ▼                 │
+│  ┌───────────────────┐   ┌───────────────────┐   ┌───────────────────┐        │
+│  │   Memory Inbox    │   │  Context Engine   │   │   Quality Metrics │        │
+│  │                   │   │                   │   │                   │        │
+│  │  - Pending review │   │  - Memory inject  │   │  - Precision ≥85% │        │
+│  │  - Approve/Reject │   │  - Routing signal │   │  - FP rate ≤5%    │        │
+│  │  - Scope mgmt     │   │  - Recommendation │   │  - Hit rate ≥80%  │        │
+│  └───────────────────┘   └───────────────────┘   └───────────────────┘        │
+│                                                                                 │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+### 3.5 Influence Calculation System
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                        Influence Calculation System                              │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                                                                                 │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Input: Paper Metadata + External Data                                   │   │
+│  └───────────────────────────────────┬─────────────────────────────────────┘   │
+│                                      │                                          │
+│         ┌────────────────────────────┼────────────────────────────┐            │
+│         │                            │                            │            │
+│         ▼                            ▼                            ▼            │
+│  ┌─────────────────┐          ┌─────────────────┐          ┌─────────────────┐ │
+│  │ Academic Metrics│          │Engineering Metrics│         │ Context Analysis│ │
+│  │                 │          │                 │          │                 │ │
+│  │ - Citation count│          │ - GitHub stars  │          │ - Citation      │ │
+│  │ - H-index       │          │ - Forks         │          │   sentiment     │ │
+│  │ - Venue tier    │          │ - Code health   │          │ - Dynamic PIS   │ │
+│  │   weighting     │          │ - Doc coverage  │          │ - Momentum      │ │
+│  └────────┬────────┘          └────────┬────────┘          └────────┬────────┘ │
+│           │                            │                            │          │
+│           └────────────────────────────┼────────────────────────────┘          │
+│                                        │                                        │
+│                                        ▼                                        │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Influence Calculator                                                    │   │
+│  │  ┌─────────────────────────────────────────────────────────────────┐    │   │
+│  │  │  Composite Score = Σ (weight_i × metric_i)                      │    │   │
+│  │  │                                                                  │    │   │
+│  │  │  Weights:                                                        │    │   │
+│  │  │  - Academic (citations, venue): 0.4                              │    │   │
+│  │  │  - Engineering (code, stars): 0.3                                │    │   │
+│  │  │  - Momentum (velocity, trend): 0.3                               │    │   │
+│  │  └─────────────────────────────────────────────────────────────────┘    │   │
+│  └───────────────────────────────────┬─────────────────────────────────────┘   │
+│                                      │                                          │
+│                                      ▼                                          │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Output: Paper Influence Score (PIS) + Breakdown                        │   │
+│  └─────────────────────────────────────────────────────────────────────────┘   │
+│                                                                                 │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 4. Data Flow
+
+### 4.1 Scholar Tracking Flow
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                          Scholar Tracking Data Flow                              │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                                                                                 │
+│  ┌────────────┐                                                                 │
+│  │   Client   │  GET /api/track?scholar_id=xxx                                  │
+│  └─────┬──────┘                                                                 │
+│        │                                                                        │
+│        ▼                                                                        │
+│  ┌────────────────────────────────────────────────────────────────────────┐    │
+│  │  API Route (/track)                                                    │    │
+│  │  - Parse request                                                       │    │
+│  │  - Create StreamingResponse                                            │    │
+│  └─────────────────────────────────────┬──────────────────────────────────┘    │
+│                                        │                                        │
+│                                        ▼                                        │
+│  ┌────────────────────────────────────────────────────────────────────────┐    │
+│  │  ScholarPipeline.analyze_paper()                                       │    │
+│  └─────────────────────────────────────┬──────────────────────────────────┘    │
+│                                        │                                        │
+│                                        ▼                                        │
+│  ┌────────────────────────────────────────────────────────────────────────┐    │
+│  │  ScholarWorkflowCoordinator                                            │    │
+│  │                                                                        │    │
+│  │  Stage 1: ResearchAgent                                                │    │
+│  │  ┌─────────────────────────────────────────────────────────────┐      │    │
+│  │  │  → Semantic Scholar API (paper search, metadata)            │      │    │
+│  │  │  → Paper enrichment                                         │      │    │
+│  │  │  → Emit: StreamEvent(progress)                              │      │    │
+│  │  │  → ScoreShareBus.publish(research_score)                    │      │    │
+│  │  └─────────────────────────────────────────────────────────────┘      │    │
+│  │                          │                                            │    │
+│  │                          ▼                                            │    │
+│  │  Stage 2: CodeAnalysisAgent                                           │    │
+│  │  ┌─────────────────────────────────────────────────────────────┐      │    │
+│  │  │  → GitHub API (repo discovery)                              │      │    │
+│  │  │  → Code health analysis                                     │      │    │
+│  │  │  → Emit: StreamEvent(progress)                              │      │    │
+│  │  │  → ScoreShareBus.publish(code_score)                        │      │    │
+│  │  └─────────────────────────────────────────────────────────────┘      │    │
+│  │                          │                                            │    │
+│  │                          ▼                                            │    │
+│  │  Stage 3: QualityAgent → InfluenceCalculator → ReportWriter           │    │
+│  │  ┌─────────────────────────────────────────────────────────────┐      │    │
+│  │  │  → Quality scoring                                          │      │    │
+│  │  │  → Influence calculation                                    │      │    │
+│  │  │  → Markdown report (Jinja2)                                 │      │    │
+│  │  │  → Emit: StreamEvent(result)                                │      │    │
+│  │  └─────────────────────────────────────────────────────────────┘      │    │
+│  └─────────────────────────────────────┬──────────────────────────────────┘    │
+│                                        │                                        │
+│                                        ▼                                        │
+│  ┌────────────────────────────────────────────────────────────────────────┐    │
+│  │  SSE Stream                                                            │    │
+│  │  ┌─────────────────────────────────────────────────────────────┐      │    │
+│  │  │  event: progress                                            │      │    │
+│  │  │  data: {"stage": "research", "message": "Analyzing..."}     │      │    │
+│  │  │                                                              │      │    │
+│  │  │  event: progress                                            │      │    │
+│  │  │  data: {"stage": "code", "message": "Checking GitHub..."}   │      │    │
+│  │  │                                                              │      │    │
+│  │  │  event: result                                              │      │    │
+│  │  │  data: {"report": "...", "scores": {...}}                   │      │    │
+│  │  │                                                              │      │    │
+│  │  │  event: done                                                │      │    │
+│  │  └─────────────────────────────────────────────────────────────┘      │    │
+│  └─────────────────────────────────────┬──────────────────────────────────┘    │
+│                                        │                                        │
+│                                        ▼                                        │
+│  ┌────────────┐                                                                 │
+│  │   Client   │  Receives SSE events, updates UI                                │
+│  └────────────┘                                                                 │
+│                                                                                 │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+### 4.2 Research Context Flow
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                        Research Context Data Flow                                │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                                                                                 │
+│  ┌────────────────────────────────────────────────────────────────────────┐    │
+│  │  User Query: "Find papers on LLM security"                             │    │
+│  └─────────────────────────────────────┬──────────────────────────────────┘    │
+│                                        │                                        │
+│                                        ▼                                        │
+│  ┌────────────────────────────────────────────────────────────────────────┐    │
+│  │  Context Engine                                                        │    │
+│  │                                                                        │    │
+│  │  1. Load User Memory (approved items for active track)                 │    │
+│  │     ┌─────────────────────────────────────────────────────────┐       │    │
+│  │     │  Memory: "I'm researching adversarial ML"               │       │    │
+│  │     │  Memory: "Prefer transformer-based methods"             │       │    │
+│  │     │  Memory: "Deadline: March 15"                           │       │    │
+│  │     └─────────────────────────────────────────────────────────┘       │    │
+│  │                          │                                            │    │
+│  │                          ▼                                            │    │
+│  │  2. Merge Query with Memory Context                                   │    │
+│  │     ┌─────────────────────────────────────────────────────────┐       │    │
+│  │     │  Merged: "LLM security + adversarial ML + transformers" │       │    │
+│  │     └─────────────────────────────────────────────────────────┘       │    │
+│  │                          │                                            │    │
+│  │                          ▼                                            │    │
+│  │  3. Route to Paper Sources                                            │    │
+│  │     ┌─────────────────────────────────────────────────────────┐       │    │
+│  │     │  → Semantic Scholar API                                 │       │    │
+│  │     │  → Local Paper Pool (v1)                                │       │    │
+│  │     └─────────────────────────────────────────────────────────┘       │    │
+│  │                          │                                            │    │
+│  │                          ▼                                            │    │
+│  │  4. Rank & Filter Results                                             │    │
+│  │     ┌─────────────────────────────────────────────────────────┐       │    │
+│  │     │  - Relevance scoring                                    │       │    │
+│  │     │  - Memory-influenced ranking                            │       │    │
+│  │     │  - Diversity balancing                                  │       │    │
+│  │     └─────────────────────────────────────────────────────────┘       │    │
+│  └─────────────────────────────────────┬──────────────────────────────────┘    │
+│                                        │                                        │
+│                                        ▼                                        │
+│  ┌────────────────────────────────────────────────────────────────────────┐    │
+│  │  Recommendations Tab                                                   │    │
+│  │  ┌─────────────────────────────────────────────────────────────┐      │    │
+│  │  │  Paper 1: "Adversarial Attacks on LLMs..."   [Like] [Save]  │      │    │
+│  │  │  Paper 2: "Transformer Security..."          [Like] [Save]  │      │    │
+│  │  │  Paper 3: "..."                              [Like] [Save]  │      │    │
+│  │  └─────────────────────────────────────────────────────────────┘      │    │
+│  └────────────────────────────────────────────────────────────────────────┘    │
+│                                                                                 │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 5. External Integrations
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                           External Integrations                                  │
+├─────────────────────────────────────────────────────────────────────────────────┤
+│                                                                                 │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  LLM Providers                                                           │   │
+│  │                                                                          │   │
+│  │  ┌─────────────────┐  ┌─────────────────┐  ┌─────────────────┐          │   │
+│  │  │   Anthropic     │  │    OpenAI       │  │   DeepSeek      │          │   │
+│  │  │   Claude API    │  │    GPT-4 API    │  │   API           │          │   │
+│  │  │                 │  │                 │  │                 │          │   │
+│  │  │  Primary for    │  │  Alternative    │  │  Cost-effective │          │   │
+│  │  │  agent reasoning│  │  provider       │  │  option         │          │   │
+│  │  └─────────────────┘  └─────────────────┘  └─────────────────┘          │   │
+│  │                                                                          │   │
+│  │  Unified via: LLMClient (llm/base.py)                                    │   │
+│  └─────────────────────────────────────────────────────────────────────────┘   │
+│                                                                                 │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Academic Data Sources                                                   │   │
+│  │                                                                          │   │
+│  │  ┌─────────────────┐  ┌─────────────────┐  ┌─────────────────┐          │   │
+│  │  │ Semantic Scholar│  │     arXiv       │  │   OpenAlex      │          │   │
+│  │  │                 │  │                 │  │   (v1 NEW)      │          │   │
+│  │  │ - Paper search  │  │ - Preprint      │  │ - 240M+ works   │          │   │
+│  │  │ - Citations     │  │   metadata      │  │ - Open access   │          │   │
+│  │  │ - Author data   │  │ - PDF links     │  │ - CS coverage   │          │   │
+│  │  └─────────────────┘  └─────────────────┘  └─────────────────┘          │   │
+│  │                                                                          │   │
+│  │  ┌─────────────────┐  ┌─────────────────┐                               │   │
+│  │  │   OpenReview    │  │  Conference     │                               │   │
+│  │  │                 │  │  Websites       │                               │   │
+│  │  │ - Submissions   │  │ - S&P, CCS      │                               │   │
+│  │  │ - Reviews       │  │ - USENIX, NDSS  │                               │   │
+│  │  └─────────────────┘  └─────────────────┘                               │   │
+│  └─────────────────────────────────────────────────────────────────────────┘   │
+│                                                                                 │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Code & Repository Services                                              │   │
+│  │                                                                          │   │
+│  │  ┌─────────────────┐  ┌─────────────────┐  ┌─────────────────┐          │   │
+│  │  │    GitHub       │  │   HuggingFace   │  │ Code Execution  │          │   │
+│  │  │    API          │  │                 │  │                 │          │   │
+│  │  │                 │  │ - Model cards   │  │ - Docker        │          │   │
+│  │  │ - Repo metadata │  │ - Checkpoints   │  │ - E2B (cloud)   │          │   │
+│  │  │ - Stars, forks  │  │                 │  │ - Local (dev)   │          │   │
+│  │  │ - Code analysis │  │                 │  │                 │          │   │
+│  │  └─────────────────┘  └─────────────────┘  └─────────────────┘          │   │
+│  └─────────────────────────────────────────────────────────────────────────┘   │
+│                                                                                 │
+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
+│  │  Persistence                                                             │   │
+│  │                                                                          │   │
+│  │  ┌─────────────────────────────────────────────────────────────────┐    │   │
+│  │  │  SQLite (Default) / PostgreSQL (Production)                     │    │   │
+│  │  │                                                                  │    │   │
+│  │  │  Tables:                                                         │    │   │
+│  │  │  - agent_runs, agent_events (execution tracking)                 │    │   │
+│  │  │  - memory_items, memory_sources (user memory)                    │    │   │
+│  │  │  - research_tracks, paper_feedback (research context)            │    │   │
+│  │  │  - papers, harvest_runs (v1 NEW - paper pool)                    │    │   │
+│  │  └─────────────────────────────────────────────────────────────────┘    │   │
+│  └─────────────────────────────────────────────────────────────────────────┘   │
+│                                                                                 │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 6. Design Patterns
+
+### Patterns Used in PaperBot
+
+| Pattern | Location | Purpose |
+|---------|----------|---------|
+| **Template Method** | `agents/base.py` | Common agent execution flow with customizable steps |
+| **Repository** | `application/ports/` | Abstract data access (EventLogPort) |
+| **Adapter** | `infrastructure/llm/` | Unified interface for multiple LLM providers |
+| **Pub/Sub** | `core/collaboration/` | AgentCoordinator message broadcasting |
+| **Dependency Injection** | `core/di/container.py` | Loose coupling between components |
+| **Pipeline** | `core/pipeline/` | Multi-stage processing |
+| **Composite** | `infrastructure/event_log/` | Multiple event log backends |
+| **Strategy** | `repro/` | Docker/E2B/Local execution strategies |
+| **Factory** | `application/services/` | Object creation abstraction |
+
+### Dependency Direction
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                         Dependency Direction                                     │
+│                                                                                 │
+│  ┌─────────────────┐                                                           │
+│  │  Presentation   │  ─────────────────┐                                       │
+│  │     (API)       │                   │                                       │
+│  └─────────────────┘                   │                                       │
+│          │                             │                                       │
+│          ▼                             │                                       │
+│  ┌─────────────────┐                   │    Dependencies point INWARD          │
+│  │  Application    │  ─────────────────┤    toward the Domain layer            │
+│  │   (Workflows)   │                   │                                       │
+│  └─────────────────┘                   │                                       │
+│          │                             │                                       │
+│          ▼                             │                                       │
+│  ┌─────────────────┐                   │                                       │
+│  │    Domain       │  ◄────────────────┘                                       │
+│  │   (Models)      │                                                           │
+│  └─────────────────┘                                                           │
+│          ▲                                                                     │
+│          │                                                                     │
+│  ┌─────────────────┐                                                           │
+│  │ Infrastructure  │  ─────────────────────────────────────────────────────────│
+│  │   (External)    │  Implements interfaces defined in Domain/Application      │
+│  └─────────────────┘                                                           │
+│                                                                                 │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 7. Configuration
+
+### Environment Variables
+
+```bash
+# LLM API Keys
+OPENAI_API_KEY=sk-...
+ANTHROPIC_API_KEY=sk-ant-...
+
+# External Services
+SEMANTIC_SCHOLAR_API_KEY=...  # Optional, higher rate limits
+GITHUB_TOKEN=ghp_...          # Optional, higher rate limits
+E2B_API_KEY=...               # Optional, cloud sandbox
+
+# Database
+PAPERBOT_DB_URL=sqlite:///data/paperbot.db
+
+# Execution Mode
+PAPERBOT_EXECUTOR=auto|docker|e2b|local
+```
+
+### Configuration Files
+
+| File | Purpose |
+|------|---------|
+| `config/config.yaml` | Main application config (models, venues, thresholds) |
+| `config/settings.py` | Pydantic settings validation |
+| `config/scholar_subscriptions.yaml` | Tracked scholars list |
+| `config/top_venues.yaml` | Venue tier rankings |
+
+### Key Configuration Options
+
+```yaml
+# config/config.yaml (example structure)
+download:
+  max_retries: 3
+  concurrency: 5
+  timeout: 30
+
+analysis:
+  parallel: true
+  max_depth: 3
+
+security:
+  ssl_verify: true
+  rate_limit: 100
+  domain_allowlist: [...]
+
+output:
+  formats: [markdown, html, pdf]
+  template_dir: templates/
+
+cache:
+  enabled: true
+  ttl: 3600
+
+logging:
+  level: INFO
+  format: json
+```
+
+---
+
+## Appendix: File Reference
+
+### Key Implementation Files
+
+| Component | File Path |
+|-----------|-----------|
+| API Entry Point | `src/paperbot/api/main.py` |
+| SSE Streaming | `src/paperbot/api/streaming.py` |
+| Base Agent | `src/paperbot/agents/base.py` |
+| Agent Coordinator | `src/paperbot/core/collaboration/coordinator.py` |
+| LLM Client | `src/paperbot/infrastructure/llm/base.py` |
+| Paper Model | `src/paperbot/domain/paper.py` |
+| DB Models | `src/paperbot/infrastructure/stores/models.py` |
+| Event Log | `src/paperbot/infrastructure/event_log/` |
+| Paper2Code | `src/paperbot/repro/orchestrator.py` |
+| Memory System | `src/paperbot/memory/` |
+| Context Engine | `src/paperbot/context_engine/` |
+
+---
+
+*Document generated by Claude Code for PaperBot project.*
diff --git a/src/paperbot/api/main.py b/src/paperbot/api/main.py
index 45ea821..3b2e915 100644
--- a/src/paperbot/api/main.py
+++ b/src/paperbot/api/main.py
@@ -20,11 +20,8 @@
     memory,
     research,
     paperscool,
-<<<<<<< HEAD
     newsletter,
-=======
     harvest,
->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
 )
 from paperbot.infrastructure.event_log.logging_event_log import LoggingEventLog
 from paperbot.infrastructure.event_log.composite_event_log import CompositeEventLog
@@ -68,11 +65,8 @@ async def health_check():
 app.include_router(memory.router, prefix="/api", tags=["Memory"])
 app.include_router(research.router, prefix="/api", tags=["Research"])
 app.include_router(paperscool.router, prefix="/api", tags=["PapersCool"])
-<<<<<<< HEAD
 app.include_router(newsletter.router, prefix="/api", tags=["Newsletter"])
-=======
 app.include_router(harvest.router, prefix="/api", tags=["Harvest"])
->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
 
 
 @app.on_event("startup")
diff --git a/src/paperbot/infrastructure/stores/models.py b/src/paperbot/infrastructure/stores/models.py
index 0cf476d..f6f5866 100644
--- a/src/paperbot/infrastructure/stores/models.py
+++ b/src/paperbot/infrastructure/stores/models.py
@@ -449,91 +449,6 @@ class ResearchMilestoneModel(Base):
 
     track = relationship("ResearchTrackModel", back_populates="milestones")
 
-
-class PaperModel(Base):
-    """Canonical paper registry row (deduplicated across sources)."""
-
-    __tablename__ = "papers"
-    __table_args__ = (
-        UniqueConstraint("arxiv_id", name="uq_papers_arxiv_id"),
-        UniqueConstraint("doi", name="uq_papers_doi"),
-    )
-
-    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
-
-    arxiv_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True, index=True)
-    doi: Mapped[Optional[str]] = mapped_column(String(128), nullable=True, index=True)
-
-    title: Mapped[str] = mapped_column(Text, default="", index=True)
-    authors_json: Mapped[str] = mapped_column(Text, default="[]")
-    abstract: Mapped[str] = mapped_column(Text, default="")
-
-    url: Mapped[str] = mapped_column(String(512), default="")
-    external_url: Mapped[str] = mapped_column(String(512), default="")
-    pdf_url: Mapped[str] = mapped_column(String(512), default="")
-
-    source: Mapped[str] = mapped_column(String(32), default="papers_cool", index=True)
-    venue: Mapped[str] = mapped_column(String(256), default="")
-    published_at: Mapped[Optional[datetime]] = mapped_column(
-        DateTime(timezone=True), nullable=True, index=True
-    )
-    first_seen_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
-
-    keywords_json: Mapped[str] = mapped_column(Text, default="[]")
-    metadata_json: Mapped[str] = mapped_column(Text, default="{}")
-
-    created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
-    updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
-
-    judge_scores = relationship(
-        "PaperJudgeScoreModel", back_populates="paper", cascade="all, delete-orphan"
-    )
-    feedback_rows = relationship("PaperFeedbackModel", back_populates="paper")
-    reading_status_rows = relationship("PaperReadingStatusModel", back_populates="paper")
-
-    def set_authors(self, values: Optional[list[str]]) -> None:
-        self.authors_json = json.dumps(
-            [str(v) for v in (values or []) if str(v).strip()],
-            ensure_ascii=False,
-        )
-
-    def get_authors(self) -> list[str]:
-        try:
-            data = json.loads(self.authors_json or "[]")
-            if isinstance(data, list):
-                return [str(v) for v in data if str(v).strip()]
-        except Exception:
-            pass
-        return []
-
-    def set_keywords(self, values: Optional[list[str]]) -> None:
-        self.keywords_json = json.dumps(
-            [str(v) for v in (values or []) if str(v).strip()],
-            ensure_ascii=False,
-        )
-
-    def get_keywords(self) -> list[str]:
-        try:
-            data = json.loads(self.keywords_json or "[]")
-            if isinstance(data, list):
-                return [str(v) for v in data if str(v).strip()]
-        except Exception:
-            pass
-        return []
-
-    def set_metadata(self, data: Dict[str, Any]) -> None:
-        self.metadata_json = json.dumps(data or {}, ensure_ascii=False)
-
-    def get_metadata(self) -> Dict[str, Any]:
-        try:
-            parsed = json.loads(self.metadata_json or "{}")
-            if isinstance(parsed, dict):
-                return parsed
-        except Exception:
-            pass
-        return {}
-
-
 class PaperFeedbackModel(Base):
     """User feedback on recommended/seen papers (track-scoped)."""
 
@@ -751,6 +666,11 @@ class PaperModel(Base):
     updated_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True)
     deleted_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True)  # Soft delete
 
+    # Relationships
+    feedback_rows = relationship("PaperFeedbackModel", back_populates="paper")
+    judge_scores = relationship("PaperJudgeScoreModel", back_populates="paper")
+    reading_status_rows = relationship("PaperReadingStatusModel", back_populates="paper")
+
     def get_authors(self) -> list:
         try:
             return json.loads(self.authors_json or "[]")
diff --git a/src/paperbot/infrastructure/stores/paper_store.py b/src/paperbot/infrastructure/stores/paper_store.py
index 6e9c3da..f0abdbf 100644
--- a/src/paperbot/infrastructure/stores/paper_store.py
+++ b/src/paperbot/infrastructure/stores/paper_store.py
@@ -1,4 +1,3 @@
-<<<<<<< HEAD
 from __future__ import annotations
 
 from datetime import datetime, timezone
@@ -8,15 +7,6 @@
 
 from paperbot.domain.paper_identity import normalize_arxiv_id, normalize_doi
 from paperbot.infrastructure.stores.models import Base, PaperJudgeScoreModel, PaperModel
-=======
-# src/paperbot/infrastructure/stores/paper_store.py
-"""
-Paper storage repository.
-
-Handles persistence and retrieval of harvested papers.
-"""
-
-from __future__ import annotations
 
 import json
 from dataclasses import dataclass
@@ -33,7 +23,6 @@
     PaperFeedbackModel,
     PaperModel,
 )
->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
 from paperbot.infrastructure.stores.sqlalchemy_db import SessionProvider, get_db_url
 
 
@@ -41,7 +30,6 @@ def _utcnow() -> datetime:
     return datetime.now(timezone.utc)
 
 
-<<<<<<< HEAD
 def _safe_list(values: Any) -> List[str]:
     if not isinstance(values, list):
         return []
@@ -84,7 +72,6 @@ def _as_utc(value: Optional[datetime]) -> Optional[datetime]:
 
 class SqlAlchemyPaperStore:
     """Canonical paper registry with idempotent upsert for daily workflows."""
-=======
 @dataclass
 class LibraryPaper:
     """Paper with library metadata (saved_at, track_id, action)."""
@@ -105,7 +92,6 @@ class PaperStore:
     - Source tracking
     - User library (saved papers)
     """
->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
 
     def __init__(self, db_url: Optional[str] = None, *, auto_create_schema: bool = True):
         self.db_url = db_url or get_db_url()
@@ -113,7 +99,6 @@ def __init__(self, db_url: Optional[str] = None, *, auto_create_schema: bool = T
         if auto_create_schema:
             Base.metadata.create_all(self._provider.engine)
 
-<<<<<<< HEAD
     def upsert_paper(
         self,
         *,
@@ -366,7 +351,6 @@ def _paper_to_dict(row: PaperModel) -> Dict[str, Any]:
             "created_at": row.created_at.isoformat() if row.created_at else None,
             "updated_at": row.updated_at.isoformat() if row.updated_at else None,
         }
-=======
     def upsert_papers_batch(
         self,
         papers: List[HarvestedPaper],
@@ -838,4 +822,3 @@ def paper_to_dict(paper: PaperModel) -> Dict[str, Any]:
         "created_at": paper.created_at.isoformat() if paper.created_at else None,
         "updated_at": paper.updated_at.isoformat() if paper.updated_at else None,
     }
->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
diff --git a/src/paperbot/infrastructure/stores/research_store.py b/src/paperbot/infrastructure/stores/research_store.py
index 9549e7f..7e654ec 100644
--- a/src/paperbot/infrastructure/stores/research_store.py
+++ b/src/paperbot/infrastructure/stores/research_store.py
@@ -8,11 +8,12 @@
 from sqlalchemy import desc, func, or_, select
 from sqlalchemy.exc import IntegrityError
 
-<<<<<<< HEAD
 from paperbot.domain.paper_identity import normalize_arxiv_id, normalize_doi
-=======
+
+from paperbot.domain.paper_identity import normalize_arxiv_id, normalize_doi
+
 from paperbot.utils.logging_config import Logger, LogFiles
->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
+
 from paperbot.infrastructure.stores.models import (
     Base,
     PaperFeedbackModel,
@@ -346,16 +347,13 @@ def add_paper_feedback(
                 Logger.error("Track not found", file=LogFiles.HARVEST)
                 return None
 
-<<<<<<< HEAD
+
             resolved_paper_ref_id = self._resolve_paper_ref_id(
                 session=session,
                 paper_id=(paper_id or "").strip(),
                 metadata=metadata,
             )
-
-=======
             Logger.info("Creating new feedback record", file=LogFiles.HARVEST)
->>>>>>> 09ca42d (feat(Harvest): add -- Paper Search and Storage)
             row = PaperFeedbackModel(
                 user_id=user_id,
                 track_id=track_id,

From 02c765aa193cf192d5776fa007f04767d0a44fd6 Mon Sep 17 00:00:00 2001
From: boyu <oor2020@163.com>
Date: Wed, 11 Feb 2026 12:12:03 +0100
Subject: [PATCH 3/3] [Fix] Fix security issues introduced by the harvest
 module Closes #33, #34, #35, #36, #37, #38 Signed-off-by: LIU BOYU
 <oor2020@163.com>

---
 src/paperbot/api/routes/harvest.py            | 27 ++++++--
 src/paperbot/api/routes/research.py           | 54 +++++++++------
 .../services/paper_deduplicator.py            | 21 ++++--
 src/paperbot/context_engine/engine.py         | 14 +++-
 src/paperbot/domain/harvest.py                |  6 +-
 .../harvesters/arxiv_harvester.py             |  9 ++-
 .../harvesters/openalex_harvester.py          |  7 +-
 .../infrastructure/stores/paper_store.py      | 65 ++++++++++++++++---
 .../app/api/papers/[paperId]/save/route.ts    | 22 ++++++-
 9 files changed, 176 insertions(+), 49 deletions(-)

diff --git a/src/paperbot/api/routes/harvest.py b/src/paperbot/api/routes/harvest.py
index 10ad62f..491134c 100644
--- a/src/paperbot/api/routes/harvest.py
+++ b/src/paperbot/api/routes/harvest.py
@@ -31,6 +31,7 @@
 
 # Lazy-initialized stores
 _paper_store: Optional[PaperStore] = None
+_research_store: Optional["SqlAlchemyResearchStore"] = None
 
 
 def _get_paper_store() -> PaperStore:
@@ -41,6 +42,16 @@ def _get_paper_store() -> PaperStore:
     return _paper_store
 
 
+def _get_research_store() -> "SqlAlchemyResearchStore":
+    """Lazy initialization of research store."""
+    from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore
+
+    global _research_store
+    if _research_store is None:
+        _research_store = SqlAlchemyResearchStore()
+    return _research_store
+
+
 # ============================================================================
 # Harvest Endpoints
 # ============================================================================
@@ -49,7 +60,7 @@ def _get_paper_store() -> PaperStore:
 class HarvestRequest(BaseModel):
     """Request body for harvest endpoint."""
 
-    keywords: List[str] = Field(..., min_items=1, description="Search keywords")
+    keywords: List[str] = Field(..., min_length=1, description="Search keywords")
     venues: Optional[List[str]] = Field(None, description="Filter to specific venues")
     year_from: Optional[int] = Field(None, ge=1900, le=2100, description="Start year")
     year_to: Optional[int] = Field(None, ge=1900, le=2100, description="End year")
@@ -150,6 +161,9 @@ class HarvestRunListResponse(BaseModel):
     runs: List[HarvestRunResponse]
 
 
+# TODO(auth): This endpoint lists all harvest runs without user-based filtering.
+# Intentional for MVP single-user setup. For multi-user production, add user_id
+# filtering so users only see their own harvest runs.
 @router.get("/harvest/runs", response_model=HarvestRunListResponse)
 def list_harvest_runs(
     status: Optional[str] = Query(None, description="Filter by status"),
@@ -320,6 +334,9 @@ class LibraryResponse(BaseModel):
     offset: int
 
 
+# TODO(auth): user_id is accepted from client without authentication.
+# This is intentional for the MVP single-user setup. For multi-user production,
+# user_id should come from an authenticated session or JWT token.
 @router.get("/papers/library", response_model=LibraryResponse)
 def get_user_library(
     user_id: str = Query("default", description="User ID"),
@@ -383,6 +400,7 @@ def get_paper(paper_id: int):
 class SavePaperRequest(BaseModel):
     """Request to save paper to library."""
 
+    # TODO(auth): user_id from client without auth - intentional for MVP single-user setup
     user_id: str = Field("default", description="User ID")
     track_id: Optional[int] = Field(None, description="Associated track ID")
 
@@ -394,8 +412,6 @@ def save_paper_to_library(paper_id: int, request: SavePaperRequest):
 
     Uses paper_feedback table with action='save'.
     """
-    from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore
-
     # Verify paper exists
     store = _get_paper_store()
     paper = store.get_paper_by_id(paper_id)
@@ -403,7 +419,7 @@ def save_paper_to_library(paper_id: int, request: SavePaperRequest):
         raise HTTPException(status_code=404, detail="Paper not found")
 
     # Use research store to record feedback
-    research_store = SqlAlchemyResearchStore()
+    research_store = _get_research_store()
     feedback = research_store.record_paper_feedback(
         user_id=request.user_id,
         paper_id=str(paper_id),
@@ -414,6 +430,9 @@ def save_paper_to_library(paper_id: int, request: SavePaperRequest):
     return {"success": True, "feedback": feedback}
 
 
+# TODO(auth): user_id accepted from query string without authentication.
+# Intentional for MVP single-user setup. For multi-user production, user_id
+# should come from authenticated session/JWT, not query parameters.
 @router.delete("/papers/{paper_id}/save")
 def remove_paper_from_library(
     paper_id: int,
diff --git a/src/paperbot/api/routes/research.py b/src/paperbot/api/routes/research.py
index 2f01503..60f8c42 100644
--- a/src/paperbot/api/routes/research.py
+++ b/src/paperbot/api/routes/research.py
@@ -25,6 +25,7 @@
 _memory_store = SqlAlchemyMemoryStore()
 _track_router = TrackRouter(research_store=_research_store, memory_store=_memory_store)
 _metric_collector: Optional[MemoryMetricCollector] = None
+_paper_store: Optional["PaperStore"] = None
 
 
 def _get_metric_collector() -> MemoryMetricCollector:
@@ -35,6 +36,16 @@ def _get_metric_collector() -> MemoryMetricCollector:
     return _metric_collector
 
 
+def _get_paper_store() -> "PaperStore":
+    """Lazy initialization of paper store."""
+    from paperbot.infrastructure.stores.paper_store import PaperStore
+
+    global _paper_store
+    if _paper_store is None:
+        _paper_store = PaperStore()
+    return _paper_store
+
+
 def _schedule_embedding_precompute(
     background_tasks: Optional[BackgroundTasks],
     *,
@@ -633,6 +644,7 @@ class PaperFeedbackRequest(BaseModel):
     paper_venue: Optional[str] = None
     paper_citation_count: Optional[int] = None
     paper_url: Optional[str] = None
+    paper_source: Optional[str] = None  # arxiv, semantic_scholar, openalex
 
 
 class PaperFeedbackResponse(BaseModel):
@@ -661,22 +673,32 @@ def add_paper_feedback(req: PaperFeedbackRequest):
         meta["context_rank"] = int(req.context_rank)
 
     library_paper_id: Optional[int] = None
-    actual_paper_id = req.paper_id
 
     # If action is "save" and we have paper metadata, insert into papers table
     if req.action == "save" and req.paper_title:
         Logger.info("Save action detected, inserting paper into papers table", file=LogFiles.HARVEST)
         try:
             from paperbot.domain.harvest import HarvestedPaper, HarvestSource
-            from paperbot.infrastructure.stores.paper_store import PaperStore
 
-            paper_store = PaperStore()
+            paper_store = _get_paper_store()
+
+            # Determine source from request or default to semantic_scholar
+            source_str = (req.paper_source or "semantic_scholar").lower()
+            source_map = {
+                "arxiv": HarvestSource.ARXIV,
+                "semantic_scholar": HarvestSource.SEMANTIC_SCHOLAR,
+                "openalex": HarvestSource.OPENALEX,
+            }
+            source = source_map.get(source_str, HarvestSource.SEMANTIC_SCHOLAR)
+
             paper = HarvestedPaper(
                 title=req.paper_title,
-                source=HarvestSource.SEMANTIC_SCHOLAR,
+                source=source,
                 abstract=req.paper_abstract or "",
                 authors=req.paper_authors or [],
-                semantic_scholar_id=req.paper_id,
+                semantic_scholar_id=req.paper_id if source == HarvestSource.SEMANTIC_SCHOLAR else None,
+                arxiv_id=req.paper_id if source == HarvestSource.ARXIV else None,
+                openalex_id=req.paper_id if source == HarvestSource.OPENALEX else None,
                 year=req.paper_year,
                 venue=req.paper_venue,
                 citation_count=req.paper_citation_count or 0,
@@ -685,19 +707,13 @@ def add_paper_feedback(req: PaperFeedbackRequest):
             Logger.info("Calling paper store to upsert paper", file=LogFiles.HARVEST)
             new_count, _ = paper_store.upsert_papers_batch([paper])
 
-            # Get the paper ID from database
-            from paperbot.infrastructure.stores.models import PaperModel
-            from sqlalchemy import select
-            with paper_store._provider.session() as session:
-                result = session.execute(
-                    select(PaperModel).where(
-                        PaperModel.semantic_scholar_id == req.paper_id
-                    )
-                ).scalar_one_or_none()
-                if result:
-                    library_paper_id = result.id
-                    actual_paper_id = str(result.id)  # Use integer ID for feedback
-                    Logger.info(f"Paper saved to library with id={library_paper_id}", file=LogFiles.HARVEST)
+            # Get the paper ID from database using store method
+            result = paper_store.get_paper_by_source_id(source, req.paper_id)
+            if result:
+                library_paper_id = result.id
+                # Store library_paper_id in metadata for joins, keep paper_id as external ID
+                meta["library_paper_id"] = library_paper_id
+                Logger.info(f"Paper saved to library with id={library_paper_id}", file=LogFiles.HARVEST)
         except Exception as e:
             Logger.warning(f"Failed to save paper to library: {e}", file=LogFiles.HARVEST)
 
@@ -705,7 +721,7 @@ def add_paper_feedback(req: PaperFeedbackRequest):
     fb = _research_store.add_paper_feedback(
         user_id=req.user_id,
         track_id=track_id,
-        paper_id=actual_paper_id,
+        paper_id=req.paper_id,  # Always use external ID for consistency
         action=req.action,
         weight=req.weight,
         metadata=meta,
diff --git a/src/paperbot/application/services/paper_deduplicator.py b/src/paperbot/application/services/paper_deduplicator.py
index 954fa64..cb0dd60 100644
--- a/src/paperbot/application/services/paper_deduplicator.py
+++ b/src/paperbot/application/services/paper_deduplicator.py
@@ -67,7 +67,7 @@ def deduplicate(
 
             if existing_idx is not None:
                 # Merge metadata into existing paper
-                self._merge_paper(unique_papers[existing_idx], paper)
+                self._merge_paper(unique_papers[existing_idx], paper, existing_idx)
                 duplicates_count += 1
             else:
                 # Add new paper
@@ -128,29 +128,36 @@ def _index_paper(self, paper: HarvestedPaper, idx: int) -> None:
         title_hash = paper.compute_title_hash()
         self._title_hash_index[title_hash] = idx
 
-    def _merge_paper(self, existing: HarvestedPaper, new: HarvestedPaper) -> None:
+    def _merge_paper(
+        self, existing: HarvestedPaper, new: HarvestedPaper, existing_idx: int
+    ) -> None:
         """
         Merge metadata from new paper into existing.
 
+        Args:
+            existing: The existing paper to merge into
+            new: The new paper with potentially additional metadata
+            existing_idx: The index of the existing paper (used for updating indexes)
+
         Strategy:
         - Fill in missing identifiers
         - Prefer longer/more complete text fields
         - Prefer higher citation counts
         - Merge lists (keywords, fields of study)
         """
-        # Fill in missing identifiers
+        # Fill in missing identifiers (use existing_idx directly, not _find_index)
         if not existing.doi and new.doi:
             existing.doi = new.doi
-            self._doi_index[new.doi.lower().strip()] = self._find_index(existing)
+            self._doi_index[new.doi.lower().strip()] = existing_idx
         if not existing.arxiv_id and new.arxiv_id:
             existing.arxiv_id = new.arxiv_id
-            self._arxiv_index[new.arxiv_id.lower().strip()] = self._find_index(existing)
+            self._arxiv_index[new.arxiv_id.lower().strip()] = existing_idx
         if not existing.semantic_scholar_id and new.semantic_scholar_id:
             existing.semantic_scholar_id = new.semantic_scholar_id
-            self._s2_index[new.semantic_scholar_id.lower().strip()] = self._find_index(existing)
+            self._s2_index[new.semantic_scholar_id.lower().strip()] = existing_idx
         if not existing.openalex_id and new.openalex_id:
             existing.openalex_id = new.openalex_id
-            self._openalex_index[new.openalex_id.lower().strip()] = self._find_index(existing)
+            self._openalex_index[new.openalex_id.lower().strip()] = existing_idx
 
         # Prefer longer abstract
         if len(new.abstract) > len(existing.abstract):
diff --git a/src/paperbot/context_engine/engine.py b/src/paperbot/context_engine/engine.py
index 6d3004c..6b59dfd 100644
--- a/src/paperbot/context_engine/engine.py
+++ b/src/paperbot/context_engine/engine.py
@@ -503,7 +503,11 @@ async def build_context_pack(
             "rebuttal": (0.50, 0.40, 0.10),
         }.get(stage, (0.55, 0.30, 0.15))
 
-        Logger.info(f"Paper search config: offline={self.config.offline}, paper_limit={self.config.paper_limit}", file=LogFiles.HARVEST)
+        Logger.info(
+            f"Paper search config: offline={self.config.offline}, "
+            f"paper_limit={self.config.paper_limit}",
+            file=LogFiles.HARVEST,
+        )
         if not self.config.offline and self.config.paper_limit > 0:
             try:
                 searcher = self.paper_searcher
@@ -514,9 +518,13 @@ async def build_context_pack(
                     Logger.info("Initialized SemanticScholarSearch", file=LogFiles.HARVEST)
 
                 fetch_limit = max(30, int(self.config.paper_limit) * 3)
-                Logger.info(f"Searching papers with query='{merged_query}', limit={fetch_limit}", file=LogFiles.HARVEST)
+                Logger.info(
+                    f"Searching papers with query='{merged_query}', limit={fetch_limit}",
+                    file=LogFiles.HARVEST,
+                )
                 resp = await asyncio.to_thread(searcher.search_papers, merged_query, fetch_limit)
-                Logger.info(f"Search returned {len(getattr(resp, 'papers', []) or [])} papers", file=LogFiles.HARVEST)
+                papers_count = len(getattr(resp, "papers", []) or [])
+                Logger.info(f"Search returned {papers_count} papers", file=LogFiles.HARVEST)
 
                 raw: List[Dict[str, Any]] = []
                 for p in getattr(resp, "papers", []) or []:
diff --git a/src/paperbot/domain/harvest.py b/src/paperbot/domain/harvest.py
index 64230ab..67c4164 100644
--- a/src/paperbot/domain/harvest.py
+++ b/src/paperbot/domain/harvest.py
@@ -89,7 +89,11 @@ def from_dict(cls, data: Dict[str, Any]) -> "HarvestedPaper":
         """Create instance from dictionary."""
         source = data.get("source", "")
         if isinstance(source, str):
-            source = HarvestSource(source)
+            try:
+                source = HarvestSource(source)
+            except ValueError:
+                # Fallback for empty or invalid source strings
+                source = HarvestSource.SEMANTIC_SCHOLAR
         return cls(
             title=data.get("title", ""),
             source=source,
diff --git a/src/paperbot/infrastructure/harvesters/arxiv_harvester.py b/src/paperbot/infrastructure/harvesters/arxiv_harvester.py
index 6b51d1c..4c0b815 100644
--- a/src/paperbot/infrastructure/harvesters/arxiv_harvester.py
+++ b/src/paperbot/infrastructure/harvesters/arxiv_harvester.py
@@ -30,9 +30,13 @@ class ArxivHarvester:
 
     ARXIV_API_URL = "https://export.arxiv.org/api/query"
     REQUEST_INTERVAL = 3.0  # seconds between requests
+    DEFAULT_TIMEOUT_SECONDS = 30
 
-    def __init__(self, connector: Optional[ArxivConnector] = None):
+    def __init__(
+        self, connector: Optional[ArxivConnector] = None, timeout_seconds: int = 30
+    ):
         self.connector = connector or ArxivConnector()
+        self.timeout_seconds = timeout_seconds
         self._session: Optional[aiohttp.ClientSession] = None
         self._last_request_time: float = 0
 
@@ -42,7 +46,8 @@ def source(self) -> HarvestSource:
 
     async def _get_session(self) -> aiohttp.ClientSession:
         if self._session is None or self._session.closed:
-            self._session = aiohttp.ClientSession()
+            timeout = aiohttp.ClientTimeout(total=self.timeout_seconds)
+            self._session = aiohttp.ClientSession(timeout=timeout)
         return self._session
 
     async def _rate_limit(self) -> None:
diff --git a/src/paperbot/infrastructure/harvesters/openalex_harvester.py b/src/paperbot/infrastructure/harvesters/openalex_harvester.py
index 4153e42..55d72df 100644
--- a/src/paperbot/infrastructure/harvesters/openalex_harvester.py
+++ b/src/paperbot/infrastructure/harvesters/openalex_harvester.py
@@ -29,9 +29,11 @@ class OpenAlexHarvester:
 
     OPENALEX_API_URL = "https://api.openalex.org/works"
     REQUEST_INTERVAL = 0.1  # 10 req/s
+    DEFAULT_TIMEOUT_SECONDS = 30
 
-    def __init__(self, email: Optional[str] = None):
+    def __init__(self, email: Optional[str] = None, timeout_seconds: int = 30):
         self.email = email  # For polite pool
+        self.timeout_seconds = timeout_seconds
         self._session: Optional[aiohttp.ClientSession] = None
         self._last_request_time: float = 0
 
@@ -41,7 +43,8 @@ def source(self) -> HarvestSource:
 
     async def _get_session(self) -> aiohttp.ClientSession:
         if self._session is None or self._session.closed:
-            self._session = aiohttp.ClientSession()
+            timeout = aiohttp.ClientTimeout(total=self.timeout_seconds)
+            self._session = aiohttp.ClientSession(timeout=timeout)
         return self._session
 
     async def _rate_limit(self) -> None:
diff --git a/src/paperbot/infrastructure/stores/paper_store.py b/src/paperbot/infrastructure/stores/paper_store.py
index f0abdbf..b26327a 100644
--- a/src/paperbot/infrastructure/stores/paper_store.py
+++ b/src/paperbot/infrastructure/stores/paper_store.py
@@ -13,7 +13,7 @@
 from datetime import datetime, timezone
 from typing import Any, Dict, List, Optional, Tuple
 
-from sqlalchemy import Integer, cast, func, or_, select
+from sqlalchemy import Integer, String, cast, func, or_, select
 
 from paperbot.utils.logging_config import Logger, LogFiles
 from paperbot.domain.harvest import HarvestedPaper, HarvestSource
@@ -522,6 +522,15 @@ def search_papers(
         Returns:
             Tuple of (papers, total_count)
         """
+        # Whitelist of allowed sort columns for security
+        allowed_sort_columns = {
+            "citation_count": PaperModel.citation_count,
+            "year": PaperModel.year,
+            "created_at": PaperModel.created_at,
+            "updated_at": PaperModel.updated_at,
+            "title": PaperModel.title,
+        }
+
         with self._provider.session() as session:
             stmt = select(PaperModel).where(PaperModel.deleted_at.is_(None))
 
@@ -535,14 +544,21 @@ def search_papers(
                     )
                 )
 
-            # Year filters
-            if year_from:
+            # Keyword filter (search in keywords_json)
+            if keywords:
+                keyword_conditions = [
+                    PaperModel.keywords_json.ilike(f"%{kw}%") for kw in keywords
+                ]
+                stmt = stmt.where(or_(*keyword_conditions))
+
+            # Year filters (use explicit None check to allow year_from=0 if needed)
+            if year_from is not None:
                 stmt = stmt.where(PaperModel.year >= year_from)
-            if year_to:
+            if year_to is not None:
                 stmt = stmt.where(PaperModel.year <= year_to)
 
-            # Citation filter
-            if min_citations:
+            # Citation filter (use explicit None check to allow min_citations=0)
+            if min_citations is not None:
                 stmt = stmt.where(PaperModel.citation_count >= min_citations)
 
             # Venue filter
@@ -558,8 +574,8 @@ def search_papers(
             count_stmt = select(func.count()).select_from(stmt.subquery())
             total_count = session.execute(count_stmt).scalar() or 0
 
-            # Sort
-            sort_col = getattr(PaperModel, sort_by, PaperModel.citation_count)
+            # Sort (use whitelist for security)
+            sort_col = allowed_sort_columns.get(sort_by, PaperModel.citation_count)
             if sort_order.lower() == "desc":
                 stmt = stmt.order_by(sort_col.desc())
             else:
@@ -582,6 +598,31 @@ def get_paper_by_id(self, paper_id: int) -> Optional[PaperModel]:
                 )
             ).scalar_one_or_none()
 
+    def get_paper_by_source_id(
+        self, source: HarvestSource, source_id: str
+    ) -> Optional[PaperModel]:
+        """
+        Get a paper by its source-specific ID.
+
+        Args:
+            source: The harvest source (ARXIV, SEMANTIC_SCHOLAR, OPENALEX)
+            source_id: The ID from that source
+
+        Returns:
+            PaperModel if found, None otherwise
+        """
+        with self._provider.session() as session:
+            if source == HarvestSource.ARXIV:
+                condition = PaperModel.arxiv_id == source_id
+            elif source == HarvestSource.OPENALEX:
+                condition = PaperModel.openalex_id == source_id
+            else:  # Default to SEMANTIC_SCHOLAR
+                condition = PaperModel.semantic_scholar_id == source_id
+
+            return session.execute(
+                select(PaperModel).where(condition, PaperModel.deleted_at.is_(None))
+            ).scalar_one_or_none()
+
     def get_user_library(
         self,
         user_id: str,
@@ -610,13 +651,19 @@ def get_user_library(
 
             Logger.info("Executing database query to join papers with feedback", file=LogFiles.HARVEST)
             # First, get all matching paper-feedback pairs
+            # Join on external IDs (semantic_scholar_id, arxiv_id, openalex_id)
+            # This avoids CAST errors on PostgreSQL for non-numeric paper_ids
+            # Also check library_paper_id from metadata if available
             base_stmt = (
                 select(PaperModel, PaperFeedbackModel)
                 .join(
                     PaperFeedbackModel,
                     or_(
-                        PaperModel.id == cast(PaperFeedbackModel.paper_id, Integer),
                         PaperModel.semantic_scholar_id == PaperFeedbackModel.paper_id,
+                        PaperModel.arxiv_id == PaperFeedbackModel.paper_id,
+                        PaperModel.openalex_id == PaperFeedbackModel.paper_id,
+                        # For backwards compatibility with numeric IDs stored as strings
+                        cast(PaperModel.id, String) == PaperFeedbackModel.paper_id,
                     ),
                 )
                 .where(
diff --git a/web/src/app/api/papers/[paperId]/save/route.ts b/web/src/app/api/papers/[paperId]/save/route.ts
index ea6a24c..494b4ee 100644
--- a/web/src/app/api/papers/[paperId]/save/route.ts
+++ b/web/src/app/api/papers/[paperId]/save/route.ts
@@ -1,12 +1,26 @@
+import { NextResponse } from "next/server"
 import { apiBaseUrl, proxyJson } from "../../../research/_base"
 
+// Validate paperId to prevent path traversal attacks
+function validatePaperId(paperId: string): number | null {
+  const parsed = parseInt(paperId, 10)
+  if (isNaN(parsed) || parsed <= 0 || String(parsed) !== paperId) {
+    return null
+  }
+  return parsed
+}
+
 export async function DELETE(
   req: Request,
   { params }: { params: Promise<{ paperId: string }> }
 ) {
   const { paperId } = await params
+  const validId = validatePaperId(paperId)
+  if (validId === null) {
+    return NextResponse.json({ error: "Invalid paper ID" }, { status: 400 })
+  }
   const url = new URL(req.url)
-  const upstream = `${apiBaseUrl()}/api/papers/${paperId}/save${url.search}`
+  const upstream = `${apiBaseUrl()}/api/papers/${validId}/save${url.search}`
   return proxyJson(req, upstream, "DELETE")
 }
 
@@ -15,6 +29,10 @@ export async function POST(
   { params }: { params: Promise<{ paperId: string }> }
 ) {
   const { paperId } = await params
-  const upstream = `${apiBaseUrl()}/api/papers/${paperId}/save`
+  const validId = validatePaperId(paperId)
+  if (validId === null) {
+    return NextResponse.json({ error: "Invalid paper ID" }, { status: 400 })
+  }
+  const upstream = `${apiBaseUrl()}/api/papers/${validId}/save`
   return proxyJson(req, upstream, "POST")
 }