diff --git a/README.md b/README.md
index 0e8adc2..9562189 100644
--- a/README.md
+++ b/README.md
@@ -9,9 +9,9 @@
| 模块 | 说明 |
|------|------|
| **Topic Search** | 多主题聚合检索,支持 papers.cool + arXiv API + Hugging Face Daily Papers 三数据源,跨 query/branch 去重与评分排序,`min_score` 质量过滤 |
-| **DailyPaper** | 日报生成(Markdown/JSON),可选 LLM 增强(摘要/趋势/洞察/相关性),支持定时推送(Email/Slack/钉钉) |
-| **LLM-as-Judge** | 5 维评分(Relevance/Novelty/Rigor/Impact/Clarity)+ 推荐分级(must_read/worth_reading/skim/skip),Token Budget 控制,多轮校准 |
-| **Analyze SSE** | Judge + Trend 分析通过 SSE 实时流式推送,前端增量渲染(逐张 Judge 卡片 / 逐条 Trend 分析) |
+| **DailyPaper** | 日报生成(Markdown/JSON),SSE 实时流式推送全流程进度,可选 LLM 增强(摘要/趋势/洞察/相关性),Judge 评分后自动过滤低质论文,支持定时推送(Email/Slack/钉钉) |
+| **LLM-as-Judge** | 5 维评分(Relevance/Novelty/Rigor/Impact/Clarity)+ 推荐分级(must_read/worth_reading/skim/skip),Token Budget 控制,多轮校准,评分后自动过滤 skip/skim 论文 |
+| **Analyze SSE** | Judge + Trend 分析通过 SSE 实时流式推送,前端增量渲染(逐张 Judge 卡片 / 逐条 Trend 分析),完整 Judge 日志保留 |
| **学者追踪** | 定期监测学者论文,多 Agent 协作(Research/Code/Quality/Reviewer),PIS 影响力评分(引用速度、趋势动量) |
| **深度评审** | 模拟同行评审(初筛→深度批评→决策),输出 Summary/Strengths/Weaknesses/Novelty Score |
| **Paper2Code** | 论文到代码骨架(Planning→Analysis→Generation→Verification),自愈调试,Docker/E2B 沙箱执行 |
@@ -82,6 +82,24 @@ Input Queries ──→ ├─── arXiv API (relevance sort)
└── Web UI (DAG + Tabs: Papers / Insights / Judge)
```
+### DailyPaper SSE 流式管线
+
+当启用 LLM 分析或 Judge 评分时,`/daily` 端点返回 SSE 流式响应,前端实时显示每个阶段的进度:
+
+```text
+Search → Build Report → LLM Enrichment → Judge Scoring → Filter → Save → Notify → Result
+ │ │ │ │ │
+ │ │ │ │ └─ 移除 skip/skim 论文
+ │ │ │ └─ 逐篇评分,实时推送 judge 事件
+ │ │ └─ 逐篇摘要 + 趋势分析 + 洞察
+ │ └─ 构建报告结构
+ └─ 多源检索 + 去重 + 评分
+```
+
+**Post-Judge 过滤**:Judge 评分完成后,自动移除推荐等级为 `skip` 和 `skim` 的论文,只保留 `must_read` 和 `worth_reading` 的论文。完整的 Judge 评分日志保留在 `report.filter.log` 中。
+
+**前端配置持久化**:所有功能开关(LLM/Judge/数据源/邮箱等)默认全部启用,保存在浏览器 localStorage 中,刷新页面不会丢失。
+
## 界面预览
### Terminal UI(Ink)
@@ -114,6 +132,10 @@ Input Queries ──→ ├─── arXiv API (relevance sort)
|---------------|-----------------|
|  |  |
+### Email 推送
+
+
+
## 快速开始
### 1) 安装
@@ -166,17 +188,34 @@ LLM_REASONING_MODEL=...
每日推送配置(点击展开)
-```bash
-# 通知渠道
-PAPERBOT_NOTIFY_ENABLED=true
-PAPERBOT_NOTIFY_CHANNELS=email,slack,dingding
+DailyPaper 生成后可自动推送摘要到 Email/Slack/钉钉。有两种配置方式:
+
+**方式一:Web UI 配置(推荐)**
+
+在 Topic Workflow 页面的 Settings 面板中:
+1. 勾选 "Email Notification"
+2. 填入收件邮箱地址(如 `you@example.com`)
+3. 运行 DailyPaper 时会自动在最后发送邮件
+
+> UI 中填写的邮箱会覆盖环境变量中的 `PAPERBOT_NOTIFY_EMAIL_TO`。
+> 所有配置项(LLM/Judge/数据源/邮箱等)会自动持久化到浏览器 localStorage,刷新页面不会丢失。
-# Email (SMTP)
-PAPERBOT_NOTIFY_SMTP_HOST=smtp.example.com
-PAPERBOT_NOTIFY_SMTP_USERNAME=...
-PAPERBOT_NOTIFY_SMTP_PASSWORD=...
-PAPERBOT_NOTIFY_EMAIL_FROM=bot@example.com
-PAPERBOT_NOTIFY_EMAIL_TO=you@example.com
+**方式二:环境变量配置**
+
+```bash
+# 总开关
+PAPERBOT_NOTIFY_ENABLED=true # 是否启用推送(必须为 true 才能发送)
+PAPERBOT_NOTIFY_CHANNELS=email,slack # 启用的推送渠道(逗号分隔)
+
+# Email (SMTP) — 必须配置才能发送邮件
+PAPERBOT_NOTIFY_SMTP_HOST=smtp.qq.com # SMTP 服务器地址
+PAPERBOT_NOTIFY_SMTP_PORT=587 # SMTP 端口(587=STARTTLS, 465=SSL)
+PAPERBOT_NOTIFY_SMTP_USERNAME=your@qq.com # SMTP 登录用户名
+PAPERBOT_NOTIFY_SMTP_PASSWORD=your-auth-code # SMTP 密码或授权码
+PAPERBOT_NOTIFY_SMTP_USE_TLS=true # 是否使用 STARTTLS(端口 587 时为 true)
+PAPERBOT_NOTIFY_SMTP_USE_SSL=false # 是否使用 SSL(端口 465 时为 true)
+PAPERBOT_NOTIFY_EMAIL_FROM=your@qq.com # 发件人地址
+PAPERBOT_NOTIFY_EMAIL_TO=recipient@example.com # 默认收件人(可被 UI 覆盖)
# Slack
PAPERBOT_NOTIFY_SLACK_WEBHOOK_URL=https://hooks.slack.com/...
@@ -185,7 +224,7 @@ PAPERBOT_NOTIFY_SLACK_WEBHOOK_URL=https://hooks.slack.com/...
PAPERBOT_NOTIFY_DINGTALK_WEBHOOK_URL=https://oapi.dingtalk.com/robot/send?access_token=...
PAPERBOT_NOTIFY_DINGTALK_SECRET=SEC...
-# DailyPaper 定时任务
+# DailyPaper 定时任务(ARQ Worker)
PAPERBOT_DAILYPAPER_ENABLED=true
PAPERBOT_DAILYPAPER_CRON_HOUR=8
PAPERBOT_DAILYPAPER_CRON_MINUTE=30
@@ -193,6 +232,15 @@ PAPERBOT_DAILYPAPER_NOTIFY_ENABLED=true
PAPERBOT_DAILYPAPER_NOTIFY_CHANNELS=email,slack
```
+**QQ 邮箱配置示例:**
+1. 登录 QQ 邮箱 → 设置 → 账户 → POP3/SMTP 服务 → 开启
+2. 生成授权码(不是 QQ 密码)
+3. 设置 `SMTP_HOST=smtp.qq.com`, `SMTP_PORT=587`, `SMTP_USE_TLS=true`
+
+**Gmail 配置示例:**
+1. Google 账号 → 安全性 → 两步验证 → 应用专用密码
+2. 设置 `SMTP_HOST=smtp.gmail.com`, `SMTP_PORT=587`, `SMTP_USE_TLS=true`
+
### 3) 启动
@@ -229,7 +277,7 @@ arq paperbot.infrastructure.queue.arq_worker.WorkerSettings
| `/api/review` | POST | 深度评审(SSE) |
| `/api/chat` | POST | AI 对话(SSE) |
| `/api/research/paperscool/search` | POST | 主题检索(多源聚合,支持 `min_score` 过滤) |
-| `/api/research/paperscool/daily` | POST | DailyPaper 日报(支持 `notify` 推送) |
+| `/api/research/paperscool/daily` | POST | DailyPaper 日报(LLM/Judge 启用时返回 SSE 流式,否则 JSON;支持 `notify` 推送) |
| `/api/research/paperscool/analyze` | POST | Judge + Trend 流式分析(SSE) |
| `/api/research/tracks` | GET/POST | 研究方向管理 |
| `/api/research/memory/*` | GET/POST | 记忆系统(Inbox/审核/检索) |
diff --git a/alembic/versions/0003_paper_registry.py b/alembic/versions/0003_paper_registry.py
new file mode 100644
index 0000000..2d04a32
--- /dev/null
+++ b/alembic/versions/0003_paper_registry.py
@@ -0,0 +1,89 @@
+"""paper registry
+
+Revision ID: 0003_paper_registry
+Revises: 0002_research_eval_runs
+Create Date: 2026-02-10
+
+Adds canonical papers table for persistent DailyPaper ingestion.
+"""
+
+from __future__ import annotations
+
+import sqlalchemy as sa
+from alembic import context, op
+
+
+revision = "0003_paper_registry"
+down_revision = "0002_research_eval_runs"
+branch_labels = None
+depends_on = None
+
+
+def _is_offline() -> bool:
+ try:
+ return bool(context.is_offline_mode())
+ except Exception:
+ return False
+
+
+def _insp():
+ return sa.inspect(op.get_bind())
+
+
+def _has_table(name: str) -> bool:
+ return _insp().has_table(name)
+
+
+def _get_indexes(table: str) -> set[str]:
+ idx = set()
+ for i in _insp().get_indexes(table):
+ idx.add(str(i.get("name") or ""))
+ return idx
+
+
+def _create_index(name: str, table: str, cols: list[str]) -> None:
+ if _is_offline():
+ op.create_index(name, table, cols)
+ return
+ if name in _get_indexes(table):
+ return
+ op.create_index(name, table, cols)
+
+
+def upgrade() -> None:
+ if _is_offline() or not _has_table("papers"):
+ op.create_table(
+ "papers",
+ sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
+ sa.Column("arxiv_id", sa.String(length=64), nullable=True),
+ sa.Column("doi", sa.String(length=128), nullable=True),
+ sa.Column("title", sa.Text(), server_default="", nullable=False),
+ sa.Column("authors_json", sa.Text(), server_default="[]", nullable=False),
+ sa.Column("abstract", sa.Text(), server_default="", nullable=False),
+ sa.Column("url", sa.String(length=512), server_default="", nullable=False),
+ sa.Column("external_url", sa.String(length=512), server_default="", nullable=False),
+ sa.Column("pdf_url", sa.String(length=512), server_default="", nullable=False),
+ sa.Column("source", sa.String(length=32), server_default="papers_cool", nullable=False),
+ sa.Column("venue", sa.String(length=256), server_default="", nullable=False),
+ sa.Column("published_at", sa.DateTime(timezone=True), nullable=True),
+ sa.Column("first_seen_at", sa.DateTime(timezone=True), nullable=False),
+ sa.Column("keywords_json", sa.Text(), server_default="[]", nullable=False),
+ sa.Column("metadata_json", sa.Text(), server_default="{}", nullable=False),
+ sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+ sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
+ sa.UniqueConstraint("arxiv_id", name="uq_papers_arxiv_id"),
+ sa.UniqueConstraint("doi", name="uq_papers_doi"),
+ )
+
+ _create_index("ix_papers_arxiv_id", "papers", ["arxiv_id"])
+ _create_index("ix_papers_doi", "papers", ["doi"])
+ _create_index("ix_papers_title", "papers", ["title"])
+ _create_index("ix_papers_source", "papers", ["source"])
+ _create_index("ix_papers_published_at", "papers", ["published_at"])
+ _create_index("ix_papers_first_seen_at", "papers", ["first_seen_at"])
+ _create_index("ix_papers_created_at", "papers", ["created_at"])
+ _create_index("ix_papers_updated_at", "papers", ["updated_at"])
+
+
+def downgrade() -> None:
+ op.drop_table("papers")
diff --git a/alembic/versions/0004_paper_feedback_judge_links.py b/alembic/versions/0004_paper_feedback_judge_links.py
new file mode 100644
index 0000000..f7e6ec5
--- /dev/null
+++ b/alembic/versions/0004_paper_feedback_judge_links.py
@@ -0,0 +1,112 @@
+"""paper feedback/judge links
+
+Revision ID: 0004_paper_feedback_judge_links
+Revises: 0003_paper_registry
+Create Date: 2026-02-10
+
+Adds:
+- paper_judge_scores table
+- paper_feedback.paper_ref_id nullable FK-like reference column
+"""
+
+from __future__ import annotations
+
+import sqlalchemy as sa
+from alembic import context, op
+
+
+revision = "0004_paper_feedback_judge_links"
+down_revision = "0003_paper_registry"
+branch_labels = None
+depends_on = None
+
+
+def _is_offline() -> bool:
+ try:
+ return bool(context.is_offline_mode())
+ except Exception:
+ return False
+
+
+def _insp():
+ return sa.inspect(op.get_bind())
+
+
+def _has_table(name: str) -> bool:
+ return _insp().has_table(name)
+
+
+def _get_columns(table: str) -> set[str]:
+ cols = set()
+ for c in _insp().get_columns(table):
+ cols.add(str(c.get("name") or ""))
+ return cols
+
+
+def _get_indexes(table: str) -> set[str]:
+ idx = set()
+ for i in _insp().get_indexes(table):
+ idx.add(str(i.get("name") or ""))
+ return idx
+
+
+def _create_index(name: str, table: str, cols: list[str]) -> None:
+ if _is_offline():
+ op.create_index(name, table, cols)
+ return
+ if name in _get_indexes(table):
+ return
+ op.create_index(name, table, cols)
+
+
+def upgrade() -> None:
+ if _is_offline() or not _has_table("paper_judge_scores"):
+ op.create_table(
+ "paper_judge_scores",
+ sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
+ sa.Column("paper_id", sa.Integer(), sa.ForeignKey("papers.id"), nullable=False),
+ sa.Column("query", sa.String(length=256), server_default="", nullable=False),
+ sa.Column("overall", sa.Float(), server_default="0.0", nullable=False),
+ sa.Column("relevance", sa.Float(), server_default="0.0", nullable=False),
+ sa.Column("novelty", sa.Float(), server_default="0.0", nullable=False),
+ sa.Column("rigor", sa.Float(), server_default="0.0", nullable=False),
+ sa.Column("impact", sa.Float(), server_default="0.0", nullable=False),
+ sa.Column("clarity", sa.Float(), server_default="0.0", nullable=False),
+ sa.Column("recommendation", sa.String(length=32), server_default="", nullable=False),
+ sa.Column("one_line_summary", sa.Text(), server_default="", nullable=False),
+ sa.Column("judge_model", sa.String(length=128), server_default="", nullable=False),
+ sa.Column("judge_cost_tier", sa.Integer(), nullable=True),
+ sa.Column("scored_at", sa.DateTime(timezone=True), nullable=False),
+ sa.Column("metadata_json", sa.Text(), server_default="{}", nullable=False),
+ sa.UniqueConstraint("paper_id", "query", name="uq_paper_judge_scores_paper_query"),
+ )
+
+ _create_index("ix_paper_judge_scores_paper_id", "paper_judge_scores", ["paper_id"])
+ _create_index("ix_paper_judge_scores_query", "paper_judge_scores", ["query"])
+ _create_index("ix_paper_judge_scores_recommendation", "paper_judge_scores", ["recommendation"])
+ _create_index("ix_paper_judge_scores_scored_at", "paper_judge_scores", ["scored_at"])
+
+ if _is_offline():
+ op.add_column("paper_feedback", sa.Column("paper_ref_id", sa.Integer(), nullable=True))
+ op.create_index("ix_paper_feedback_paper_ref_id", "paper_feedback", ["paper_ref_id"])
+ return
+
+ if "paper_ref_id" not in _get_columns("paper_feedback"):
+ with op.batch_alter_table("paper_feedback") as batch_op:
+ batch_op.add_column(sa.Column("paper_ref_id", sa.Integer(), nullable=True))
+
+ _create_index("ix_paper_feedback_paper_ref_id", "paper_feedback", ["paper_ref_id"])
+
+
+def downgrade() -> None:
+ with op.batch_alter_table("paper_feedback") as batch_op:
+ try:
+ batch_op.drop_index("ix_paper_feedback_paper_ref_id")
+ except Exception:
+ pass
+ try:
+ batch_op.drop_column("paper_ref_id")
+ except Exception:
+ pass
+
+ op.drop_table("paper_judge_scores")
diff --git a/alembic/versions/0005_paper_reading_status.py b/alembic/versions/0005_paper_reading_status.py
new file mode 100644
index 0000000..7fca541
--- /dev/null
+++ b/alembic/versions/0005_paper_reading_status.py
@@ -0,0 +1,79 @@
+"""paper reading status
+
+Revision ID: 0005_paper_reading_status
+Revises: 0004_paper_feedback_judge_links
+Create Date: 2026-02-10
+
+Adds paper_reading_status table for saved/reading/read lifecycle tracking.
+"""
+
+from __future__ import annotations
+
+import sqlalchemy as sa
+from alembic import context, op
+
+
+revision = "0005_paper_reading_status"
+down_revision = "0004_paper_feedback_judge_links"
+branch_labels = None
+depends_on = None
+
+
+def _is_offline() -> bool:
+ try:
+ return bool(context.is_offline_mode())
+ except Exception:
+ return False
+
+
+def _insp():
+ return sa.inspect(op.get_bind())
+
+
+def _has_table(name: str) -> bool:
+ return _insp().has_table(name)
+
+
+def _get_indexes(table: str) -> set[str]:
+ idx = set()
+ for i in _insp().get_indexes(table):
+ idx.add(str(i.get("name") or ""))
+ return idx
+
+
+def _create_index(name: str, table: str, cols: list[str]) -> None:
+ if _is_offline():
+ op.create_index(name, table, cols)
+ return
+ if name in _get_indexes(table):
+ return
+ op.create_index(name, table, cols)
+
+
+def upgrade() -> None:
+ if _is_offline() or not _has_table("paper_reading_status"):
+ op.create_table(
+ "paper_reading_status",
+ sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
+ sa.Column("user_id", sa.String(length=64), nullable=False),
+ sa.Column("paper_id", sa.Integer(), sa.ForeignKey("papers.id"), nullable=False),
+ sa.Column("status", sa.String(length=16), server_default="unread", nullable=False),
+ sa.Column("saved_at", sa.DateTime(timezone=True), nullable=True),
+ sa.Column("read_at", sa.DateTime(timezone=True), nullable=True),
+ sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
+ sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
+ sa.Column("metadata_json", sa.Text(), server_default="{}", nullable=False),
+ sa.UniqueConstraint("user_id", "paper_id", name="uq_paper_reading_status_user_paper"),
+ )
+
+ _create_index("ix_paper_reading_status_user_id", "paper_reading_status", ["user_id"])
+ _create_index("ix_paper_reading_status_paper_id", "paper_reading_status", ["paper_id"])
+ _create_index("ix_paper_reading_status_status", "paper_reading_status", ["status"])
+ _create_index("ix_paper_reading_status_saved_at", "paper_reading_status", ["saved_at"])
+ _create_index("ix_paper_reading_status_read_at", "paper_reading_status", ["read_at"])
+ _create_index("ix_paper_reading_status_created_at", "paper_reading_status", ["created_at"])
+ _create_index("ix_paper_reading_status_updated_at", "paper_reading_status", ["updated_at"])
+
+
+def downgrade() -> None:
+ op.drop_table("paper_reading_status")
diff --git a/alembic/versions/0006_newsletter_subscribers.py b/alembic/versions/0006_newsletter_subscribers.py
new file mode 100644
index 0000000..a188330
--- /dev/null
+++ b/alembic/versions/0006_newsletter_subscribers.py
@@ -0,0 +1,72 @@
+"""newsletter subscribers
+
+Revision ID: 0006_newsletter_subscribers
+Revises: 0005_paper_reading_status
+Create Date: 2026-02-11
+
+Adds newsletter_subscribers table for email subscription management.
+"""
+
+from __future__ import annotations
+
+import sqlalchemy as sa
+from alembic import context, op
+
+
+revision = "0006_newsletter_subscribers"
+down_revision = "0005_paper_reading_status"
+branch_labels = None
+depends_on = None
+
+
+def _is_offline() -> bool:
+ try:
+ return bool(context.is_offline_mode())
+ except Exception:
+ return False
+
+
+def _insp():
+ return sa.inspect(op.get_bind())
+
+
+def _has_table(name: str) -> bool:
+ return _insp().has_table(name)
+
+
+def _get_indexes(table: str) -> set[str]:
+ idx = set()
+ for i in _insp().get_indexes(table):
+ idx.add(str(i.get("name") or ""))
+ return idx
+
+
+def _create_index(name: str, table: str, cols: list[str]) -> None:
+ if _is_offline():
+ op.create_index(name, table, cols)
+ return
+ if name in _get_indexes(table):
+ return
+ op.create_index(name, table, cols)
+
+
+def upgrade() -> None:
+ if _is_offline() or not _has_table("newsletter_subscribers"):
+ op.create_table(
+ "newsletter_subscribers",
+ sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
+ sa.Column("email", sa.String(length=256), nullable=False, unique=True),
+ sa.Column("status", sa.String(length=16), server_default="active", nullable=False),
+ sa.Column("unsub_token", sa.String(length=64), nullable=False, unique=True),
+ sa.Column("subscribed_at", sa.DateTime(timezone=True), nullable=False),
+ sa.Column("unsub_at", sa.DateTime(timezone=True), nullable=True),
+ sa.Column("metadata_json", sa.Text(), server_default="{}", nullable=False),
+ )
+
+ _create_index("ix_newsletter_subscribers_email", "newsletter_subscribers", ["email"])
+ _create_index("ix_newsletter_subscribers_status", "newsletter_subscribers", ["status"])
+ _create_index("ix_newsletter_subscribers_subscribed_at", "newsletter_subscribers", ["subscribed_at"])
+
+
+def downgrade() -> None:
+ op.drop_table("newsletter_subscribers")
diff --git a/asset/notify.png b/asset/notify.png
new file mode 100644
index 0000000..67de3ab
Binary files /dev/null and b/asset/notify.png differ
diff --git a/docs/ROADMAP_TODO.md b/docs/ROADMAP_TODO.md
index b3b8f90..c57f6c3 100644
--- a/docs/ROADMAP_TODO.md
+++ b/docs/ROADMAP_TODO.md
@@ -3,6 +3,16 @@
> 对标 HuggingFace Daily Papers / AlphaXiv 的完整功能规划。
> 本文件同时作为迭代清单使用:完成一项请将 `[ ]` 更新为 `[x]`。
+
+## 已完成进度(2026-02-10)
+
+- [x] 修复 Judge Prompt f-string 语法错误(CI collection error)
+- [x] DailyPaper 去除 `top_k_per_query` 过早截断(确保 `top_n` 能生效)
+- [x] 新增 Repo 批量富化 API:`POST /api/research/paperscool/repos`
+- [x] 新增学者网络 API:`POST /api/research/scholar/network`
+- [x] 新增学者趋势 API:`POST /api/research/scholar/trends`
+- [x] 打通前端 API 代理(Next route handlers)
+
---
## 对标结论
@@ -30,29 +40,29 @@
### 1.1 Paper Registry(论文持久化)
-- [ ] 新增 `PaperModel` 表
+- [x] 新增 `PaperModel` 表
- 字段:`id`(自增)、`arxiv_id`(唯一索引)、`doi`、`title`、`authors_json`、`abstract`、`url`、`external_url`、`pdf_url`、`source`(papers_cool / arxiv_api / semantic_scholar)、`venue`、`published_at`、`first_seen_at`、`keywords_json`、`metadata_json`
- 唯一约束:`(arxiv_id)` 或 `(doi)` 去重
- 文件:`src/paperbot/infrastructure/stores/models.py`
-- [ ] 新增 Alembic 迁移
-- [ ] 新增 `PaperStore`(CRUD + upsert + 按 arxiv_id/doi 查重)
+- [x] 新增 Alembic 迁移
+- [x] 新增 `PaperStore`(CRUD + upsert + 按 arxiv_id/doi 查重)
- 文件:`src/paperbot/infrastructure/stores/paper_store.py`
-- [ ] `build_daily_paper_report()` 完成后自动入库
+- [x] `build_daily_paper_report()` 完成后自动入库
- 文件:`src/paperbot/application/workflows/dailypaper.py`
- 逻辑:遍历 `report["queries"][*]["top_items"]`,逐条 upsert
-- [ ] Judge 评分写入 `PaperModel` 或关联表 `paper_judge_scores`
+- [x] Judge 评分写入 `PaperModel` 或关联表 `paper_judge_scores`
- 字段:`paper_id`、`query`、`overall`、`relevance`、`novelty`、`rigor`、`impact`、`clarity`、`recommendation`、`one_line_summary`、`scored_at`
-- [ ] `PaperFeedbackModel` 的 `paper_id` 关联到 `PaperModel.id`(目前 paper_id 是自由文本)
-- [ ] 论文 ID 归一化工具函数 `normalize_paper_id(url_or_id) -> arxiv_id | doi`
+- [x] `PaperFeedbackModel` 的 `paper_id` 关联到 `PaperModel.id`(目前 paper_id 是自由文本)
+- [x] 论文 ID 归一化工具函数 `normalize_paper_id(url_or_id) -> arxiv_id | doi`
- 文件:`src/paperbot/domain/paper_identity.py`
### 1.2 收藏 / 阅读列表
-- [ ] 新增 `PaperReadingStatusModel` 表(或扩展 `PaperFeedbackModel`)
+- [x] 新增 `PaperReadingStatusModel` 表(或扩展 `PaperFeedbackModel`)
- 字段:`user_id`、`paper_id`、`status`(unread/reading/read/archived)、`saved_at`、`read_at`
-- [ ] API:`GET /api/research/papers/saved`(用户收藏列表,支持排序:judge_score / saved_at / published_at)
-- [ ] API:`POST /api/research/papers/{paper_id}/status`(更新阅读状态)
-- [ ] API:`GET /api/research/papers/{paper_id}`(论文详情,聚合 judge + feedback + summary)
+- [x] API:`GET /api/research/papers/saved`(用户收藏列表,支持排序:judge_score / saved_at / published_at)
+- [x] API:`POST /api/research/papers/{paper_id}/status`(更新阅读状态)
+- [x] API:`GET /api/research/papers/{paper_id}`(论文详情,聚合 judge + feedback + summary)
- [ ] 前端:收藏列表页面组件
- 文件:`web/src/components/research/SavedPapersList.tsx`
@@ -60,13 +70,13 @@
- [ ] 新增 `PaperRepoModel` 表
- 字段:`paper_id`、`repo_url`、`owner`、`name`、`stars`、`forks`、`last_commit_at`、`language`、`description`、`fetched_at`
-- [ ] Enrichment 服务:从论文 abstract/url/external_url 中正则提取 GitHub 链接
- - 文件:`src/paperbot/application/services/repo_enrichment.py`
- - 正则:`github\.com/[\w\-]+/[\w\-]+`
- - 调用 GitHub API 补元数据(stars/forks/language/last_commit)
+- [x] Enrichment 服务:从论文 abstract/url/external_url 中提取 GitHub 链接并补元数据
+ - 当前实现:`src/paperbot/api/routes/paperscool.py`(后续可下沉到 service)
+ - 提取来源:`github_url/external_url/url/pdf_url/alternative_urls + snippet/abstract`
+ - 调用 GitHub API 补元数据(stars/forks/language/updated_at)
- [ ] DailyPaper 生成后异步调用 repo enrichment
- [ ] API:`GET /api/research/papers/{paper_id}/repos`
-- [ ] API:`GET /api/research/paperscool/repos`(批量,含 stars/活跃度)
+- [x] API:`POST /api/research/paperscool/repos`(批量,含 stars/活跃度)
---
@@ -98,11 +108,12 @@
### 2.3 富文本推送
-- [ ] HTML 邮件模板
- - 文件:`src/paperbot/application/services/templates/daily_email.html`
- - 内容:Top 5 论文卡片(标题+摘要+Judge 评分徽章+链接)、趋势摘要、统计数据
- - 使用 Jinja2 渲染
+- [x] HTML 邮件模板(BestBlogs 风格)
+ - 文件:`src/paperbot/application/services/email_template.py`(共享模板)
+ - 布局:本期导读 → 三步精选流程 → 分层推荐(Must Read / Worth Reading / Skim)
+ - 每篇论文"方法大框":研究问题 / 核心方法 / 关键证据 / 适用场景 / 创新点(从 Judge 五维 rationale 自动拼)
- `_send_email()` 发送 `multipart/alternative`(text + html)
+ - SMTP 和 Resend 两个渠道共享同一模板
- [ ] Slack Block Kit 消息
- 将 `_send_slack()` 的 `text` payload 替换为 `blocks` 结构
- 包含 Header Block + Section Blocks(论文卡片)+ Divider
@@ -125,30 +136,27 @@
### 3.1 学者合作网络(Coauthor Graph)
-- [ ] API:`GET /api/research/scholar/network`
- - 参数:`scholar_id`(S2 author ID)、`depth`(默认 1,最大 2)
- - 返回:`{ "nodes": [...], "edges": [...] }`
- - node:`{ "id", "name", "affiliation", "h_index", "paper_count", "citation_count" }`
- - edge:`{ "source", "target", "coauthor_count", "recent_paper_titles" }`
- - 实现:调用 `SemanticScholarClient.get_author()` + `get_author_papers()` → 提取 coauthor → 递归
- - 文件:`src/paperbot/api/routes/scholar.py`
- - 基础设施:`src/paperbot/infrastructure/api_clients/semantic_scholar.py`(已有 `get_author` / `get_author_papers`)
+- [x] API:`POST /api/research/scholar/network`
+ - 参数:`scholar_id` 或 `scholar_name`、`max_papers`、`recent_years`、`max_nodes`
+ - 返回:`{ "scholar", "stats", "nodes", "edges" }`
+ - node:`{ "id", "name", "type", "collab_papers", "citation_sum" }`
+ - edge:`{ "source", "target", "weight", "sample_titles" }`
+ - 实现:调用 `SemanticScholarClient.get_author()` + `get_author_papers()` 聚合 coauthor 图
+ - 文件:`src/paperbot/api/routes/research.py`
+ - 基础设施:`src/paperbot/infrastructure/api_clients/semantic_scholar.py`
- [ ] 前端:学者关系图可视化(复用 xyflow / d3-force)
- 文件:`web/src/components/research/ScholarNetworkGraph.tsx`
### 3.2 学者趋势分析
-- [ ] API:`GET /api/research/scholar/trends`
- - 参数:`scholar_id`、`years`(默认 5)
+- [x] API:`POST /api/research/scholar/trends`
+ - 参数:`scholar_id` 或 `scholar_name`、`max_papers`、`year_window`
- 返回:
- - `yearly_stats`:`[{ "year", "paper_count", "citation_count", "top_venues" }]`
- - `topic_migration`:`[{ "period", "keywords", "shift_direction" }]`(LLM 生成)
- - `citation_velocity`:`{ "recent_3y_avg", "historical_avg", "trend" }`
- - `collaboration_trend`:`{ "unique_coauthors_per_year", "new_collaborators" }`
- - 实现:
- - 基础统计:从 S2 author papers 聚合
- - 主题迁移:复用 `TrendAnalyzer`(`analysis/trend_analyzer.py`),输入学者各年代表论文
- - 文件:`src/paperbot/application/services/scholar_trends.py`
+ - `publication_velocity`:`[{ "year", "papers", "citations" }]`
+ - `topic_distribution` / `venue_distribution`
+ - `trend_summary`:`{ "publication_trend", "citation_trend", "active_years" }`
+ - 实现:从 S2 author papers 聚合统计并生成趋势方向
+ - 文件:`src/paperbot/api/routes/research.py`
- [ ] 前端:学者趋势图表(年度发表量/引用趋势/主题变迁时间线)
- 文件:`web/src/components/research/ScholarTrendsChart.tsx`
@@ -355,6 +363,85 @@ OpenClaw 的"多 Agent"是指将不同消息渠道路由到不同 Agent 实例
- [ ] 全文索引(用于 AI Chat with Paper 的深度问答)
- [ ] 逐段批注(类 AlphaXiv)
+### 4.5 论文框架图提取(Paper Framework Figure Extraction)
+
+> 目标:自动提取论文中的方法框架图(通常是 Figure 1),嵌入邮件推送和论文详情页。
+> 三条提取路径 + LaTeX 快速通道,后续做 A/B Test 对比效果。
+
+**路径 A:LaTeX 源码直提(arXiv 论文优先,精度最高)**
+
+- [ ] arXiv 提供 LaTeX 源码包下载(`https://arxiv.org/e-print/{arxiv_id}`)
+- [ ] 解压 `.tar.gz` → 解析 `.tex` 文件中的 `\includegraphics` 命令
+- [ ] 定位框架图:匹配 `\begin{figure}` 环境 + caption 关键词("overview"、"framework"、"architecture"、"pipeline")
+- [ ] 直接提取对应图片文件(`.pdf`/`.png`/`.eps`)→ 转换为 PNG/WebP
+- 优点:无损质量、精准定位、无需模型推理
+- 缺点:仅限 arXiv 有源码的论文(覆盖率 ~70-80%)
+
+**路径 B:MinerU 文档布局检测(PDF 结构化解析,推荐)**
+
+- [ ] 使用 [MinerU](https://github.com/opendatalab/MinerU)(30k+ stars)解析 PDF
+ - LayoutLMv3 布局检测 → 自动识别 figure 区域 + 导出图片 + 关联 caption 文本
+ - 输出 Markdown/JSON + 图片目录,figure 作为独立元素
+- [ ] 遍历 MinerU 输出的 figure 列表 → 匹配 caption 关键词定位框架图
+- [ ] 备选:[Docling](https://github.com/DS4SD/docling)(IBM 出品,结构化文档解析)或 [Marker](https://github.com/vikparuchuri/marker)(PDF→Markdown,速度快)
+- 优点:从 PDF 内部结构提取原始图片(无损)、自动关联 caption、文档专用模型准确率高
+- 缺点:需下载 LayoutLMv3 权重(~1.5GB),首次推理较慢
+
+**路径 C:SAM 3 视觉语义分割(扫描版 PDF fallback)**
+
+- [ ] 使用 [SAM 3](https://pyimagesearch.com/2026/01/26/sam-3-concept-based-visual-understanding-and-segmentation/)(Meta Segment Anything Model 3)
+ - 支持 concept-based text prompt 分割:`"framework diagram"` / `"architecture overview figure"`
+ - PDF 页面渲染为高 DPI 图片 → SAM 3 分割 → 裁剪导出
+- [ ] 适用场景:扫描版 PDF(图片型,无内嵌矢量图)、MinerU 提取失败的 fallback
+- 优点:不依赖 PDF 内部结构,纯视觉语义理解,对扫描件友好
+- 缺点:图片质量受渲染 DPI 影响(有损)、拿不到 caption 文本、模型较重(ViT-H)
+
+**路径 D:PyMuPDF 轻量启发式(兜底方案)**
+
+- [ ] PyMuPDF(fitz)直接提取 PDF 内嵌位图
+- [ ] 启发式定位:页面位置(前 3 页)+ 图片尺寸(宽度 > 页面 50%)+ 周围文本匹配("Figure 1")
+- [ ] 可选:LLM 视觉模型辅助判断(传入候选图片 → 判断哪张是框架图)
+- 优点:零模型依赖、速度极快
+- 缺点:矢量图可能提取为空、启发式规则覆盖率有限
+
+**提取策略(级联 fallback):**
+
+```
+LaTeX 源码可用? ──是──→ 路径 A(LaTeX 直提)
+ │否
+ ▼
+MinerU 提取成功? ──是──→ 路径 B(布局检测)
+ │否
+ ▼
+扫描版 PDF? ──是──→ 路径 C(SAM 3 分割)
+ │否
+ ▼
+路径 D(PyMuPDF 启发式兜底)
+```
+
+**A/B Test 计划:**
+
+- [ ] 收集 100 篇论文样本(50 有 LaTeX 源码 + 50 仅 PDF)
+- [ ] 人工标注 ground truth(每篇论文的框架图是哪张)
+- [ ] 对比指标:提取成功率、图片质量(SSIM)、定位准确率、耗时
+- [ ] 确定生产环境的级联策略和各路径权重
+
+**通用后处理:**
+
+- [ ] 图片压缩 + 尺寸归一化(邮件内嵌 ≤600px 宽)
+- [ ] 上传到对象存储(S3/R2)或 base64 内嵌邮件
+- [ ] 缓存:`PaperModel` 新增 `framework_figure_url` 字段
+- [ ] 邮件模板集成:在方法大框下方展示框架图缩略图
+
+**文件规划:**
+
+- [ ] `src/paperbot/application/services/figure_extractor.py` — 统一入口 + 级联调度
+- [ ] `src/paperbot/application/services/extractors/latex_extractor.py` — 路径 A
+- [ ] `src/paperbot/application/services/extractors/mineru_extractor.py` — 路径 B
+- [ ] `src/paperbot/application/services/extractors/sam3_extractor.py` — 路径 C
+- [ ] `src/paperbot/application/services/extractors/pymupdf_extractor.py` — 路径 D
+- [ ] 依赖:`magic-pdf`(MinerU)、`segment-anything-3`、`PyMuPDF`、`tarfile`、`Pillow`
+
---
## 实现依赖关系
diff --git a/docs/anchor_source_authority_model.md b/docs/anchor_source_authority_model.md
new file mode 100644
index 0000000..297a165
--- /dev/null
+++ b/docs/anchor_source_authority_model.md
@@ -0,0 +1,770 @@
+# 信息源锚点模型:形式化建模与系统设计
+
+> 从噪声信息流中发现高质量锚点,建模信息源之间的权威性传播关系,并个性化标定锚点价值。
+
+---
+
+## 1. 本质问题
+
+这是一个**异构信息网络中的权威性发现与传播问题**(Authority Discovery in Heterogeneous Information Networks)。
+
+它结合了三个经典问题:
+
+1. **信号检测**(Signal Detection)— 从噪声中识别高质量信号
+2. **权威性传播**(Authority Propagation)— PageRank/HITS 的核心思想:权威性不是孤立的属性,而是通过关系网络传播的
+3. **锚点标定**(Anchor Calibration)— 锚点不是绝对的,是相对于观察者(用户研究方向)和时间的
+
+### 1.1 信号检测:从噪声中找锚点
+
+每天面对的信息流本质是一个**低信噪比信道**。arXiv 每天 ~500 篇 CS 论文,绝大多数是噪声(对特定研究方向而言)。锚点就是这个信道中的**高信噪比节点** — 它们不只是自身有价值,而且它们的存在能帮你**校准其他信号的价值**。
+
+**一个好的锚点的本质特征是:它能减少你评估其他信息时的不确定性。**
+
+例:当你知道 Dawn Song 在做 AI Safety,她的新论文就是一个锚点 — 不只因为这篇论文好,而是因为它帮你快速判断:
+- 这个方向是活跃的
+- 这些合作者值得关注
+- 这些 venue 是相关的
+- 这些被引论文可能是基础工作
+
+锚点的信息论定义:**锚点是观测到后能最大程度降低你对信息空间不确定性的节点**。
+
+```
+H(信息空间 | 观测到锚点) << H(信息空间)
+
+其中 H 是信息熵。锚点的质量 ∝ 互信息 I(锚点; 信息空间)
+```
+
+### 1.2 权威传播:锚点之间的关系
+
+这就是 PageRank 的核心洞察:**权威性不是孤立属性,而是通过关系网络传播的**。
+
+```
+锚点学者 ──发表于──→ 锚点 venue
+ │ │
+ └──引用──→ 锚点论文 ──被引用──→ 新论文(被锚点网络"背书")
+ │
+ └──合作──→ 新学者(通过合作关系获得"锚点传播")
+```
+
+**关键:锚点不是单个实体的属性,是整个网络中相对位置的函数。**
+
+一篇论文被 3 个核心锚点学者引用 vs 被 30 个普通论文引用 — 这两个事件传达的信号完全不同。前者是领域顶级专家的"背书",后者可能只是普通的文献综述。现有的引用计数无法区分这种差异,但基于网络传播的权威评分可以。
+
+### 1.3 个性化标定:锚点是相对的
+
+同一个学者,对研究 NLP 的人和研究 Systems 的人是完全不同的锚点。同一个会议在 2020 年是锚点,2026 年可能影响力已经转移。所以锚点评分实际上是一个**四元函数**:
+
+```
+anchor_score(source, domain, time, observer) → [0, 1]
+```
+
+- **source**: 信息源实体(学者/venue/网站/repo)
+- **domain**: 领域上下文(安全/ML/系统/SE)
+- **time**: 时间窗口(最近 6 个月 vs 历史全量)
+- **observer**: 用户的研究方向和偏好
+
+---
+
+## 2. 形式化建模
+
+### 2.1 异构信息网络定义
+
+定义异构信息网络 $G = (V, E, \phi, \psi)$,其中:
+
+- $V$ 是节点集合
+- $E \subseteq V \times V$ 是边集合
+- $\phi: V \rightarrow T_V$ 是节点类型映射函数
+- $\psi: E \rightarrow T_E$ 是边类型映射函数
+- $|T_V| + |T_E| > 2$(异构性条件)
+
+#### 节点类型 $T_V$
+
+```
+T_V = {Scholar, Paper, Venue, Website, Topic, Repo}
+```
+
+| 节点类型 | 属性集 | 内在质量信号 |
+|---------|--------|------------|
+| **Scholar** | id, name, h_index, citation_count, paper_count, fields, affiliations | h-index, 总引用, 论文产出率 |
+| **Paper** | id, title, year, citations, venue, judge_scores, abstract | 引用数, Judge 综合分, venue tier |
+| **Venue** | name, domain, tier, acceptance_rate, impact_factor | 领域排名, 接收率, 影响因子 |
+| **Website** | url, type, freshness, coverage | 覆盖率, 更新频率, 数据质量 |
+| **Topic** | keyword, field, trending_score | 论文量增速, 引用集中度 |
+| **Repo** | url, stars, forks, language, last_commit, contributors | stars, 活跃度, 贡献者数 |
+
+#### 边类型 $T_E$
+
+```
+T_E = {authors, published_at, cites, coauthors, belongs_to, listed_on, has_repo, researches}
+```
+
+| 边类型 | 源节点 → 目标节点 | 权威传播含义 | 权重来源 |
+|--------|------------------|------------|---------|
+| **authors** | Scholar → Paper | 学者为论文背书 | 作者排序位置 |
+| **published_at** | Paper → Venue | Venue 为论文背书(接收=认可) | 接收年份 |
+| **cites** | Paper → Paper | 被引论文获得引用方的传播 | 引用上下文(正/负/中性) |
+| **coauthors** | Scholar ↔ Scholar | 合作者之间的信任传递 | 合作频次, 合作年限 |
+| **belongs_to** | Paper → Topic | 论文质量反哺主题热度 | 主题匹配置信度 |
+| **listed_on** | Paper → Website | 数据源的覆盖质量 | 上线时间 |
+| **has_repo** | Paper → Repo | 代码实现增强论文可信度 | 代码与论文匹配度 |
+| **researches** | Scholar → Topic | 学者定义研究方向的权威性 | 该方向的论文数占比 |
+
+#### 元路径(Meta-path)
+
+元路径是异构图上连接两个节点的语义路径模式,用于定义"什么样的关系链构成有意义的权威传播"。
+
+关键元路径定义:
+
+```
+Scholar Authority Paths:
+ P1: Scholar ──authors──→ Paper ──published_at──→ Venue
+ 含义:学者通过在高质量 venue 发表论文获得权威
+
+ P2: Scholar ──authors──→ Paper ──cites──→ Paper ──authors──→ Scholar
+ 含义:学者 A 引用学者 B 的论文 → B 获得 A 传播的权威
+
+ P3: Scholar ──coauthors──→ Scholar ──authors──→ Paper
+ 含义:合作者的论文质量反映到当前学者
+
+Venue Authority Paths:
+ P4: Venue ←──published_at── Paper ──cites──→ Paper ──published_at──→ Venue
+ 含义:Venue A 的论文引用 Venue B 的论文 → B 获得 A 的传播
+
+Topic Authority Paths:
+ P5: Topic ←──belongs_to── Paper ──authors──→ Scholar ──researches──→ Topic
+ 含义:Topic 间通过学者的跨领域工作产生关联
+
+Emerging Source Detection:
+ P6: Scholar ──coauthors──→ Scholar(anchor) ──researches──→ Topic
+ 含义:与锚点学者合作且进入新方向 → 潜力信号
+```
+
+### 2.2 锚点评分公式
+
+对一个 source 节点 $s$,其锚点评分是四个分量的组合:
+
+$$
+\text{AnchorScore}(s) = Q(s) \cdot N(s) \cdot T(s) \cdot R(s, o)
+$$
+
+或者采用加权加法形式(避免任何一项为零导致整体为零):
+
+$$
+\text{AnchorScore}(s) = \alpha \cdot Q(s) + \beta \cdot N(s) + \gamma \cdot T(s) + \delta \cdot R(s, o)
+$$
+
+其中 $\alpha + \beta + \gamma + \delta = 1$,建议初始权重:
+
+| 分量 | 权重 | 说明 |
+|------|------|------|
+| $\alpha$ (内在质量) | 0.30 | 基础门槛,但不应主导 |
+| $\beta$ (网络位置) | 0.35 | 最重要的信号 — 网络效应 |
+| $\gamma$ (时间动态) | 0.15 | 区分活跃 vs 历史锚点 |
+| $\delta$ (观察者相关) | 0.20 | 个性化校准 |
+
+#### 2.2.1 Q(s) — 内在质量
+
+内在质量是 source 自身的客观属性评分,不依赖网络关系。
+
+**Scholar 内在质量:**
+
+```
+Q_scholar(s) = normalize(
+ w_h · log(1 + h_index) +
+ w_c · log(1 + citation_count) +
+ w_p · min(paper_count / 50, 1.0) +
+ w_v · avg_venue_tier
+)
+
+其中:
+ w_h = 0.40 (h-index 权重,对数缩放)
+ w_c = 0.25 (总引用权重)
+ w_p = 0.10 (论文数量,上限 50 篇后饱和)
+ w_v = 0.25 (平均发表 venue tier)
+```
+
+h-index 使用对数缩放是因为它的分布是重尾的(h=50 和 h=100 的差距远小于 h=5 和 h=10 的差距)。
+
+**Paper 内在质量:**
+
+```
+Q_paper(s) = normalize(
+ w_cite · citation_score(citations) + # 引用分(分段映射)
+ w_venue · venue_tier_score(venue) + # venue tier 分
+ w_judge · judge_overall / 5.0 + # Judge 综合分(如有)
+ w_code · has_code_score # 代码可用性
+)
+
+其中:
+ w_cite = 0.35
+ w_venue = 0.25
+ w_judge = 0.25
+ w_code = 0.15
+```
+
+这里 `citation_score()` 复用现有的 `CITATION_SCORE_RANGES` 分段映射(`domain/influence/weights.py`)。
+
+**Venue 内在质量:**
+
+```
+Q_venue(s) = tier_score(tier) · domain_relevance(domain)
+
+其中:
+ tier_score(tier1) = 1.0
+ tier_score(tier2) = 0.6
+ tier_score(other) = 0.2
+```
+
+**Repo 内在质量:**
+
+```
+Q_repo(s) = normalize(
+ w_stars · stars_score(stars) +
+ w_activity · activity_score(last_commit) +
+ w_contrib · min(contributors / 10, 1.0)
+)
+```
+
+**归一化**:所有 Q(s) 在同类型节点内做 min-max 归一化到 [0, 1]。不同类型之间不直接比较 Q 值。
+
+#### 2.2.2 N(s) — 网络位置(异构 PageRank)
+
+N(s) 度量的是节点在异构网络中的结构重要性。这是锚点评分中最关键的分量 — 它捕获了"被谁引用/合作/发表"的信息。
+
+**基础算法:异构阻尼 PageRank**
+
+传统 PageRank 在同构图上定义:
+
+$$
+PR(v) = \frac{1 - d}{|V|} + d \sum_{u \in \text{in}(v)} \frac{PR(u)}{|\text{out}(u)|}
+$$
+
+在异构图上,需要对不同边类型赋予不同的传播权重:
+
+$$
+N(v) = \frac{1 - d}{|V|} + d \sum_{u \in \text{in}(v)} \frac{w_{\psi(u,v)} \cdot N(u)}{Z(u)}
+$$
+
+其中:
+- $d = 0.85$(阻尼因子)
+- $w_{\psi(u,v)}$ 是边类型 $\psi(u,v)$ 的传播权重
+- $Z(u) = \sum_{v' \in \text{out}(u)} w_{\psi(u,v')}$ 是归一化因子
+
+**边类型传播权重:**
+
+```python
+EDGE_PROPAGATION_WEIGHTS = {
+ "cites": 0.30, # 引用传播最强 — "我认可你的工作"
+ "coauthors": 0.25, # 合作关系 — "我信任你足以共同署名"
+ "published_at": 0.20, # venue 背书 — "这个会议认可了这篇论文"
+ "has_repo": 0.15, # 代码关联 — 实现增强可信度
+ "belongs_to": 0.10, # 主题归属 — 最弱的关联
+}
+```
+
+**算法伪代码:**
+
+```python
+def heterogeneous_pagerank(graph, edge_weights, d=0.85, iterations=20, epsilon=1e-6):
+ """
+ 异构 PageRank — 不同边类型有不同传播权重。
+
+ Args:
+ graph: 异构信息网络
+ edge_weights: Dict[edge_type, float] — 边类型传播权重
+ d: 阻尼因子
+ iterations: 最大迭代次数
+ epsilon: 收敛阈值
+
+ Returns:
+ Dict[node_id, float] — 每个节点的网络权威分数
+ """
+ n = len(graph.nodes)
+
+ # 初始化:用内在质量 Q(s) 作为先验
+ authority = {
+ node.id: node.intrinsic_quality / n
+ for node in graph.nodes
+ }
+
+ for iteration in range(iterations):
+ new_authority = {}
+ max_delta = 0
+
+ for node in graph.nodes:
+ # 从所有入边收集权威性
+ incoming_authority = 0.0
+ for neighbor, edge_type in graph.in_edges(node):
+ w = edge_weights.get(edge_type, 0.1)
+ out_degree = sum(
+ edge_weights.get(et, 0.1)
+ for _, et in graph.out_edges(neighbor)
+ )
+ if out_degree > 0:
+ incoming_authority += w * authority[neighbor.id] / out_degree
+
+ new_score = (1 - d) / n + d * incoming_authority
+ max_delta = max(max_delta, abs(new_score - authority[node.id]))
+ new_authority[node.id] = new_score
+
+ authority = new_authority
+
+ # 收敛检测
+ if max_delta < epsilon:
+ break
+
+ # 归一化到 [0, 1]
+ max_auth = max(authority.values()) or 1
+ return {nid: score / max_auth for nid, score in authority.items()}
+```
+
+**HITS 变体(可选增强):Hub-Authority 双角色**
+
+HITS 算法区分了两种角色:
+- **Authority**:被高质量 hub 指向的节点(好论文 = 被好综述引用的论文)
+- **Hub**:指向很多高质量 authority 的节点(好综述 = 引用了很多好论文的综述)
+
+在 PaperBot 语境下:
+- Survey 论文是典型的 **Hub**(引用大量论文)
+- 被 survey 引用的论文是 **Authority**
+- 学者同时扮演两种角色:作为 Hub(引用他人),作为 Authority(被引用)
+
+```
+authority(v) = ∑_{u → v} hub(u)
+hub(v) = ∑_{v → u} authority(u)
+```
+
+HITS 比 PageRank 能更好地区分"综述型学者"(hub 分高)和"原创型学者"(authority 分高),但计算复杂度略高。建议 Phase 1 用 PageRank,Phase 2 考虑 HITS。
+
+#### 2.2.3 T(s) — 时间动态
+
+时间动态捕获 source 的**当前活跃度**和**趋势方向**。一个 h-index=80 但 5 年没发过论文的学者,和一个 h-index=15 但近 6 个月有 3 篇顶会的新学者,后者的时间动态分数应该更高。
+
+**Scholar 时间动态:**
+
+```
+T_scholar(s) = w_rec · recency(s) + w_vel · velocity(s) + w_trend · trend(s)
+
+其中:
+ recency(s) = exp(-λ · months_since_last_paper)
+ λ = 0.693 / 12 (半衰期 12 个月)
+
+ velocity(s) = min(papers_last_12_months / 5, 1.0)
+ (年产 5 篇以上饱和)
+
+ trend(s) = citation_velocity_trend (来自 DynamicPIS)
+ accelerating → 1.0
+ stable → 0.5
+ declining → 0.1
+
+权重:
+ w_rec = 0.40, w_vel = 0.35, w_trend = 0.25
+```
+
+**Paper 时间动态:**
+
+```
+T_paper(s) = recency_decay(days_since_publish)
+ = exp(-0.693 · days / half_life)
+
+half_life:
+ - 对于 trending 类场景: 30 天 (快速衰减,关注最新)
+ - 对于 foundational 类场景: 365 天 (慢速衰减,经典论文保留)
+```
+
+**Venue 时间动态:**
+
+```
+T_venue(s) = w_accept · normalized_acceptance_rate_trend +
+ w_submit · normalized_submission_growth +
+ w_cite · venue_citation_velocity
+
+(Venue 的时间动态变化较慢,可每年更新一次)
+```
+
+**Repo 时间动态:**
+
+```
+T_repo(s) = w_commit · commit_recency +
+ w_stars · star_velocity +
+ w_issue · issue_resolution_rate
+
+commit_recency = exp(-0.693 · days_since_last_commit / 30)
+star_velocity = stars_last_30_days / max(total_stars, 1)
+```
+
+#### 2.2.4 R(s, o) — 观察者相关性
+
+观察者相关性将全局的权威性投射到特定用户的研究方向上。
+
+**基于 embedding 的语义相似度:**
+
+```
+R(s, o) = cosine_similarity(embed(source_profile), embed(observer_profile))
+
+source_profile = 拼接(
+ source.keywords,
+ source.fields,
+ source.recent_paper_titles[:5], # 最近 5 篇论文标题
+ source.venue_names[:3] # 常发 venue
+)
+
+observer_profile = 拼接(
+ observer.track_keywords,
+ observer.track_methods,
+ observer.liked_paper_titles[:10], # 最近 like 的 10 篇论文
+ observer.search_queries[:5] # 最近 5 次搜索
+)
+```
+
+这里复用现有的 `EmbeddingProvider`(`context_engine/embeddings.py`),使用 `text-embedding-3-small` 计算。
+
+**领域归一化**:
+
+相同 h-index 在不同领域代表不同含义。ML 领域 h-index=30 是中等水平,而理论 CS h-index=30 是顶级。
+
+```
+Q_normalized(s) = Q_raw(s) / Q_median(s.domain)
+
+其中 Q_median(domain) 是该领域所有已知 source 的 Q 中位数。
+```
+
+这需要按领域维护 Q 的分布统计,可以在 `top_venues.yaml` 中按 domain 扩展。
+
+### 2.3 锚点层级判定
+
+锚点不是二元分类,而是一个**连续光谱**。为了实用,划分四个层级:
+
+| 层级 | AnchorScore 区间 | 含义 | 系统行为 |
+|------|-----------------|------|---------|
+| **核心锚点** (Core Anchor) | ≥ 0.8 | 领域奠基者/顶会常客/关键 venue | 主动追踪,新论文自动推送,搜索结果置顶加权 |
+| **活跃锚点** (Active Anchor) | 0.5 ~ 0.8 | 当前产出高、引用增速快、重要 repo | 纳入 DailyPaper 优先排序,Judge 上下文注入 |
+| **潜力锚点** (Emerging Anchor) | 0.3 ~ 0.5 | 新兴学者/新 repo/新趋势 | 标记关注,定期复查,Trending 候选 |
+| **普通源** (Background) | < 0.3 | 背景噪声 | 仅在搜索命中时展示,不主动推荐 |
+
+### 2.4 潜力锚点检测
+
+潜力锚点的识别特别关键 — 这是"有潜力的源头"的核心。
+
+**特征定义:内在质量不高,但动态信号异常强**
+
+```python
+def classify_emerging_anchor(source, window_months=6):
+ """
+ 识别潜力锚点:内在质量一般,但增速异常。
+
+ 核心思想:
+ - 核心锚点 = 高 Q + 高 N(已经很强了)
+ - 潜力锚点 = 中低 Q + 异常高 ΔN 或 ΔT(正在变强)
+ """
+ q = intrinsic_quality(source)
+ t = temporal_momentum(source)
+
+ # 网络位置变化量:最近 window 内的 N(s) 增速
+ n_current = network_authority(source, time=now)
+ n_previous = network_authority(source, time=now - window)
+ delta_n = (n_current - n_previous) / max(n_previous, 0.01)
+
+ # 异常检测:增速是否超过同层级 source 的 2σ
+ peers = get_peers(source, q_range=(q - 0.1, q + 0.1))
+ peer_delta_mean = mean([delta_n_of(p) for p in peers])
+ peer_delta_std = std([delta_n_of(p) for p in peers])
+
+ z_score = (delta_n - peer_delta_mean) / max(peer_delta_std, 0.01)
+
+ if q < 0.5 and z_score > 2.0:
+ return "emerging_anchor"
+ elif q >= 0.8:
+ return "core_anchor"
+ elif q >= 0.5:
+ return "active_anchor"
+ else:
+ return "background"
+```
+
+**Scholar 潜力信号:**
+
+| 信号 | 检测方法 | 示例 |
+|------|---------|------|
+| **顶会突破** | 首次在 tier1 venue 发表 | 博士生的第一篇 NeurIPS |
+| **锚点合作** | 首次与核心锚点合作 | 新学者与 Dawn Song 合著 |
+| **引用爆发** | 近 6 个月引用增速 > 同年龄段 2σ | 一篇论文突然被广泛引用 |
+| **跨领域迁移** | 原本在 A 领域,开始在 B 领域发表 | Systems 学者开始做 ML Security |
+| **代码影响力** | 关联 repo stars 快速增长 | 论文 repo 一月内 1000+ stars |
+
+**Topic 潜力信号:**
+
+| 信号 | 检测方法 |
+|------|---------|
+| **论文聚集** | 近 3 个月该 topic 论文数量异常增长 |
+| **锚点学者进入** | 核心锚点开始在该 topic 发表 |
+| **跨域引用** | 多个不同领域的论文开始引用该 topic 的论文 |
+| **Industry 关注** | 关联 repo 出现企业贡献者/sponsor |
+
+---
+
+## 3. 与经典算法的关系
+
+### 3.1 vs. PageRank
+
+PageRank 是同构有向图上的全局权威评分。本模型的 N(s) 分量是 PageRank 在异构图上的扩展,增加了:
+- 边类型权重(不同关系传播不同强度)
+- 节点类型感知(Scholar 和 Paper 的权威含义不同)
+- Q(s) 先验(初始化不是均匀的,而是用内在质量)
+
+### 3.2 vs. HITS
+
+HITS 区分 Hub(指向好东西的节点)和 Authority(被好东西指向的节点)。在本模型中:
+- Survey 论文是 Hub,被 survey 引用的原创论文是 Authority
+- 数据源网站(papers.cool、arXiv)是 Hub,被它们列出的论文是 Authority
+- HITS 可以作为 N(s) 的替代算法,提供更精细的角色区分
+
+### 3.3 vs. TrustRank
+
+TrustRank 从一组已知可信的种子节点("白名单")开始,沿链接传播信任。在本模型中:
+- Scholar 订阅列表(`scholar_subscriptions.yaml`)是天然的种子锚点
+- `top_venues.yaml` 的 tier1 venue 是天然的种子锚点
+- 可以用 TrustRank 的思路:从种子集出发做有限跳数的权威传播
+
+### 3.4 vs. Metapath2Vec / HAN
+
+Metapath2Vec 和 HAN(Heterogeneous Attention Network)是异构图上的表征学习方法。它们通过元路径引导的随机游走或注意力机制学习节点嵌入。
+
+对于 PaperBot 的规模(数千到数万节点),显式的 PageRank 计算比深度图模型更实用。但如果未来需要更精细的语义相似度(如"找到和 X 学者研究风格相似的学者"),图嵌入方法值得考虑。
+
+---
+
+## 4. 与 PaperBot 现有基础的对接
+
+### 4.1 已有基础设施
+
+| 已有组件 | 对应锚点模型分量 | 当前限制 |
+|---------|-----------------|---------|
+| `InfluenceCalculator` (PIS) | Q(s) — Paper 内在质量 | 仅评估 Paper,未扩展到 Scholar/Venue |
+| `DynamicPISCalculator` | T(s) — 时间动态 | 仅估算引用速度,无真实引用历史 |
+| `top_venues.yaml` (tier1/tier2) | Q(s) — Venue 内在质量 | 静态配置,无自动发现新 venue |
+| `Scholar` domain model | Q(s) — Scholar 内在质量 | h-index 无领域归一化 |
+| Scholar Network API (coauthor graph) | N(s) — 网络位置输入 | 已有 coauthor 数据,但未计算传播分数 |
+| Judge 5 维评分 | Q(s) — Paper 细粒度质量 | 独立评分,未反哺 source 权威 |
+| `TrackRouter` (keyword matching) | R(s,o) — 观察者相关性 | 基于 keyword overlap,非语义嵌入 |
+| `EmbeddingProvider` (OpenAI) | R(s,o) — 语义计算 | 已有基础设施,但未用于 source 评分 |
+| `INFLUENCE_WEIGHTS` | 权重配置 | 现有权重结构可复用 |
+| `_score_record()` in topic search | 搜索排序 | 仅用 token 匹配,未考虑 source 权威 |
+
+### 4.2 关键缺口
+
+1. **无 Source 统一注册** — Scholar/Venue/Repo 是独立的,没有统一的 `Source` 抽象
+2. **无网络级评分** — 所有评分都是实体级的,没有权威传播
+3. **无 Source 间关系建模** — coauthor 数据已有但未用于评分
+4. **Judge 评分单向流** — Judge 评完论文后分数不反哺到学者/venue 的权威
+5. **搜索排序忽略 source** — `_score_record()` 不考虑论文作者/venue 的锚点地位
+
+### 4.3 反哺回路
+
+锚点模型一旦建立,可以反哺现有的多个模块:
+
+```
+Source Authority Layer
+ │
+ ├──→ DailyPaper: global_top 排序时加入 anchor_boost
+ │ anchor_boost = 0.2 * max(author_anchor_scores)
+ │ final_score = base_score + anchor_boost
+ │
+ ├──→ Topic Search: _score_record() 中加入 source_authority_factor
+ │ source_factor = avg(author_anchors) * venue_anchor
+ │ score *= (1 + 0.3 * source_factor)
+ │
+ ├──→ Judge: 评分上下文中注入 "This paper is by [anchor_level] scholar X"
+ │ 帮助 LLM Judge 更好地评估 impact 维度
+ │
+ ├──→ Scholar Tracking: 自动发现潜力锚点,建议订阅
+ │ "New emerging anchor detected: Y (z_score=2.5, 3 ICML papers in 6 months)"
+ │
+ ├──→ Trending: trending_score 中加入 source_authority 权重
+ │ trending_score += 0.15 * avg_author_anchor_score
+ │
+ └──→ 推荐系统: 基于用户锚点偏好推荐新论文
+ "Based on your core anchors, you might find this relevant..."
+```
+
+---
+
+## 5. 实施架构
+
+### 5.1 系统架构
+
+```
+ ┌─────────────────────────────────┐
+ │ Source Authority Layer │
+ │ │
+ ┌─────────┐ │ ┌───────────────────────────┐ │
+ │ S2 API │──────│──│ 1. Source Registry │ │
+ │ (引用/ │ │ │ 统一 Scholar/Venue/Repo │ │
+ │ 合作) │ │ │ 为 Source 实体 │ │
+ └─────────┘ │ └─────────────┬───────────────┘ │
+ │ │ │
+ ┌─────────┐ │ ┌────────────▼────────────────┐ │
+ │ PIS / │──────│──│ 2. Relation Graph │ │
+ │ Judge / │ │ │ Source 间关系网络 │ │
+ │ Venues │ │ │ (引用/合作/发表/主题) │ │
+ └─────────┘ │ └─────────────┬───────────────┘ │
+ │ │ │
+ ┌─────────┐ │ ┌────────────▼────────────────┐ │
+ │ Track │──────│──│ 3. Authority Propagation │ │
+ │ Router │ │ │ 异构 PageRank 迭代计算 │ │
+ │ (用户 │ │ │ Q(s) + N(s) + T(s) │ │
+ │ 偏好) │ │ └─────────────┬───────────────┘ │
+ └─────────┘ │ │ │
+ │ ┌────────────▼────────────────┐ │
+ │ │ 4. Anchor Classifier │ │
+ │ │ 锚点层级判定 │ │
+ │ │ + 潜力锚点异常检测 │ │
+ │ └─────────────┬───────────────┘ │
+ │ │ │
+ │ ┌────────────▼────────────────┐ │
+ │ │ 5. Observer Projection │ │
+ │ │ R(s,o) 个性化投影 │ │
+ │ │ 基于 Track + Embedding │ │
+ │ └────────────────────────────┘ │
+ └─────────────────┬─────────────────┘
+ │
+ ┌───────────────────────┼───────────────────────┐
+ ▼ ▼ ▼
+ DailyPaper Scholar Tracking 推荐系统
+ 排序加权 自动锚点发现 个性化推荐
+```
+
+### 5.2 数据模型
+
+```sql
+-- Source 统一注册表
+CREATE TABLE sources (
+ id TEXT PRIMARY KEY,
+ source_type TEXT NOT NULL, -- 'scholar' | 'venue' | 'website' | 'repo' | 'topic'
+ name TEXT NOT NULL,
+ external_id TEXT, -- S2 author ID / venue name / repo URL
+ intrinsic_score REAL DEFAULT 0, -- Q(s)
+ network_authority REAL DEFAULT 0, -- N(s)
+ temporal_momentum REAL DEFAULT 0, -- T(s)
+ anchor_score REAL DEFAULT 0, -- AnchorScore(s) 全局版
+ anchor_level TEXT DEFAULT 'background', -- core/active/emerging/background
+ attributes_json TEXT, -- 类型特定属性 (h_index, stars, tier, ...)
+ first_seen_at TEXT NOT NULL,
+ last_updated_at TEXT NOT NULL,
+ UNIQUE(source_type, external_id)
+);
+
+-- Source 间关系表
+CREATE TABLE source_relations (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ source_id TEXT NOT NULL,
+ target_id TEXT NOT NULL,
+ relation_type TEXT NOT NULL, -- 'cites' | 'coauthors' | 'published_at' | ...
+ weight REAL DEFAULT 1.0, -- 边权重(合作频次/引用次数/...)
+ evidence_json TEXT, -- 关系证据 (论文ID列表/时间跨度/...)
+ created_at TEXT NOT NULL,
+ updated_at TEXT NOT NULL,
+ UNIQUE(source_id, target_id, relation_type),
+ FOREIGN KEY (source_id) REFERENCES sources(id),
+ FOREIGN KEY (target_id) REFERENCES sources(id)
+);
+
+-- 用户个性化锚点评分
+CREATE TABLE user_anchor_scores (
+ user_id TEXT NOT NULL,
+ source_id TEXT NOT NULL,
+ observer_relevance REAL DEFAULT 0, -- R(s, o)
+ personalized_anchor_score REAL DEFAULT 0,
+ last_computed_at TEXT NOT NULL,
+ PRIMARY KEY (user_id, source_id),
+ FOREIGN KEY (source_id) REFERENCES sources(id)
+);
+
+-- 锚点变化历史(用于潜力检测)
+CREATE TABLE anchor_score_history (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ source_id TEXT NOT NULL,
+ score_type TEXT NOT NULL, -- 'intrinsic' | 'network' | 'temporal' | 'anchor'
+ score_value REAL NOT NULL,
+ computed_at TEXT NOT NULL,
+ FOREIGN KEY (source_id) REFERENCES sources(id)
+);
+CREATE INDEX idx_anchor_history_source_time ON anchor_score_history(source_id, computed_at);
+```
+
+### 5.3 计算调度
+
+锚点评分不需要实时计算(与搜索不同),可以按不同频率批量更新:
+
+| 计算任务 | 频率 | 触发方式 | 说明 |
+|---------|------|---------|------|
+| Q(s) 更新 | 每周 | ARQ Cron | 重新计算内在质量(新引用/新论文/新 stars) |
+| N(s) PageRank | 每周 | Q(s) 更新后 | 在关系图上运行 20 次迭代 |
+| T(s) 动态 | 每日 | DailyPaper 后 | 更新时间衰减和速度指标 |
+| R(s,o) 投影 | 每次 Track 变更 | 用户操作触发 | 重算与当前 Track 的语义相似度 |
+| 潜力检测 | 每周 | N(s) 更新后 | Z-score 异常检测 |
+| 锚点层级判定 | 每周 | 所有分量更新后 | 重新分类 core/active/emerging/background |
+
+### 5.4 冷启动策略
+
+新系统没有足够的关系数据来运行 PageRank。冷启动方案:
+
+1. **种子锚点**:`scholar_subscriptions.yaml` 中的学者 + `top_venues.yaml` 中的 tier1 venue 作为初始核心锚点
+2. **反向填充**:对种子锚点调用 S2 API 获取 coauthor 和 cited papers,构建初始关系图
+3. **Bootstrap PageRank**:在初始图上运行 PageRank,产生第一批 N(s) 分数
+4. **DailyPaper 持续积累**:每次 DailyPaper 运行时,新论文的 authors/venues/citations 持续充实关系图
+5. **Judge 反哺**:高分论文的 authors 获得 Q(s) boost,逐步建立更多数据
+
+---
+
+## 6. 与记忆模块的关系
+
+锚点模型和记忆模块(见 `memory_module_complete_proposal.md`)是互补的两个系统:
+
+| 维度 | 锚点模型 | 记忆模块 |
+|------|---------|---------|
+| **关注对象** | 信息源(外部世界的客观属性) | 用户知识(内部世界的主观积累) |
+| **数据来源** | S2 API, 引用网络, venue 配置 | 用户交互, feedback, 聊天记录 |
+| **时间特征** | 周级更新,变化缓慢 | 实时写入,变化快速 |
+| **个性化方式** | R(s,o) 投影 — 全局权威的个人视角 | scope/track — 用户自己的知识结构 |
+| **协作点** | 记忆中的 "interest" 类型记忆 → 提供 observer_profile | 锚点评分 → 影响记忆中的 "insight" 提取优先级 |
+
+**数据流:**
+
+```
+用户行为 ──→ 记忆模块(提取 interest/preference)
+ │
+ └──→ 锚点模型 R(s,o)(用 interest 计算观察者相关性)
+ │
+ └──→ DailyPaper/Search(锚点加权排序)
+ │
+ └──→ 记忆模块(高分论文写入 episode/insight)
+```
+
+---
+
+## 7. 参考文献
+
+### 算法基础
+
+- Page, L., Brin, S., Motwani, R., & Winograd, T. (1999). The PageRank Citation Ranking: Bringing Order to the Web.
+- Kleinberg, J. M. (1999). Authoritative Sources in a Hyperlinked Environment. JACM.
+- Gyöngyi, Z., Garcia-Molina, H., & Pedersen, J. (2004). Combating Web Spam with TrustRank. VLDB.
+- Sun, Y., Han, J., Yan, X., Yu, P. S., & Wu, T. (2011). PathSim: Meta Path-Based Top-K Similarity Search in Heterogeneous Information Networks. VLDB.
+- Dong, Y., Chawla, N. V., & Swami, A. (2017). metapath2vec: Scalable Representation Learning for Heterogeneous Networks. KDD.
+- Wang, X., Ji, H., Shi, C., Wang, B., et al. (2019). Heterogeneous Graph Attention Network. WWW.
+
+### 学术影响力度量
+
+- Hirsch, J. E. (2005). An index to quantify an individual's scientific research output. PNAS.
+- Radicchi, F., Fortunato, S., & Castellano, C. (2008). Universality of citation distributions: Toward an objective measure of scientific impact. PNAS.
+- Wang, D., Song, C., & Barabási, A. L. (2013). Quantifying Long-Term Scientific Impact. Science.
+
+### PaperBot 现有实现
+
+- `src/paperbot/domain/influence/calculator.py` — PIS 评分计算器
+- `src/paperbot/domain/influence/analyzers/dynamic_pis.py` — 引用速度分析
+- `src/paperbot/domain/influence/weights.py` — 评分权重配置
+- `src/paperbot/domain/scholar.py` — Scholar 领域模型
+- `config/top_venues.yaml` — Venue tier 配置
+- `config/scholar_subscriptions.yaml` — 种子锚点配置
diff --git a/docs/memory_module_complete_proposal.md b/docs/memory_module_complete_proposal.md
new file mode 100644
index 0000000..2bd7e59
--- /dev/null
+++ b/docs/memory_module_complete_proposal.md
@@ -0,0 +1,700 @@
+# PaperBot 记忆模块架构设计提案
+
+> 基于 Manus 上下文工程、EverMemOS/Mem0/Zep/Letta 等主流实现、以及 15 篇近期顶会论文的综合调研。
+
+## 1. 调研综述
+
+### 1.1 外部系统调研
+
+| 系统 | 架构 | LoCoMo | LongMemEval-S | 核心思想 |
+|------|------|--------|---------------|---------|
+| **EverMemOS** | 4 层仿脑架构(engram 启发式) | 92.3% | 82% | 前额叶皮层+大脑皮层网络类比,当前 SOTA |
+| **Zep/Graphiti** | 时序知识图谱(Neo4j) | 85.2% | — | 双时态模型,P95 延迟 300ms,检索无需 LLM |
+| **Letta** | 文件系统即记忆 | 74.0% | — | 迭代文件搜索优于专用记忆工具 |
+| **Mem0** | 向量+图双存储 | 64.2% | — | 生产级 SaaS,自动记忆提取管线 |
+| **memU** | 基于文件的 Agent 记忆 | 66.7% | — | 面向 24/7 主动式 Agent |
+
+### 1.2 Manus 上下文工程核心原则
+
+1. **KV-Cache 命中率是第一指标** — 缓存 vs 非缓存 token 成本差 10x
+2. **上下文即 RAM** — LLM 是 CPU,上下文窗口是 RAM,需要"操作系统"管理
+3. **Raw > Compaction > Summarization** — 可逆压缩优先,不可逆摘要最后手段
+4. **文件系统是无限记忆** — 上下文只保留引用,全量数据在外部存储
+5. **上下文隔离** — "Share memory by communicating, don't communicate by sharing memory"
+6. **渐进式披露(Skills)** — 三级加载:元数据(100 tokens) → 指令(<5k) → 资源(按需)
+7. **工具掩码而非移除** — 保持 prompt 前缀稳定以最大化 KV-cache
+8. **todo.md 注意力管理** — 将计划写到上下文尾部,利用 transformer 近因偏差
+
+### 1.3 关键论文发现
+
+| 论文 | 会议 | 关键贡献 |
+|------|------|---------|
+| A-MEM | NeurIPS 2025 | Zettelkasten 式自组织互联笔记网络 |
+| HiMem | arXiv 2026.01 | Episode Memory + Note Memory 两层层级 + 冲突感知重整合 |
+| Agent Workflow Memory | ICML 2025 | 从历史轨迹归纳可复用工作流模板 |
+| RMM (Reflective Memory) | ACL 2025 | 前瞻/回顾双向反思 + RL 精化检索 |
+| Memoria | arXiv 2025.12 | SQL + KG + 向量三存储混合,87.1% 准确率 |
+| ACE | arXiv 2025.10 | Agent 通过重写上下文自我改进,无需权重更新 |
+| TiMem | arXiv 2026.01 | 认知科学启发的时间层级记忆整合 |
+| Collaborative Memory | ICML 2025 | 多用户记忆共享 + 动态访问控制 |
+| Survey of Context Engineering | arXiv 2025.07 | 165 页综述,1400+ 论文,上下文工程形式化框架 |
+
+---
+
+## 2. PaperBot 现状分析
+
+### 2.1 现有记忆架构
+
+```
+src/paperbot/memory/
+├── schema.py # NormalizedMessage, MemoryCandidate, MemoryKind (11种)
+├── extractor.py # 双策略提取:LLM (ModelRouter) + 启发式 (中文正则)
+├── __init__.py # 公共 API
+├── eval/collector.py # 5 个 P0 指标(precision≥85%, FP≤5%, ...)
+└── parsers/
+ ├── common.py # 多格式聊天记录解析
+ └── types.py # ParsedChatLog
+
+src/paperbot/context_engine/
+├── engine.py # ContextEngine — build_context_pack() 632 行
+├── track_router.py # TrackRouter — 多特征 track 评分 356 行
+└── embeddings.py # EmbeddingProvider (OpenAI text-embedding-3-small)
+
+src/paperbot/infrastructure/stores/
+├── memory_store.py # SqlAlchemyMemoryStore 658 行(CRUD + 粗粒度搜索)
+└── models.py # MemoryItemModel, MemorySourceModel, MemoryAuditLogModel
+```
+
+### 2.2 现有问题
+
+| 问题 | 严重度 | 说明 |
+|------|--------|------|
+| **无向量检索** | 🔴 高 | `search_memories()` 使用 SQL `CONTAINS` + 内存 token 评分,无语义匹配 |
+| **无时间感知** | 🔴 高 | 记忆无衰减机制,无时序推理能力 |
+| **无记忆整合** | 🟡 中 | 记忆只有 CRUD,无 consolidation/forgetting/reconsolidation |
+| **层级耦合** | 🟡 中 | ContextEngine 直接依赖 SqlAlchemyMemoryStore,混合 infra 和业务逻辑 |
+| **提取策略单一** | 🟡 中 | 启发式仅支持中文正则,LLM 提取依赖 ModelRouter 可用性 |
+| **无跨记忆关联** | 🟡 中 | 记忆项之间无链接关系(vs A-MEM 的双向链接) |
+| **Scope 隔离不完整** | 🟢 低 | scope_type 有 global/track/project/paper,但 track/paper scope 实际使用有限 |
+
+### 2.3 现有优势(可复用)
+
+- ✅ 完整的 schema 设计(MemoryKind 11 种、scope、confidence、status lifecycle)
+- ✅ 审计日志(MemoryAuditLogModel 全量变更记录)
+- ✅ PII 检测与脱敏(email/phone 正则)
+- ✅ 基于 confidence 的自动审核(≥0.60 自动 approved)
+- ✅ 使用量追踪(last_used_at, use_count)
+- ✅ 评估指标框架(5 个 P0 指标 + MemoryEvalMetricModel)
+
+---
+
+## 3. 架构设计
+
+### 3.1 设计原则
+
+基于调研结论,采用以下原则:
+
+1. **记忆即基础设施** — 记忆模块是独立的 infra 层服务,不依赖任何业务模块(DailyPaper/Judge/Track)
+2. **混合存储** — 结合向量存储(语义)+ 结构化存储(关系/时间)+ 文件存储(全文)
+3. **层级记忆** — 参考 HiMem,区分 Episode Memory(具体事件)和 Note Memory(抽象知识)
+4. **时间感知** — 参考 Zep/Graphiti 的双时态模型(事件时间 + 录入时间)
+5. **渐进式上下文** — 参考 Manus Skills,三级加载控制 token 消耗
+6. **自组织链接** — 参考 A-MEM Zettelkasten,记忆项之间建立双向关联
+7. **上下文工程 > Prompt 工程** — 整个 context payload(记忆/工具/检索结果)作为工程系统设计
+
+### 3.2 分层架构
+
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│ Application Layer(业务消费者,不属于记忆模块) │
+│ │
+│ DailyPaper · Judge · TopicSearch · ScholarPipeline · Paper2Code │
+│ ↓ ↓ ↓ │
+│ ┌──────────────────────────────────────────────────────────────┐ │
+│ │ Context Assembly Service(上下文装配,属于 application 层) │ │
+│ │ - build_context_pack() 从记忆层获取原料 │ │
+│ │ - 按 task/stage 组装成 prompt-ready 上下文 │ │
+│ │ - 实施 token budget 控制和渐进式披露 │ │
+│ └────────────────────────┬─────────────────────────────────────┘ │
+└───────────────────────────│─────────────────────────────────────────┘
+ │ MemoryService Protocol (接口契约)
+┌───────────────────────────│─────────────────────────────────────────┐
+│ Memory Infrastructure Layer(记忆基础设施,独立模块) │
+│ │ │
+│ ┌─────────────────────────┴──────────────────────────────────┐ │
+│ │ MemoryService (Facade) │ │
+│ │ - write(items) 读写入口 │ │
+│ │ - recall(query, scope, k) 检索入口 │ │
+│ │ - forget(item_id, reason) 删除/过期 │ │
+│ │ - consolidate() 定期整合 │ │
+│ │ - link(a, b, relation) 建立关联 │ │
+│ └────────┬───────────┬──────────────┬────────────────────────┘ │
+│ │ │ │ │
+│ ┌────────▼───┐ ┌─────▼──────┐ ┌────▼─────────┐ │
+│ │ Extractor │ │ Retriever │ │ Consolidator │ │
+│ │ (Write) │ │ (Read) │ │ (Maintain) │ │
+│ ├────────────┤ ├────────────┤ ├──────────────┤ │
+│ │ LLM 提取 │ │ 向量检索 │ │ 记忆衰减 │ │
+│ │ 规则提取 │ │ 关键词匹配 │ │ 冲突检测 │ │
+│ │ 结构化导入 │ │ 图遍历 │ │ Episode→Note │ │
+│ │ 自动标签 │ │ 时间过滤 │ │ 链接维护 │ │
+│ │ PII 检测 │ │ scope 过滤 │ │ 过期清理 │ │
+│ └────────┬───┘ └─────┬──────┘ └──────┬───────┘ │
+│ │ │ │ │
+│ ┌────────▼───────────▼───────────────▼───────────────────────┐ │
+│ │ Storage Backends │ │
+│ │ ┌──────────┐ ┌───────────┐ ┌────────────┐ │ │
+│ │ │ SQLite │ │ Vector │ │ File │ │ │
+│ │ │ (结构化) │ │ (语义) │ │ (全文/导出) │ │ │
+│ │ │ │ │ │ │ │ │ │
+│ │ │ items │ │ embeddings│ │ episodes │ │ │
+│ │ │ links │ │ (dim=1536)│ │ exports │ │ │
+│ │ │ audit │ │ │ │ snapshots │ │ │
+│ │ │ sources │ │ │ │ │ │ │
+│ │ └──────────┘ └───────────┘ └────────────┘ │ │
+│ └────────────────────────────────────────────────────────────┘ │
+└───────────────────────────────────────────────────────────────────┘
+```
+
+### 3.3 核心抽象(Protocol 层)
+
+记忆模块对外暴露的接口契约,所有业务模块通过此协议消费记忆服务:
+
+```python
+# src/paperbot/memory/protocol.py
+
+from typing import Protocol, Optional, Sequence
+from dataclasses import dataclass
+
+@dataclass(frozen=True)
+class MemoryItem:
+ """一条记忆项(infrastructure 不关心业务含义)"""
+ id: str
+ kind: str # profile/preference/fact/note/episode/...
+ content: str # 记忆内容文本
+ scope_type: str # global/track/project/paper
+ scope_id: Optional[str]
+ confidence: float # 0.0~1.0
+ tags: tuple[str, ...]
+ created_at: str # ISO 8601
+ event_at: Optional[str] # 事件发生时间(双时态)
+ use_count: int
+ last_used_at: Optional[str]
+ linked_ids: tuple[str, ...] # 关联的其他记忆 ID
+
+@dataclass(frozen=True)
+class RecallResult:
+ """检索结果"""
+ items: Sequence[MemoryItem]
+ scores: Sequence[float] # 与 items 一一对应的相关性分数
+ token_count: int # 估算的 token 消耗
+
+class MemoryService(Protocol):
+ """记忆服务的接口契约 — 业务层只依赖此协议"""
+
+ def write(
+ self,
+ user_id: str,
+ items: Sequence[dict], # kind, content, scope_type, ...
+ source: str = "api", # 来源标识
+ ) -> Sequence[str]: # 返回写入的 item IDs
+ ...
+
+ def recall(
+ self,
+ user_id: str,
+ query: str,
+ *,
+ scope_type: Optional[str] = None,
+ scope_id: Optional[str] = None,
+ kinds: Optional[Sequence[str]] = None,
+ top_k: int = 10,
+ max_tokens: int = 2000,
+ recency_weight: float = 0.2,
+ ) -> RecallResult:
+ ...
+
+ def forget(
+ self,
+ user_id: str,
+ item_id: str,
+ reason: str = "user_request",
+ ) -> bool:
+ ...
+
+ def consolidate(
+ self,
+ user_id: str,
+ scope_type: Optional[str] = None,
+ scope_id: Optional[str] = None,
+ ) -> int: # 返回整合/清理的记忆条数
+ ...
+
+ def link(
+ self,
+ item_a_id: str,
+ item_b_id: str,
+ relation: str = "related", # related/supports/contradicts/supersedes
+ ) -> bool:
+ ...
+
+ def build_context_block(
+ self,
+ user_id: str,
+ query: str,
+ *,
+ max_tokens: int = 1500,
+ scope_type: Optional[str] = None,
+ scope_id: Optional[str] = None,
+ ) -> str:
+ """便捷方法:recall + 格式化为 prompt-ready 文本块"""
+ ...
+```
+
+### 3.4 记忆类型体系
+
+参考 HiMem(Episode + Note)和 A-MEM(Zettelkasten)设计两层记忆:
+
+```
+Memory Types
+├── Episode Memory(具体事件记忆)
+│ ├── paper_read: 用户阅读了某篇论文
+│ ├── search_query: 用户执行的搜索查询
+│ ├── feedback: 用户对论文的 like/dislike/save
+│ ├── workflow_run: 执行了 DailyPaper/Judge/Analyze 流程
+│ └── interaction: 用户与系统的对话片段
+│
+└── Note Memory(抽象知识记忆)
+ ├── profile: 用户身份信息(姓名/机构/职称)
+ ├── preference: 用户偏好(语言/格式/模型选择)
+ ├── interest: 研究兴趣(主题/方法/venue)
+ ├── fact: 用户陈述的事实
+ ├── goal: 研究目标
+ ├── constraint: 约束条件(deadline/scope)
+ ├── decision: 用户做出的决定
+ └── insight: 从论文中提炼的洞察
+```
+
+**Episode → Note 整合规则**(Consolidator 负责):
+
+| Episode 类型 | 整合目标 | 触发条件 |
+|-------------|---------|---------|
+| 多次 `paper_read` 同领域 | → `interest` Note | ≥3 篇同 keyword 论文 |
+| 多次 `feedback` like | → `preference` Note | ≥5 次 like 同 venue/method |
+| `search_query` 重复模式 | → `interest` Note | ≥3 次相似查询 |
+| `workflow_run` 常用配置 | → `preference` Note | ≥3 次相同 workflow 参数 |
+
+### 3.5 存储层设计
+
+#### 3.5.1 SQLite 结构化存储(主存储)
+
+扩展现有 `MemoryItemModel`,新增字段:
+
+```sql
+-- 记忆项(扩展现有表)
+ALTER TABLE memory_items ADD COLUMN memory_layer TEXT DEFAULT 'note';
+ -- 'episode' | 'note'
+ALTER TABLE memory_items ADD COLUMN event_at TEXT;
+ -- 双时态:事件发生时间(vs 已有的 created_at 录入时间)
+ALTER TABLE memory_items ADD COLUMN embedding_id TEXT;
+ -- 关联到 memory_embeddings 表
+ALTER TABLE memory_items ADD COLUMN decay_factor REAL DEFAULT 1.0;
+ -- 衰减因子,定期更新
+
+-- 记忆关联(新表)
+CREATE TABLE memory_links (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ source_id TEXT NOT NULL,
+ target_id TEXT NOT NULL,
+ relation TEXT NOT NULL DEFAULT 'related',
+ -- related | supports | contradicts | supersedes | derived_from
+ weight REAL DEFAULT 1.0,
+ created_at TEXT NOT NULL,
+ UNIQUE(source_id, target_id, relation),
+ FOREIGN KEY (source_id) REFERENCES memory_items(id),
+ FOREIGN KEY (target_id) REFERENCES memory_items(id)
+);
+
+-- 记忆向量(新表)
+CREATE TABLE memory_embeddings (
+ id TEXT PRIMARY KEY,
+ item_id TEXT NOT NULL,
+ model TEXT NOT NULL DEFAULT 'text-embedding-3-small',
+ embedding BLOB NOT NULL, -- numpy float32 序列化
+ dim INTEGER NOT NULL DEFAULT 1536,
+ created_at TEXT NOT NULL,
+ FOREIGN KEY (item_id) REFERENCES memory_items(id)
+);
+```
+
+#### 3.5.2 向量检索策略
+
+考虑到 PaperBot 是单用户/小团队工具,不需要大规模向量数据库:
+
+| 方案 | 优点 | 缺点 | 推荐 |
+|------|------|------|------|
+| **SQLite + numpy cosine** | 零依赖,现有技术栈 | 线性扫描,>10K 条时变慢 | ✅ Phase 1 |
+| **sqlite-vec** | SQLite 扩展,原生向量 | 需编译安装 | Phase 2 |
+| **Qdrant (本地模式)** | 高性能 ANN | 新增依赖 | Phase 3 (可选) |
+| **FAISS** | 成熟高效 | C++ 编译依赖 | Phase 3 (可选) |
+
+**Phase 1 实现**:在 `memory_embeddings` 表中存储 embedding blob,检索时加载到内存做 cosine similarity。对于 < 5000 条记忆,延迟可控在 50ms 以内。
+
+```python
+# src/paperbot/memory/retriever.py (核心检索逻辑)
+
+import numpy as np
+
+def vector_search(
+ query_embedding: np.ndarray,
+ candidate_embeddings: list[tuple[str, np.ndarray]], # (item_id, embedding)
+ top_k: int = 10,
+) -> list[tuple[str, float]]:
+ """余弦相似度检索"""
+ if not candidate_embeddings:
+ return []
+ ids = [c[0] for c in candidate_embeddings]
+ matrix = np.stack([c[1] for c in candidate_embeddings])
+ # 归一化
+ query_norm = query_embedding / (np.linalg.norm(query_embedding) + 1e-9)
+ matrix_norm = matrix / (np.linalg.norm(matrix, axis=1, keepdims=True) + 1e-9)
+ scores = matrix_norm @ query_norm
+ top_indices = np.argsort(scores)[::-1][:top_k]
+ return [(ids[i], float(scores[i])) for i in top_indices]
+```
+
+#### 3.5.3 检索管线(Hybrid Recall)
+
+参考 Zep 的混合检索策略,组合三路信号:
+
+```
+Query → ┌── 向量检索(语义匹配) ─── weight: 0.50
+ ├── 关键词匹配(BM25/token)── weight: 0.25
+ └── scope/tag 精确过滤 ── weight: 0.25
+ │
+ Merge & Re-rank
+ │
+ Time Decay × Score
+ │
+ Token Budget Trim
+ │
+ RecallResult
+```
+
+**时间衰减公式**(参考 Trending 评分公式):
+
+```python
+import math
+
+def time_decay(days_since_event: float, half_life: float = 30.0) -> float:
+ """记忆时间衰减 — 半衰期默认 30 天"""
+ return math.exp(-0.693 * days_since_event / half_life)
+
+def recall_score(
+ semantic_sim: float,
+ keyword_score: float,
+ scope_match: float,
+ days_old: float,
+ use_count: int,
+ recency_weight: float = 0.2,
+) -> float:
+ """综合检索评分"""
+ base = semantic_sim * 0.50 + keyword_score * 0.25 + scope_match * 0.25
+ decay = time_decay(days_old)
+ usage_boost = min(math.log1p(use_count) * 0.05, 0.2) # 使用频率加成,上限 0.2
+ return base * (1 - recency_weight + recency_weight * decay) + usage_boost
+```
+
+### 3.6 记忆整合(Consolidator)
+
+定期运行的后台任务,负责:
+
+1. **Episode → Note 升级**:将频繁出现的 Episode 模式提炼为 Note
+2. **冲突检测**:检查新记忆与旧记忆的矛盾(参考 HiMem 冲突感知重整合)
+3. **衰减清理**:`decay_factor` 低于阈值的记忆标记为 superseded
+4. **链接维护**:自动发现相似记忆并建立关联
+
+```python
+# src/paperbot/memory/consolidator.py (简化示意)
+
+class MemoryConsolidator:
+ """记忆整合器 — 定期运行"""
+
+ def __init__(self, store, embedding_provider, llm_service=None):
+ self.store = store
+ self.embedder = embedding_provider
+ self.llm = llm_service
+
+ async def run(self, user_id: str) -> ConsolidationReport:
+ report = ConsolidationReport()
+
+ # 1. 衰减更新
+ report.decayed = await self._update_decay_factors(user_id)
+
+ # 2. Episode → Note 整合
+ report.consolidated = await self._consolidate_episodes(user_id)
+
+ # 3. 自动链接发现
+ report.links_created = await self._discover_links(user_id)
+
+ # 4. 过期清理
+ report.expired = await self._cleanup_expired(user_id)
+
+ return report
+
+ async def _consolidate_episodes(self, user_id: str) -> int:
+ """将相似 episode 聚类并提炼为 note"""
+ episodes = self.store.list_memories(
+ user_id=user_id,
+ memory_layer="episode",
+ status="approved",
+ min_count=3, # 至少 3 个相似 episode 才整合
+ )
+ # 按 embedding 聚类 → 每个簇生成一条 Note
+ # 如果 LLM 可用,用 LLM 生成摘要;否则用模板
+ ...
+
+ async def _discover_links(self, user_id: str) -> int:
+ """基于 embedding 相似度自动发现关联"""
+ items = self.store.list_memories(user_id=user_id, status="approved")
+ # 对所有 items 的 embedding 做 pairwise cosine
+ # similarity > 0.85 → 建立 'related' 链接
+ ...
+```
+
+### 3.7 上下文装配(与 Manus 原则对齐)
+
+现有的 `ContextEngine.build_context_pack()` 重构为 **上下文装配服务**,位于 application 层(不属于记忆 infra):
+
+```python
+# src/paperbot/application/services/context_assembly.py
+
+class ContextAssemblyService:
+ """上下文装配 — 从记忆层获取原料,按 task/stage 组装"""
+
+ def __init__(self, memory: MemoryService, track_router: TrackRouter):
+ self.memory = memory
+ self.router = track_router
+
+ def build_context(
+ self,
+ user_id: str,
+ task_type: str, # "judge" | "daily" | "search" | "chat"
+ query: str,
+ *,
+ track_id: Optional[str] = None,
+ max_tokens: int = 3000,
+ ) -> ContextPack:
+ # 1. 路由到 track
+ track = self.router.suggest_track(query, user_id) if not track_id else ...
+
+ # 2. 按优先级和 token budget 分配
+ budget = TokenBudget(total=max_tokens)
+
+ # Level 1: 用户画像(profile/preference)— 始终包含
+ profile_block = self.memory.build_context_block(
+ user_id, query="user profile",
+ max_tokens=budget.allocate("profile", 300),
+ kinds=["profile", "preference"],
+ )
+
+ # Level 2: 任务相关记忆 — 按 scope 和 query 检索
+ task_block = self.memory.build_context_block(
+ user_id, query=query,
+ max_tokens=budget.allocate("task", 1200),
+ scope_type="track" if track else None,
+ scope_id=track.id if track else None,
+ )
+
+ # Level 3: 历史洞察 — 仅在 budget 允许时包含
+ insight_block = ""
+ remaining = budget.remaining()
+ if remaining > 200:
+ insight_block = self.memory.build_context_block(
+ user_id, query=query,
+ max_tokens=remaining,
+ kinds=["insight", "decision"],
+ )
+
+ return ContextPack(
+ profile=profile_block,
+ task_memories=task_block,
+ insights=insight_block,
+ track=track,
+ token_usage=budget.used(),
+ )
+```
+
+### 3.8 渐进式上下文管理(三级加载)
+
+参考 Manus Skills 的 Progressive Disclosure:
+
+| 级别 | 何时加载 | 内容 | Token 消耗 |
+|------|---------|------|-----------|
+| **L0: 元数据** | 每次 LLM 调用 | 用户名 + 当前 track 名 + "has N memories" | ~50 tokens |
+| **L1: 画像** | task 开始时 | profile + preferences + goals | ~300 tokens |
+| **L2: 任务记忆** | query 确定后 | recall(query) 结果的 top-k | ~1200 tokens |
+| **L3: 深度上下文** | 仅在需要时 | 完整 insights + linked items + episode 详情 | 按需分配 |
+
+```python
+# 实际使用示例(在 DailyPaper workflow 中)
+
+# L0: 始终包含
+system_prompt = f"User: {user_name}. Research track: {track_name}."
+
+# L1: workflow 开始时获取
+profile = memory.build_context_block(user_id, "user profile", max_tokens=300)
+
+# L2: 每个 query 的 judge 评分时获取
+for query in queries:
+ task_ctx = memory.build_context_block(
+ user_id, query, max_tokens=1200, scope_type="track"
+ )
+ judge_prompt = f"{system_prompt}\n\n{profile}\n\n{task_ctx}\n\n{paper_abstract}"
+```
+
+---
+
+## 4. 迁移计划
+
+### Phase 0: 接口定义 + 向量化(无破坏性变更)
+
+**目标**:在不修改现有功能的前提下,为记忆系统添加向量检索能力。
+
+- [ ] 新建 `src/paperbot/memory/protocol.py`(MemoryService Protocol 定义)
+- [ ] 新建 `src/paperbot/memory/retriever.py`(向量检索 + 混合检索实现)
+- [ ] 新增 `memory_embeddings` 表 + Alembic 迁移
+- [ ] 新增 `memory_links` 表 + Alembic 迁移
+- [ ] 扩展 `MemoryItemModel`:添加 `memory_layer`、`event_at`、`embedding_id`、`decay_factor` 字段
+- [ ] 在现有 `SqlAlchemyMemoryStore.add_memories()` 中异步计算 embedding
+- [ ] 在现有 `SqlAlchemyMemoryStore.search_memories()` 中加入向量检索分支
+
+### Phase 1: 分离 Facade + Consolidator
+
+**目标**:建立 MemoryService Facade,实现 Protocol 契约,使业务层通过 Protocol 消费。
+
+- [ ] 新建 `src/paperbot/memory/service.py`(MemoryServiceImpl,实现 MemoryService Protocol)
+- [ ] 新建 `src/paperbot/memory/consolidator.py`(MemoryConsolidator)
+- [ ] Episode/Note 双层记忆类型支持(memory_layer 字段实际使用)
+- [ ] `recall()` 方法实现混合检索管线
+- [ ] `link()` 方法实现记忆关联
+- [ ] 将 `ContextEngine` 中的记忆相关逻辑迁移到 `ContextAssemblyService`
+- [ ] DI 容器注册 `MemoryService`
+
+### Phase 2: 业务集成 + 自动记忆生成
+
+**目标**:让各 workflow 自动产生和消费记忆。
+
+- [ ] `dailypaper.py` 完成后自动写入 Episode(search_query + workflow_run)
+- [ ] `paper_judge.py` 评分后将高分论文洞察写入 Note(insight)
+- [ ] `feedback` 路由处理后写入 Episode(feedback)
+- [ ] Judge prompt 注入用户画像和研究偏好记忆
+- [ ] Track Router 使用向量化记忆提升路由准确度
+- [ ] Consolidator 注册到 ARQ Worker 定期执行
+
+### Phase 3: 高级功能
+
+- [ ] 时间衰减调度(decay_factor 定期更新)
+- [ ] 冲突检测(新记忆 vs 旧记忆的语义矛盾检查)
+- [ ] 自动链接发现(embedding 相似度 > 阈值自动建立关联)
+- [ ] 记忆导出/快照(备份到文件系统,参考 Manus 文件即记忆模式)
+- [ ] 可选升级到 sqlite-vec 或 Qdrant
+
+---
+
+## 5. 文件清单
+
+| 文件 | 类型 | Phase | 说明 |
+|------|------|-------|------|
+| `src/paperbot/memory/protocol.py` | **新建** | 0 | MemoryService Protocol 接口定义 |
+| `src/paperbot/memory/retriever.py` | **新建** | 0 | 向量检索 + 混合检索 |
+| `src/paperbot/memory/service.py` | **新建** | 1 | MemoryServiceImpl (Facade) |
+| `src/paperbot/memory/consolidator.py` | **新建** | 1 | 记忆整合器 |
+| `src/paperbot/memory/types.py` | **新建** | 0 | MemoryItem, RecallResult 等数据类 |
+| `src/paperbot/infrastructure/stores/models.py` | 修改 | 0 | 扩展 MemoryItemModel + 新增 MemoryLinkModel/MemoryEmbeddingModel |
+| `src/paperbot/infrastructure/stores/memory_store.py` | 修改 | 0-1 | 添加向量检索/链接 CRUD |
+| `src/paperbot/context_engine/engine.py` | 修改 | 1 | 迁移记忆逻辑到 ContextAssemblyService |
+| `src/paperbot/application/services/context_assembly.py` | **新建** | 1 | 上下文装配服务 |
+| `src/paperbot/memory/extractor.py` | 修改 | 1 | 适配新的 MemoryService 写入接口 |
+| `src/paperbot/application/workflows/dailypaper.py` | 修改 | 2 | 自动写入 Episode 记忆 |
+| `src/paperbot/application/workflows/analysis/paper_judge.py` | 修改 | 2 | 注入记忆上下文 |
+| `src/paperbot/infrastructure/queue/arq_worker.py` | 修改 | 2 | 注册 Consolidator 定时任务 |
+| `alembic/versions/xxx_add_memory_vectors.py` | **新建** | 0 | 数据库迁移 |
+
+---
+
+## 6. 与上下文工程的关系
+
+### 6.1 关键定位
+
+```
+┌──────────────────────────────────────────────────────────────┐
+│ Context Engineering │
+│ │
+│ ┌───────────┐ ┌─────────────┐ ┌────────────────────────┐ │
+│ │ Memory │ │ Retrieval │ │ Context Management │ │
+│ │ (本模块) │ │ (RAG/搜索) │ │ (token budget/压缩) │ │
+│ │ │ │ │ │ │ │
+│ │ 用户画像 │ │ 论文检索 │ │ 渐进式加载 (L0-L3) │ │
+│ │ 研究偏好 │ │ 学者数据 │ │ Compaction (引用替代) │ │
+│ │ 交互历史 │ │ 代码仓库 │ │ Summarization (摘要) │ │
+│ │ 知识积累 │ │ │ │ Scope isolation │ │
+│ └─────┬─────┘ └──────┬──────┘ └───────────┬────────────┘ │
+│ │ │ │ │
+│ └───────────────┼──────────────────────┘ │
+│ ▼ │
+│ Context Assembly Service │
+│ (组装 prompt payload) │
+└──────────────────────────────────────────────────────────────┘
+```
+
+### 6.2 对齐 Manus 原则
+
+| Manus 原则 | PaperBot 对应设计 |
+|-----------|-----------------|
+| 文件系统即无限记忆 | Episode 全文存文件系统,DB 只存引用和元数据 |
+| Raw > Compaction > Summarization | L2 检索返回原文;L1 返回 profile 摘要;L0 返回元数据 |
+| 上下文隔离 | scope_type 隔离:每个 Track 的记忆互不干扰 |
+| KV-Cache 稳定性 | Profile 块(L1)放 prompt 前部,很少变化,利于缓存 |
+| 工具掩码而非移除 | 记忆 recall 按 scope/kinds 过滤,而非修改 prompt 模板 |
+| todo.md 注意力引导 | 将当前 research goal 放到 prompt 末尾 |
+| 保留错误上下文 | 记忆中保留 "contradiction" 和 "superseded" 标记 |
+
+---
+
+## 7. 参考文献
+
+### 系统与框架
+
+- [Manus Context Engineering](https://manus.im/blog/Context-Engineering-for-AI-Agents-Lessons-from-Building-Manus)
+- [LangChain × Manus Webinar](https://blog.langchain.com/context-engineering-for-agents/)
+- [Manus Skills Standard](https://manus.im/blog/manus-skills)
+- [EverMemOS](https://github.com/EverMind-AI/EverMemOS) — 92.3% LoCoMo
+- [Zep/Graphiti](https://github.com/getzep/graphiti) — 时序知识图谱
+- [Mem0](https://github.com/mem0ai/mem0) — 生产级记忆层
+- [Letta](https://www.letta.com/blog/benchmarking-ai-agent-memory) — 文件系统即记忆
+- [memU](https://github.com/NevaMind-AI/memU) — 主动式 Agent 记忆
+
+### 学术论文
+
+1. A-MEM: Agentic Memory for LLM Agents — NeurIPS 2025 ([arXiv:2502.12110](https://arxiv.org/abs/2502.12110))
+2. HiMem: Hierarchical Long-Term Memory — arXiv 2026 ([arXiv:2601.06377](https://arxiv.org/abs/2601.06377))
+3. Agent Workflow Memory — ICML 2025 ([arXiv:2409.07429](https://arxiv.org/abs/2409.07429))
+4. RMM: Reflective Memory Management — ACL 2025 ([arXiv:2503.08026](https://arxiv.org/abs/2503.08026))
+5. Memoria: Scalable Agentic Memory — arXiv 2025 ([arXiv:2512.12686](https://arxiv.org/abs/2512.12686))
+6. ACE: Agentic Context Engineering — arXiv 2025 ([arXiv:2510.04618](https://arxiv.org/abs/2510.04618))
+7. TiMem: Temporal-Hierarchical Memory — arXiv 2026 ([arXiv:2601.02845](https://arxiv.org/abs/2601.02845))
+8. Collaborative Memory — ICML 2025 ([arXiv:2505.18279](https://arxiv.org/abs/2505.18279))
+9. Memory in the Age of AI Agents: Survey — arXiv 2025 ([arXiv:2512.13564](https://arxiv.org/abs/2512.13564))
+10. Survey of Context Engineering — arXiv 2025 ([arXiv:2507.13334](https://arxiv.org/abs/2507.13334))
+11. M+: Extending MemoryLLM — ICML 2025 ([arXiv:2502.00592](https://arxiv.org/abs/2502.00592))
+12. Mem0 Paper — arXiv 2025 ([arXiv:2504.19413](https://arxiv.org/abs/2504.19413))
+13. Episodic Memory Risks — SaTML 2025 ([arXiv:2501.11739](https://arxiv.org/abs/2501.11739))
+14. Episodic Memory: Suggesting Next Tasks — arXiv 2025 ([arXiv:2511.17775](https://arxiv.org/abs/2511.17775))
+15. Zep: Temporal KG Architecture — arXiv 2025 ([arXiv:2501.13956](https://arxiv.org/abs/2501.13956))
+
+### Benchmark
+
+- [LoCoMo](https://snap-research.github.io/locomo/) — 300-turn 长对话记忆评估
+- [LongMemEval](https://arxiv.org/abs/2410.10813) — 500 问题,5 核心记忆能力 (ICLR 2025)
+- [MemAgents Workshop Proposal](https://openreview.net/pdf?id=U51WxL382H) — ICLR 2026 Workshop
diff --git a/docs/notifydemo.pdf b/docs/notifydemo.pdf
new file mode 100644
index 0000000..10665ba
Binary files /dev/null and b/docs/notifydemo.pdf differ
diff --git a/src/paperbot/api/main.py b/src/paperbot/api/main.py
index af9ab10..8a41f07 100644
--- a/src/paperbot/api/main.py
+++ b/src/paperbot/api/main.py
@@ -20,6 +20,7 @@
memory,
research,
paperscool,
+ newsletter,
)
from paperbot.infrastructure.event_log.logging_event_log import LoggingEventLog
from paperbot.infrastructure.event_log.composite_event_log import CompositeEventLog
@@ -63,6 +64,7 @@ async def health_check():
app.include_router(memory.router, prefix="/api", tags=["Memory"])
app.include_router(research.router, prefix="/api", tags=["Research"])
app.include_router(paperscool.router, prefix="/api", tags=["PapersCool"])
+app.include_router(newsletter.router, prefix="/api", tags=["Newsletter"])
@app.on_event("startup")
diff --git a/src/paperbot/api/routes/__init__.py b/src/paperbot/api/routes/__init__.py
index d091df5..6a1344d 100644
--- a/src/paperbot/api/routes/__init__.py
+++ b/src/paperbot/api/routes/__init__.py
@@ -13,6 +13,7 @@
memory,
research,
paperscool,
+ newsletter,
)
__all__ = [
@@ -28,4 +29,5 @@
"memory",
"research",
"paperscool",
+ "newsletter",
]
diff --git a/src/paperbot/api/routes/newsletter.py b/src/paperbot/api/routes/newsletter.py
new file mode 100644
index 0000000..03e5e42
--- /dev/null
+++ b/src/paperbot/api/routes/newsletter.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+import re
+from functools import lru_cache
+from typing import Any, Dict
+
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import HTMLResponse
+from pydantic import BaseModel, Field
+
+from paperbot.infrastructure.stores.subscriber_store import SubscriberStore
+
+router = APIRouter()
+
+_EMAIL_RE = re.compile(r"^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$")
+
+
+@lru_cache(maxsize=1)
+def _get_subscriber_store() -> SubscriberStore:
+ """Lazy-init subscriber store on first use, not at import time."""
+ return SubscriberStore()
+
+
+class SubscribeRequest(BaseModel):
+ email: str = Field(..., min_length=3, max_length=256)
+
+
+class SubscribeResponse(BaseModel):
+ ok: bool
+ email: str
+ message: str
+
+
+@router.post("/newsletter/subscribe", response_model=SubscribeResponse)
+def subscribe(req: SubscribeRequest):
+ email = req.email.strip().lower()
+ if not _EMAIL_RE.match(email):
+ raise HTTPException(status_code=400, detail="Invalid email format")
+
+ result = _get_subscriber_store().add_subscriber(email)
+ return SubscribeResponse(
+ ok=True,
+ email=result["email"],
+ message="Subscribed successfully",
+ )
+
+
+@router.get("/newsletter/unsubscribe/{token}")
+def unsubscribe(token: str):
+ if not token or len(token) > 64:
+ raise HTTPException(status_code=400, detail="Invalid token")
+
+ ok = _get_subscriber_store().remove_subscriber(token)
+ if not ok:
+ raise HTTPException(status_code=404, detail="Token not found")
+
+ return HTMLResponse(
+ """
+
Unsubscribed
+
+Unsubscribed
+You have been removed from the PaperBot DailyPaper newsletter.
+"""
+ )
+
+
+class SubscriberCountResponse(BaseModel):
+ active: int
+ total: int
+
+
+@router.get("/newsletter/subscribers", response_model=SubscriberCountResponse)
+def list_subscribers():
+ counts = _get_subscriber_store().get_subscriber_count()
+ return SubscriberCountResponse(**counts)
diff --git a/src/paperbot/api/routes/paperscool.py b/src/paperbot/api/routes/paperscool.py
index 5d5ee44..aa61198 100644
--- a/src/paperbot/api/routes/paperscool.py
+++ b/src/paperbot/api/routes/paperscool.py
@@ -1,7 +1,12 @@
from __future__ import annotations
import copy
+import os
+import re
from typing import Any, Dict, List, Optional
+from urllib.parse import urlparse
+
+import requests
from fastapi import APIRouter, HTTPException
from fastapi.responses import StreamingResponse
@@ -16,15 +21,45 @@
apply_judge_scores_to_report,
build_daily_paper_report,
enrich_daily_paper_report,
+ ingest_daily_report_to_registry,
normalize_llm_features,
normalize_output_formats,
+ persist_judge_scores_to_registry,
render_daily_paper_markdown,
select_judge_candidates,
)
from paperbot.application.workflows.paperscool_topic_search import PapersCoolTopicSearchWorkflow
+from paperbot.utils.text_processing import extract_github_url
router = APIRouter()
+_ALLOWED_REPORT_BASE = os.path.abspath("./reports")
+
+
+def _sanitize_output_dir(raw: str) -> str:
+ """Prevent path traversal — resolve and ensure output stays under ./reports/."""
+ resolved = os.path.abspath(raw)
+ if not resolved.startswith(_ALLOWED_REPORT_BASE):
+ return os.path.join(_ALLOWED_REPORT_BASE, "dailypaper")
+ return resolved
+
+
+_EMAIL_RE = re.compile(r"^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$")
+
+
+def _validate_email_list(emails: List[str]) -> List[str]:
+ """Validate and sanitize email list — reject header injection attempts."""
+ cleaned: List[str] = []
+ for e in emails:
+ addr = (e or "").strip()
+ if not addr:
+ continue
+ if "\n" in addr or "\r" in addr:
+ continue
+ if _EMAIL_RE.match(addr):
+ cleaned.append(addr)
+ return cleaned
+
class PapersCoolSearchRequest(BaseModel):
queries: List[str] = Field(default_factory=list)
@@ -52,18 +87,19 @@ class DailyPaperRequest(BaseModel):
show_per_branch: int = Field(25, ge=1, le=200)
min_score: float = Field(0.0, ge=0.0, description="Drop papers scoring below this threshold")
title: str = "DailyPaper Digest"
- top_n: int = Field(10, ge=1, le=50)
+ top_n: int = Field(10, ge=1, le=200)
formats: List[str] = Field(default_factory=lambda: ["both"])
save: bool = False
- output_dir: str = "./reports/dailypaper"
+ output_dir: str = Field("./reports/dailypaper", description="Relative path under project root for saving reports")
enable_llm_analysis: bool = False
llm_features: List[str] = Field(default_factory=lambda: ["summary"])
enable_judge: bool = False
judge_runs: int = Field(1, ge=1, le=5)
- judge_max_items_per_query: int = Field(5, ge=1, le=20)
+ judge_max_items_per_query: int = Field(5, ge=1, le=200)
judge_token_budget: int = Field(0, ge=0, le=2_000_000)
notify: bool = False
notify_channels: List[str] = Field(default_factory=list)
+ notify_email_to: List[str] = Field(default_factory=list)
class DailyPaperResponse(BaseModel):
@@ -80,11 +116,25 @@ class PapersCoolAnalyzeRequest(BaseModel):
run_trends: bool = False
run_insight: bool = False
judge_runs: int = Field(1, ge=1, le=5)
- judge_max_items_per_query: int = Field(5, ge=1, le=20)
+ judge_max_items_per_query: int = Field(5, ge=1, le=200)
judge_token_budget: int = Field(0, ge=0, le=2_000_000)
trend_max_items_per_query: int = Field(3, ge=1, le=20)
+class PapersCoolReposRequest(BaseModel):
+ report: Optional[Dict[str, Any]] = None
+ papers: List[Dict[str, Any]] = Field(default_factory=list)
+ max_items: int = Field(100, ge=1, le=1000)
+ include_github_api: bool = True
+
+
+class PapersCoolReposResponse(BaseModel):
+ total_candidates: int
+ matched_repos: int
+ github_api_used: bool
+ repos: List[Dict[str, Any]]
+
+
@router.post("/research/paperscool/search", response_model=PapersCoolSearchResponse)
def topic_search(req: PapersCoolSearchRequest):
cleaned_queries = [q.strip() for q in req.queries if (q or "").strip()]
@@ -106,44 +156,380 @@ def topic_search(req: PapersCoolSearchRequest):
return PapersCoolSearchResponse(**result)
-@router.post("/research/paperscool/daily", response_model=DailyPaperResponse)
-def generate_daily_report(req: DailyPaperRequest):
+async def _dailypaper_stream(req: DailyPaperRequest):
+ """SSE generator for the full DailyPaper pipeline."""
+ cleaned_queries = [q.strip() for q in req.queries if (q or "").strip()]
+
+ # Phase 1 — Search
+ yield StreamEvent(type="progress", data={"phase": "search", "message": "Searching papers..."})
+ workflow = PapersCoolTopicSearchWorkflow()
+ effective_top_k = max(int(req.top_k_per_query), int(req.top_n), 1)
+ search_result = workflow.run(
+ queries=cleaned_queries,
+ sources=req.sources,
+ branches=req.branches,
+ top_k_per_query=effective_top_k,
+ show_per_branch=req.show_per_branch,
+ min_score=req.min_score,
+ )
+ summary = search_result.get("summary") or {}
+ yield StreamEvent(
+ type="search_done",
+ data={
+ "items_count": len(search_result.get("items") or []),
+ "queries_count": len(search_result.get("queries") or []),
+ "unique_items": int(summary.get("unique_items") or 0),
+ },
+ )
+
+ # Phase 2 — Build Report
+ yield StreamEvent(type="progress", data={"phase": "build", "message": "Building report..."})
+ report = build_daily_paper_report(search_result=search_result, title=req.title, top_n=req.top_n)
+ yield StreamEvent(
+ type="report_built",
+ data={
+ "queries_count": len(report.get("queries") or []),
+ "global_top_count": len(report.get("global_top") or []),
+ "report": report,
+ },
+ )
+
+ # Phase 3 — LLM Enrichment
+ if req.enable_llm_analysis:
+ features = normalize_llm_features(req.llm_features)
+ if features:
+ llm_service = get_llm_service()
+ llm_block: Dict[str, Any] = {
+ "enabled": True,
+ "features": features,
+ "query_trends": [],
+ "daily_insight": "",
+ }
+
+ summary_done = 0
+ summary_total = 0
+ if "summary" in features or "relevance" in features:
+ for query in report.get("queries") or []:
+ summary_total += len((query.get("top_items") or [])[:3])
+
+ yield StreamEvent(
+ type="progress",
+ data={"phase": "llm", "message": "Starting LLM enrichment...", "total": summary_total},
+ )
+
+ for query in report.get("queries") or []:
+ query_name = query.get("normalized_query") or query.get("raw_query") or ""
+ top_items = (query.get("top_items") or [])[:3]
+
+ if "summary" in features:
+ for item in top_items:
+ item["ai_summary"] = llm_service.summarize_paper(
+ title=item.get("title") or "",
+ abstract=item.get("snippet") or item.get("abstract") or "",
+ )
+ summary_done += 1
+ yield StreamEvent(
+ type="llm_summary",
+ data={
+ "title": item.get("title") or "Untitled",
+ "query": query_name,
+ "ai_summary": item["ai_summary"],
+ "done": summary_done,
+ "total": summary_total,
+ },
+ )
+
+ if "relevance" in features:
+ for item in top_items:
+ item["relevance"] = llm_service.assess_relevance(paper=item, query=query_name)
+ if "summary" not in features:
+ summary_done += 1
+
+ if "trends" in features and top_items:
+ trend_text = llm_service.analyze_trends(topic=query_name, papers=top_items)
+ llm_block["query_trends"].append({"query": query_name, "analysis": trend_text})
+ yield StreamEvent(
+ type="trend",
+ data={
+ "query": query_name,
+ "analysis": trend_text,
+ "done": len(llm_block["query_trends"]),
+ "total": len(report.get("queries") or []),
+ },
+ )
+
+ if "insight" in features:
+ yield StreamEvent(type="progress", data={"phase": "insight", "message": "Generating daily insight..."})
+ llm_block["daily_insight"] = llm_service.generate_daily_insight(report)
+ yield StreamEvent(type="insight", data={"analysis": llm_block["daily_insight"]})
+
+ report["llm_analysis"] = llm_block
+ yield StreamEvent(
+ type="llm_done",
+ data={
+ "summaries_count": summary_done,
+ "trends_count": len(llm_block["query_trends"]),
+ },
+ )
+
+ # Phase 4 — Judge
+ if req.enable_judge:
+ llm_service_j = get_llm_service()
+ judge = PaperJudge(llm_service=llm_service_j)
+ selection = select_judge_candidates(
+ report,
+ max_items_per_query=req.judge_max_items_per_query,
+ n_runs=req.judge_runs,
+ token_budget=req.judge_token_budget,
+ )
+ selected = list(selection.get("selected") or [])
+ recommendation_count: Dict[str, int] = {
+ "must_read": 0,
+ "worth_reading": 0,
+ "skim": 0,
+ "skip": 0,
+ }
+
+ yield StreamEvent(
+ type="progress",
+ data={
+ "phase": "judge",
+ "message": "Starting judge scoring",
+ "total": len(selected),
+ "budget": selection.get("budget") or {},
+ },
+ )
+
+ queries = list(report.get("queries") or [])
+ for idx, row in enumerate(selected, start=1):
+ query_index = int(row.get("query_index") or 0)
+ item_index = int(row.get("item_index") or 0)
+
+ if query_index >= len(queries):
+ continue
+
+ query = queries[query_index]
+ query_name = query.get("normalized_query") or query.get("raw_query") or ""
+ top_items = list(query.get("top_items") or [])
+ if item_index >= len(top_items):
+ continue
+
+ item = top_items[item_index]
+ if req.judge_runs > 1:
+ judgment = judge.judge_with_calibration(
+ paper=item,
+ query=query_name,
+ n_runs=max(1, int(req.judge_runs)),
+ )
+ else:
+ judgment = judge.judge_single(paper=item, query=query_name)
+
+ j_payload = judgment.to_dict()
+ item["judge"] = j_payload
+ rec = j_payload.get("recommendation")
+ if rec in recommendation_count:
+ recommendation_count[rec] += 1
+
+ yield StreamEvent(
+ type="judge",
+ data={
+ "query": query_name,
+ "title": item.get("title") or "Untitled",
+ "judge": j_payload,
+ "done": idx,
+ "total": len(selected),
+ },
+ )
+
+ for query in report.get("queries") or []:
+ top_items = list(query.get("top_items") or [])
+ if not top_items:
+ continue
+ capped_count = min(len(top_items), max(1, int(req.judge_max_items_per_query)))
+ capped = top_items[:capped_count]
+ capped.sort(
+ key=lambda it: float((it.get("judge") or {}).get("overall") or -1), reverse=True
+ )
+ query["top_items"] = capped + top_items[capped_count:]
+
+ report["judge"] = {
+ "enabled": True,
+ "max_items_per_query": int(req.judge_max_items_per_query),
+ "n_runs": int(max(1, int(req.judge_runs))),
+ "recommendation_count": recommendation_count,
+ "budget": selection.get("budget") or {},
+ }
+ yield StreamEvent(type="judge_done", data=report["judge"])
+
+ # Phase 4b — Filter: remove papers below "worth_reading"
+ KEEP_RECOMMENDATIONS = {"must_read", "worth_reading"}
+ yield StreamEvent(
+ type="progress",
+ data={"phase": "filter", "message": "Filtering papers by judge recommendation..."},
+ )
+ filter_log: List[Dict[str, Any]] = []
+ total_before = 0
+ total_after = 0
+ for query in report.get("queries") or []:
+ query_name = query.get("normalized_query") or query.get("raw_query") or ""
+ items_before = list(query.get("top_items") or [])
+ total_before += len(items_before)
+ kept: List[Dict[str, Any]] = []
+ removed: List[Dict[str, Any]] = []
+ for item in items_before:
+ j = item.get("judge")
+ if isinstance(j, dict):
+ rec = j.get("recommendation", "")
+ if rec in KEEP_RECOMMENDATIONS:
+ kept.append(item)
+ else:
+ removed.append(item)
+ filter_log.append({
+ "query": query_name,
+ "title": item.get("title") or "Untitled",
+ "recommendation": rec,
+ "overall": j.get("overall"),
+ "action": "removed",
+ })
+ else:
+ # No judge score — keep by default (unjudged papers)
+ kept.append(item)
+ total_after += len(kept)
+ query["top_items"] = kept
+
+ # Also filter global_top
+ global_before = list(report.get("global_top") or [])
+ global_kept = []
+ for item in global_before:
+ j = item.get("judge")
+ if isinstance(j, dict):
+ rec = j.get("recommendation", "")
+ if rec in KEEP_RECOMMENDATIONS:
+ global_kept.append(item)
+ else:
+ global_kept.append(item)
+ report["global_top"] = global_kept
+
+ report["filter"] = {
+ "enabled": True,
+ "keep_recommendations": list(KEEP_RECOMMENDATIONS),
+ "total_before": total_before,
+ "total_after": total_after,
+ "removed_count": total_before - total_after,
+ "log": filter_log,
+ }
+ yield StreamEvent(
+ type="filter_done",
+ data={
+ "total_before": total_before,
+ "total_after": total_after,
+ "removed_count": total_before - total_after,
+ "log": filter_log,
+ },
+ )
+
+ # Phase 5 — Persist + Notify
+ yield StreamEvent(type="progress", data={"phase": "save", "message": "Saving to registry..."})
+ try:
+ ingest_summary = ingest_daily_report_to_registry(report)
+ report["registry_ingest"] = ingest_summary
+ except Exception as exc:
+ report["registry_ingest"] = {"error": str(exc)}
+
+ if req.enable_judge:
+ try:
+ report["judge_registry_ingest"] = persist_judge_scores_to_registry(report)
+ except Exception as exc:
+ report["judge_registry_ingest"] = {"error": str(exc)}
+
+ markdown = render_daily_paper_markdown(report)
+
+ markdown_path = None
+ json_path = None
+ notify_result: Optional[Dict[str, Any]] = None
+ if req.save:
+ reporter = DailyPaperReporter(output_dir=_sanitize_output_dir(req.output_dir))
+ artifacts = reporter.write(
+ report=report,
+ markdown=markdown,
+ formats=normalize_output_formats(req.formats),
+ slug=req.title,
+ )
+ markdown_path = artifacts.markdown_path
+ json_path = artifacts.json_path
+
+ if req.notify:
+ yield StreamEvent(type="progress", data={"phase": "notify", "message": "Sending notifications..."})
+ notify_service = DailyPushService.from_env()
+ notify_result = notify_service.push_dailypaper(
+ report=report,
+ markdown=markdown,
+ markdown_path=markdown_path,
+ json_path=json_path,
+ channels_override=req.notify_channels or None,
+ email_to_override=_validate_email_list(req.notify_email_to) or None,
+ )
+
+ yield StreamEvent(
+ type="result",
+ data={
+ "report": report,
+ "markdown": markdown,
+ "markdown_path": markdown_path,
+ "json_path": json_path,
+ "notify_result": notify_result,
+ },
+ )
+
+
+@router.post("/research/paperscool/daily")
+async def generate_daily_report(req: DailyPaperRequest):
cleaned_queries = [q.strip() for q in req.queries if (q or "").strip()]
if not cleaned_queries:
raise HTTPException(status_code=400, detail="queries is required")
+ # Fast sync path when no LLM/Judge — avoids SSE overhead
+ if not req.enable_llm_analysis and not req.enable_judge:
+ return _sync_daily_report(req, cleaned_queries)
+
+ # SSE streaming path for long-running operations
+ return StreamingResponse(
+ wrap_generator(_dailypaper_stream(req)),
+ media_type="text/event-stream",
+ headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
+ )
+
+
+def _sync_daily_report(req: DailyPaperRequest, cleaned_queries: List[str]):
+ """Original synchronous path for fast requests (no LLM/Judge)."""
workflow = PapersCoolTopicSearchWorkflow()
+ effective_top_k = max(int(req.top_k_per_query), int(req.top_n), 1)
try:
search_result = workflow.run(
queries=cleaned_queries,
sources=req.sources,
branches=req.branches,
- top_k_per_query=req.top_k_per_query,
+ top_k_per_query=effective_top_k,
show_per_branch=req.show_per_branch,
min_score=req.min_score,
)
except Exception as exc:
raise HTTPException(status_code=502, detail=f"daily search failed: {exc}") from exc
report = build_daily_paper_report(search_result=search_result, title=req.title, top_n=req.top_n)
- if req.enable_llm_analysis:
- report = enrich_daily_paper_report(
- report,
- llm_features=normalize_llm_features(req.llm_features),
- )
- if req.enable_judge:
- report = apply_judge_scores_to_report(
- report,
- max_items_per_query=req.judge_max_items_per_query,
- n_runs=req.judge_runs,
- judge_token_budget=req.judge_token_budget,
- )
+
+ try:
+ ingest_summary = ingest_daily_report_to_registry(report)
+ report["registry_ingest"] = ingest_summary
+ except Exception as exc:
+ report["registry_ingest"] = {"error": str(exc)}
+
markdown = render_daily_paper_markdown(report)
markdown_path = None
json_path = None
notify_result: Optional[Dict[str, Any]] = None
if req.save:
- reporter = DailyPaperReporter(output_dir=req.output_dir)
+ reporter = DailyPaperReporter(output_dir=_sanitize_output_dir(req.output_dir))
artifacts = reporter.write(
report=report,
markdown=markdown,
@@ -161,6 +547,7 @@ def generate_daily_report(req: DailyPaperRequest):
markdown_path=markdown_path,
json_path=json_path,
channels_override=req.notify_channels or None,
+ email_to_override=_validate_email_list(req.notify_email_to) or None,
)
return DailyPaperResponse(
@@ -172,6 +559,194 @@ def generate_daily_report(req: DailyPaperRequest):
)
+_GITHUB_REPO_RE = re.compile(r"https?://github\.com/([\w.-]+)/([\w.-]+)", re.IGNORECASE)
+
+
+def _normalize_github_repo_url(raw_url: str | None) -> Optional[str]:
+ if not raw_url:
+ return None
+ candidate = (raw_url or "").strip()
+ if not candidate:
+ return None
+ if "github.com" not in candidate.lower():
+ return None
+ if not candidate.startswith("http"):
+ candidate = f"https://{candidate}"
+
+ parsed = urlparse(candidate)
+ if "github.com" not in (parsed.netloc or "").lower():
+ return None
+
+ match = _GITHUB_REPO_RE.search(f"{parsed.scheme}://{parsed.netloc}{parsed.path}")
+ if not match:
+ return None
+
+ owner, repo = match.group(1), match.group(2)
+ if repo.endswith(".git"):
+ repo = repo[:-4]
+ return f"https://github.com/{owner}/{repo}"
+
+
+def _extract_repo_url_from_paper(paper: Dict[str, Any]) -> Optional[str]:
+ candidates: List[str] = []
+ for key in ("github_url", "external_url", "url", "pdf_url"):
+ value = paper.get(key)
+ if isinstance(value, str) and value:
+ candidates.append(value)
+
+ for alt in paper.get("alternative_urls") or []:
+ if isinstance(alt, str) and alt:
+ candidates.append(alt)
+
+ for candidate in candidates:
+ normalized = _normalize_github_repo_url(candidate)
+ if normalized:
+ return normalized
+
+ text_blob_parts = [
+ str(paper.get("title") or ""),
+ str(paper.get("snippet") or paper.get("abstract") or ""),
+ " ".join(str(k) for k in (paper.get("keywords") or [])),
+ ]
+ extracted = extract_github_url("\n".join(text_blob_parts))
+ return _normalize_github_repo_url(extracted)
+
+
+def _flatten_report_papers(report: Dict[str, Any]) -> List[Dict[str, Any]]:
+ rows: List[Dict[str, Any]] = []
+ for query in report.get("queries") or []:
+ query_name = query.get("normalized_query") or query.get("raw_query") or ""
+ for item in query.get("top_items") or []:
+ row = dict(item)
+ row.setdefault("_query", query_name)
+ rows.append(row)
+
+ for item in report.get("global_top") or []:
+ row = dict(item)
+ if "_query" not in row:
+ matched = row.get("matched_queries") or []
+ row["_query"] = matched[0] if matched else ""
+ rows.append(row)
+
+ deduped: List[Dict[str, Any]] = []
+ seen: set[str] = set()
+ for item in rows:
+ key = f"{item.get('url') or ''}|{item.get('title') or ''}"
+ if key in seen:
+ continue
+ seen.add(key)
+ deduped.append(item)
+ return deduped
+
+
+def _fetch_github_repo_metadata(repo_url: str, token: Optional[str]) -> Dict[str, Any]:
+ normalized = _normalize_github_repo_url(repo_url)
+ if not normalized:
+ return {"ok": False, "error": "invalid_repo_url"}
+
+ match = _GITHUB_REPO_RE.search(normalized)
+ if not match:
+ return {"ok": False, "error": "invalid_repo_url"}
+
+ owner, repo = match.group(1), match.group(2)
+ api_url = f"https://api.github.com/repos/{owner}/{repo}"
+
+ headers = {"Accept": "application/vnd.github+json", "User-Agent": "PaperBot/1.0"}
+ if token:
+ headers["Authorization"] = f"Bearer {token}"
+
+ try:
+ resp = requests.get(api_url, headers=headers, timeout=15)
+ if resp.status_code != 200:
+ return {
+ "ok": False,
+ "status": resp.status_code,
+ "error": "github_api_error",
+ "repo_url": normalized,
+ }
+
+ payload = resp.json()
+ return {
+ "ok": True,
+ "status": resp.status_code,
+ "repo_url": normalized,
+ "full_name": payload.get("full_name") or f"{owner}/{repo}",
+ "description": payload.get("description") or "",
+ "stars": int(payload.get("stargazers_count") or 0),
+ "forks": int(payload.get("forks_count") or 0),
+ "open_issues": int(payload.get("open_issues_count") or 0),
+ "watchers": int(payload.get("subscribers_count") or payload.get("watchers_count") or 0),
+ "language": payload.get("language") or "",
+ "license": (payload.get("license") or {}).get("spdx_id") or "",
+ "updated_at": payload.get("updated_at"),
+ "pushed_at": payload.get("pushed_at"),
+ "archived": bool(payload.get("archived")),
+ "topics": payload.get("topics") or [],
+ "html_url": payload.get("html_url") or normalized,
+ }
+ except Exception as exc:
+ return {
+ "ok": False,
+ "error": f"github_api_exception: {exc}",
+ "repo_url": normalized,
+ }
+
+
+@router.post("/research/paperscool/repos", response_model=PapersCoolReposResponse)
+def enrich_papers_with_repo_data(req: PapersCoolReposRequest):
+ papers: List[Dict[str, Any]] = []
+ if isinstance(req.report, dict):
+ papers.extend(_flatten_report_papers(req.report))
+ papers.extend(list(req.papers or []))
+
+ if not papers:
+ raise HTTPException(status_code=400, detail="report or papers is required")
+
+ deduped: List[Dict[str, Any]] = []
+ seen: set[str] = set()
+ for item in papers:
+ key = f"{item.get('url') or ''}|{item.get('title') or ''}"
+ if key in seen:
+ continue
+ seen.add(key)
+ deduped.append(item)
+
+ selected = deduped[: max(1, int(req.max_items))]
+ token = os.getenv("GITHUB_TOKEN") or os.getenv("GH_TOKEN")
+
+ # TODO: GitHub API calls are sequential — switch to concurrent.futures or
+ # async httpx with bounded concurrency to avoid multi-minute requests and
+ # rate-limit exhaustion (60 req/hr unauthenticated, 5000 authenticated).
+ repos: List[Dict[str, Any]] = []
+ for item in selected:
+ repo_url = _extract_repo_url_from_paper(item)
+ if not repo_url:
+ continue
+
+ row: Dict[str, Any] = {
+ "title": item.get("title") or "Untitled",
+ "query": item.get("_query") or ", ".join(item.get("matched_queries") or []),
+ "paper_url": item.get("url") or item.get("external_url") or "",
+ "repo_url": repo_url,
+ }
+ if req.include_github_api:
+ row["github"] = _fetch_github_repo_metadata(repo_url=repo_url, token=token)
+ repos.append(row)
+
+ if req.include_github_api:
+ repos.sort(
+ key=lambda row: int(((row.get("github") or {}).get("stars") or -1)),
+ reverse=True,
+ )
+
+ return PapersCoolReposResponse(
+ total_candidates=len(selected),
+ matched_repos=len(repos),
+ github_api_used=bool(req.include_github_api),
+ repos=repos,
+ )
+
+
async def _paperscool_analyze_stream(req: PapersCoolAnalyzeRequest):
report = copy.deepcopy(req.report)
llm_service = get_llm_service()
@@ -264,11 +839,11 @@ async def _paperscool_analyze_stream(req: PapersCoolAnalyzeRequest):
},
)
+ queries = list(report.get("queries") or [])
for idx, row in enumerate(selected, start=1):
query_index = int(row.get("query_index") or 0)
item_index = int(row.get("item_index") or 0)
- queries = list(report.get("queries") or [])
if query_index >= len(queries):
continue
@@ -323,6 +898,10 @@ async def _paperscool_analyze_stream(req: PapersCoolAnalyzeRequest):
"recommendation_count": recommendation_count,
"budget": selection.get("budget") or {},
}
+ try:
+ report["judge_registry_ingest"] = persist_judge_scores_to_registry(report)
+ except Exception as exc:
+ report["judge_registry_ingest"] = {"error": str(exc)}
yield StreamEvent(type="judge_done", data=report["judge"])
markdown = render_daily_paper_markdown(report)
diff --git a/src/paperbot/api/routes/research.py b/src/paperbot/api/routes/research.py
index 8d19649..abb50f4 100644
--- a/src/paperbot/api/routes/research.py
+++ b/src/paperbot/api/routes/research.py
@@ -1,6 +1,10 @@
from __future__ import annotations
-from typing import Any, Dict, List, Optional
+from collections import Counter
+from datetime import datetime, timezone
+import os
+import re
+from typing import Any, Dict, List, Optional, Tuple
from fastapi import APIRouter, BackgroundTasks, HTTPException, Query
from pydantic import BaseModel, Field
@@ -8,6 +12,7 @@
from paperbot.context_engine import ContextEngine, ContextEngineConfig
from paperbot.context_engine.track_router import TrackRouter
from paperbot.infrastructure.stores.memory_store import SqlAlchemyMemoryStore
+from paperbot.infrastructure.api_clients.semantic_scholar import SemanticScholarClient
from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore
from paperbot.memory.eval.collector import MemoryMetricCollector
from paperbot.memory.extractor import extract_memories
@@ -391,7 +396,8 @@ def bulk_moderate(req: BulkModerateRequest, background_tasks: BackgroundTasks):
# A rejection of an auto-approved (confidence >= 0.60) item is a false positive
if req.status == "rejected" and items_before:
high_confidence_rejected = sum(
- 1 for item in items_before
+ 1
+ for item in items_before
if item.get("confidence", 0) >= 0.60 and item.get("status") == "approved"
)
if high_confidence_rejected > 0:
@@ -445,10 +451,15 @@ def bulk_move(req: BulkMoveRequest, background_tasks: BackgroundTasks):
class MemoryFeedbackRequest(BaseModel):
"""Request to record feedback on retrieved memories."""
+
user_id: str = "default"
memory_ids: List[int] = Field(..., min_length=1, description="IDs of memories being rated")
- helpful_ids: List[int] = Field(default_factory=list, description="IDs of memories that were helpful")
- not_helpful_ids: List[int] = Field(default_factory=list, description="IDs of memories that were not helpful")
+ helpful_ids: List[int] = Field(
+ default_factory=list, description="IDs of memories that were helpful"
+ )
+ not_helpful_ids: List[int] = Field(
+ default_factory=list, description="IDs of memories that were not helpful"
+ )
context_run_id: Optional[int] = None
query: Optional[str] = None
@@ -666,6 +677,58 @@ def list_paper_feedback(
return PaperFeedbackListResponse(user_id=user_id, track_id=track_id, items=items)
+class PaperReadingStatusRequest(BaseModel):
+ user_id: str = "default"
+ status: str = Field(..., min_length=1) # unread/reading/read/archived
+ mark_saved: Optional[bool] = None
+ metadata: Dict[str, Any] = {}
+
+
+class PaperReadingStatusResponse(BaseModel):
+ status: Dict[str, Any]
+
+
+class SavedPapersResponse(BaseModel):
+ user_id: str
+ items: List[Dict[str, Any]]
+
+
+class PaperDetailResponse(BaseModel):
+ detail: Dict[str, Any]
+
+
+@router.post("/research/papers/{paper_id}/status", response_model=PaperReadingStatusResponse)
+def update_paper_status(paper_id: str, req: PaperReadingStatusRequest):
+ status = _research_store.set_paper_reading_status(
+ user_id=req.user_id,
+ paper_id=paper_id,
+ status=req.status,
+ metadata=req.metadata,
+ mark_saved=req.mark_saved,
+ )
+ if not status:
+ raise HTTPException(status_code=404, detail="Paper not found in registry")
+ return PaperReadingStatusResponse(status=status)
+
+
+@router.get("/research/papers/saved", response_model=SavedPapersResponse)
+def list_saved_papers(
+ user_id: str = "default",
+ sort_by: str = Query("saved_at"),
+ limit: int = Query(200, ge=1, le=1000),
+):
+ items = _research_store.list_saved_papers(user_id=user_id, sort_by=sort_by, limit=limit)
+ return SavedPapersResponse(user_id=user_id, items=items)
+
+
+@router.get("/research/papers/{paper_id}", response_model=PaperDetailResponse)
+def get_paper_detail(paper_id: str, user_id: str = "default"):
+ detail = _research_store.get_paper_detail(paper_id=paper_id, user_id=user_id)
+ if not detail:
+ raise HTTPException(status_code=404, detail="Paper not found in registry")
+ return PaperDetailResponse(detail=detail)
+
+
class RouterSuggestRequest(BaseModel):
user_id: str = "default"
query: str = Field(..., min_length=1)
@@ -740,3 +803,384 @@ async def build_context(req: ContextRequest):
return ContextResponse(context_pack=pack)
finally:
await engine.close()
+
+
+class ScholarNetworkRequest(BaseModel):
+ scholar_id: Optional[str] = None
+ scholar_name: Optional[str] = None
+ max_papers: int = Field(100, ge=1, le=500)
+ recent_years: int = Field(5, ge=0, le=30)
+ max_nodes: int = Field(40, ge=5, le=200)
+
+
+class ScholarNetworkResponse(BaseModel):
+ scholar: Dict[str, Any]
+ stats: Dict[str, Any]
+ nodes: List[Dict[str, Any]]
+ edges: List[Dict[str, Any]]
+
+
+class ScholarTrendsRequest(BaseModel):
+ scholar_id: Optional[str] = None
+ scholar_name: Optional[str] = None
+ max_papers: int = Field(200, ge=1, le=1000)
+ year_window: int = Field(10, ge=3, le=30)
+
+
+class ScholarTrendsResponse(BaseModel):
+ scholar: Dict[str, Any]
+ stats: Dict[str, Any]
+ publication_velocity: List[Dict[str, Any]]
+ topic_distribution: List[Dict[str, Any]]
+ venue_distribution: List[Dict[str, Any]]
+ recent_papers: List[Dict[str, Any]]
+ trend_summary: Dict[str, Any]
+
+
+def _resolve_scholar_identity(
+ *, scholar_id: Optional[str], scholar_name: Optional[str]
+) -> Tuple[str, Optional[str]]:
+ if scholar_id and scholar_id.strip():
+ return scholar_id.strip(), None
+
+ if not scholar_name or not scholar_name.strip():
+ raise HTTPException(status_code=400, detail="scholar_id or scholar_name is required")
+
+ from paperbot.agents.scholar_tracking.scholar_profile_agent import ScholarProfileAgent
+
+ name_key = scholar_name.strip().lower()
+ try:
+ profile = ScholarProfileAgent()
+ for scholar in profile.list_tracked_scholars():
+ if (scholar.name or "").strip().lower() != name_key:
+ continue
+ if not scholar.semantic_scholar_id:
+ break
+ return scholar.semantic_scholar_id, scholar.name
+ except Exception as exc:
+ raise HTTPException(
+ status_code=502, detail=f"failed to load scholar profile: {exc}"
+ ) from exc
+
+ raise HTTPException(
+ status_code=404,
+ detail="Scholar not found in subscriptions. Provide scholar_id directly or add scholar to subscriptions.",
+ )
+
+
+def _safe_int(value: Any, default: int = 0) -> int:
+ try:
+ return int(value)
+ except Exception:
+ return default
+
+
+def _extract_year_from_paper(paper: Dict[str, Any]) -> Optional[int]:
+ year = _safe_int(paper.get("year"), 0)
+ if year > 0:
+ return year
+
+ date_value = str(paper.get("publicationDate") or paper.get("publication_date") or "")
+ match = re.search(r"(20\d{2}|19\d{2})", date_value)
+ if match:
+ return _safe_int(match.group(1), 0) or None
+ return None
+
+
+def _unwrap_author_paper_row(row: Dict[str, Any]) -> Dict[str, Any]:
+ if isinstance(row.get("paper"), dict):
+ return row["paper"]
+ return row
+
+
+def _trend_direction(values: List[float]) -> str:
+ if len(values) < 2:
+ return "flat"
+ pivot = max(1, len(values) // 2)
+ older = sum(values[:pivot]) / max(1, len(values[:pivot]))
+ recent = sum(values[pivot:]) / max(1, len(values[pivot:]))
+ if recent > older * 1.15:
+ return "up"
+ if recent < older * 0.85:
+ return "down"
+ return "flat"
+
+
+@router.post("/research/scholar/network", response_model=ScholarNetworkResponse)
+async def scholar_network(req: ScholarNetworkRequest):
+ scholar_id, resolved_name = _resolve_scholar_identity(
+ scholar_id=req.scholar_id,
+ scholar_name=req.scholar_name,
+ )
+
+ api_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY") or os.getenv("S2_API_KEY")
+ client = SemanticScholarClient(api_key=api_key)
+ try:
+ author = await client.get_author(
+ scholar_id,
+ fields=["name", "affiliations", "paperCount", "citationCount", "hIndex"],
+ )
+ paper_rows = await client.get_author_papers(
+ scholar_id,
+ limit=max(1, int(req.max_papers)),
+ fields=["title", "year", "citationCount", "authors", "url", "publicationDate"],
+ )
+ finally:
+ await client.close()
+
+ target_name = (author or {}).get("name") or resolved_name or req.scholar_name or scholar_id
+ target_key = str(scholar_id)
+ min_year: Optional[int] = None
+ if req.recent_years > 0:
+ min_year = datetime.now(timezone.utc).year - int(req.recent_years) + 1
+
+ collaborators: Dict[str, Dict[str, Any]] = {}
+ papers_used = 0
+
+ for raw_row in paper_rows:
+ paper = _unwrap_author_paper_row(raw_row)
+ year = _extract_year_from_paper(paper)
+ if min_year and year and year < min_year:
+ continue
+
+ parsed_authors: List[Tuple[str, str]] = []
+ has_target_author = False
+ for author_row in paper.get("authors") or []:
+ if isinstance(author_row, dict):
+ author_id = str(author_row.get("authorId") or "")
+ author_name = str(author_row.get("name") or "").strip()
+ else:
+ author_id = ""
+ author_name = str(author_row or "").strip()
+ if not author_name:
+ continue
+ parsed_authors.append((author_id, author_name))
+ if author_id and author_id == target_key:
+ has_target_author = True
+ elif not author_id and author_name.lower() == str(target_name).lower():
+ has_target_author = True
+
+ if not parsed_authors or not has_target_author:
+ continue
+
+ papers_used += 1
+ paper_title = str(paper.get("title") or "Untitled")
+ citation_count = _safe_int(paper.get("citationCount"), 0)
+
+ for author_id, author_name in parsed_authors:
+ if (author_id and author_id == target_key) or (
+ not author_id and author_name.lower() == str(target_name).lower()
+ ):
+ continue
+
+ node_id = f"author:{author_id}" if author_id else f"name:{author_name.lower()}"
+ item = collaborators.setdefault(
+ node_id,
+ {
+ "id": node_id,
+ "author_id": author_id or None,
+ "name": author_name,
+ "collab_papers": 0,
+ "citation_sum": 0,
+ "recent_year": year,
+ "sample_titles": [],
+ },
+ )
+ item["collab_papers"] += 1
+ item["citation_sum"] += citation_count
+ if year and (not item.get("recent_year") or year > int(item.get("recent_year") or 0)):
+ item["recent_year"] = year
+ if len(item["sample_titles"]) < 3:
+ item["sample_titles"].append(paper_title)
+
+ ranked = sorted(
+ collaborators.values(),
+ key=lambda row: (int(row["collab_papers"]), int(row["citation_sum"])),
+ reverse=True,
+ )
+ ranked = ranked[: max(0, int(req.max_nodes) - 1)]
+
+ nodes = [
+ {
+ "id": f"author:{target_key}",
+ "author_id": target_key,
+ "name": target_name,
+ "type": "target",
+ "collab_papers": papers_used,
+ "citation_sum": _safe_int((author or {}).get("citationCount"), 0),
+ }
+ ]
+ nodes.extend(
+ [
+ {
+ "id": row["id"],
+ "author_id": row["author_id"],
+ "name": row["name"],
+ "type": "coauthor",
+ "collab_papers": row["collab_papers"],
+ "citation_sum": row["citation_sum"],
+ "recent_year": row.get("recent_year"),
+ }
+ for row in ranked
+ ]
+ )
+
+ edges = [
+ {
+ "source": f"author:{target_key}",
+ "target": row["id"],
+ "weight": row["collab_papers"],
+ "citation_sum": row["citation_sum"],
+ "sample_titles": row["sample_titles"],
+ }
+ for row in ranked
+ ]
+
+ scholar_payload = {
+ "scholar_id": target_key,
+ "name": target_name,
+ "affiliations": (author or {}).get("affiliations") or [],
+ "paper_count": _safe_int((author or {}).get("paperCount"), 0),
+ "citation_count": _safe_int((author or {}).get("citationCount"), 0),
+ "h_index": _safe_int((author or {}).get("hIndex"), 0),
+ }
+
+ return ScholarNetworkResponse(
+ scholar=scholar_payload,
+ stats={
+ "papers_fetched": len(paper_rows),
+ "papers_used": papers_used,
+ "coauthor_count": len(ranked),
+ "recent_years": int(req.recent_years),
+ },
+ nodes=nodes,
+ edges=edges,
+ )
+
+
+@router.post("/research/scholar/trends", response_model=ScholarTrendsResponse)
+async def scholar_trends(req: ScholarTrendsRequest):
+ scholar_id, resolved_name = _resolve_scholar_identity(
+ scholar_id=req.scholar_id,
+ scholar_name=req.scholar_name,
+ )
+
+ api_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY") or os.getenv("S2_API_KEY")
+ client = SemanticScholarClient(api_key=api_key)
+ try:
+ author = await client.get_author(
+ scholar_id,
+ fields=["name", "affiliations", "paperCount", "citationCount", "hIndex"],
+ )
+ paper_rows = await client.get_author_papers(
+ scholar_id,
+ limit=max(1, int(req.max_papers)),
+ fields=[
+ "title",
+ "year",
+ "citationCount",
+ "venue",
+ "fieldsOfStudy",
+ "publicationDate",
+ "url",
+ ],
+ )
+ finally:
+ await client.close()
+
+ current_year = datetime.now(timezone.utc).year
+ min_year = current_year - int(req.year_window) + 1
+
+ year_buckets: Dict[int, Dict[str, int]] = {}
+ topic_counter: Counter[str] = Counter()
+ venue_counter: Counter[str] = Counter()
+ recent_papers: List[Dict[str, Any]] = []
+
+ for raw_row in paper_rows:
+ paper = _unwrap_author_paper_row(raw_row)
+ year = _extract_year_from_paper(paper)
+ if year is None or year < min_year:
+ continue
+
+ citation_count = _safe_int(paper.get("citationCount"), 0)
+ bucket = year_buckets.setdefault(year, {"papers": 0, "citations": 0})
+ bucket["papers"] += 1
+ bucket["citations"] += citation_count
+
+ for topic in paper.get("fieldsOfStudy") or paper.get("fields_of_study") or []:
+ topic_name = str(topic).strip()
+ if topic_name:
+ topic_counter[topic_name] += 1
+
+ venue = str(
+ paper.get("venue")
+ or (
+ (paper.get("publicationVenue") or {}).get("name")
+ if isinstance(paper.get("publicationVenue"), dict)
+ else ""
+ )
+ or ""
+ ).strip()
+ if venue:
+ venue_counter[venue] += 1
+
+ recent_papers.append(
+ {
+ "title": paper.get("title") or "Untitled",
+ "year": year,
+ "citation_count": citation_count,
+ "venue": venue,
+ "url": paper.get("url") or "",
+ }
+ )
+
+ yearly = [
+ {
+ "year": year,
+ "papers": stats["papers"],
+ "citations": stats["citations"],
+ }
+ for year, stats in sorted(year_buckets.items())
+ ]
+
+ recent_papers.sort(
+ key=lambda row: (int(row.get("year") or 0), int(row.get("citation_count") or 0)),
+ reverse=True,
+ )
+
+ paper_series = [float(row["papers"]) for row in yearly]
+ citation_series = [float(row["citations"]) for row in yearly]
+
+ trend_summary = {
+ "publication_trend": _trend_direction(paper_series),
+ "citation_trend": _trend_direction(citation_series),
+ "active_years": len(yearly),
+ "window": int(req.year_window),
+ }
+
+ scholar_payload = {
+ "scholar_id": str(scholar_id),
+ "name": (author or {}).get("name") or resolved_name or req.scholar_name or str(scholar_id),
+ "affiliations": (author or {}).get("affiliations") or [],
+ "paper_count": _safe_int((author or {}).get("paperCount"), 0),
+ "citation_count": _safe_int((author or {}).get("citationCount"), 0),
+ "h_index": _safe_int((author or {}).get("hIndex"), 0),
+ }
+
+ return ScholarTrendsResponse(
+ scholar=scholar_payload,
+ stats={
+ "papers_fetched": len(paper_rows),
+ "papers_in_window": sum(item["papers"] for item in yearly),
+ "year_window": int(req.year_window),
+ },
+ publication_velocity=yearly,
+ topic_distribution=[
+ {"topic": topic, "count": count} for topic, count in topic_counter.most_common(15)
+ ],
+ venue_distribution=[
+ {"venue": venue, "count": count} for venue, count in venue_counter.most_common(15)
+ ],
+ recent_papers=recent_papers[:10],
+ trend_summary=trend_summary,
+ )
diff --git a/src/paperbot/application/services/daily_push_service.py b/src/paperbot/application/services/daily_push_service.py
index 6f2fd98..3458cf5 100644
--- a/src/paperbot/application/services/daily_push_service.py
+++ b/src/paperbot/application/services/daily_push_service.py
@@ -8,6 +8,7 @@
import smtplib
import time
from dataclasses import dataclass, field
+from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formataddr
from typing import Any, Dict, List, Optional
@@ -93,6 +94,7 @@ def push_dailypaper(
markdown_path: Optional[str] = None,
json_path: Optional[str] = None,
channels_override: Optional[List[str]] = None,
+ email_to_override: Optional[List[str]] = None,
) -> Dict[str, Any]:
channels = channels_override or self.config.channels
channels = [c.strip().lower() for c in channels if c and c.strip()]
@@ -102,21 +104,34 @@ def push_dailypaper(
if not channels:
return {"sent": False, "reason": "no channels configured", "channels": channels}
+ # Determine effective email recipients (local var, no shared state mutation)
+ effective_email_to = self.config.email_to
+ if email_to_override:
+ cleaned = [e.strip() for e in email_to_override if (e or "").strip()]
+ if cleaned:
+ effective_email_to = cleaned
+
subject = self._build_subject(report)
text = self._build_text(
report, markdown=markdown, markdown_path=markdown_path, json_path=json_path
)
+ html_body = self._build_html(report)
results: Dict[str, Any] = {"sent": False, "channels": channels, "results": {}}
any_success = False
for channel in channels:
try:
if channel == "email":
- self._send_email(subject=subject, body=text)
+ self._send_email(
+ subject=subject, body=text, html_body=html_body,
+ recipients=effective_email_to,
+ )
elif channel == "slack":
self._send_slack(subject=subject, body=text)
elif channel in {"dingtalk", "dingding"}:
self._send_dingtalk(subject=subject, body=text)
+ elif channel == "resend":
+ self._send_resend(report=report, markdown=markdown or text)
else:
raise ValueError(f"unsupported channel: {channel}")
results["results"][channel] = {"ok": True}
@@ -143,53 +158,47 @@ def _build_text(
markdown_path: Optional[str],
json_path: Optional[str],
) -> str:
- stats = report.get("stats") or {}
- lines: List[str] = []
- lines.append(str(report.get("title") or "DailyPaper Digest"))
- lines.append("Date: " + str(report.get("date") or "-"))
- lines.append("Unique Items: " + str(stats.get("unique_items", 0)))
- lines.append("Total Query Hits: " + str(stats.get("total_query_hits", 0)))
- lines.append("")
-
- global_top = list(report.get("global_top") or [])[:5]
- if global_top:
- lines.append("Top 5 Papers:")
- for idx, item in enumerate(global_top, start=1):
- title = item.get("title") or "Untitled"
- score = float(item.get("score") or 0)
- url = item.get("url") or ""
- if url:
- lines.append(f"{idx}. {title} (score={score:.4f})\n {url}")
- else:
- lines.append(f"{idx}. {title} (score={score:.4f})")
- lines.append("")
+ from paperbot.application.services.email_template import build_digest_text
+ text = build_digest_text(report)
+
+ extras: List[str] = []
if markdown_path:
- lines.append(f"Markdown: {markdown_path}")
+ extras.append(f"Markdown: {markdown_path}")
if json_path:
- lines.append(f"JSON: {json_path}")
+ extras.append(f"JSON: {json_path}")
+ if extras:
+ text += "\n" + "\n".join(extras)
+
+ return text
- if markdown and not markdown_path:
- lines.append("")
- lines.append("---")
- lines.append(markdown[:3000])
+ def _build_html(self, report: Dict[str, Any]) -> str:
+ from paperbot.application.services.email_template import build_digest_html
- return "\n".join(lines).strip()
+ return build_digest_html(report)
- def _send_email(self, *, subject: str, body: str) -> None:
+ def _send_email(
+ self, *, subject: str, body: str, html_body: str = "",
+ recipients: Optional[List[str]] = None,
+ ) -> None:
if not self.config.smtp_host:
raise ValueError("PAPERBOT_NOTIFY_SMTP_HOST is required for email notifications")
- if not self.config.email_to:
+ email_to = recipients or self.config.email_to
+ if not email_to:
raise ValueError("PAPERBOT_NOTIFY_EMAIL_TO is required for email notifications")
from_addr = self.config.email_from or self.config.smtp_username
if not from_addr:
raise ValueError("PAPERBOT_NOTIFY_EMAIL_FROM or SMTP username is required")
- msg = MIMEText(body, _subtype="plain", _charset="utf-8")
+ msg = MIMEMultipart("alternative")
msg["Subject"] = subject
msg["From"] = formataddr(("PaperBot", from_addr))
- msg["To"] = ", ".join(self.config.email_to)
+ msg["To"] = ", ".join(email_to)
+
+ msg.attach(MIMEText(body, _subtype="plain", _charset="utf-8"))
+ if html_body:
+ msg.attach(MIMEText(html_body, _subtype="html", _charset="utf-8"))
if self.config.smtp_use_ssl:
server = smtplib.SMTP_SSL(
@@ -211,7 +220,7 @@ def _send_email(self, *, subject: str, body: str) -> None:
server.ehlo()
if self.config.smtp_username:
server.login(self.config.smtp_username, self.config.smtp_password)
- server.sendmail(from_addr, self.config.email_to, msg.as_string())
+ server.sendmail(from_addr, email_to, msg.as_string())
def _send_slack(self, *, subject: str, body: str) -> None:
url = self.config.slack_webhook_url
@@ -268,3 +277,27 @@ def _dingtalk_signed_url(self, webhook_url: str) -> str:
parsed = urlparse(webhook_url)
sep = "&" if parsed.query else "?"
return f"{webhook_url}{sep}timestamp={timestamp}&sign={sign_qs}"
+
+ def _send_resend(self, *, report: Dict[str, Any], markdown: str) -> None:
+ from paperbot.application.services.resend_service import ResendEmailService
+ from paperbot.infrastructure.stores.subscriber_store import SubscriberStore
+
+ resend = ResendEmailService.from_env()
+ if not resend:
+ raise ValueError("PAPERBOT_RESEND_API_KEY is required for resend channel")
+
+ store = SubscriberStore()
+ tokens = store.get_active_subscribers_with_tokens()
+ if not tokens:
+ logger.info("Resend: no active subscribers, skipping")
+ return
+
+ result = resend.send_digest(
+ to=list(tokens.keys()),
+ report=report,
+ markdown=markdown,
+ unsub_tokens=tokens,
+ )
+ ok_count = sum(1 for v in result.values() if v.get("ok"))
+ fail_count = len(result) - ok_count
+ logger.info("Resend digest sent: ok=%d fail=%d", ok_count, fail_count)
diff --git a/src/paperbot/application/services/email_template.py b/src/paperbot/application/services/email_template.py
new file mode 100644
index 0000000..a51a142
--- /dev/null
+++ b/src/paperbot/application/services/email_template.py
@@ -0,0 +1,527 @@
+"""Shared HTML / plain-text email template for DailyPaper digest.
+
+Used by both SMTP (daily_push_service) and Resend (resend_service) channels.
+
+Layout (BestBlogs-inspired):
+ 1. Header — title, date, stats
+ 2. 本期导读 — trend narrative from llm_analysis
+ 3. 三步精选流程 — static methodology blurb
+ 4. 分层推荐 — papers grouped by recommendation tier
+ - Must Read → full 方法大框
+ - Worth Reading → full 方法大框
+ - Skim → compact card (title + one-liner)
+ 5. Footer — unsubscribe
+"""
+from __future__ import annotations
+
+import html as _html
+from typing import Any, Dict, List, Optional, Tuple
+
+# ── colour palette ──────────────────────────────────────────────
+_BLUE = "#2563eb"
+_DARK_BLUE = "#1e40af"
+_ORANGE = "#f59e0b"
+_GREEN = "#16a34a"
+_TEAL = "#0d9488"
+_GRAY_50 = "#f9fafb"
+_GRAY_100 = "#f3f4f6"
+_GRAY_200 = "#e5e7eb"
+_GRAY_400 = "#9ca3af"
+_GRAY_500 = "#6b7280"
+_GRAY_900 = "#111827"
+
+_TIER_STYLES: Dict[str, Dict[str, str]] = {
+ "must_read": {"color": _GREEN, "bg": "#f0fdf4", "label": "🔥 Must Read", "border": _GREEN},
+ "worth_reading": {"color": _BLUE, "bg": "#eff6ff", "label": "👍 Worth Reading", "border": _BLUE},
+ "skim": {"color": _GRAY_500, "bg": _GRAY_50, "label": "📋 Skim", "border": _GRAY_400},
+}
+
+DEFAULT_MAX_PER_TIER = 15
+
+
+# ── helpers ─────────────────────────────────────────────────────
+
+def _esc(val: Any) -> str:
+ return _html.escape(str(val)) if val else ""
+
+
+def _truncate(text: str, limit: int = 300) -> str:
+ text = text.strip()
+ if len(text) <= limit:
+ return text
+ return text[:limit].rsplit(" ", 1)[0] + " …"
+
+
+def _query_name(q: Dict[str, Any], fallback: str = "Query") -> str:
+ """Extract display name from a query dict (handles multiple key conventions)."""
+ return (
+ q.get("query") or q.get("raw_query") or q.get("normalized_query")
+ or q.get("name") or fallback
+ )
+
+
+def _collect_all_papers(report: Dict[str, Any]) -> List[Dict[str, Any]]:
+ """Deduplicate papers from queries + global_top, preserving order."""
+ seen: set = set()
+ out: List[Dict[str, Any]] = []
+ for q in report.get("queries") or []:
+ for item in q.get("top_items") or []:
+ key = item.get("title") or id(item)
+ if key not in seen:
+ seen.add(key)
+ out.append(item)
+ for item in report.get("global_top") or []:
+ key = item.get("title") or id(item)
+ if key not in seen:
+ seen.add(key)
+ out.append(item)
+ return out
+
+
+def _group_by_tier(
+ papers: List[Dict[str, Any]], *, max_per_tier: int = DEFAULT_MAX_PER_TIER
+) -> List[Tuple[str, List[Dict[str, Any]]]]:
+ """Group papers into (must_read, worth_reading, skim) tiers."""
+ buckets: Dict[str, List[Dict[str, Any]]] = {
+ "must_read": [], "worth_reading": [], "skim": [],
+ }
+ for p in papers:
+ judge = p.get("judge") or {}
+ rec = judge.get("recommendation", "skim")
+ if rec == "skip":
+ continue
+ bucket = buckets.get(rec, buckets["skim"])
+ bucket.append(p)
+ # Sort each tier by score descending
+ for items in buckets.values():
+ items.sort(key=lambda x: float(x.get("score") or 0), reverse=True)
+ return [
+ (tier, buckets[tier][:max_per_tier])
+ for tier in ("must_read", "worth_reading", "skim")
+ if buckets.get(tier)
+ ]
+
+
+# ── HTML components ─────────────────────────────────────────────
+
+def _method_framework_html(item: Dict[str, Any]) -> str:
+ """Build the 方法大框 from judge dimension rationales + snippet."""
+ judge: Dict[str, Any] = item.get("judge") or {}
+ snippet = str(item.get("snippet") or "")
+
+ rows: List[Tuple[str, str]] = []
+
+ # 研究问题 ← relevance rationale
+ rel = (judge.get("relevance") or {}).get("rationale", "")
+ if rel:
+ rows.append(("🎯 研究问题", rel))
+
+ # 核心方法 ← snippet (abstract) truncated
+ if snippet:
+ rows.append(("🔬 核心方法", _truncate(snippet, 250)))
+
+ # 关键证据 ← rigor rationale
+ rig = (judge.get("rigor") or {}).get("rationale", "")
+ if rig:
+ rows.append(("📊 关键证据", rig))
+
+ # 适用场景 ← impact rationale
+ imp = (judge.get("impact") or {}).get("rationale", "")
+ if imp:
+ rows.append(("🏷️ 适用场景", imp))
+
+ # 创新点 ← novelty rationale
+ nov = (judge.get("novelty") or {}).get("rationale", "")
+ if nov:
+ rows.append(("💡 创新点", nov))
+
+ if not rows:
+ return ""
+
+ inner = "".join(
+ f'{_esc(label)} '
+ f'{_esc(val)} '
+ for label, val in rows
+ )
+ return (
+ f''
+ )
+
+
+def _paper_card_full_html(idx: int, item: Dict[str, Any]) -> str:
+ """Full paper card with method framework (for must_read / worth_reading)."""
+ title = _esc(item.get("title") or "Untitled")
+ url = _esc(item.get("url") or item.get("external_url") or "")
+ score = float(item.get("score") or 0)
+ venue = _esc(item.get("subject_or_venue") or "")
+ authors: List[str] = list(item.get("authors") or [])
+ judge: Dict[str, Any] = item.get("judge") or {}
+ one_line = str(judge.get("one_line_summary") or "")
+ overall = judge.get("overall", 0)
+
+ # title link
+ if url:
+ title_html = f'{title} '
+ else:
+ title_html = f'{title} '
+
+ # metadata pills
+ pills: List[str] = []
+ if venue:
+ pills.append(f'📍 {venue} ')
+ pills.append(f'⭐ {score:.2f} ')
+ if overall:
+ pills.append(f'Judge {overall:.1f}/5 ')
+ if authors:
+ author_str = _esc(", ".join(authors[:3]))
+ if len(authors) > 3:
+ author_str += " et al."
+ pills.append(f'👤 {author_str} ')
+ meta_html = " ".join(pills)
+
+ # one-line summary
+ summary_html = ""
+ if one_line:
+ summary_html = f'💬 {_esc(one_line)}
'
+
+ framework = _method_framework_html(item)
+
+ return (
+ f''
+ f'
{idx}. {title_html}
'
+ f'
{meta_html}
'
+ f'{summary_html}'
+ f'{framework}'
+ f'
'
+ )
+
+
+def _paper_card_compact_html(idx: int, item: Dict[str, Any]) -> str:
+ """Compact card for skim-tier papers (title + score + one-liner)."""
+ title = _esc(item.get("title") or "Untitled")
+ url = _esc(item.get("url") or item.get("external_url") or "")
+ score = float(item.get("score") or 0)
+ judge: Dict[str, Any] = item.get("judge") or {}
+ one_line = _esc(str(judge.get("one_line_summary") or ""))
+
+ if url:
+ title_html = f'{title} '
+ else:
+ title_html = title
+
+ summary = f' — {one_line} ' if one_line else ""
+ return (
+ f''
+ f'{idx}. {title_html}'
+ f' (⭐ {score:.2f}) '
+ f'{summary}'
+ f'
'
+ )
+
+
+def _tier_section_html(tier: str, items: List[Dict[str, Any]]) -> str:
+ """Render a recommendation tier section."""
+ style = _TIER_STYLES.get(tier, _TIER_STYLES["skim"])
+ use_full = tier in ("must_read", "worth_reading")
+
+ if use_full:
+ cards = "\n".join(_paper_card_full_html(i, it) for i, it in enumerate(items, 1))
+ else:
+ cards = "\n".join(_paper_card_compact_html(i, it) for i, it in enumerate(items, 1))
+
+ return (
+ f''
+ f'
'
+ f'{style["label"]} '
+ f' ({len(items)}) '
+ f'
'
+ f'{cards}'
+ f'
'
+ )
+
+
+def _intro_section_html(report: Dict[str, Any]) -> str:
+ """本期导读 — narrative intro from llm_analysis."""
+ llm = report.get("llm_analysis") or {}
+ daily_insight = str(llm.get("daily_insight") or "").strip()
+ query_trends = llm.get("query_trends") or []
+
+ parts: List[str] = []
+ if daily_insight:
+ parts.append(f'{_esc(daily_insight)}
')
+
+ if query_trends:
+ trend_items = "".join(
+ f'{_esc(t.get("query", ""))} : '
+ f'{_esc(_truncate(str(t.get("analysis", "")), 200))} '
+ for t in query_trends[:5]
+ )
+ parts.append(f'')
+
+ if not parts:
+ # Fallback: summarize from queries
+ queries = report.get("queries") or []
+ if queries:
+ topics = [_query_name(q) for q in queries if q.get("top_items")]
+ topics = [t for t in topics if t and t != "Query"]
+ if topics:
+ topic_str = "、".join(_esc(t) for t in topics[:5])
+ parts.append(
+ f''
+ f'本期主要关注方向:{topic_str}。
'
+ )
+
+ if not parts:
+ return ""
+
+ return (
+ f''
+ f'
📖 本期导读 '
+ f'{"".join(parts)}'
+ f''
+ )
+
+
+def _process_section_html(stats: Dict[str, Any]) -> str:
+ """三步精选流程 — static methodology blurb."""
+ unique = stats.get("unique_items", 0)
+ hits = stats.get("total_query_hits", 0)
+ return (
+ f''
+ f'🔍 三步精选流程 '
+ f'① 聚合收集 — 从 Papers.cool / arXiv 抓取 {unique} 篇,匹配 {hits} 次命中 '
+ f'② AI 智能分析 — Judge 多维评分 + Trend 趋势洞察 + Insight 综合研判 '
+ f'③ 分层推荐 — Must Read / Worth Reading / Skim 三级分流,聚焦高价值论文'
+ f'
'
+ )
+
+
+# ═══════════════════════════════════════════════════════════════
+# Public API
+# ═══════════════════════════════════════════════════════════════
+
+def build_digest_html(
+ report: Dict[str, Any],
+ *,
+ unsub_link: Optional[str] = None,
+ max_per_tier: int = DEFAULT_MAX_PER_TIER,
+) -> str:
+ """Build a rich HTML email body from a DailyPaper *report* dict."""
+ date_str = _esc(report.get("date") or "")
+ stats = report.get("stats") or {}
+ unique = stats.get("unique_items", 0)
+ hits = stats.get("total_query_hits", 0)
+
+ # ── header ──
+ header = (
+ f''
+ f'
📄 PaperBot DailyPaper '
+ f'
'
+ f'{date_str} · {unique} papers · {hits} hits
'
+ f'
'
+ )
+
+ # ── 本期导读 ──
+ intro = _intro_section_html(report)
+
+ # ── 三步精选流程 ──
+ process = _process_section_html(stats)
+
+ # ── 分层推荐 ──
+ all_papers = _collect_all_papers(report)
+ tiers = _group_by_tier(all_papers, max_per_tier=max_per_tier)
+ tier_html = "\n".join(_tier_section_html(t, items) for t, items in tiers)
+
+ if not tier_html:
+ # Fallback: no judge data, show flat list by query
+ tier_html = _fallback_query_sections_html(report, max_per_tier)
+
+ # ── footer ──
+ unsub_html = ""
+ if unsub_link:
+ unsub_html = f' Unsubscribe '
+ footer = (
+ f''
+ f'You received this from PaperBot DailyPaper.{unsub_html}'
+ f'
'
+ )
+
+ return (
+ f' '
+ f''
+ f'{header}{intro}{process}{tier_html}{footer}'
+ f''
+ )
+
+
+def _fallback_query_sections_html(report: Dict[str, Any], max_items: int) -> str:
+ """Fallback when no judge data: group by query like before."""
+ section_colors = ["#f59e0b", "#3b82f6", "#10b981", "#8b5cf6", "#ef4444"]
+ queries = list(report.get("queries") or [])
+ sections: List[str] = []
+ if queries:
+ for qi, q in enumerate(queries):
+ color = section_colors[qi % len(section_colors)]
+ q_name = _query_name(q, fallback=f"Query {qi + 1}")
+ top_items = list(q.get("top_items") or [])[:max_items]
+ if not top_items:
+ continue
+ cards = "\n".join(_paper_card_full_html(i, it) for i, it in enumerate(top_items, 1))
+ sections.append(
+ f''
+ f'
'
+ f'{_esc(q_name)} '
+ f'({len(top_items)} hits) '
+ f'{cards}'
+ )
+ else:
+ global_top = list(report.get("global_top") or [])[:max_items]
+ if global_top:
+ cards = "\n".join(_paper_card_full_html(i, it) for i, it in enumerate(global_top, 1))
+ sections.append(
+ f''
+ f'
Top Papers '
+ f'{cards}'
+ )
+ return "\n".join(sections)
+
+
+# ── plain-text version ──────────────────────────────────────────
+
+def build_digest_text(
+ report: Dict[str, Any],
+ *,
+ unsub_link: Optional[str] = None,
+ max_per_tier: int = DEFAULT_MAX_PER_TIER,
+) -> str:
+ """Build a plain-text fallback from a DailyPaper *report* dict."""
+ lines: List[str] = []
+ lines.append("📄 PaperBot DailyPaper")
+ title = str(report.get("title") or "DailyPaper Digest")
+ lines.append(title)
+ lines.append(f"Date: {report.get('date', '-')}")
+ stats = report.get("stats") or {}
+ lines.append(f"Papers: {stats.get('unique_items', 0)} · Hits: {stats.get('total_query_hits', 0)}")
+ lines.append("")
+
+ # 导读
+ llm = report.get("llm_analysis") or {}
+ daily_insight = str(llm.get("daily_insight") or "").strip()
+ if daily_insight:
+ lines.append("📖 本期导读")
+ lines.append(daily_insight)
+ lines.append("")
+ query_trends = llm.get("query_trends") or []
+ if query_trends:
+ for t in query_trends[:5]:
+ lines.append(f" · {t.get('query', '')}: {_truncate(str(t.get('analysis', '')), 200)}")
+ lines.append("")
+
+ # 三步精选
+ lines.append(f"🔍 三步精选: 聚合 {stats.get('unique_items', 0)} 篇 → AI Judge 评分 → 分层推荐")
+ lines.append("")
+
+ # 分层推荐
+ all_papers = _collect_all_papers(report)
+ tiers = _group_by_tier(all_papers, max_per_tier=max_per_tier)
+
+ tier_labels = {"must_read": "🔥 Must Read", "worth_reading": "👍 Worth Reading", "skim": "📋 Skim"}
+
+ if tiers:
+ for tier, items in tiers:
+ label = tier_labels.get(tier, tier)
+ lines.append(f"{'='*50}")
+ lines.append(f"{label} ({len(items)})")
+ lines.append(f"{'='*50}")
+ lines.append("")
+ use_full = tier in ("must_read", "worth_reading")
+ for idx, item in enumerate(items, 1):
+ _append_paper_text(lines, idx, item, full=use_full)
+ lines.append("")
+ else:
+ # Fallback: by query
+ for q in report.get("queries") or []:
+ q_name = _query_name(q)
+ top_items = list(q.get("top_items") or [])[:max_per_tier]
+ if not top_items:
+ continue
+ lines.append(f"▌ {q_name} ({len(top_items)} hits)")
+ lines.append("")
+ for idx, item in enumerate(top_items, 1):
+ _append_paper_text(lines, idx, item, full=True)
+ lines.append("")
+
+ lines.append("---")
+ if unsub_link:
+ lines.append(f"Unsubscribe: {unsub_link}")
+ else:
+ lines.append("You received this from PaperBot DailyPaper.")
+ return "\n".join(lines)
+
+
+def _append_paper_text(
+ lines: List[str], idx: int, item: Dict[str, Any], *, full: bool = True
+) -> None:
+ title = item.get("title") or "Untitled"
+ url = item.get("url") or ""
+ score = float(item.get("score") or 0)
+ venue = item.get("subject_or_venue") or ""
+ authors = list(item.get("authors") or [])
+ judge: Dict[str, Any] = item.get("judge") or {}
+ one_line = str(judge.get("one_line_summary") or "")
+ rec = judge.get("recommendation", "")
+
+ badge = ""
+ if rec == "must_read":
+ badge = "[Must Read] "
+ elif rec == "worth_reading":
+ badge = "[Worth Reading] "
+
+ lines.append(f" {idx}. {badge}{title} (⭐ {score:.2f})")
+ meta_parts: List[str] = []
+ if venue:
+ meta_parts.append(venue)
+ if authors:
+ meta_parts.append(", ".join(authors[:3]) + (" et al." if len(authors) > 3 else ""))
+ if meta_parts:
+ lines.append(f" {' | '.join(meta_parts)}")
+ if url:
+ lines.append(f" {url}")
+ if one_line:
+ lines.append(f" 💬 {one_line}")
+
+ if full:
+ # 方法大框 text version
+ snippet = str(item.get("snippet") or "")
+ rel = str((judge.get("relevance") or {}).get("rationale", ""))
+ rig = str((judge.get("rigor") or {}).get("rationale", ""))
+ imp = str((judge.get("impact") or {}).get("rationale", ""))
+ nov = str((judge.get("novelty") or {}).get("rationale", ""))
+
+ framework: List[Tuple[str, str]] = []
+ if rel:
+ framework.append(("🎯 研究问题", rel))
+ if snippet:
+ framework.append(("🔬 核心方法", _truncate(snippet, 200)))
+ if rig:
+ framework.append(("📊 关键证据", rig))
+ if imp:
+ framework.append(("🏷️ 适用场景", imp))
+ if nov:
+ framework.append(("💡 创新点", nov))
+
+ if framework:
+ for label, val in framework:
+ lines.append(f" {label}: {val}")
+
+ lines.append("")
diff --git a/src/paperbot/application/services/resend_service.py b/src/paperbot/application/services/resend_service.py
new file mode 100644
index 0000000..912120f
--- /dev/null
+++ b/src/paperbot/application/services/resend_service.py
@@ -0,0 +1,104 @@
+from __future__ import annotations
+
+import logging
+import os
+from typing import Any, Dict, List, Optional
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+
+class ResendEmailService:
+ """Send emails via the Resend REST API (no SDK dependency)."""
+
+ API_URL = "https://api.resend.com/emails"
+
+ def __init__(self, api_key: str, from_email: str, unsub_base_url: str):
+ self.api_key = api_key
+ self.from_email = from_email
+ self.unsub_base_url = unsub_base_url.rstrip("/")
+
+ @classmethod
+ def from_env(cls) -> Optional["ResendEmailService"]:
+ api_key = os.getenv("PAPERBOT_RESEND_API_KEY", "").strip()
+ if not api_key:
+ return None
+ from_email = os.getenv(
+ "PAPERBOT_RESEND_FROM", "PaperBot "
+ )
+ unsub_base_url = os.getenv(
+ "PAPERBOT_RESEND_UNSUB_URL", "http://localhost:3000"
+ )
+ return cls(
+ api_key=api_key,
+ from_email=from_email,
+ unsub_base_url=unsub_base_url,
+ )
+
+ def send(
+ self, *, to: List[str], subject: str, html_body: str, text: str
+ ) -> Dict[str, Any]:
+ resp = requests.post(
+ self.API_URL,
+ headers={"Authorization": f"Bearer {self.api_key}"},
+ json={
+ "from": self.from_email,
+ "to": to,
+ "subject": subject,
+ "html": html_body,
+ "text": text,
+ },
+ timeout=15,
+ )
+ resp.raise_for_status()
+ return resp.json()
+
+ def send_digest(
+ self,
+ *,
+ to: List[str],
+ report: Dict[str, Any],
+ markdown: str,
+ unsub_tokens: Dict[str, str],
+ ) -> Dict[str, Any]:
+ """Send DailyPaper digest to subscribers, each with their own unsubscribe link."""
+ results: Dict[str, Any] = {}
+ date_str = report.get("date", "")
+ subject = f"[PaperBot] {report.get('title', 'DailyPaper Digest')}"
+ if date_str:
+ subject += f" - {date_str}"
+
+ for email_addr in to:
+ token = unsub_tokens.get(email_addr, "")
+ if not token:
+ logger.warning("Resend: no unsub token for subscriber, skipping")
+ results[email_addr] = {"ok": False, "error": "missing_unsub_token"}
+ continue
+ unsub_link = f"{self.unsub_base_url}/api/newsletter/unsubscribe/{token}"
+ html_body = self._render_html(report, markdown, unsub_link)
+ text = self._render_text(report, markdown, unsub_link)
+ try:
+ r = self.send(
+ to=[email_addr], subject=subject, html_body=html_body, text=text
+ )
+ results[email_addr] = {"ok": True, "id": r.get("id")}
+ except Exception as e:
+ masked = email_addr[:2] + "***" + email_addr[email_addr.index("@"):] if "@" in email_addr else "***"
+ logger.warning("Resend failed for %s: %s", masked, e)
+ results[email_addr] = {"ok": False, "error": str(e)}
+ return results
+
+ def _render_html(
+ self, report: Dict[str, Any], markdown: str, unsub_link: str
+ ) -> str:
+ from paperbot.application.services.email_template import build_digest_html
+
+ return build_digest_html(report, unsub_link=unsub_link)
+
+ def _render_text(
+ self, report: Dict[str, Any], markdown: str, unsub_link: str
+ ) -> str:
+ from paperbot.application.services.email_template import build_digest_text
+
+ return build_digest_text(report, unsub_link=unsub_link)
diff --git a/src/paperbot/application/workflows/dailypaper.py b/src/paperbot/application/workflows/dailypaper.py
index cc55e91..42b5308 100644
--- a/src/paperbot/application/workflows/dailypaper.py
+++ b/src/paperbot/application/workflows/dailypaper.py
@@ -10,6 +10,7 @@
from paperbot.application.services.llm_service import LLMService, get_llm_service
from paperbot.application.workflows.analysis.paper_judge import PaperJudge
+from paperbot.infrastructure.stores.paper_store import SqlAlchemyPaperStore
SUPPORTED_LLM_FEATURES = ("summary", "trends", "insight", "relevance")
@@ -54,6 +55,67 @@ def build_daily_paper_report(
}
+def _iter_report_papers(report: Dict[str, Any]) -> List[Dict[str, Any]]:
+ rows: List[Dict[str, Any]] = []
+ for query in report.get("queries") or []:
+ for item in query.get("top_items") or []:
+ row = dict(item)
+ row.setdefault("source", (report.get("sources") or [report.get("source")])[0])
+ rows.append(row)
+
+ for item in report.get("global_top") or []:
+ row = dict(item)
+ row.setdefault("source", (report.get("sources") or [report.get("source")])[0])
+ rows.append(row)
+
+ deduped: List[Dict[str, Any]] = []
+ seen: set[str] = set()
+ for item in rows:
+ key = f"{item.get('url') or ''}|{item.get('title') or ''}"
+ if key in seen:
+ continue
+ seen.add(key)
+ deduped.append(item)
+ return deduped
+
+
+def ingest_daily_report_to_registry(
+ report: Dict[str, Any],
+ *,
+ paper_store: Optional[SqlAlchemyPaperStore] = None,
+) -> Dict[str, int]:
+ """Persist DailyPaper report items to canonical paper registry."""
+
+ store = paper_store or SqlAlchemyPaperStore()
+ generated_at = report.get("generated_at")
+ seen_at: Optional[datetime] = None
+ if isinstance(generated_at, str) and generated_at.strip():
+ stamp = generated_at.strip()
+ if stamp.endswith("Z"):
+ stamp = f"{stamp[:-1]}+00:00"
+ try:
+ seen_at = datetime.fromisoformat(stamp)
+ if seen_at.tzinfo is None:
+ seen_at = seen_at.replace(tzinfo=timezone.utc)
+ except Exception:
+ seen_at = None
+
+ papers = _iter_report_papers(report)
+ source_hint = (report.get("sources") or [report.get("source") or "papers_cool"])[0]
+ return store.upsert_many(papers=papers, source_hint=source_hint, seen_at=seen_at)
+
+
+def persist_judge_scores_to_registry(
+ report: Dict[str, Any],
+ *,
+ paper_store: Optional[SqlAlchemyPaperStore] = None,
+) -> Dict[str, int]:
+ """Persist LLM judge outputs into structured judge score table."""
+
+ store = paper_store or SqlAlchemyPaperStore()
+ return store.upsert_judge_scores_from_report(report)
+
+
def enrich_daily_paper_report(
report: Dict[str, Any],
*,
diff --git a/src/paperbot/core/abstractions/executable.py b/src/paperbot/core/abstractions/executable.py
index 7ada4dd..d4beab9 100644
--- a/src/paperbot/core/abstractions/executable.py
+++ b/src/paperbot/core/abstractions/executable.py
@@ -110,22 +110,27 @@ def ensure_execution_result(raw: Union[ExecutionResult[TOutput], Dict[str, Any],
return raw
if isinstance(raw, dict):
- # 兼容旧的 status 字段
- success = raw.get("success")
- if success is None:
- status = raw.get("status")
- success = status == "success" if status is not None else True
- error = raw.get("error")
- data = raw.get("data")
- metadata = raw.get("metadata", {})
- duration_ms = raw.get("duration_ms")
- return ExecutionResult(
- success=bool(success),
- error=error,
- data=data,
- metadata=metadata if isinstance(metadata, dict) else {},
- duration_ms=duration_ms,
- )
+ # Only treat as a result dict if it has recognized result keys
+ result_keys = {"success", "status", "data", "error"}
+ if result_keys & raw.keys():
+ # 兼容旧的 status 字段
+ success = raw.get("success")
+ if success is None:
+ status = raw.get("status")
+ success = status == "success" if status is not None else True
+ error = raw.get("error")
+ data = raw.get("data")
+ metadata = raw.get("metadata", {})
+ duration_ms = raw.get("duration_ms")
+ return ExecutionResult(
+ success=bool(success),
+ error=error,
+ data=data,
+ metadata=metadata if isinstance(metadata, dict) else {},
+ duration_ms=duration_ms,
+ )
+ # No recognized keys — treat the entire dict as data
+ return ExecutionResult.ok(raw) # type: ignore[arg-type]
return ExecutionResult.ok(raw) # type: ignore[arg-type]
diff --git a/src/paperbot/core/errors/errors.py b/src/paperbot/core/errors/errors.py
index a9a78c0..b88a964 100644
--- a/src/paperbot/core/errors/errors.py
+++ b/src/paperbot/core/errors/errors.py
@@ -26,17 +26,20 @@ def __str__(self) -> str:
return f"[{self.code}] {self.message}"
+@dataclass
class LLMError(PaperBotError):
- code = "LLM_ERROR"
+ code: str = "LLM_ERROR"
+@dataclass
class APIError(PaperBotError):
- code = "API_ERROR"
+ code: str = "API_ERROR"
+@dataclass
class ValidationError(PaperBotError):
- code = "VALIDATION_ERROR"
- severity = ErrorSeverity.WARNING
+ severity: ErrorSeverity = ErrorSeverity.WARNING
+ code: str = "VALIDATION_ERROR"
T = TypeVar("T")
diff --git a/src/paperbot/domain/paper_identity.py b/src/paperbot/domain/paper_identity.py
new file mode 100644
index 0000000..c508dcd
--- /dev/null
+++ b/src/paperbot/domain/paper_identity.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+import re
+from typing import Optional
+
+
+_ARXIV_ID_RE = re.compile(
+ r"(?P(?:\d{4}\.\d{4,5})(?:v\d+)?|[a-z\-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?)",
+ re.IGNORECASE,
+)
+_DOI_RE = re.compile(r"(?P10\.\d{4,9}/[-._;()/:A-Z0-9]+)", re.IGNORECASE)
+
+
+def normalize_arxiv_id(value: str | None) -> Optional[str]:
+ text = (value or "").strip()
+ if not text:
+ return None
+
+ lowered = text.lower()
+ for marker in ("arxiv.org/abs/", "arxiv.org/pdf/"):
+ idx = lowered.find(marker)
+ if idx >= 0:
+ text = text[idx + len(marker) :]
+ break
+
+ text = text.replace("arxiv:", "")
+ text = text.split("?", 1)[0].split("#", 1)[0]
+ if text.lower().endswith(".pdf"):
+ text = text[:-4]
+ text = text.strip(" /")
+
+ match = _ARXIV_ID_RE.search(text)
+ if not match:
+ return None
+ return match.group("id")
+
+
+def normalize_doi(value: str | None) -> Optional[str]:
+ text = (value or "").strip()
+ if not text:
+ return None
+
+ lowered = text.lower()
+ for marker in ("doi.org/", "dx.doi.org/"):
+ idx = lowered.find(marker)
+ if idx >= 0:
+ text = text[idx + len(marker) :]
+ break
+
+ text = text.split("?", 1)[0].split("#", 1)[0].strip()
+ match = _DOI_RE.search(text)
+ if not match:
+ return None
+ return match.group("doi").strip().lower()
+
+
+def normalize_paper_id(url_or_id: str | None) -> Optional[str]:
+ arxiv_id = normalize_arxiv_id(url_or_id)
+ if arxiv_id:
+ return f"arxiv:{arxiv_id}"
+
+ doi = normalize_doi(url_or_id)
+ if doi:
+ return f"doi:{doi}"
+
+ return None
diff --git a/src/paperbot/infrastructure/queue/arq_worker.py b/src/paperbot/infrastructure/queue/arq_worker.py
index 1d8eb9f..7256b18 100644
--- a/src/paperbot/infrastructure/queue/arq_worker.py
+++ b/src/paperbot/infrastructure/queue/arq_worker.py
@@ -382,8 +382,10 @@ async def daily_papers_job(
apply_judge_scores_to_report,
build_daily_paper_report,
enrich_daily_paper_report,
+ ingest_daily_report_to_registry,
normalize_llm_features,
normalize_output_formats,
+ persist_judge_scores_to_registry,
render_daily_paper_markdown,
)
from paperbot.application.services.daily_push_service import DailyPushService
@@ -413,6 +415,18 @@ async def daily_papers_job(
n_runs=max(1, int(judge_runs)),
judge_token_budget=max(0, int(judge_token_budget)),
)
+
+ try:
+ report["registry_ingest"] = ingest_daily_report_to_registry(report)
+ except Exception as exc:
+ report["registry_ingest"] = {"error": str(exc)}
+
+ if enable_judge:
+ try:
+ report["judge_registry_ingest"] = persist_judge_scores_to_registry(report)
+ except Exception as exc:
+ report["judge_registry_ingest"] = {"error": str(exc)}
+
markdown = render_daily_paper_markdown(report)
markdown_path = None
diff --git a/src/paperbot/infrastructure/stores/models.py b/src/paperbot/infrastructure/stores/models.py
index 3a1204e..726f29b 100644
--- a/src/paperbot/infrastructure/stores/models.py
+++ b/src/paperbot/infrastructure/stores/models.py
@@ -450,6 +450,90 @@ class ResearchMilestoneModel(Base):
track = relationship("ResearchTrackModel", back_populates="milestones")
+class PaperModel(Base):
+ """Canonical paper registry row (deduplicated across sources)."""
+
+ __tablename__ = "papers"
+ __table_args__ = (
+ UniqueConstraint("arxiv_id", name="uq_papers_arxiv_id"),
+ UniqueConstraint("doi", name="uq_papers_doi"),
+ )
+
+ id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
+
+ arxiv_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True, index=True)
+ doi: Mapped[Optional[str]] = mapped_column(String(128), nullable=True, index=True)
+
+ title: Mapped[str] = mapped_column(Text, default="", index=True)
+ authors_json: Mapped[str] = mapped_column(Text, default="[]")
+ abstract: Mapped[str] = mapped_column(Text, default="")
+
+ url: Mapped[str] = mapped_column(String(512), default="")
+ external_url: Mapped[str] = mapped_column(String(512), default="")
+ pdf_url: Mapped[str] = mapped_column(String(512), default="")
+
+ source: Mapped[str] = mapped_column(String(32), default="papers_cool", index=True)
+ venue: Mapped[str] = mapped_column(String(256), default="")
+ published_at: Mapped[Optional[datetime]] = mapped_column(
+ DateTime(timezone=True), nullable=True, index=True
+ )
+ first_seen_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
+
+ keywords_json: Mapped[str] = mapped_column(Text, default="[]")
+ metadata_json: Mapped[str] = mapped_column(Text, default="{}")
+
+ created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
+ updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
+
+ judge_scores = relationship(
+ "PaperJudgeScoreModel", back_populates="paper", cascade="all, delete-orphan"
+ )
+ feedback_rows = relationship("PaperFeedbackModel", back_populates="paper")
+ reading_status_rows = relationship("PaperReadingStatusModel", back_populates="paper")
+
+ def set_authors(self, values: Optional[list[str]]) -> None:
+ self.authors_json = json.dumps(
+ [str(v) for v in (values or []) if str(v).strip()],
+ ensure_ascii=False,
+ )
+
+ def get_authors(self) -> list[str]:
+ try:
+ data = json.loads(self.authors_json or "[]")
+ if isinstance(data, list):
+ return [str(v) for v in data if str(v).strip()]
+ except Exception:
+ pass
+ return []
+
+ def set_keywords(self, values: Optional[list[str]]) -> None:
+ self.keywords_json = json.dumps(
+ [str(v) for v in (values or []) if str(v).strip()],
+ ensure_ascii=False,
+ )
+
+ def get_keywords(self) -> list[str]:
+ try:
+ data = json.loads(self.keywords_json or "[]")
+ if isinstance(data, list):
+ return [str(v) for v in data if str(v).strip()]
+ except Exception:
+ pass
+ return []
+
+ def set_metadata(self, data: Dict[str, Any]) -> None:
+ self.metadata_json = json.dumps(data or {}, ensure_ascii=False)
+
+ def get_metadata(self) -> Dict[str, Any]:
+ try:
+ parsed = json.loads(self.metadata_json or "{}")
+ if isinstance(parsed, dict):
+ return parsed
+ except Exception:
+ pass
+ return {}
+
+
class PaperFeedbackModel(Base):
"""User feedback on recommended/seen papers (track-scoped)."""
@@ -461,6 +545,9 @@ class PaperFeedbackModel(Base):
track_id: Mapped[int] = mapped_column(Integer, ForeignKey("research_tracks.id"), index=True)
paper_id: Mapped[str] = mapped_column(String(64), index=True)
+ paper_ref_id: Mapped[Optional[int]] = mapped_column(
+ Integer, ForeignKey("papers.id"), nullable=True, index=True
+ )
action: Mapped[str] = mapped_column(String(16), index=True) # like/dislike/skip/save/cite
weight: Mapped[float] = mapped_column(Float, default=0.0)
@@ -468,6 +555,64 @@ class PaperFeedbackModel(Base):
metadata_json: Mapped[str] = mapped_column(Text, default="{}")
track = relationship("ResearchTrackModel", back_populates="paper_feedback")
+ paper = relationship("PaperModel", back_populates="feedback_rows")
+
+
+class PaperJudgeScoreModel(Base):
+ """Structured LLM-as-Judge scores linked to canonical papers."""
+
+ __tablename__ = "paper_judge_scores"
+ __table_args__ = (
+ UniqueConstraint("paper_id", "query", name="uq_paper_judge_scores_paper_query"),
+ )
+
+ id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
+ paper_id: Mapped[int] = mapped_column(Integer, ForeignKey("papers.id"), index=True)
+ query: Mapped[str] = mapped_column(String(256), default="", index=True)
+
+ overall: Mapped[float] = mapped_column(Float, default=0.0)
+ relevance: Mapped[float] = mapped_column(Float, default=0.0)
+ novelty: Mapped[float] = mapped_column(Float, default=0.0)
+ rigor: Mapped[float] = mapped_column(Float, default=0.0)
+ impact: Mapped[float] = mapped_column(Float, default=0.0)
+ clarity: Mapped[float] = mapped_column(Float, default=0.0)
+
+ recommendation: Mapped[str] = mapped_column(String(32), default="", index=True)
+ one_line_summary: Mapped[str] = mapped_column(Text, default="")
+ judge_model: Mapped[str] = mapped_column(String(128), default="")
+ judge_cost_tier: Mapped[Optional[int]] = mapped_column(Integer, nullable=True)
+
+ scored_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
+ metadata_json: Mapped[str] = mapped_column(Text, default="{}")
+
+ paper = relationship("PaperModel", back_populates="judge_scores")
+
+
+class PaperReadingStatusModel(Base):
+ """Per-user reading lifecycle state for a paper."""
+
+ __tablename__ = "paper_reading_status"
+ __table_args__ = (
+ UniqueConstraint("user_id", "paper_id", name="uq_paper_reading_status_user_paper"),
+ )
+
+ id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
+ user_id: Mapped[str] = mapped_column(String(64), index=True)
+ paper_id: Mapped[int] = mapped_column(Integer, ForeignKey("papers.id"), index=True)
+
+ status: Mapped[str] = mapped_column(String(16), default="unread", index=True)
+ saved_at: Mapped[Optional[datetime]] = mapped_column(
+ DateTime(timezone=True), nullable=True, index=True
+ )
+ read_at: Mapped[Optional[datetime]] = mapped_column(
+ DateTime(timezone=True), nullable=True, index=True
+ )
+
+ created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
+ updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
+ metadata_json: Mapped[str] = mapped_column(Text, default="{}")
+
+ paper = relationship("PaperModel", back_populates="reading_status_rows")
class ResearchTrackEmbeddingModel(Base):
@@ -489,6 +634,25 @@ class ResearchTrackEmbeddingModel(Base):
track = relationship("ResearchTrackModel", back_populates="embeddings")
+class NewsletterSubscriberModel(Base):
+ """Email newsletter subscriber for DailyPaper digest delivery.
+
+ TODO(GDPR): email stored as plaintext — consider encryption-at-rest or
+ hashing. Add a hard-delete method for GDPR/CCPA right-to-erasure (current
+ unsubscribe only sets status='unsubscribed', no row purge).
+ """
+
+ __tablename__ = "newsletter_subscribers"
+
+ id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
+ email: Mapped[str] = mapped_column(String(256), unique=True, index=True)
+ status: Mapped[str] = mapped_column(String(16), default="active", index=True)
+ unsub_token: Mapped[str] = mapped_column(String(64), unique=True)
+ subscribed_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
+ unsub_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True)
+ metadata_json: Mapped[str] = mapped_column(Text, default="{}")
+
+
class ResearchContextRunModel(Base):
"""
One context build event (routing + recommendations), used for replay/eval.
diff --git a/src/paperbot/infrastructure/stores/paper_store.py b/src/paperbot/infrastructure/stores/paper_store.py
new file mode 100644
index 0000000..e4f8c87
--- /dev/null
+++ b/src/paperbot/infrastructure/stores/paper_store.py
@@ -0,0 +1,317 @@
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from typing import Any, Dict, Iterable, List, Optional
+
+from sqlalchemy import desc, func, select
+
+from paperbot.domain.paper_identity import normalize_arxiv_id, normalize_doi
+from paperbot.infrastructure.stores.models import Base, PaperJudgeScoreModel, PaperModel
+from paperbot.infrastructure.stores.sqlalchemy_db import SessionProvider, get_db_url
+
+
+def _utcnow() -> datetime:
+ return datetime.now(timezone.utc)
+
+
+def _safe_list(values: Any) -> List[str]:
+ if not isinstance(values, list):
+ return []
+ return [str(v).strip() for v in values if str(v).strip()]
+
+
+def _parse_datetime(value: Any) -> Optional[datetime]:
+ if isinstance(value, datetime):
+ return value if value.tzinfo else value.replace(tzinfo=timezone.utc)
+ if not value:
+ return None
+
+ text = str(value).strip()
+ if not text:
+ return None
+
+ if text.endswith("Z"):
+ text = f"{text[:-1]}+00:00"
+ try:
+ parsed = datetime.fromisoformat(text)
+ return parsed if parsed.tzinfo else parsed.replace(tzinfo=timezone.utc)
+ except Exception:
+ return None
+
+
+def _safe_float(value: Any) -> float:
+ try:
+ return float(value)
+ except Exception:
+ return 0.0
+
+
+def _as_utc(value: Optional[datetime]) -> Optional[datetime]:
+ if value is None:
+ return None
+ if value.tzinfo is None:
+ return value.replace(tzinfo=timezone.utc)
+ return value.astimezone(timezone.utc)
+
+
+class SqlAlchemyPaperStore:
+ """Canonical paper registry with idempotent upsert for daily workflows."""
+
+ def __init__(self, db_url: Optional[str] = None, *, auto_create_schema: bool = True):
+ self.db_url = db_url or get_db_url()
+ self._provider = SessionProvider(self.db_url)
+ if auto_create_schema:
+ Base.metadata.create_all(self._provider.engine)
+
+ def upsert_paper(
+ self,
+ *,
+ paper: Dict[str, Any],
+ source_hint: Optional[str] = None,
+ seen_at: Optional[datetime] = None,
+ ) -> Dict[str, Any]:
+ now = _utcnow()
+ first_seen = seen_at or now
+
+ title = str(paper.get("title") or "").strip()
+ url = str(paper.get("url") or "").strip()
+ external_url = str(paper.get("external_url") or "").strip()
+ pdf_url = str(paper.get("pdf_url") or "").strip()
+ abstract = str(paper.get("snippet") or paper.get("abstract") or "").strip()
+
+ arxiv_id = (
+ normalize_arxiv_id(paper.get("arxiv_id"))
+ or normalize_arxiv_id(paper.get("paper_id"))
+ or normalize_arxiv_id(url)
+ or normalize_arxiv_id(external_url)
+ or normalize_arxiv_id(pdf_url)
+ )
+ doi = (
+ normalize_doi(paper.get("doi"))
+ or normalize_doi(url)
+ or normalize_doi(external_url)
+ or normalize_doi(pdf_url)
+ )
+
+ source = (
+ source_hint
+ or (paper.get("sources") or [None])[0]
+ or paper.get("source")
+ or "papers_cool"
+ )
+ venue = str(paper.get("subject_or_venue") or paper.get("venue") or "").strip()
+ published_at = _parse_datetime(
+ paper.get("published_at") or paper.get("published") or paper.get("publicationDate")
+ )
+
+ authors = _safe_list(paper.get("authors"))
+ keywords = _safe_list(paper.get("keywords"))
+
+ metadata = {
+ "paper_id": paper.get("paper_id"),
+ "matched_queries": _safe_list(paper.get("matched_queries")),
+ "branches": _safe_list(paper.get("branches")),
+ "score": paper.get("score"),
+ "pdf_stars": paper.get("pdf_stars"),
+ "kimi_stars": paper.get("kimi_stars"),
+ "alternative_urls": _safe_list(paper.get("alternative_urls")),
+ }
+
+ with self._provider.session() as session:
+ # TODO: title+url fallback query uses scalar_one_or_none() which
+ # raises MultipleResultsFound if duplicates exist. Switch to
+ # .first() or add .limit(1) for safety.
+ row = None
+ if arxiv_id:
+ row = session.execute(
+ select(PaperModel).where(PaperModel.arxiv_id == arxiv_id)
+ ).scalar_one_or_none()
+ if row is None and doi:
+ row = session.execute(
+ select(PaperModel).where(PaperModel.doi == doi)
+ ).scalar_one_or_none()
+ if row is None and url:
+ row = session.execute(
+ select(PaperModel).where(PaperModel.url == url)
+ ).scalar_one_or_none()
+ if row is None and title:
+ row = session.execute(
+ select(PaperModel).where(
+ func.lower(PaperModel.title) == title.lower(),
+ PaperModel.url == url,
+ )
+ ).scalar_one_or_none()
+
+ created = row is None
+ if row is None:
+ row = PaperModel(
+ first_seen_at=_as_utc(first_seen) or now,
+ created_at=now,
+ updated_at=now,
+ )
+ session.add(row)
+
+ # Keep earliest first_seen_at for existing records.
+ existing_seen = _as_utc(row.first_seen_at)
+ candidate_seen = _as_utc(first_seen) or now
+ if not existing_seen or candidate_seen < existing_seen:
+ row.first_seen_at = candidate_seen
+
+ if arxiv_id:
+ row.arxiv_id = arxiv_id
+ if doi:
+ row.doi = doi
+
+ row.title = title or row.title or ""
+ row.abstract = abstract or row.abstract or ""
+ row.url = url or row.url or ""
+ row.external_url = external_url or row.external_url or ""
+ row.pdf_url = pdf_url or row.pdf_url or ""
+ row.source = str(source or row.source or "papers_cool")
+ row.venue = venue or row.venue or ""
+ row.published_at = _as_utc(published_at) or _as_utc(row.published_at)
+ # TODO: unconditional set_authors/set_keywords/set_metadata may wipe
+ # existing data when new paper dict has empty values. Consider
+ # preserving existing values when incoming data is empty:
+ # row.set_authors(authors or row.get_authors())
+ row.set_authors(authors)
+ row.set_keywords(keywords)
+ row.set_metadata(metadata)
+ row.updated_at = now
+
+ session.commit()
+ session.refresh(row)
+
+ payload = self._paper_to_dict(row)
+ payload["_created"] = created
+ return payload
+
+ def upsert_many(
+ self,
+ *,
+ papers: Iterable[Dict[str, Any]],
+ source_hint: Optional[str] = None,
+ seen_at: Optional[datetime] = None,
+ ) -> Dict[str, int]:
+ created = 0
+ updated = 0
+ total = 0
+
+ for paper in papers:
+ if not isinstance(paper, dict):
+ continue
+ result = self.upsert_paper(paper=paper, source_hint=source_hint, seen_at=seen_at)
+ total += 1
+ if result.get("_created"):
+ created += 1
+ else:
+ updated += 1
+
+ return {"total": total, "created": created, "updated": updated}
+
+ def list_recent(self, *, limit: int = 50, source: Optional[str] = None) -> List[Dict[str, Any]]:
+ with self._provider.session() as session:
+ stmt = select(PaperModel)
+ if source:
+ stmt = stmt.where(PaperModel.source == source)
+ stmt = stmt.order_by(desc(PaperModel.first_seen_at), desc(PaperModel.id)).limit(
+ max(1, int(limit))
+ )
+ rows = session.execute(stmt).scalars().all()
+ return [self._paper_to_dict(row) for row in rows]
+
+ def upsert_judge_scores_from_report(self, report: Dict[str, Any]) -> Dict[str, int]:
+ now = _utcnow()
+ scored_at = _parse_datetime(report.get("generated_at")) or now
+
+ created = 0
+ updated = 0
+ total = 0
+
+ for query in report.get("queries") or []:
+ query_name = str(query.get("normalized_query") or query.get("raw_query") or "").strip()
+ if not query_name:
+ continue
+ for item in query.get("top_items") or []:
+ if not isinstance(item, dict):
+ continue
+ judge = item.get("judge")
+ if not isinstance(judge, dict):
+ continue
+
+ paper_row = self.upsert_paper(
+ paper=item,
+ source_hint=(report.get("sources") or [report.get("source")])[0],
+ seen_at=scored_at,
+ )
+ paper_db_id = int(paper_row.get("id") or 0)
+ if paper_db_id <= 0:
+ continue
+
+ total += 1
+ with self._provider.session() as session:
+ row = session.execute(
+ select(PaperJudgeScoreModel).where(
+ PaperJudgeScoreModel.paper_id == paper_db_id,
+ PaperJudgeScoreModel.query == query_name,
+ )
+ ).scalar_one_or_none()
+
+ was_created = row is None
+ if row is None:
+ row = PaperJudgeScoreModel(
+ paper_id=paper_db_id,
+ query=query_name,
+ scored_at=scored_at,
+ )
+ session.add(row)
+
+ row.overall = _safe_float(judge.get("overall"))
+ row.relevance = _safe_float((judge.get("relevance") or {}).get("score"))
+ row.novelty = _safe_float((judge.get("novelty") or {}).get("score"))
+ row.rigor = _safe_float((judge.get("rigor") or {}).get("score"))
+ row.impact = _safe_float((judge.get("impact") or {}).get("score"))
+ row.clarity = _safe_float((judge.get("clarity") or {}).get("score"))
+ row.recommendation = str(judge.get("recommendation") or "")
+ row.one_line_summary = str(judge.get("one_line_summary") or "")
+ row.judge_model = str(judge.get("judge_model") or "")
+ try:
+ row.judge_cost_tier = (
+ int(judge.get("judge_cost_tier"))
+ if judge.get("judge_cost_tier") is not None
+ else None
+ )
+ except (ValueError, TypeError):
+ row.judge_cost_tier = None
+ row.scored_at = scored_at
+ row.metadata_json = "{}"
+
+ session.commit()
+ if was_created:
+ created += 1
+ else:
+ updated += 1
+
+ return {"total": total, "created": created, "updated": updated}
+
+ @staticmethod
+ def _paper_to_dict(row: PaperModel) -> Dict[str, Any]:
+ return {
+ "id": int(row.id),
+ "arxiv_id": row.arxiv_id,
+ "doi": row.doi,
+ "title": row.title,
+ "authors": row.get_authors(),
+ "abstract": row.abstract,
+ "url": row.url,
+ "external_url": row.external_url,
+ "pdf_url": row.pdf_url,
+ "source": row.source,
+ "venue": row.venue,
+ "published_at": row.published_at.isoformat() if row.published_at else None,
+ "first_seen_at": row.first_seen_at.isoformat() if row.first_seen_at else None,
+ "keywords": row.get_keywords(),
+ "metadata": row.get_metadata(),
+ "created_at": row.created_at.isoformat() if row.created_at else None,
+ "updated_at": row.updated_at.isoformat() if row.updated_at else None,
+ }
diff --git a/src/paperbot/infrastructure/stores/research_store.py b/src/paperbot/infrastructure/stores/research_store.py
index 37597c7..425724a 100644
--- a/src/paperbot/infrastructure/stores/research_store.py
+++ b/src/paperbot/infrastructure/stores/research_store.py
@@ -5,13 +5,17 @@
from datetime import datetime, timedelta, timezone
from typing import Any, Dict, List, Optional
-from sqlalchemy import desc, select
+from sqlalchemy import desc, func, or_, select
from sqlalchemy.exc import IntegrityError
+from paperbot.domain.paper_identity import normalize_arxiv_id, normalize_doi
from paperbot.infrastructure.stores.models import (
Base,
PaperFeedbackModel,
+ PaperJudgeScoreModel,
+ PaperModel,
PaperImpressionModel,
+ PaperReadingStatusModel,
ResearchContextRunModel,
ResearchMilestoneModel,
ResearchTaskModel,
@@ -326,6 +330,7 @@ def add_paper_feedback(
metadata: Optional[Dict[str, Any]] = None,
) -> Optional[Dict[str, Any]]:
now = _utcnow()
+ metadata = dict(metadata or {})
with self._provider.session() as session:
track = session.execute(
select(ResearchTrackModel).where(
@@ -335,16 +340,35 @@ def add_paper_feedback(
if track is None:
return None
+ resolved_paper_ref_id = self._resolve_paper_ref_id(
+ session=session,
+ paper_id=(paper_id or "").strip(),
+ metadata=metadata,
+ )
+
row = PaperFeedbackModel(
user_id=user_id,
track_id=track_id,
paper_id=(paper_id or "").strip(),
+ paper_ref_id=resolved_paper_ref_id,
action=(action or "").strip(),
weight=float(weight or 0.0),
ts=now,
metadata_json=json.dumps(metadata or {}, ensure_ascii=False),
)
session.add(row)
+
+ if resolved_paper_ref_id and (action or "").strip() == "save":
+ self._upsert_reading_status_row(
+ session=session,
+ user_id=user_id,
+ paper_ref_id=resolved_paper_ref_id,
+ status="unread",
+ mark_saved=True,
+ metadata=metadata,
+ now=now,
+ )
+
track.updated_at = now
session.add(track)
session.commit()
@@ -394,6 +418,218 @@ def list_paper_feedback_ids(
ids.add(pid)
return ids
+ def set_paper_reading_status(
+ self,
+ *,
+ user_id: str,
+ paper_id: str,
+ status: str,
+ metadata: Optional[Dict[str, Any]] = None,
+ mark_saved: Optional[bool] = None,
+ ) -> Optional[Dict[str, Any]]:
+ now = _utcnow()
+ metadata = dict(metadata or {})
+ with self._provider.session() as session:
+ paper_ref_id = self._resolve_paper_ref_id(
+ session=session,
+ paper_id=(paper_id or "").strip(),
+ metadata=metadata,
+ )
+ if not paper_ref_id:
+ return None
+
+ row = self._upsert_reading_status_row(
+ session=session,
+ user_id=user_id,
+ paper_ref_id=paper_ref_id,
+ status=status,
+ mark_saved=mark_saved,
+ metadata=metadata,
+ now=now,
+ )
+ session.commit()
+ session.refresh(row)
+ return self._reading_status_to_dict(row)
+
+ def list_saved_papers(
+ self,
+ *,
+ user_id: str,
+ limit: int = 200,
+ sort_by: str = "saved_at",
+ ) -> List[Dict[str, Any]]:
+ with self._provider.session() as session:
+ saved_at_by_paper: Dict[int, datetime] = {}
+
+ status_rows = (
+ session.execute(
+ select(PaperReadingStatusModel).where(
+ PaperReadingStatusModel.user_id == user_id,
+ PaperReadingStatusModel.saved_at.is_not(None),
+ )
+ )
+ .scalars()
+ .all()
+ )
+ for row in status_rows:
+ if row.paper_id and row.saved_at:
+ saved_at_by_paper[int(row.paper_id)] = row.saved_at
+
+ feedback_rows = (
+ session.execute(
+ select(PaperFeedbackModel).where(
+ PaperFeedbackModel.user_id == user_id,
+ PaperFeedbackModel.action == "save",
+ PaperFeedbackModel.paper_ref_id.is_not(None),
+ )
+ )
+ .scalars()
+ .all()
+ )
+ for row in feedback_rows:
+ pid = int(row.paper_ref_id or 0)
+ if pid <= 0:
+ continue
+ current = saved_at_by_paper.get(pid)
+ if current is None or (row.ts and row.ts > current):
+ saved_at_by_paper[pid] = row.ts or _utcnow()
+
+ paper_ids = list(saved_at_by_paper.keys())
+ if not paper_ids:
+ return []
+
+ papers = (
+ session.execute(select(PaperModel).where(PaperModel.id.in_(paper_ids)))
+ .scalars()
+ .all()
+ )
+ status_by_paper = {
+ int(row.paper_id): row
+ for row in session.execute(
+ select(PaperReadingStatusModel).where(
+ PaperReadingStatusModel.user_id == user_id,
+ PaperReadingStatusModel.paper_id.in_(paper_ids),
+ )
+ )
+ .scalars()
+ .all()
+ }
+
+ latest_judge_by_paper: Dict[int, PaperJudgeScoreModel] = {}
+ for pid in paper_ids:
+ judge = session.execute(
+ select(PaperJudgeScoreModel)
+ .where(PaperJudgeScoreModel.paper_id == pid)
+ .order_by(desc(PaperJudgeScoreModel.scored_at), desc(PaperJudgeScoreModel.id))
+ .limit(1)
+ ).scalar_one_or_none()
+ if judge is not None:
+ latest_judge_by_paper[pid] = judge
+
+ rows: List[Dict[str, Any]] = []
+ for paper in papers:
+ pid = int(paper.id)
+ status_row = status_by_paper.get(pid)
+ judge_row = latest_judge_by_paper.get(pid)
+ rows.append(
+ {
+ "paper": self._paper_to_dict(paper),
+ "saved_at": (
+ saved_at_by_paper.get(pid).isoformat()
+ if saved_at_by_paper.get(pid)
+ else None
+ ),
+ "reading_status": (
+ self._reading_status_to_dict(status_row) if status_row else None
+ ),
+ "latest_judge": self._judge_score_to_dict(judge_row) if judge_row else None,
+ }
+ )
+
+ if sort_by == "judge_score":
+ rows.sort(
+ key=lambda row: float(((row.get("latest_judge") or {}).get("overall") or 0.0)),
+ reverse=True,
+ )
+ elif sort_by == "published_at":
+ rows.sort(
+ key=lambda row: str(((row.get("paper") or {}).get("published_at") or "")),
+ reverse=True,
+ )
+ else:
+ rows.sort(key=lambda row: str(row.get("saved_at") or ""), reverse=True)
+
+ return rows[: max(1, int(limit))]
+
+ def get_paper_detail(
+ self, *, paper_id: str, user_id: str = "default"
+ ) -> Optional[Dict[str, Any]]:
+ with self._provider.session() as session:
+ paper_ref_id = self._resolve_paper_ref_id(
+ session=session,
+ paper_id=(paper_id or "").strip(),
+ metadata={},
+ )
+ if not paper_ref_id:
+ return None
+
+ paper = session.execute(
+ select(PaperModel).where(PaperModel.id == int(paper_ref_id))
+ ).scalar_one_or_none()
+ if not paper:
+ return None
+
+ reading_status = session.execute(
+ select(PaperReadingStatusModel).where(
+ PaperReadingStatusModel.user_id == user_id,
+ PaperReadingStatusModel.paper_id == int(paper_ref_id),
+ )
+ ).scalar_one_or_none()
+
+ judge_scores = (
+ session.execute(
+ select(PaperJudgeScoreModel)
+ .where(PaperJudgeScoreModel.paper_id == int(paper_ref_id))
+ .order_by(desc(PaperJudgeScoreModel.scored_at), desc(PaperJudgeScoreModel.id))
+ )
+ .scalars()
+ .all()
+ )
+
+ feedback_rows = (
+ session.execute(
+ select(PaperFeedbackModel)
+ .where(
+ PaperFeedbackModel.user_id == user_id,
+ PaperFeedbackModel.paper_ref_id == int(paper_ref_id),
+ )
+ .order_by(desc(PaperFeedbackModel.ts), desc(PaperFeedbackModel.id))
+ .limit(100)
+ )
+ .scalars()
+ .all()
+ )
+
+ feedback_summary: Dict[str, int] = {}
+ for row in feedback_rows:
+ action = str(row.action or "")
+ if not action:
+ continue
+ feedback_summary[action] = feedback_summary.get(action, 0) + 1
+
+ return {
+ "paper": self._paper_to_dict(paper),
+ "reading_status": (
+ self._reading_status_to_dict(reading_status) if reading_status else None
+ ),
+ "latest_judge": (
+ self._judge_score_to_dict(judge_scores[0]) if judge_scores else None
+ ),
+ "judge_scores": [self._judge_score_to_dict(row) for row in judge_scores],
+ "feedback_summary": feedback_summary,
+ "feedback_rows": [self._feedback_to_dict(row) for row in feedback_rows],
+ }
+
def create_context_run(
self,
*,
@@ -678,6 +914,209 @@ def _milestone_to_dict(m: ResearchMilestoneModel) -> Dict[str, Any]:
"updated_at": m.updated_at.isoformat() if m.updated_at else None,
}
+ @staticmethod
+ def _normalize_reading_status(value: str) -> str:
+ normalized = (value or "").strip().lower()
+ if normalized in {"unread", "reading", "read", "archived"}:
+ return normalized
+ return "unread"
+
+ def _upsert_reading_status_row(
+ self,
+ *,
+ session,
+ user_id: str,
+ paper_ref_id: int,
+ status: str,
+ mark_saved: Optional[bool],
+ metadata: Optional[Dict[str, Any]],
+ now: datetime,
+ ) -> PaperReadingStatusModel:
+ row = session.execute(
+ select(PaperReadingStatusModel).where(
+ PaperReadingStatusModel.user_id == user_id,
+ PaperReadingStatusModel.paper_id == int(paper_ref_id),
+ )
+ ).scalar_one_or_none()
+ if row is None:
+ row = PaperReadingStatusModel(
+ user_id=user_id,
+ paper_id=int(paper_ref_id),
+ created_at=now,
+ updated_at=now,
+ )
+ session.add(row)
+
+ row.status = self._normalize_reading_status(status)
+ row.updated_at = now
+
+ if row.status == "read" and row.read_at is None:
+ row.read_at = now
+
+ if mark_saved is True:
+ row.saved_at = row.saved_at or now
+ elif mark_saved is False:
+ row.saved_at = None
+
+ if metadata is not None:
+ row.metadata_json = json.dumps(metadata, ensure_ascii=False)
+
+ session.add(row)
+ return row
+
+ @staticmethod
+ def _paper_to_dict(p: PaperModel) -> Dict[str, Any]:
+ try:
+ metadata = json.loads(p.metadata_json or "{}")
+ if not isinstance(metadata, dict):
+ metadata = {}
+ except Exception:
+ metadata = {}
+
+ return {
+ "id": int(p.id),
+ "arxiv_id": p.arxiv_id,
+ "doi": p.doi,
+ "title": p.title,
+ "authors": p.get_authors(),
+ "abstract": p.abstract,
+ "url": p.url,
+ "external_url": p.external_url,
+ "pdf_url": p.pdf_url,
+ "source": p.source,
+ "venue": p.venue,
+ "published_at": p.published_at.isoformat() if p.published_at else None,
+ "first_seen_at": p.first_seen_at.isoformat() if p.first_seen_at else None,
+ "keywords": p.get_keywords(),
+ "metadata": metadata,
+ }
+
+ @staticmethod
+ def _judge_score_to_dict(row: PaperJudgeScoreModel) -> Dict[str, Any]:
+ try:
+ metadata = json.loads(row.metadata_json or "{}")
+ if not isinstance(metadata, dict):
+ metadata = {}
+ except Exception:
+ metadata = {}
+
+ return {
+ "id": int(row.id),
+ "paper_id": int(row.paper_id),
+ "query": row.query,
+ "overall": float(row.overall or 0.0),
+ "relevance": float(row.relevance or 0.0),
+ "novelty": float(row.novelty or 0.0),
+ "rigor": float(row.rigor or 0.0),
+ "impact": float(row.impact or 0.0),
+ "clarity": float(row.clarity or 0.0),
+ "recommendation": row.recommendation,
+ "one_line_summary": row.one_line_summary,
+ "judge_model": row.judge_model,
+ "judge_cost_tier": row.judge_cost_tier,
+ "scored_at": row.scored_at.isoformat() if row.scored_at else None,
+ "metadata": metadata,
+ }
+
+ @staticmethod
+ def _reading_status_to_dict(row: PaperReadingStatusModel) -> Dict[str, Any]:
+ try:
+ metadata = json.loads(row.metadata_json or "{}")
+ if not isinstance(metadata, dict):
+ metadata = {}
+ except Exception:
+ metadata = {}
+ return {
+ "id": int(row.id),
+ "user_id": row.user_id,
+ "paper_id": int(row.paper_id),
+ "status": row.status,
+ "saved_at": row.saved_at.isoformat() if row.saved_at else None,
+ "read_at": row.read_at.isoformat() if row.read_at else None,
+ "created_at": row.created_at.isoformat() if row.created_at else None,
+ "updated_at": row.updated_at.isoformat() if row.updated_at else None,
+ "metadata": metadata,
+ }
+
+ @staticmethod
+ def _resolve_paper_ref_id(
+ *,
+ session,
+ paper_id: str,
+ metadata: Dict[str, Any],
+ ) -> Optional[int]:
+ pid = (paper_id or "").strip()
+ if not pid:
+ return None
+
+ if pid.isdigit():
+ row = session.execute(
+ select(PaperModel).where(PaperModel.id == int(pid))
+ ).scalar_one_or_none()
+ if row is not None:
+ return int(row.id)
+
+ arxiv_id = normalize_arxiv_id(pid)
+ doi = normalize_doi(pid)
+
+ url_candidates = []
+ for key in ("paper_url", "url", "external_url", "pdf_url"):
+ value = metadata.get(key)
+ if isinstance(value, str) and value.strip():
+ url_candidates.append(value.strip())
+ if pid.startswith("http"):
+ url_candidates.append(pid)
+
+ if not arxiv_id:
+ for candidate in url_candidates:
+ arxiv_id = normalize_arxiv_id(candidate)
+ if arxiv_id:
+ break
+ if not doi:
+ for candidate in url_candidates:
+ doi = normalize_doi(candidate)
+ if doi:
+ break
+
+ if arxiv_id:
+ row = session.execute(
+ select(PaperModel).where(PaperModel.arxiv_id == arxiv_id)
+ ).scalar_one_or_none()
+ if row is not None:
+ return int(row.id)
+
+ if doi:
+ row = session.execute(
+ select(PaperModel).where(PaperModel.doi == doi)
+ ).scalar_one_or_none()
+ if row is not None:
+ return int(row.id)
+
+ # TODO: scalar_one_or_none() can raise MultipleResultsFound if
+ # multiple papers share the same URL or title. Switch to .first().
+ if url_candidates:
+ row = session.execute(
+ select(PaperModel).where(
+ or_(
+ PaperModel.url.in_(url_candidates),
+ PaperModel.external_url.in_(url_candidates),
+ PaperModel.pdf_url.in_(url_candidates),
+ )
+ )
+ ).scalar_one_or_none()
+ if row is not None:
+ return int(row.id)
+
+ title = str(metadata.get("title") or "").strip()
+ if title:
+ row = session.execute(
+ select(PaperModel).where(func.lower(PaperModel.title) == title.lower())
+ ).scalar_one_or_none()
+ if row is not None:
+ return int(row.id)
+
+ return None
+
@staticmethod
def _feedback_to_dict(f: PaperFeedbackModel) -> Dict[str, Any]:
try:
@@ -691,6 +1130,7 @@ def _feedback_to_dict(f: PaperFeedbackModel) -> Dict[str, Any]:
"user_id": f.user_id,
"track_id": f.track_id,
"paper_id": f.paper_id,
+ "paper_ref_id": f.paper_ref_id,
"action": f.action,
"weight": float(f.weight or 0.0),
"ts": f.ts.isoformat() if f.ts else None,
diff --git a/src/paperbot/infrastructure/stores/subscriber_store.py b/src/paperbot/infrastructure/stores/subscriber_store.py
new file mode 100644
index 0000000..ec399b7
--- /dev/null
+++ b/src/paperbot/infrastructure/stores/subscriber_store.py
@@ -0,0 +1,118 @@
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional
+from uuid import uuid4
+
+from sqlalchemy import select
+
+from paperbot.infrastructure.stores.models import NewsletterSubscriberModel
+from paperbot.infrastructure.stores.sqlalchemy_db import SessionProvider, get_db_url
+
+
+def _utcnow() -> datetime:
+ return datetime.now(timezone.utc)
+
+
+class SubscriberStore:
+ """CRUD operations for newsletter subscribers."""
+
+ def __init__(self, db_url: Optional[str] = None, *, auto_create_schema: bool = True):
+ self.db_url = db_url or get_db_url()
+ self._provider = SessionProvider(self.db_url)
+ if auto_create_schema:
+ NewsletterSubscriberModel.__table__.create(self._provider.engine, checkfirst=True)
+
+ def add_subscriber(self, email: str) -> Dict[str, Any]:
+ email = email.strip().lower()
+ with self._provider.session() as session:
+ existing = session.execute(
+ select(NewsletterSubscriberModel).where(
+ NewsletterSubscriberModel.email == email
+ )
+ ).scalar_one_or_none()
+
+ if existing:
+ if existing.status == "unsubscribed":
+ existing.status = "active"
+ existing.unsub_at = None
+ existing.subscribed_at = _utcnow()
+ session.commit()
+ return self._row_to_dict(existing)
+
+ row = NewsletterSubscriberModel(
+ email=email,
+ status="active",
+ unsub_token=uuid4().hex,
+ subscribed_at=_utcnow(),
+ metadata_json="{}",
+ )
+ session.add(row)
+ session.commit()
+ session.refresh(row)
+ return self._row_to_dict(row)
+
+ def remove_subscriber(self, unsub_token: str) -> bool:
+ with self._provider.session() as session:
+ row = session.execute(
+ select(NewsletterSubscriberModel).where(
+ NewsletterSubscriberModel.unsub_token == unsub_token
+ )
+ ).scalar_one_or_none()
+ if not row:
+ return False
+ if row.status == "unsubscribed":
+ return True
+ row.status = "unsubscribed"
+ row.unsub_at = _utcnow()
+ session.commit()
+ return True
+
+ def get_active_subscribers(self) -> List[str]:
+ with self._provider.session() as session:
+ rows = session.execute(
+ select(NewsletterSubscriberModel).where(
+ NewsletterSubscriberModel.status == "active"
+ )
+ ).scalars().all()
+ return [r.email for r in rows]
+
+ def get_active_subscribers_with_tokens(self) -> Dict[str, str]:
+ with self._provider.session() as session:
+ rows = session.execute(
+ select(NewsletterSubscriberModel).where(
+ NewsletterSubscriberModel.status == "active"
+ )
+ ).scalars().all()
+ return {r.email: r.unsub_token for r in rows}
+
+ def get_subscriber_by_email(self, email: str) -> Optional[Dict[str, Any]]:
+ email = email.strip().lower()
+ with self._provider.session() as session:
+ row = session.execute(
+ select(NewsletterSubscriberModel).where(
+ NewsletterSubscriberModel.email == email
+ )
+ ).scalar_one_or_none()
+ if not row:
+ return None
+ return self._row_to_dict(row)
+
+ def get_subscriber_count(self) -> Dict[str, int]:
+ with self._provider.session() as session:
+ all_rows = session.execute(
+ select(NewsletterSubscriberModel)
+ ).scalars().all()
+ active = sum(1 for r in all_rows if r.status == "active")
+ return {"active": active, "total": len(all_rows)}
+
+ @staticmethod
+ def _row_to_dict(row: NewsletterSubscriberModel) -> Dict[str, Any]:
+ return {
+ "id": row.id,
+ "email": row.email,
+ "status": row.status,
+ "unsub_token": row.unsub_token,
+ "subscribed_at": row.subscribed_at.isoformat() if row.subscribed_at else None,
+ "unsub_at": row.unsub_at.isoformat() if row.unsub_at else None,
+ }
diff --git a/src/paperbot/presentation/cli/main.py b/src/paperbot/presentation/cli/main.py
index c7f5599..39d9019 100644
--- a/src/paperbot/presentation/cli/main.py
+++ b/src/paperbot/presentation/cli/main.py
@@ -19,7 +19,9 @@
apply_judge_scores_to_report,
build_daily_paper_report,
enrich_daily_paper_report,
+ ingest_daily_report_to_registry,
normalize_llm_features,
+ persist_judge_scores_to_registry,
normalize_output_formats,
render_daily_paper_markdown,
)
@@ -77,7 +79,7 @@ def create_parser() -> argparse.ArgumentParser:
action="append",
dest="sources",
default=None,
- help="数据源名称,可重复指定;默认 papers_cool",
+ help="数据源名称,可重复指定;默认 papers_cool(可选 arxiv_api / hf_daily)",
)
topic_search_parser.add_argument("--top-k", type=int, default=5, help="每个主题保留的结果数")
topic_search_parser.add_argument(
@@ -101,7 +103,7 @@ def create_parser() -> argparse.ArgumentParser:
action="append",
dest="sources",
default=None,
- help="数据源名称,可重复指定;默认 papers_cool",
+ help="数据源名称,可重复指定;默认 papers_cool(可选 arxiv_api / hf_daily)",
)
daily_parser.add_argument(
"--branch",
@@ -286,11 +288,12 @@ def _run_daily_paper(parsed: argparse.Namespace) -> int:
sources = parsed.sources or ["papers_cool"]
workflow = _create_topic_search_workflow()
+ effective_top_k = max(1, int(parsed.top_k), int(parsed.top_n))
search_result = workflow.run(
queries=queries,
sources=sources,
branches=branches,
- top_k_per_query=max(1, int(parsed.top_k)),
+ top_k_per_query=effective_top_k,
show_per_branch=max(1, int(parsed.show)),
)
@@ -315,6 +318,18 @@ def _run_daily_paper(parsed: argparse.Namespace) -> int:
n_runs=max(1, int(parsed.judge_runs)),
judge_token_budget=max(0, int(parsed.judge_token_budget)),
)
+
+ try:
+ report["registry_ingest"] = ingest_daily_report_to_registry(report)
+ except Exception as exc:
+ report["registry_ingest"] = {"error": str(exc)}
+
+ if judge_enabled:
+ try:
+ report["judge_registry_ingest"] = persist_judge_scores_to_registry(report)
+ except Exception as exc:
+ report["judge_registry_ingest"] = {"error": str(exc)}
+
markdown = render_daily_paper_markdown(report)
markdown_path = None
diff --git a/tests/unit/test_newsletter.py b/tests/unit/test_newsletter.py
new file mode 100644
index 0000000..6fd2905
--- /dev/null
+++ b/tests/unit/test_newsletter.py
@@ -0,0 +1,182 @@
+"""Tests for newsletter subscription system."""
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from paperbot.infrastructure.stores.subscriber_store import SubscriberStore
+
+
+@pytest.fixture()
+def store(tmp_path):
+ db_url = f"sqlite:///{tmp_path / 'test.db'}"
+ return SubscriberStore(db_url=db_url, auto_create_schema=True)
+
+
+class TestSubscriberStore:
+ def test_add_subscriber(self, store: SubscriberStore):
+ result = store.add_subscriber("alice@example.com")
+ assert result["email"] == "alice@example.com"
+ assert result["status"] == "active"
+ assert result["unsub_token"]
+
+ def test_add_subscriber_idempotent(self, store: SubscriberStore):
+ r1 = store.add_subscriber("bob@example.com")
+ r2 = store.add_subscriber("bob@example.com")
+ assert r1["email"] == r2["email"]
+ assert r1["unsub_token"] == r2["unsub_token"]
+
+ def test_add_subscriber_normalizes_email(self, store: SubscriberStore):
+ result = store.add_subscriber(" Alice@Example.COM ")
+ assert result["email"] == "alice@example.com"
+
+ def test_remove_subscriber(self, store: SubscriberStore):
+ sub = store.add_subscriber("charlie@example.com")
+ ok = store.remove_subscriber(sub["unsub_token"])
+ assert ok is True
+ info = store.get_subscriber_by_email("charlie@example.com")
+ assert info is not None
+ assert info["status"] == "unsubscribed"
+
+ def test_remove_subscriber_invalid_token(self, store: SubscriberStore):
+ ok = store.remove_subscriber("nonexistent_token")
+ assert ok is False
+
+ def test_remove_subscriber_idempotent(self, store: SubscriberStore):
+ sub = store.add_subscriber("dave@example.com")
+ store.remove_subscriber(sub["unsub_token"])
+ ok = store.remove_subscriber(sub["unsub_token"])
+ assert ok is True
+
+ def test_resubscribe_after_unsubscribe(self, store: SubscriberStore):
+ sub = store.add_subscriber("eve@example.com")
+ store.remove_subscriber(sub["unsub_token"])
+ resub = store.add_subscriber("eve@example.com")
+ assert resub["status"] == "active"
+ assert resub["unsub_at"] is None
+
+ def test_get_active_subscribers(self, store: SubscriberStore):
+ store.add_subscriber("a@example.com")
+ store.add_subscriber("b@example.com")
+ sub_c = store.add_subscriber("c@example.com")
+ store.remove_subscriber(sub_c["unsub_token"])
+
+ active = store.get_active_subscribers()
+ assert sorted(active) == ["a@example.com", "b@example.com"]
+
+ def test_get_active_subscribers_with_tokens(self, store: SubscriberStore):
+ store.add_subscriber("x@example.com")
+ store.add_subscriber("y@example.com")
+ tokens = store.get_active_subscribers_with_tokens()
+ assert len(tokens) == 2
+ assert "x@example.com" in tokens
+ assert "y@example.com" in tokens
+
+ def test_get_subscriber_count(self, store: SubscriberStore):
+ store.add_subscriber("one@example.com")
+ sub = store.add_subscriber("two@example.com")
+ store.remove_subscriber(sub["unsub_token"])
+
+ counts = store.get_subscriber_count()
+ assert counts["active"] == 1
+ assert counts["total"] == 2
+
+ def test_get_subscriber_by_email_not_found(self, store: SubscriberStore):
+ assert store.get_subscriber_by_email("nobody@example.com") is None
+
+
+class TestResendEmailService:
+ def test_from_env_returns_none_without_key(self):
+ from paperbot.application.services.resend_service import ResendEmailService
+
+ with patch.dict("os.environ", {}, clear=True):
+ svc = ResendEmailService.from_env()
+ assert svc is None
+
+ def test_from_env_returns_instance_with_key(self):
+ from paperbot.application.services.resend_service import ResendEmailService
+
+ env = {"PAPERBOT_RESEND_API_KEY": "re_test_key"}
+ with patch.dict("os.environ", env, clear=True):
+ svc = ResendEmailService.from_env()
+ assert svc is not None
+ assert svc.api_key == "re_test_key"
+
+ def test_render_text(self):
+ from paperbot.application.services.resend_service import ResendEmailService
+
+ svc = ResendEmailService(
+ api_key="test", from_email="test@test.com", unsub_base_url="https://example.com"
+ )
+ report = {
+ "title": "Test Digest",
+ "date": "2026-02-11",
+ "stats": {"unique_items": 3},
+ "global_top": [
+ {"title": "Paper A", "url": "https://arxiv.org/abs/1", "score": 9.5},
+ {"title": "Paper B", "url": "", "score": 8.0},
+ ],
+ }
+ text = svc._render_text(report, "", "https://example.com/unsub/abc")
+ assert "Test Digest" in text
+ assert "Paper A" in text
+ assert "Unsubscribe" in text
+
+ def test_render_html(self):
+ from paperbot.application.services.resend_service import ResendEmailService
+
+ svc = ResendEmailService(
+ api_key="test", from_email="test@test.com", unsub_base_url="https://example.com"
+ )
+ report = {
+ "title": "Test Digest",
+ "date": "2026-02-11",
+ "stats": {"unique_items": 1, "total_query_hits": 5},
+ "global_top": [
+ {
+ "title": "Paper X",
+ "url": "https://arxiv.org/abs/2",
+ "score": 7.0,
+ "judge": {"recommendation": "must_read", "one_line_summary": "Great paper"},
+ },
+ ],
+ }
+ html = svc._render_html(report, "", "https://example.com/unsub/xyz")
+ assert "PaperBot" in html
+ assert "Paper X" in html
+ assert "Must Read" in html
+ assert "Unsubscribe" in html
+
+
+class TestNewsletterRoutes:
+ @pytest.fixture()
+ def client(self, tmp_path):
+ import os
+ os.environ["PAPERBOT_DB_URL"] = f"sqlite:///{tmp_path / 'test.db'}"
+ from fastapi.testclient import TestClient
+ from paperbot.api.main import app
+ return TestClient(app)
+
+ def test_subscribe(self, client):
+ resp = client.post("/api/newsletter/subscribe", json={"email": "test@example.com"})
+ assert resp.status_code == 200
+ data = resp.json()
+ assert data["ok"] is True
+ assert data["email"] == "test@example.com"
+
+ def test_subscribe_invalid_email(self, client):
+ resp = client.post("/api/newsletter/subscribe", json={"email": "not-an-email"})
+ assert resp.status_code == 400
+
+ def test_subscribers_count(self, client):
+ client.post("/api/newsletter/subscribe", json={"email": "a@example.com"})
+ client.post("/api/newsletter/subscribe", json={"email": "b@example.com"})
+ resp = client.get("/api/newsletter/subscribers")
+ assert resp.status_code == 200
+ data = resp.json()
+ assert data["active"] >= 2
+
+ def test_unsubscribe_invalid_token(self, client):
+ resp = client.get("/api/newsletter/unsubscribe/invalid_token_123")
+ assert resp.status_code == 404
diff --git a/tests/unit/test_paper_identity.py b/tests/unit/test_paper_identity.py
new file mode 100644
index 0000000..b5d9b58
--- /dev/null
+++ b/tests/unit/test_paper_identity.py
@@ -0,0 +1,17 @@
+from paperbot.domain.paper_identity import normalize_arxiv_id, normalize_doi, normalize_paper_id
+
+
+def test_normalize_arxiv_id_from_urls_and_prefixes():
+ assert normalize_arxiv_id("arXiv:2501.12345v2") == "2501.12345v2"
+ assert normalize_arxiv_id("https://arxiv.org/abs/2501.12345") == "2501.12345"
+ assert normalize_arxiv_id("https://arxiv.org/pdf/2501.12345.pdf") == "2501.12345"
+
+
+def test_normalize_doi_from_url_or_raw():
+ assert normalize_doi("https://doi.org/10.1145/123.456") == "10.1145/123.456"
+ assert normalize_doi("10.48550/arXiv.2501.12345") == "10.48550/arxiv.2501.12345"
+
+
+def test_normalize_paper_id_prefers_arxiv():
+ assert normalize_paper_id("https://arxiv.org/abs/2501.12345") == "arxiv:2501.12345"
+ assert normalize_paper_id("https://doi.org/10.1145/123.456") == "doi:10.1145/123.456"
diff --git a/tests/unit/test_paper_judge_persistence.py b/tests/unit/test_paper_judge_persistence.py
new file mode 100644
index 0000000..6e09499
--- /dev/null
+++ b/tests/unit/test_paper_judge_persistence.py
@@ -0,0 +1,137 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from sqlalchemy import select
+
+from paperbot.infrastructure.stores.models import PaperJudgeScoreModel
+from paperbot.infrastructure.stores.paper_store import SqlAlchemyPaperStore
+from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore
+
+
+def _judged_report():
+ return {
+ "title": "Daily",
+ "date": "2026-02-10",
+ "generated_at": "2026-02-10T00:00:00+00:00",
+ "source": "papers.cool",
+ "sources": ["papers_cool"],
+ "queries": [
+ {
+ "raw_query": "ICL压缩",
+ "normalized_query": "icl compression",
+ "top_items": [
+ {
+ "title": "UniICL",
+ "url": "https://arxiv.org/abs/2501.12345",
+ "pdf_url": "https://arxiv.org/pdf/2501.12345.pdf",
+ "authors": ["A"],
+ "snippet": "compress context",
+ "judge": {
+ "overall": 4.2,
+ "recommendation": "must_read",
+ "one_line_summary": "good",
+ "judge_model": "fake",
+ "judge_cost_tier": 1,
+ "relevance": {"score": 5},
+ "novelty": {"score": 4},
+ "rigor": {"score": 4},
+ "impact": {"score": 4},
+ "clarity": {"score": 4},
+ },
+ }
+ ],
+ }
+ ],
+ "global_top": [],
+ }
+
+
+def test_upsert_judge_scores_from_report_is_idempotent(tmp_path: Path):
+ db_path = tmp_path / "judge-registry.db"
+ store = SqlAlchemyPaperStore(db_url=f"sqlite:///{db_path}")
+
+ report = _judged_report()
+ first = store.upsert_judge_scores_from_report(report)
+ second = store.upsert_judge_scores_from_report(report)
+
+ assert first == {"total": 1, "created": 1, "updated": 0}
+ assert second == {"total": 1, "created": 0, "updated": 1}
+
+ with store._provider.session() as session:
+ rows = session.execute(select(PaperJudgeScoreModel)).scalars().all()
+ assert len(rows) == 1
+ assert rows[0].query == "icl compression"
+ assert float(rows[0].overall) == 4.2
+
+
+def test_feedback_links_to_paper_registry_row(tmp_path: Path):
+ db_path = tmp_path / "feedback-link.db"
+ db_url = f"sqlite:///{db_path}"
+
+ paper_store = SqlAlchemyPaperStore(db_url=db_url)
+ research_store = SqlAlchemyResearchStore(db_url=db_url)
+
+ paper = paper_store.upsert_paper(
+ paper={
+ "title": "UniICL",
+ "url": "https://arxiv.org/abs/2501.12345",
+ "pdf_url": "https://arxiv.org/pdf/2501.12345.pdf",
+ }
+ )
+
+ track = research_store.create_track(user_id="default", name="t1", activate=True)
+ feedback = research_store.add_paper_feedback(
+ user_id="default",
+ track_id=int(track["id"]),
+ paper_id="https://arxiv.org/abs/2501.12345",
+ action="save",
+ metadata={"url": "https://arxiv.org/abs/2501.12345", "title": "UniICL"},
+ )
+
+ assert feedback is not None
+ assert feedback["paper_ref_id"] == int(paper["id"])
+
+
+def test_saved_list_and_detail_from_research_store(tmp_path: Path):
+ db_path = tmp_path / "saved-detail.db"
+ db_url = f"sqlite:///{db_path}"
+
+ paper_store = SqlAlchemyPaperStore(db_url=db_url)
+ research_store = SqlAlchemyResearchStore(db_url=db_url)
+
+ paper = paper_store.upsert_paper(
+ paper={
+ "title": "UniICL",
+ "url": "https://arxiv.org/abs/2501.12345",
+ "pdf_url": "https://arxiv.org/pdf/2501.12345.pdf",
+ }
+ )
+
+ track = research_store.create_track(user_id="u1", name="track-u1", activate=True)
+ feedback = research_store.add_paper_feedback(
+ user_id="u1",
+ track_id=int(track["id"]),
+ paper_id=str(paper["id"]),
+ action="save",
+ metadata={"title": "UniICL"},
+ )
+ assert feedback and feedback["paper_ref_id"] == int(paper["id"])
+
+ status = research_store.set_paper_reading_status(
+ user_id="u1",
+ paper_id=str(paper["id"]),
+ status="read",
+ mark_saved=True,
+ )
+ assert status is not None
+ assert status["status"] == "read"
+
+ saved = research_store.list_saved_papers(user_id="u1", limit=10)
+ assert len(saved) == 1
+ assert saved[0]["paper"]["title"] == "UniICL"
+
+ detail = research_store.get_paper_detail(user_id="u1", paper_id=str(paper["id"]))
+ assert detail is not None
+ assert detail["paper"]["title"] == "UniICL"
+ assert detail["reading_status"]["status"] == "read"
diff --git a/tests/unit/test_paper_store.py b/tests/unit/test_paper_store.py
new file mode 100644
index 0000000..26b909e
--- /dev/null
+++ b/tests/unit/test_paper_store.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from paperbot.application.workflows.dailypaper import (
+ build_daily_paper_report,
+ ingest_daily_report_to_registry,
+)
+from paperbot.infrastructure.stores.paper_store import SqlAlchemyPaperStore
+
+
+def _sample_search_result():
+ return {
+ "source": "papers.cool",
+ "sources": ["papers_cool", "arxiv_api"],
+ "queries": [
+ {
+ "raw_query": "ICL压缩",
+ "normalized_query": "icl compression",
+ "total_hits": 1,
+ "items": [
+ {
+ "title": "UniICL",
+ "url": "https://arxiv.org/abs/2501.12345",
+ "external_url": "https://arxiv.org/abs/2501.12345",
+ "pdf_url": "https://arxiv.org/pdf/2501.12345.pdf",
+ "score": 10.2,
+ "snippet": "compress in-context learning",
+ "authors": ["A", "B"],
+ "keywords": ["icl", "compression"],
+ "matched_queries": ["icl compression"],
+ }
+ ],
+ }
+ ],
+ "items": [
+ {
+ "title": "UniICL",
+ "url": "https://arxiv.org/abs/2501.12345",
+ "external_url": "https://arxiv.org/abs/2501.12345",
+ "pdf_url": "https://arxiv.org/pdf/2501.12345.pdf",
+ "score": 10.2,
+ "snippet": "compress in-context learning",
+ "authors": ["A", "B"],
+ "keywords": ["icl", "compression"],
+ "matched_queries": ["icl compression"],
+ }
+ ],
+ "summary": {
+ "unique_items": 1,
+ "total_query_hits": 1,
+ },
+ }
+
+
+def test_ingest_daily_report_to_registry_is_idempotent(tmp_path: Path):
+ db_path = tmp_path / "paper-registry.db"
+ store = SqlAlchemyPaperStore(db_url=f"sqlite:///{db_path}")
+
+ report = build_daily_paper_report(search_result=_sample_search_result(), title="Daily", top_n=5)
+
+ first = ingest_daily_report_to_registry(report, paper_store=store)
+ second = ingest_daily_report_to_registry(report, paper_store=store)
+
+ assert first["total"] == 1
+ assert first["created"] == 1
+ assert first["updated"] == 0
+
+ assert second["total"] == 1
+ assert second["created"] == 0
+ assert second["updated"] == 1
+
+ rows = store.list_recent(limit=5)
+ assert len(rows) == 1
+ assert rows[0]["arxiv_id"] == "2501.12345"
+ assert rows[0]["title"] == "UniICL"
+ assert rows[0]["authors"] == ["A", "B"]
diff --git a/tests/unit/test_paperscool_route.py b/tests/unit/test_paperscool_route.py
index 79cd72a..c4f9fb8 100644
--- a/tests/unit/test_paperscool_route.py
+++ b/tests/unit/test_paperscool_route.py
@@ -4,8 +4,24 @@
from paperbot.api.routes import paperscool as paperscool_route
+def _parse_sse_events(text: str):
+ """Parse SSE text into a list of event dicts."""
+ import json
+ events = []
+ for line in text.split("\n"):
+ if line.startswith("data: "):
+ payload = line[6:].strip()
+ if payload == "[DONE]":
+ continue
+ try:
+ events.append(json.loads(payload))
+ except Exception:
+ pass
+ return events
+
+
class _FakeWorkflow:
- def run(self, *, queries, sources, branches, top_k_per_query, show_per_branch):
+ def run(self, *, queries, sources, branches, top_k_per_query, show_per_branch, min_score=0.0):
return {
"source": "papers.cool",
"fetched_at": "2026-02-09T00:00:00+00:00",
@@ -113,15 +129,20 @@ def test_paperscool_daily_route_success(monkeypatch, tmp_path):
def test_paperscool_daily_route_with_llm_enrichment(monkeypatch):
monkeypatch.setattr(paperscool_route, "PapersCoolTopicSearchWorkflow", _FakeWorkflow)
- called = {"value": False}
+ class _FakeLLMService:
+ def summarize_paper(self, *, title, abstract):
+ return f"summary of {title}"
- def _fake_enrich(report, *, llm_features, llm_service=None, max_items_per_query=3):
- called["value"] = True
- report = dict(report)
- report["llm_analysis"] = {"enabled": True, "features": llm_features}
- return report
+ def assess_relevance(self, *, paper, query):
+ return {"score": 4, "reason": "relevant"}
- monkeypatch.setattr(paperscool_route, "enrich_daily_paper_report", _fake_enrich)
+ def analyze_trends(self, *, topic, papers):
+ return f"trend:{topic}:{len(papers)}"
+
+ def generate_daily_insight(self, report):
+ return "daily insight"
+
+ monkeypatch.setattr(paperscool_route, "get_llm_service", lambda: _FakeLLMService())
with TestClient(api_main.app) as client:
resp = client.post(
@@ -134,36 +155,44 @@ def _fake_enrich(report, *, llm_features, llm_service=None, max_items_per_query=
)
assert resp.status_code == 200
- payload = resp.json()
- assert called["value"] is True
- assert payload["report"]["llm_analysis"]["enabled"] is True
+ # SSE stream response
+ events = _parse_sse_events(resp.text)
+ types = [e.get("type") for e in events]
+ assert "llm_done" in types
+ result_event = next(e for e in events if e.get("type") == "result")
+ assert result_event["data"]["report"]["llm_analysis"]["enabled"] is True
def test_paperscool_daily_route_with_judge(monkeypatch):
monkeypatch.setattr(paperscool_route, "PapersCoolTopicSearchWorkflow", _FakeWorkflow)
- called = {"value": False}
-
- def _fake_judge(
- report,
- *,
- llm_service=None,
- max_items_per_query=5,
- n_runs=1,
- judge_token_budget=0,
- ):
- called["value"] = True
- report = dict(report)
- report["judge"] = {
- "enabled": True,
- "max_items_per_query": max_items_per_query,
- "n_runs": n_runs,
- "recommendation_count": {"must_read": 1, "worth_reading": 0, "skim": 0, "skip": 0},
- "budget": {"token_budget": judge_token_budget, "judged_items": 1},
- }
- return report
+ class _FakeJudgment:
+ def to_dict(self):
+ return {
+ "relevance": {"score": 5, "rationale": ""},
+ "novelty": {"score": 4, "rationale": ""},
+ "rigor": {"score": 4, "rationale": ""},
+ "impact": {"score": 4, "rationale": ""},
+ "clarity": {"score": 4, "rationale": ""},
+ "overall": 4.2,
+ "one_line_summary": "good",
+ "recommendation": "must_read",
+ "judge_model": "fake",
+ "judge_cost_tier": 1,
+ }
+
+ class _FakeJudge:
+ def __init__(self, llm_service=None):
+ pass
- monkeypatch.setattr(paperscool_route, "apply_judge_scores_to_report", _fake_judge)
+ def judge_single(self, *, paper, query):
+ return _FakeJudgment()
+
+ def judge_with_calibration(self, *, paper, query, n_runs=1):
+ return _FakeJudgment()
+
+ monkeypatch.setattr(paperscool_route, "get_llm_service", lambda: object())
+ monkeypatch.setattr(paperscool_route, "PaperJudge", _FakeJudge)
with TestClient(api_main.app) as client:
resp = client.post(
@@ -177,9 +206,12 @@ def _fake_judge(
)
assert resp.status_code == 200
- payload = resp.json()
- assert called["value"] is True
- assert payload["report"]["judge"]["enabled"] is True
+ events = _parse_sse_events(resp.text)
+ types = [e.get("type") for e in events]
+ assert "judge" in types
+ assert "judge_done" in types
+ result_event = next(e for e in events if e.get("type") == "result")
+ assert result_event["data"]["report"]["judge"]["enabled"] is True
def test_paperscool_analyze_route_stream(monkeypatch):
@@ -256,3 +288,369 @@ def judge_with_calibration(self, *, paper, query, n_runs=1):
assert '"type": "trend"' in text
assert '"type": "judge"' in text
assert "[DONE]" in text
+
+
+def test_paperscool_repos_route_extracts_and_enriches(monkeypatch):
+ class _FakeResp:
+ status_code = 200
+
+ def json(self):
+ return {
+ "full_name": "owner/repo",
+ "stargazers_count": 42,
+ "forks_count": 7,
+ "open_issues_count": 1,
+ "watchers_count": 5,
+ "language": "Python",
+ "license": {"spdx_id": "MIT"},
+ "updated_at": "2026-02-01T00:00:00Z",
+ "pushed_at": "2026-02-02T00:00:00Z",
+ "archived": False,
+ "topics": ["llm"],
+ "html_url": "https://github.com/owner/repo",
+ }
+
+ monkeypatch.setattr(paperscool_route.requests, "get", lambda *args, **kwargs: _FakeResp())
+
+ with TestClient(api_main.app) as client:
+ resp = client.post(
+ "/api/research/paperscool/repos",
+ json={
+ "papers": [
+ {
+ "title": "Repo Paper",
+ "url": "https://papers.cool/arxiv/1234",
+ "external_url": "https://github.com/owner/repo",
+ }
+ ],
+ "include_github_api": True,
+ },
+ )
+
+ assert resp.status_code == 200
+ payload = resp.json()
+ assert payload["matched_repos"] == 1
+ assert payload["repos"][0]["repo_url"] == "https://github.com/owner/repo"
+ assert payload["repos"][0]["github"]["stars"] == 42
+
+
+def test_paperscool_daily_route_persists_judge_scores(monkeypatch):
+ monkeypatch.setattr(paperscool_route, "PapersCoolTopicSearchWorkflow", _FakeWorkflow)
+
+ class _FakeJudgment:
+ def to_dict(self):
+ return {
+ "relevance": {"score": 5, "rationale": ""},
+ "novelty": {"score": 4, "rationale": ""},
+ "rigor": {"score": 4, "rationale": ""},
+ "impact": {"score": 4, "rationale": ""},
+ "clarity": {"score": 4, "rationale": ""},
+ "overall": 4.2,
+ "one_line_summary": "good",
+ "recommendation": "must_read",
+ "judge_model": "fake",
+ "judge_cost_tier": 1,
+ }
+
+ class _FakeJudge:
+ def __init__(self, llm_service=None):
+ pass
+
+ def judge_single(self, *, paper, query):
+ return _FakeJudgment()
+
+ def judge_with_calibration(self, *, paper, query, n_runs=1):
+ return _FakeJudgment()
+
+ monkeypatch.setattr(paperscool_route, "get_llm_service", lambda: object())
+ monkeypatch.setattr(paperscool_route, "PaperJudge", _FakeJudge)
+
+ with TestClient(api_main.app) as client:
+ resp = client.post(
+ "/api/research/paperscool/daily",
+ json={
+ "queries": ["ICL压缩"],
+ "enable_judge": True,
+ },
+ )
+
+ assert resp.status_code == 200
+ events = _parse_sse_events(resp.text)
+ result_event = next(e for e in events if e.get("type") == "result")
+ report = result_event["data"]["report"]
+ # Judge registry ingest should have been attempted
+ assert "judge_registry_ingest" in report
+
+
+class _FakeWorkflowMultiPaper:
+ """Workflow returning multiple papers for filter testing."""
+
+ def run(self, *, queries, sources, branches, top_k_per_query, show_per_branch, min_score=0.0):
+ return {
+ "source": "papers.cool",
+ "fetched_at": "2026-02-10T00:00:00+00:00",
+ "sources": sources,
+ "queries": [
+ {
+ "raw_query": queries[0],
+ "normalized_query": "icl compression",
+ "tokens": ["icl", "compression"],
+ "total_hits": 3,
+ "items": [
+ {
+ "paper_id": "p1",
+ "title": "GoodPaper",
+ "url": "https://papers.cool/venue/p1",
+ "score": 10.0,
+ "snippet": "excellent work",
+ "keywords": ["icl"],
+ "branches": branches,
+ "matched_queries": ["icl compression"],
+ },
+ {
+ "paper_id": "p2",
+ "title": "MediocreWork",
+ "url": "https://papers.cool/venue/p2",
+ "score": 5.0,
+ "snippet": "average",
+ "keywords": ["icl"],
+ "branches": branches,
+ "matched_queries": ["icl compression"],
+ },
+ {
+ "paper_id": "p3",
+ "title": "WeakPaper",
+ "url": "https://papers.cool/venue/p3",
+ "score": 2.0,
+ "snippet": "not great",
+ "keywords": ["icl"],
+ "branches": branches,
+ "matched_queries": ["icl compression"],
+ },
+ ],
+ }
+ ],
+ "items": [],
+ "summary": {
+ "unique_items": 3,
+ "total_query_hits": 3,
+ },
+ }
+
+
+def test_dailypaper_sse_filter_removes_low_papers(monkeypatch):
+ """End-to-end: judge scores papers, filter removes 'skip' and 'skim'."""
+ monkeypatch.setattr(paperscool_route, "PapersCoolTopicSearchWorkflow", _FakeWorkflowMultiPaper)
+
+ # Judge returns different recommendations per paper title
+ class _VaryingJudgment:
+ def __init__(self, title):
+ self._title = title
+
+ def to_dict(self):
+ rec_map = {
+ "GoodPaper": ("must_read", 4.5),
+ "MediocreWork": ("skim", 2.9),
+ "WeakPaper": ("skip", 1.8),
+ }
+ rec, overall = rec_map.get(self._title, ("skip", 1.0))
+ return {
+ "relevance": {"score": 4, "rationale": ""},
+ "novelty": {"score": 3, "rationale": ""},
+ "rigor": {"score": 3, "rationale": ""},
+ "impact": {"score": 3, "rationale": ""},
+ "clarity": {"score": 3, "rationale": ""},
+ "overall": overall,
+ "one_line_summary": f"summary of {self._title}",
+ "recommendation": rec,
+ "judge_model": "fake",
+ "judge_cost_tier": 1,
+ }
+
+ class _FakeJudge:
+ def __init__(self, llm_service=None):
+ pass
+
+ def judge_single(self, *, paper, query):
+ return _VaryingJudgment(paper.get("title", ""))
+
+ def judge_with_calibration(self, *, paper, query, n_runs=1):
+ return _VaryingJudgment(paper.get("title", ""))
+
+ monkeypatch.setattr(paperscool_route, "get_llm_service", lambda: object())
+ monkeypatch.setattr(paperscool_route, "PaperJudge", _FakeJudge)
+
+ with TestClient(api_main.app) as client:
+ resp = client.post(
+ "/api/research/paperscool/daily",
+ json={
+ "queries": ["ICL压缩"],
+ "enable_judge": True,
+ "judge_max_items_per_query": 10,
+ },
+ )
+
+ assert resp.status_code == 200
+ events = _parse_sse_events(resp.text)
+ types = [e.get("type") for e in events]
+
+ # All expected phases present
+ assert "judge" in types
+ assert "judge_done" in types
+ assert "filter_done" in types
+ assert "result" in types
+
+ # Check filter_done event
+ filter_event = next(e for e in events if e.get("type") == "filter_done")
+ assert filter_event["data"]["total_before"] == 3
+ assert filter_event["data"]["total_after"] == 1 # only GoodPaper kept
+ assert filter_event["data"]["removed_count"] == 2
+
+ # Check filter log has details for removed papers
+ filter_log = filter_event["data"]["log"]
+ removed_titles = {entry["title"] for entry in filter_log}
+ assert "MediocreWork" in removed_titles
+ assert "WeakPaper" in removed_titles
+ assert "GoodPaper" not in removed_titles
+
+ # Check final result only has the kept paper
+ result_event = next(e for e in events if e.get("type") == "result")
+ final_report = result_event["data"]["report"]
+ final_items = final_report["queries"][0]["top_items"]
+ assert len(final_items) == 1
+ assert final_items[0]["title"] == "GoodPaper"
+
+ # Judge log events should have all 3 papers (complete log)
+ judge_events = [e for e in events if e.get("type") == "judge"]
+ assert len(judge_events) == 3
+ judge_titles = {e["data"]["title"] for e in judge_events}
+ assert judge_titles == {"GoodPaper", "MediocreWork", "WeakPaper"}
+
+
+def test_dailypaper_sse_full_pipeline_llm_judge_filter(monkeypatch):
+ """End-to-end: LLM enrichment + Judge + Filter in one SSE stream."""
+ monkeypatch.setattr(paperscool_route, "PapersCoolTopicSearchWorkflow", _FakeWorkflowMultiPaper)
+
+ class _FakeLLMService:
+ def summarize_paper(self, *, title, abstract):
+ return f"summary of {title}"
+
+ def assess_relevance(self, *, paper, query):
+ return {"score": 4, "reason": "relevant"}
+
+ def analyze_trends(self, *, topic, papers):
+ return f"trend:{topic}:{len(papers)}"
+
+ def generate_daily_insight(self, report):
+ return "daily insight"
+
+ def complete(self, **kwargs):
+ return "{}"
+
+ def describe_task_provider(self, task_type):
+ return {"model_name": "fake", "cost_tier": 1}
+
+ class _VaryingJudgment:
+ def __init__(self, title):
+ self._title = title
+
+ def to_dict(self):
+ rec_map = {
+ "GoodPaper": ("must_read", 4.5),
+ "MediocreWork": ("worth_reading", 3.7),
+ "WeakPaper": ("skip", 1.8),
+ }
+ rec, overall = rec_map.get(self._title, ("skip", 1.0))
+ return {
+ "relevance": {"score": 4, "rationale": ""},
+ "novelty": {"score": 3, "rationale": ""},
+ "rigor": {"score": 3, "rationale": ""},
+ "impact": {"score": 3, "rationale": ""},
+ "clarity": {"score": 3, "rationale": ""},
+ "overall": overall,
+ "one_line_summary": f"summary of {self._title}",
+ "recommendation": rec,
+ "judge_model": "fake",
+ "judge_cost_tier": 1,
+ }
+
+ class _FakeJudge:
+ def __init__(self, llm_service=None):
+ pass
+
+ def judge_single(self, *, paper, query):
+ return _VaryingJudgment(paper.get("title", ""))
+
+ def judge_with_calibration(self, *, paper, query, n_runs=1):
+ return _VaryingJudgment(paper.get("title", ""))
+
+ monkeypatch.setattr(paperscool_route, "get_llm_service", lambda: _FakeLLMService())
+ monkeypatch.setattr(paperscool_route, "PaperJudge", _FakeJudge)
+
+ with TestClient(api_main.app) as client:
+ resp = client.post(
+ "/api/research/paperscool/daily",
+ json={
+ "queries": ["ICL压缩"],
+ "enable_llm_analysis": True,
+ "llm_features": ["summary", "trends"],
+ "enable_judge": True,
+ "judge_max_items_per_query": 10,
+ },
+ )
+
+ assert resp.status_code == 200
+ events = _parse_sse_events(resp.text)
+ types = [e.get("type") for e in events]
+
+ # Full pipeline phases
+ assert "search_done" in types
+ assert "report_built" in types
+ assert "llm_summary" in types
+ assert "trend" in types
+ assert "llm_done" in types
+ assert "judge" in types
+ assert "judge_done" in types
+ assert "filter_done" in types
+ assert "result" in types
+
+ # Filter keeps must_read + worth_reading, removes skip
+ filter_event = next(e for e in events if e.get("type") == "filter_done")
+ assert filter_event["data"]["total_after"] == 2 # GoodPaper + MediocreWork
+ assert filter_event["data"]["removed_count"] == 1 # WeakPaper
+
+ # Final report has 2 papers
+ result_event = next(e for e in events if e.get("type") == "result")
+ final_items = result_event["data"]["report"]["queries"][0]["top_items"]
+ assert len(final_items) == 2
+ final_titles = {item["title"] for item in final_items}
+ assert final_titles == {"GoodPaper", "MediocreWork"}
+
+ # LLM analysis present
+ assert result_event["data"]["report"]["llm_analysis"]["enabled"] is True
+
+ # Filter metadata in report
+ assert result_event["data"]["report"]["filter"]["enabled"] is True
+
+
+def test_dailypaper_sync_path_no_llm_no_judge(monkeypatch):
+ """When no LLM/Judge, endpoint returns sync JSON (not SSE)."""
+ monkeypatch.setattr(paperscool_route, "PapersCoolTopicSearchWorkflow", _FakeWorkflow)
+
+ with TestClient(api_main.app) as client:
+ resp = client.post(
+ "/api/research/paperscool/daily",
+ json={
+ "queries": ["ICL压缩"],
+ "sources": ["papers_cool"],
+ "branches": ["arxiv", "venue"],
+ },
+ )
+
+ assert resp.status_code == 200
+ # Should be JSON, not SSE
+ payload = resp.json()
+ assert "report" in payload
+ assert payload["report"]["stats"]["unique_items"] == 1
+ # No filter block in sync path
+ assert "filter" not in payload["report"]
diff --git a/tests/unit/test_research_paper_registry_routes.py b/tests/unit/test_research_paper_registry_routes.py
new file mode 100644
index 0000000..daa5fd2
--- /dev/null
+++ b/tests/unit/test_research_paper_registry_routes.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from fastapi.testclient import TestClient
+
+from paperbot.api import main as api_main
+from paperbot.api.routes import research as research_route
+from paperbot.infrastructure.stores.paper_store import SqlAlchemyPaperStore
+from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore
+
+
+def _prepare_db(tmp_path: Path):
+ db_path = tmp_path / "paper-routes.db"
+ db_url = f"sqlite:///{db_path}"
+
+ paper_store = SqlAlchemyPaperStore(db_url=db_url)
+ research_store = SqlAlchemyResearchStore(db_url=db_url)
+
+ paper = paper_store.upsert_paper(
+ paper={
+ "title": "UniICL",
+ "url": "https://arxiv.org/abs/2501.12345",
+ "pdf_url": "https://arxiv.org/pdf/2501.12345.pdf",
+ }
+ )
+ track = research_store.create_track(user_id="u1", name="u1-track", activate=True)
+ research_store.add_paper_feedback(
+ user_id="u1",
+ track_id=int(track["id"]),
+ paper_id=str(paper["id"]),
+ action="save",
+ metadata={"title": "UniICL"},
+ )
+ return research_store, int(paper["id"])
+
+
+def test_saved_and_detail_routes(tmp_path, monkeypatch):
+ store, paper_id = _prepare_db(tmp_path)
+ monkeypatch.setattr(research_route, "_research_store", store)
+
+ with TestClient(api_main.app) as client:
+ saved = client.get("/api/research/papers/saved", params={"user_id": "u1"})
+ detail = client.get(f"/api/research/papers/{paper_id}", params={"user_id": "u1"})
+
+ assert saved.status_code == 200
+ assert len(saved.json()["items"]) == 1
+
+ assert detail.status_code == 200
+ payload = detail.json()["detail"]
+ assert payload["paper"]["id"] == paper_id
+ assert payload["paper"]["title"] == "UniICL"
+
+
+def test_update_status_route(tmp_path, monkeypatch):
+ store, paper_id = _prepare_db(tmp_path)
+ monkeypatch.setattr(research_route, "_research_store", store)
+
+ with TestClient(api_main.app) as client:
+ resp = client.post(
+ f"/api/research/papers/{paper_id}/status",
+ json={"user_id": "u1", "status": "reading", "mark_saved": True},
+ )
+
+ assert resp.status_code == 200
+ payload = resp.json()["status"]
+ assert payload["paper_id"] == paper_id
+ assert payload["status"] == "reading"
diff --git a/tests/unit/test_research_scholar_routes.py b/tests/unit/test_research_scholar_routes.py
new file mode 100644
index 0000000..ff678b7
--- /dev/null
+++ b/tests/unit/test_research_scholar_routes.py
@@ -0,0 +1,90 @@
+from fastapi.testclient import TestClient
+
+from paperbot.api import main as api_main
+from paperbot.api.routes import research as research_route
+
+
+class _FakeSemanticScholarClient:
+ def __init__(self, api_key=None):
+ self.api_key = api_key
+
+ async def get_author(self, author_id, fields=None):
+ return {
+ "authorId": author_id,
+ "name": "Alice",
+ "affiliations": ["Lab"],
+ "paperCount": 10,
+ "citationCount": 100,
+ "hIndex": 12,
+ }
+
+ async def get_author_papers(self, author_id, limit=100, fields=None):
+ return [
+ {
+ "title": "Paper A",
+ "year": 2025,
+ "citationCount": 10,
+ "venue": "NeurIPS",
+ "fieldsOfStudy": ["Machine Learning"],
+ "authors": [
+ {"authorId": author_id, "name": "Alice"},
+ {"authorId": "c1", "name": "Bob"},
+ ],
+ },
+ {
+ "title": "Paper B",
+ "year": 2024,
+ "citationCount": 4,
+ "venue": "ICLR",
+ "fieldsOfStudy": ["Machine Learning", "Optimization"],
+ "authors": [
+ {"authorId": author_id, "name": "Alice"},
+ {"authorId": "c1", "name": "Bob"},
+ {"authorId": "c2", "name": "Carol"},
+ ],
+ },
+ ]
+
+ async def close(self):
+ return None
+
+
+def test_scholar_network_route(monkeypatch):
+ monkeypatch.setattr(research_route, "SemanticScholarClient", _FakeSemanticScholarClient)
+
+ with TestClient(api_main.app) as client:
+ resp = client.post(
+ "/api/research/scholar/network",
+ json={
+ "scholar_id": "s1",
+ "max_papers": 20,
+ "recent_years": 10,
+ "max_nodes": 10,
+ },
+ )
+
+ assert resp.status_code == 200
+ payload = resp.json()
+ assert payload["scholar"]["name"] == "Alice"
+ assert payload["stats"]["coauthor_count"] == 2
+ assert len(payload["edges"]) == 2
+
+
+def test_scholar_trends_route(monkeypatch):
+ monkeypatch.setattr(research_route, "SemanticScholarClient", _FakeSemanticScholarClient)
+
+ with TestClient(api_main.app) as client:
+ resp = client.post(
+ "/api/research/scholar/trends",
+ json={
+ "scholar_id": "s1",
+ "max_papers": 20,
+ "year_window": 10,
+ },
+ )
+
+ assert resp.status_code == 200
+ payload = resp.json()
+ assert payload["scholar"]["name"] == "Alice"
+ assert len(payload["publication_velocity"]) >= 1
+ assert payload["trend_summary"]["publication_trend"] in {"up", "down", "flat"}
diff --git a/web/src/app/api/newsletter/subscribe/route.ts b/web/src/app/api/newsletter/subscribe/route.ts
new file mode 100644
index 0000000..db7f679
--- /dev/null
+++ b/web/src/app/api/newsletter/subscribe/route.ts
@@ -0,0 +1,7 @@
+export const runtime = "nodejs"
+
+import { apiBaseUrl, proxyJson } from "../../research/_base"
+
+export async function POST(req: Request) {
+ return proxyJson(req, `${apiBaseUrl()}/api/newsletter/subscribe`, "POST")
+}
diff --git a/web/src/app/api/newsletter/subscribers/route.ts b/web/src/app/api/newsletter/subscribers/route.ts
new file mode 100644
index 0000000..e96390b
--- /dev/null
+++ b/web/src/app/api/newsletter/subscribers/route.ts
@@ -0,0 +1,7 @@
+export const runtime = "nodejs"
+
+import { apiBaseUrl, proxyJson } from "../../research/_base"
+
+export async function GET(req: Request) {
+ return proxyJson(req, `${apiBaseUrl()}/api/newsletter/subscribers`, "GET")
+}
diff --git a/web/src/app/api/newsletter/unsubscribe/[token]/route.ts b/web/src/app/api/newsletter/unsubscribe/[token]/route.ts
new file mode 100644
index 0000000..c644c2b
--- /dev/null
+++ b/web/src/app/api/newsletter/unsubscribe/[token]/route.ts
@@ -0,0 +1,29 @@
+export const runtime = "nodejs"
+
+import { apiBaseUrl } from "../../../research/_base"
+
+export async function GET(
+ _req: Request,
+ { params }: { params: Promise<{ token: string }> },
+) {
+ const { token } = await params
+ try {
+ const upstream = await fetch(
+ `${apiBaseUrl()}/api/newsletter/unsubscribe/${encodeURIComponent(token)}`,
+ )
+ const text = await upstream.text()
+ return new Response(text, {
+ status: upstream.status,
+ headers: {
+ "Content-Type": upstream.headers.get("content-type") || "text/html",
+ },
+ })
+ } catch (error) {
+ const detail = error instanceof Error ? error.message : String(error)
+ const escaped = detail.replace(/&/g, "&").replace(//g, ">").replace(/"/g, """)
+ return new Response(
+ `Error ${escaped}
`,
+ { status: 502, headers: { "Content-Type": "text/html" } },
+ )
+ }
+}
diff --git a/web/src/app/api/research/_base.ts b/web/src/app/api/research/_base.ts
index d414b55..d0fdde7 100644
--- a/web/src/app/api/research/_base.ts
+++ b/web/src/app/api/research/_base.ts
@@ -4,21 +4,32 @@ export function apiBaseUrl() {
export async function proxyJson(req: Request, upstreamUrl: string, method: string) {
const body = method === "GET" ? undefined : await req.text()
- const upstream = await fetch(upstreamUrl, {
- method,
- headers: {
- Accept: "application/json",
- "Content-Type": req.headers.get("content-type") || "application/json",
- },
- body,
- })
- const text = await upstream.text()
- return new Response(text, {
- status: upstream.status,
- headers: {
- "Content-Type": upstream.headers.get("content-type") || "application/json",
- "Cache-Control": "no-cache",
- },
- })
-}
+ try {
+ const upstream = await fetch(upstreamUrl, {
+ method,
+ headers: {
+ Accept: "application/json",
+ "Content-Type": req.headers.get("content-type") || "application/json",
+ },
+ body,
+ })
+ const text = await upstream.text()
+ return new Response(text, {
+ status: upstream.status,
+ headers: {
+ "Content-Type": upstream.headers.get("content-type") || "application/json",
+ "Cache-Control": "no-cache",
+ },
+ })
+ } catch (error) {
+ const detail = error instanceof Error ? error.message : String(error)
+ return Response.json(
+ {
+ detail: `Upstream API unreachable: ${upstreamUrl}`,
+ error: detail,
+ },
+ { status: 502 },
+ )
+ }
+}
diff --git a/web/src/app/api/research/papers/[paperId]/route.ts b/web/src/app/api/research/papers/[paperId]/route.ts
new file mode 100644
index 0000000..565b86d
--- /dev/null
+++ b/web/src/app/api/research/papers/[paperId]/route.ts
@@ -0,0 +1,16 @@
+export const runtime = "nodejs"
+
+import { apiBaseUrl, proxyJson } from "../../_base"
+
+export async function GET(
+ req: Request,
+ { params }: { params: Promise<{ paperId: string }> },
+) {
+ const { paperId } = await params
+ const url = new URL(req.url)
+ return proxyJson(
+ req,
+ `${apiBaseUrl()}/api/research/papers/${encodeURIComponent(paperId)}?${url.searchParams.toString()}`,
+ "GET",
+ )
+}
diff --git a/web/src/app/api/research/papers/[paperId]/status/route.ts b/web/src/app/api/research/papers/[paperId]/status/route.ts
new file mode 100644
index 0000000..b8c340e
--- /dev/null
+++ b/web/src/app/api/research/papers/[paperId]/status/route.ts
@@ -0,0 +1,11 @@
+export const runtime = "nodejs"
+
+import { apiBaseUrl, proxyJson } from "../../../_base"
+
+export async function POST(
+ req: Request,
+ { params }: { params: Promise<{ paperId: string }> },
+) {
+ const { paperId } = await params
+ return proxyJson(req, `${apiBaseUrl()}/api/research/papers/${encodeURIComponent(paperId)}/status`, "POST")
+}
diff --git a/web/src/app/api/research/papers/saved/route.ts b/web/src/app/api/research/papers/saved/route.ts
new file mode 100644
index 0000000..d3446de
--- /dev/null
+++ b/web/src/app/api/research/papers/saved/route.ts
@@ -0,0 +1,12 @@
+export const runtime = "nodejs"
+
+import { apiBaseUrl, proxyJson } from "../../_base"
+
+export async function GET(req: Request) {
+ const url = new URL(req.url)
+ return proxyJson(
+ req,
+ `${apiBaseUrl()}/api/research/papers/saved?${url.searchParams.toString()}`,
+ "GET",
+ )
+}
diff --git a/web/src/app/api/research/paperscool/daily/route.ts b/web/src/app/api/research/paperscool/daily/route.ts
index a59a5f4..f38c39b 100644
--- a/web/src/app/api/research/paperscool/daily/route.ts
+++ b/web/src/app/api/research/paperscool/daily/route.ts
@@ -1,7 +1,50 @@
export const runtime = "nodejs"
-import { apiBaseUrl, proxyJson } from "../../_base"
+import { apiBaseUrl } from "../../_base"
export async function POST(req: Request) {
- return proxyJson(req, `${apiBaseUrl()}/api/research/paperscool/daily`, "POST")
+ const body = await req.text()
+ const contentType = req.headers.get("content-type") || "application/json"
+
+ let upstream: Response
+ try {
+ upstream = await fetch(`${apiBaseUrl()}/api/research/paperscool/daily`, {
+ method: "POST",
+ headers: {
+ "Content-Type": contentType,
+ Accept: "text/event-stream, application/json",
+ },
+ body,
+ })
+ } catch (error) {
+ const detail = error instanceof Error ? error.message : String(error)
+ return Response.json(
+ { detail: "Upstream API unreachable", error: detail },
+ { status: 502 },
+ )
+ }
+
+ const upstreamContentType = upstream.headers.get("content-type") || ""
+
+ // SSE stream path — pipe through without buffering
+ if (upstreamContentType.includes("text/event-stream")) {
+ return new Response(upstream.body, {
+ status: upstream.status,
+ headers: {
+ "Content-Type": "text/event-stream",
+ "Cache-Control": "no-cache",
+ Connection: "keep-alive",
+ },
+ })
+ }
+
+ // JSON fallback (fast path when no LLM/Judge)
+ const text = await upstream.text()
+ return new Response(text, {
+ status: upstream.status,
+ headers: {
+ "Content-Type": upstreamContentType || "application/json",
+ "Cache-Control": "no-cache",
+ },
+ })
}
diff --git a/web/src/app/api/research/paperscool/repos/route.ts b/web/src/app/api/research/paperscool/repos/route.ts
new file mode 100644
index 0000000..5380050
--- /dev/null
+++ b/web/src/app/api/research/paperscool/repos/route.ts
@@ -0,0 +1,7 @@
+export const runtime = "nodejs"
+
+import { apiBaseUrl, proxyJson } from "../../_base"
+
+export async function POST(req: Request) {
+ return proxyJson(req, `${apiBaseUrl()}/api/research/paperscool/repos`, "POST")
+}
diff --git a/web/src/app/api/research/scholar/network/route.ts b/web/src/app/api/research/scholar/network/route.ts
new file mode 100644
index 0000000..203c211
--- /dev/null
+++ b/web/src/app/api/research/scholar/network/route.ts
@@ -0,0 +1,7 @@
+export const runtime = "nodejs"
+
+import { apiBaseUrl, proxyJson } from "../../_base"
+
+export async function POST(req: Request) {
+ return proxyJson(req, `${apiBaseUrl()}/api/research/scholar/network`, "POST")
+}
diff --git a/web/src/app/api/research/scholar/trends/route.ts b/web/src/app/api/research/scholar/trends/route.ts
new file mode 100644
index 0000000..737da1b
--- /dev/null
+++ b/web/src/app/api/research/scholar/trends/route.ts
@@ -0,0 +1,7 @@
+export const runtime = "nodejs"
+
+import { apiBaseUrl, proxyJson } from "../../_base"
+
+export async function POST(req: Request) {
+ return proxyJson(req, `${apiBaseUrl()}/api/research/scholar/trends`, "POST")
+}
diff --git a/web/src/app/scholars/[id]/page.tsx b/web/src/app/scholars/[id]/page.tsx
index 4129c99..48907b9 100644
--- a/web/src/app/scholars/[id]/page.tsx
+++ b/web/src/app/scholars/[id]/page.tsx
@@ -28,7 +28,14 @@ export default async function ScholarProfilePage({ params }: { params: Promise<{
{scholar.affiliation}
{scholar.location}
-
+
+
+ {scholar.website ? (
+
Website
+ ) : (
+
Website N/A
+ )}
+
diff --git a/web/src/components/research/TopicWorkflowDashboard.tsx b/web/src/components/research/TopicWorkflowDashboard.tsx
index 7ed5297..b030bcb 100644
--- a/web/src/components/research/TopicWorkflowDashboard.tsx
+++ b/web/src/components/research/TopicWorkflowDashboard.tsx
@@ -1,6 +1,6 @@
"use client"
-import { useMemo, useState } from "react"
+import { useCallback, useEffect, useMemo, useRef, useState } from "react"
import Markdown from "react-markdown"
import remarkGfm from "remark-gfm"
import {
@@ -9,6 +9,7 @@ import {
ChevronRightIcon,
FilterIcon,
Loader2Icon,
+ MailIcon,
PlusIcon,
PlayIcon,
SettingsIcon,
@@ -79,6 +80,20 @@ type SearchItem = {
judge?: JudgeResult
}
+type RepoRow = {
+ title: string
+ query?: string
+ paper_url?: string
+ repo_url: string
+ github?: {
+ ok?: boolean
+ stars?: number
+ language?: string
+ updated_at?: string
+ error?: string
+ }
+}
+
type StepStatus = "pending" | "running" | "done" | "error" | "skipped"
/* ── Helpers ──────────────────────────────────────────── */
@@ -205,6 +220,108 @@ function buildDagStatuses(args: {
return statuses
}
+/* ── Stream Progress ─────────────────────────────────── */
+
+type StreamPhase = "idle" | "search" | "build" | "llm" | "insight" | "judge" | "filter" | "save" | "notify" | "done" | "error"
+
+const PHASE_LABELS: Record = {
+ idle: "Idle",
+ search: "Searching papers",
+ build: "Building report",
+ llm: "LLM enrichment",
+ insight: "Generating insights",
+ judge: "Judge scoring",
+ filter: "Filtering papers",
+ save: "Saving",
+ notify: "Sending notifications",
+ done: "Done",
+ error: "Error",
+}
+
+const PHASE_ORDER: StreamPhase[] = ["search", "build", "llm", "insight", "judge", "filter", "save", "notify", "done"]
+
+function useElapsed(startTime: number | null) {
+ const [elapsed, setElapsed] = useState(0)
+ useEffect(() => {
+ if (!startTime) { setElapsed(0); return }
+ setElapsed(Math.round((Date.now() - startTime) / 1000))
+ const id = setInterval(() => setElapsed(Math.round((Date.now() - startTime) / 1000)), 1000)
+ return () => clearInterval(id)
+ }, [startTime])
+ return elapsed
+}
+
+function StreamProgressCard({
+ streamPhase,
+ streamLog,
+ streamProgress,
+ startTime,
+}: {
+ streamPhase: StreamPhase
+ streamLog: string[]
+ streamProgress: { done: number; total: number }
+ startTime: number | null
+}) {
+ const elapsed = useElapsed(startTime)
+ const currentIdx = PHASE_ORDER.indexOf(streamPhase)
+ const pct = streamProgress.total > 0
+ ? Math.round((streamProgress.done / streamProgress.total) * 100)
+ : currentIdx >= 0
+ ? Math.round(((currentIdx + 0.5) / PHASE_ORDER.length) * 100)
+ : 0
+
+ return (
+
+
+
+
+
+ {PHASE_LABELS[streamPhase] || streamPhase}
+
+
+ {streamProgress.total > 0 && (
+ {streamProgress.done}/{streamProgress.total}
+ )}
+ {elapsed > 0 && {elapsed}s }
+
+
+
+
+ {PHASE_ORDER.slice(0, -1).map((p) => {
+ const idx = PHASE_ORDER.indexOf(p)
+ const status = idx < currentIdx ? "done" : idx === currentIdx ? "active" : "pending"
+ return (
+
+
+
+ {PHASE_LABELS[p]}
+
+
+ )
+ })}
+
+ {streamLog.length > 0 && (
+
+
+ {streamLog.slice(-20).map((line, idx) => (
+
{line}
+ ))}
+
+
+ )}
+
+
+ )
+}
+
/* ── Paper Card ───────────────────────────────────────── */
function PaperCard({ item, query, onOpenDetail }: { item: SearchItem; query?: string; onOpenDetail: (item: SearchItem) => void }) {
@@ -341,6 +458,7 @@ function ConfigSheetBody(props: {
useVenue: boolean; setUseVenue: (v: boolean) => void
usePapersCool: boolean; setUsePapersCool: (v: boolean) => void
useArxivApi: boolean; setUseArxivApi: (v: boolean) => void
+ useHFDaily: boolean; setUseHFDaily: (v: boolean) => void
enableLLM: boolean; setEnableLLM: (v: boolean) => void
useSummary: boolean; setUseSummary: (v: boolean) => void
useTrends: boolean; setUseTrends: (v: boolean) => void
@@ -350,16 +468,21 @@ function ConfigSheetBody(props: {
judgeRuns: number; setJudgeRuns: (v: number) => void
judgeMaxItems: number; setJudgeMaxItems: (v: number) => void
judgeTokenBudget: number; setJudgeTokenBudget: (v: number) => void
+ notifyEmail: string; setNotifyEmail: (v: string) => void
+ notifyEnabled: boolean; setNotifyEnabled: (v: boolean) => void
+ resendEnabled: boolean; setResendEnabled: (v: boolean) => void
}) {
const {
queryItems, setQueryItems, topK, setTopK, topN, setTopN,
showPerBranch, setShowPerBranch, saveDaily, setSaveDaily,
outputDir, setOutputDir, useArxiv, setUseArxiv, useVenue, setUseVenue,
- usePapersCool, setUsePapersCool, useArxivApi, setUseArxivApi, enableLLM, setEnableLLM,
+ usePapersCool, setUsePapersCool, useArxivApi, setUseArxivApi, useHFDaily, setUseHFDaily, enableLLM, setEnableLLM,
useSummary, setUseSummary, useTrends, setUseTrends,
useInsight, setUseInsight, useRelevance, setUseRelevance,
enableJudge, setEnableJudge, judgeRuns, setJudgeRuns,
judgeMaxItems, setJudgeMaxItems, judgeTokenBudget, setJudgeTokenBudget,
+ notifyEmail, setNotifyEmail, notifyEnabled, setNotifyEnabled,
+ resendEnabled, setResendEnabled,
} = props
const updateQuery = (idx: number, value: string) => {
@@ -404,6 +527,7 @@ function ConfigSheetBody(props: {
setUsePapersCool(Boolean(v))} /> papers.cool
setUseArxivApi(Boolean(v))} /> arXiv API
+ setUseHFDaily(Boolean(v))} /> HF Daily
setUseArxiv(Boolean(v))} /> arxiv
@@ -450,11 +574,115 @@ function ConfigSheetBody(props: {
{enableJudge && (
)}
+
+
+
+
+
+ setNotifyEnabled(Boolean(v))} />
+ Email Notification
+
+ {notifyEnabled && (
+
+
+ Email Address
+ setNotifyEmail(e.target.value)}
+ placeholder="you@example.com"
+ className="h-8 text-sm"
+ />
+
+
+ Requires PAPERBOT_NOTIFY_SMTP_* env vars on the backend. The email address here overrides PAPERBOT_NOTIFY_EMAIL_TO.
+
+
+ )}
+
+
+
+
+ setResendEnabled(Boolean(v))} />
+ Newsletter (Resend)
+
+ {resendEnabled && (
+
+
+ Send digest to all newsletter subscribers via Resend API. Requires PAPERBOT_RESEND_API_KEY env var.
+
+
+
+ )}
+
+
+ )
+}
+
+/* ── Newsletter Subscribe Widget ─────────────────────── */
+
+function NewsletterSubscribeWidget() {
+ const [email, setEmail] = useState("")
+ const [status, setStatus] = useState<"idle" | "loading" | "ok" | "error">("idle")
+ const [message, setMessage] = useState("")
+ const [subCount, setSubCount] = useState<{ active: number; total: number } | null>(null)
+
+ const fetchCount = useCallback(async () => {
+ try {
+ const res = await fetch("/api/newsletter/subscribers")
+ if (res.ok) setSubCount(await res.json())
+ } catch { /* ignore */ }
+ }, [])
+
+ useEffect(() => { fetchCount() }, [fetchCount])
+
+ async function handleSubscribe() {
+ if (!email.trim()) return
+ setStatus("loading"); setMessage("")
+ try {
+ const res = await fetch("/api/newsletter/subscribe", {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ email: email.trim() }),
+ })
+ const data = await res.json()
+ if (res.ok) {
+ setStatus("ok"); setMessage(data.message || "Subscribed!"); setEmail("")
+ fetchCount()
+ } else {
+ setStatus("error"); setMessage(data.detail || "Failed to subscribe")
+ }
+ } catch (err) {
+ setStatus("error"); setMessage(String(err))
+ }
+ }
+
+ return (
+
+
+ { setEmail(e.target.value); setStatus("idle") }}
+ placeholder="subscriber@example.com"
+ className="h-8 text-sm"
+ onKeyDown={(e) => e.key === "Enter" && handleSubscribe()}
+ />
+
+ {status === "loading" ? : "Subscribe"}
+
+
+ {message && (
+
{message}
+ )}
+ {subCount && (
+
{subCount.active} active subscriber{subCount.active !== 1 ? "s" : ""}
+ )}
)
}
@@ -462,43 +690,76 @@ function ConfigSheetBody(props: {
/* ── Main Dashboard ───────────────────────────────────── */
export default function TopicWorkflowDashboard() {
- /* Config state (local) */
+ /* Config state (local — queries only) */
const [queryItems, setQueryItems] = useState([...DEFAULT_QUERIES])
- const [topK, setTopK] = useState(5)
- const [topN, setTopN] = useState(10)
- const [showPerBranch, setShowPerBranch] = useState(25)
- const [saveDaily, setSaveDaily] = useState(false)
- const [outputDir, setOutputDir] = useState("./reports/dailypaper")
- const [useArxiv, setUseArxiv] = useState(true)
- const [useVenue, setUseVenue] = useState(true)
- const [usePapersCool, setUsePapersCool] = useState(true)
- const [useArxivApi, setUseArxivApi] = useState(false)
- const persistedDailyResult = useWorkflowStore.getState().dailyResult
- const [enableLLM, setEnableLLM] = useState(
- () => Boolean(persistedDailyResult?.report?.llm_analysis?.enabled),
- )
- const [useSummary, setUseSummary] = useState(true)
- const [useTrends, setUseTrends] = useState(true)
- const [useInsight, setUseInsight] = useState(true)
- const [useRelevance, setUseRelevance] = useState(false)
- const [enableJudge, setEnableJudge] = useState(
- () => Boolean(persistedDailyResult?.report?.judge?.enabled),
- )
- const [judgeRuns, setJudgeRuns] = useState(1)
- const [judgeMaxItems, setJudgeMaxItems] = useState(5)
- const [judgeTokenBudget, setJudgeTokenBudget] = useState(0)
/* Persisted state (zustand) */
const store = useWorkflowStore()
- const { searchResult, dailyResult, phase, analyzeLog } = store
+ const { searchResult, dailyResult, phase, analyzeLog, notifyEmail, notifyEnabled, config } = store
+ const resendEnabled = store.resendEnabled
+ const uc = store.updateConfig
+
+ /* Derived config accessors — read from persisted store */
+ const topK = config.topK
+ const setTopK = (v: number) => uc({ topK: v })
+ const topN = config.topN
+ const setTopN = (v: number) => uc({ topN: v })
+ const showPerBranch = config.showPerBranch
+ const setShowPerBranch = (v: number) => uc({ showPerBranch: v })
+ const saveDaily = config.saveDaily
+ const setSaveDaily = (v: boolean) => uc({ saveDaily: v })
+ const outputDir = config.outputDir
+ const setOutputDir = (v: string) => uc({ outputDir: v })
+ const useArxiv = config.useArxiv
+ const setUseArxiv = (v: boolean) => uc({ useArxiv: v })
+ const useVenue = config.useVenue
+ const setUseVenue = (v: boolean) => uc({ useVenue: v })
+ const usePapersCool = config.usePapersCool
+ const setUsePapersCool = (v: boolean) => uc({ usePapersCool: v })
+ const useArxivApi = config.useArxivApi
+ const setUseArxivApi = (v: boolean) => uc({ useArxivApi: v })
+ const useHFDaily = config.useHFDaily
+ const setUseHFDaily = (v: boolean) => uc({ useHFDaily: v })
+ const enableLLM = config.enableLLM
+ const setEnableLLM = (v: boolean) => uc({ enableLLM: v })
+ const useSummary = config.useSummary
+ const setUseSummary = (v: boolean) => uc({ useSummary: v })
+ const useTrends = config.useTrends
+ const setUseTrends = (v: boolean) => uc({ useTrends: v })
+ const useInsight = config.useInsight
+ const setUseInsight = (v: boolean) => uc({ useInsight: v })
+ const useRelevance = config.useRelevance
+ const setUseRelevance = (v: boolean) => uc({ useRelevance: v })
+ const enableJudge = config.enableJudge
+ const setEnableJudge = (v: boolean) => uc({ enableJudge: v })
+ const judgeRuns = config.judgeRuns
+ const setJudgeRuns = (v: number) => uc({ judgeRuns: v })
+ const judgeMaxItems = config.judgeMaxItems
+ const setJudgeMaxItems = (v: number) => uc({ judgeMaxItems: v })
+ const judgeTokenBudget = config.judgeTokenBudget
+ const setJudgeTokenBudget = (v: number) => uc({ judgeTokenBudget: v })
/* Transient loading state (not persisted) */
const [loadingSearch, setLoadingSearch] = useState(false)
const [loadingDaily, setLoadingDaily] = useState(false)
const [loadingAnalyze, setLoadingAnalyze] = useState(false)
const [analyzeProgress, setAnalyzeProgress] = useState({ done: 0, total: 0 })
+ const [loadingRepos, setLoadingRepos] = useState(false)
+ const [repoRows, setRepoRows] = useState([])
+ const [repoError, setRepoError] = useState(null)
const [error, setError] = useState(null)
+ /* Stream progress state */
+ const [streamPhase, setStreamPhase] = useState("idle")
+ const [streamLog, setStreamLog] = useState([])
+ const [streamProgress, setStreamProgress] = useState({ done: 0, total: 0 })
+ const streamStartRef = useRef(null)
+ const streamAbortRef = useRef(null)
+
+ const addStreamLog = useCallback((line: string) => {
+ setStreamLog((prev) => [...prev.slice(-50), line])
+ }, [])
+
/* UI state */
const [dagOpen, setDagOpen] = useState(false)
const [selectedPaper, setSelectedPaper] = useState(null)
@@ -506,7 +767,14 @@ export default function TopicWorkflowDashboard() {
const queries = useMemo(() => queryItems.map((q) => q.trim()).filter(Boolean), [queryItems])
const branches = useMemo(() => [useArxiv ? "arxiv" : "", useVenue ? "venue" : ""].filter(Boolean), [useArxiv, useVenue])
- const sources = useMemo(() => [usePapersCool ? "papers_cool" : "", useArxivApi ? "arxiv_api" : ""].filter(Boolean), [usePapersCool, useArxivApi])
+ const sources = useMemo(
+ () => [
+ usePapersCool ? "papers_cool" : "",
+ useArxivApi ? "arxiv_api" : "",
+ useHFDaily ? "hf_daily" : "",
+ ].filter(Boolean),
+ [usePapersCool, useArxivApi, useHFDaily],
+ )
const llmFeatures = useMemo(
() => [useSummary ? "summary" : "", useTrends ? "trends" : "", useInsight ? "insight" : "", useRelevance ? "relevance" : ""].filter(Boolean),
[useInsight, useRelevance, useSummary, useTrends],
@@ -542,6 +810,8 @@ export default function TopicWorkflowDashboard() {
[phase, error, enableLLM, enableJudge, hasSearchData, hasReportData, hasLLMData, hasJudgeData, schedulerDone],
)
+ const paperDataSource = dailyResult?.report?.queries ? "dailypaper" : searchResult?.items ? "search" : null
+
const allPapers = useMemo(() => {
const items: Array = []
if (dailyResult?.report?.queries) {
@@ -603,6 +873,7 @@ export default function TopicWorkflowDashboard() {
/* Actions */
async function runTopicSearch() {
setLoadingSearch(true); setError(null); store.setPhase("searching")
+ store.setDailyResult(null); store.clearAnalyzeLog()
try {
const res = await fetch("/api/research/paperscool/search", {
method: "POST", headers: { "Content-Type": "application/json" },
@@ -615,23 +886,267 @@ export default function TopicWorkflowDashboard() {
} catch (err) { setError(String(err)); store.setPhase("error") } finally { setLoadingSearch(false) }
}
- async function runDailyPaper() {
- setLoadingDaily(true); setError(null); store.setPhase("reporting")
+ async function runDailyPaperStream() {
+ streamAbortRef.current?.abort()
+ const controller = new AbortController()
+ streamAbortRef.current = controller
+ setLoadingDaily(true); setError(null); setRepoRows([]); setRepoError(null)
+ store.setPhase("reporting"); store.clearAnalyzeLog()
+ setStreamPhase("search"); setStreamLog([]); setStreamProgress({ done: 0, total: 0 })
+ streamStartRef.current = Date.now()
+
+ const requestBody = {
+ queries, sources, branches, top_k_per_query: topK, show_per_branch: showPerBranch, top_n: topN,
+ title: "DailyPaper Digest", formats: ["both"], save: saveDaily, output_dir: outputDir,
+ enable_llm_analysis: enableLLM, llm_features: llmFeatures,
+ enable_judge: enableJudge, judge_runs: judgeRuns,
+ judge_max_items_per_query: judgeMaxItems, judge_token_budget: judgeTokenBudget,
+ notify: notifyEnabled || resendEnabled,
+ notify_channels: [...(notifyEnabled ? ["email"] : []), ...(resendEnabled ? ["resend"] : [])],
+ notify_email_to: notifyEnabled && notifyEmail.trim() ? [notifyEmail.trim()] : [],
+ }
+
+ let streamFailed = false
try {
const res = await fetch("/api/research/paperscool/daily", {
- method: "POST", headers: { "Content-Type": "application/json" },
- body: JSON.stringify({
- queries, sources, branches, top_k_per_query: topK, show_per_branch: showPerBranch, top_n: topN,
- title: "DailyPaper Digest", formats: ["both"], save: saveDaily, output_dir: outputDir,
- enable_llm_analysis: enableLLM, llm_features: llmFeatures,
- enable_judge: enableJudge, judge_runs: judgeRuns,
- judge_max_items_per_query: judgeMaxItems, judge_token_budget: judgeTokenBudget,
- }),
+ method: "POST",
+ headers: { "Content-Type": "application/json", Accept: "text/event-stream, application/json" },
+ body: JSON.stringify(requestBody),
+ signal: controller.signal,
})
if (!res.ok) throw new Error(await res.text())
- store.setDailyResult(await res.json())
- store.setPhase("reported")
- } catch (err) { setError(String(err)); store.setPhase("error") } finally { setLoadingDaily(false) }
+
+ const contentType = res.headers.get("content-type") || ""
+
+ // JSON fallback (fast path — no LLM/Judge)
+ if (!contentType.includes("text/event-stream")) {
+ const data = await res.json()
+ store.setDailyResult(data)
+ store.setPhase("reported")
+ setStreamPhase("done")
+ return
+ }
+
+ // SSE streaming path
+ if (!res.body) throw new Error("No response body for SSE stream")
+
+ for await (const event of readSSE(res.body)) {
+ if (event.type === "progress") {
+ const d = (event.data || {}) as { phase?: string; message?: string; total?: number }
+ const p = (d.phase || "search") as StreamPhase
+ setStreamPhase(p)
+ addStreamLog(`[${p}] ${d.message || "running"}`)
+ if (d.total && d.total > 0) {
+ setStreamProgress({ done: 0, total: d.total })
+ }
+ continue
+ }
+
+ if (event.type === "search_done") {
+ const d = (event.data || {}) as { items_count?: number; unique_items?: number }
+ addStreamLog(`search done: ${d.unique_items || 0} unique papers`)
+ setStreamPhase("build")
+ continue
+ }
+
+ if (event.type === "report_built") {
+ const d = (event.data || {}) as { report?: DailyResult["report"]; queries_count?: number; global_top_count?: number }
+ addStreamLog(`report built: ${d.queries_count || 0} queries, ${d.global_top_count || 0} global top`)
+ if (d.report) {
+ store.setDailyResult({ report: d.report, markdown: "" })
+ }
+ continue
+ }
+
+ if (event.type === "llm_summary") {
+ const d = (event.data || {}) as { title?: string; query?: string; ai_summary?: string; done?: number; total?: number }
+ setStreamProgress({ done: d.done || 0, total: d.total || 0 })
+ addStreamLog(`summary ${d.done || 0}/${d.total || 0}: ${d.title || "paper"}`)
+ if (d.query && d.title && d.ai_summary) {
+ store.updateDailyResult((prev) => {
+ const nextQueries = (prev.report.queries || []).map((query) => {
+ const queryName = query.normalized_query || query.raw_query || ""
+ if (queryName !== d.query) return query
+ const nextItems = (query.top_items || []).map((item) => {
+ if (item.title === d.title) return { ...item, ai_summary: d.ai_summary }
+ return item
+ })
+ return { ...query, top_items: nextItems }
+ })
+ return { ...prev, report: { ...prev.report, queries: nextQueries } }
+ })
+ }
+ continue
+ }
+
+ if (event.type === "trend") {
+ const d = (event.data || {}) as { query?: string; analysis?: string; done?: number; total?: number }
+ addStreamLog(`trend ${d.done || 0}/${d.total || 0}: ${d.query || "query"}`)
+ if (d.query && typeof d.analysis === "string") {
+ store.updateDailyResult((prev) => {
+ const llmAnalysis = prev.report.llm_analysis || { enabled: true, features: [], daily_insight: "", query_trends: [] }
+ const features = new Set(llmAnalysis.features || [])
+ features.add("trends")
+ const trendList = [...(llmAnalysis.query_trends || [])]
+ const existingIndex = trendList.findIndex((item) => item.query === d.query)
+ if (existingIndex >= 0) {
+ trendList[existingIndex] = { query: d.query!, analysis: d.analysis! }
+ } else {
+ trendList.push({ query: d.query!, analysis: d.analysis! })
+ }
+ return {
+ ...prev,
+ report: {
+ ...prev.report,
+ llm_analysis: { ...llmAnalysis, enabled: true, features: Array.from(features), query_trends: trendList },
+ },
+ }
+ })
+ }
+ continue
+ }
+
+ if (event.type === "insight") {
+ const d = (event.data || {}) as { analysis?: string }
+ addStreamLog("insight generated")
+ if (typeof d.analysis === "string") {
+ store.updateDailyResult((prev) => {
+ const llmAnalysis = prev.report.llm_analysis || { enabled: true, features: [], daily_insight: "", query_trends: [] }
+ const features = new Set(llmAnalysis.features || [])
+ features.add("insight")
+ return {
+ ...prev,
+ report: {
+ ...prev.report,
+ llm_analysis: { ...llmAnalysis, enabled: true, features: Array.from(features), daily_insight: d.analysis! },
+ },
+ }
+ })
+ }
+ continue
+ }
+
+ if (event.type === "llm_done") {
+ const d = (event.data || {}) as { summaries_count?: number; trends_count?: number }
+ addStreamLog(`LLM done: ${d.summaries_count || 0} summaries, ${d.trends_count || 0} trends`)
+ setStreamPhase("judge")
+ continue
+ }
+
+ if (event.type === "judge") {
+ const d = (event.data || {}) as { query?: string; title?: string; judge?: SearchItem["judge"]; done?: number; total?: number }
+ setStreamProgress({ done: d.done || 0, total: d.total || 0 })
+ setStreamPhase("judge")
+ const rec = d.judge?.recommendation || "?"
+ const overall = d.judge?.overall != null ? Number(d.judge.overall).toFixed(2) : "?"
+ addStreamLog(`judge ${d.done || 0}/${d.total || 0}: [${rec} ${overall}] ${d.title || "paper"} (${d.query || ""})`)
+ // TODO: refactor judge update — current nested map + matched flag is hard
+ // to follow. Use findIndex to locate target query+item, then apply a
+ // single immutable update. See PR #25 review for suggested approach.
+ if (d.query && d.title && d.judge) {
+ store.updateDailyResult((prev) => {
+ const sourceQueries = prev.report.queries || []
+ let matched = false
+ const nextQueries = sourceQueries.map((query) => {
+ const queryName = query.normalized_query || query.raw_query || ""
+ if (queryName !== d.query) return query
+ const nextItems = (query.top_items || []).map((item) => {
+ if (item.title === d.title) { matched = true; return { ...item, judge: d.judge } }
+ return item
+ })
+ return { ...query, top_items: nextItems }
+ })
+ if (!matched) {
+ const fallbackQueries = nextQueries.map((query) => {
+ if (matched) return query
+ const nextItems = (query.top_items || []).map((item) => {
+ if (!matched && item.title === d.title) { matched = true; return { ...item, judge: d.judge } }
+ return item
+ })
+ return { ...query, top_items: nextItems }
+ })
+ return { ...prev, report: { ...prev.report, queries: fallbackQueries } }
+ }
+ return { ...prev, report: { ...prev.report, queries: nextQueries } }
+ })
+ }
+ continue
+ }
+
+ if (event.type === "judge_done") {
+ const d = (event.data || {}) as DailyResult["report"]["judge"]
+ store.updateDailyResult((prev) => ({
+ ...prev,
+ report: { ...prev.report, judge: d || prev.report.judge },
+ }))
+ addStreamLog("judge scoring complete")
+ continue
+ }
+
+ if (event.type === "filter_done") {
+ const d = (event.data || {}) as {
+ total_before?: number
+ total_after?: number
+ removed_count?: number
+ log?: Array<{ query?: string; title?: string; recommendation?: string; overall?: number; action?: string }>
+ }
+ setStreamPhase("filter")
+ addStreamLog(`filter: ${d.total_before || 0} papers -> ${d.total_after || 0} kept, ${d.removed_count || 0} removed`)
+ if (d.log) {
+ for (const entry of d.log) {
+ addStreamLog(` removed [${entry.recommendation || "?"}] ${entry.title || "?"} (${entry.query || ""})`)
+ }
+ }
+ // Update the store with filtered report — the next "result" event will have the final state
+ // but we can also re-fetch queries from the filter event if needed
+ continue
+ }
+
+ if (event.type === "result") {
+ const d = (event.data || {}) as {
+ report?: DailyResult["report"]
+ markdown?: string
+ markdown_path?: string | null
+ json_path?: string | null
+ notify_result?: Record | null
+ }
+ if (d.report) {
+ store.setDailyResult({
+ report: d.report,
+ markdown: typeof d.markdown === "string" ? d.markdown : "",
+ markdown_path: d.markdown_path,
+ json_path: d.json_path,
+ })
+ }
+ setStreamPhase("done")
+ addStreamLog("stream complete")
+ continue
+ }
+
+ if (event.type === "error") {
+ const d = (event.data || {}) as { message?: string; detail?: string }
+ const msg = event.message || d.message || d.detail || "Unknown stream error"
+ addStreamLog(`[error] ${msg}`)
+ setError(`DailyPaper failed: ${msg}`)
+ streamFailed = true
+ setStreamPhase("error")
+ store.setPhase("error")
+ break
+ }
+ }
+ if (!streamFailed) {
+ store.setPhase("reported")
+ }
+ } catch (err) {
+ streamFailed = true
+ setError(String(err))
+ setStreamPhase("error")
+ store.setPhase("error")
+ } finally {
+ setLoadingDaily(false)
+ streamStartRef.current = null
+ streamAbortRef.current = null
+ }
}
async function runAnalyzeStream() {
@@ -641,7 +1156,12 @@ export default function TopicWorkflowDashboard() {
const runInsight = Boolean(enableLLM && useInsight)
if (!runJudge && !runTrends && !runInsight) { setError("Enable Judge, LLM trends, or LLM insight before analyzing."); return }
+ streamAbortRef.current?.abort()
+ const controller = new AbortController()
+ streamAbortRef.current = controller
setLoadingAnalyze(true); setError(null); store.clearAnalyzeLog(); setAnalyzeProgress({ done: 0, total: 0 }); store.setPhase("reporting")
+ setStreamPhase("idle"); setStreamLog([]); setStreamProgress({ done: 0, total: 0 })
+ streamStartRef.current = Date.now()
store.addAnalyzeLog(
`[start] run_judge=${runJudge} run_trends=${runTrends} run_insight=${runInsight} llm_enabled=${enableLLM} judge_enabled=${enableJudge}`,
)
@@ -649,6 +1169,7 @@ export default function TopicWorkflowDashboard() {
store.addAnalyzeLog("[hint] Analyze stream currently supports trends and daily insight.")
}
+ let streamFailed = false
try {
const res = await fetch("/api/research/paperscool/analyze", {
method: "POST", headers: { "Content-Type": "application/json" },
@@ -657,6 +1178,7 @@ export default function TopicWorkflowDashboard() {
judge_runs: judgeRuns, judge_max_items_per_query: judgeMaxItems,
judge_token_budget: judgeTokenBudget, trend_max_items_per_query: 3,
}),
+ signal: controller.signal,
})
if (!res.ok || !res.body) throw new Error(await res.text())
@@ -841,30 +1363,63 @@ export default function TopicWorkflowDashboard() {
const msg = event.message || d.message || d.detail || "Unknown analyze stream error"
store.addAnalyzeLog(`[error] ${msg}`)
setError(`Analyze failed: ${msg}`)
+ streamFailed = true
store.setPhase("error")
break
}
}
- if (store.phase !== "error") {
+ if (!streamFailed) {
store.setPhase("reported")
}
- } catch (err) { setError(String(err)); store.setPhase("error") } finally { setLoadingAnalyze(false) }
+ } catch (err) {
+ streamFailed = true
+ setError(String(err))
+ store.setPhase("error")
+ } finally {
+ setLoadingAnalyze(false)
+ streamStartRef.current = null
+ streamAbortRef.current = null
+ }
+ }
+
+ async function runRepoEnrichment() {
+ if (!dailyResult?.report) {
+ setRepoError("Generate DailyPaper first.")
+ return
+ }
+
+ setLoadingRepos(true)
+ setRepoError(null)
+ try {
+ const res = await fetch("/api/research/paperscool/repos", {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ report: dailyResult.report,
+ max_items: 500,
+ include_github_api: true,
+ }),
+ })
+ if (!res.ok) throw new Error(await res.text())
+ const payload = await res.json() as { repos?: RepoRow[] }
+ setRepoRows(payload.repos || [])
+ } catch (err) {
+ setRepoError(String(err))
+ } finally {
+ setLoadingRepos(false)
+ }
}
const isLoading = loadingSearch || loadingDaily || loadingAnalyze
const canSearch = queries.length > 0 && branches.length > 0 && sources.length > 0
const loadingLabel = loadingSearch
? "Searching sources..."
- : loadingDaily
- ? "Generating DailyPaper report and enrichment..."
- : "Running judge/trend/insight enrichment..."
+ : "Running judge/trend/insight enrichment..."
const loadingHint = loadingAnalyze && analyzeProgress.total > 0
? `${analyzeProgress.done}/${analyzeProgress.total} judged`
- : loadingDaily
- ? "Fetching, ranking, and composing report"
- : loadingSearch
- ? "Multi-query retrieval in progress"
- : "Waiting for LLM events"
+ : loadingSearch
+ ? "Multi-query retrieval in progress"
+ : "Waiting for LLM events"
return (
@@ -883,7 +1438,7 @@ export default function TopicWorkflowDashboard() {
{loadingSearch ? : } Search
-
+
{loadingDaily ? : } DailyPaper
@@ -907,11 +1462,14 @@ export default function TopicWorkflowDashboard() {
queryItems, setQueryItems, topK, setTopK, topN, setTopN,
showPerBranch, setShowPerBranch, saveDaily, setSaveDaily,
outputDir, setOutputDir, useArxiv, setUseArxiv, useVenue, setUseVenue,
- usePapersCool, setUsePapersCool, useArxivApi, setUseArxivApi, enableLLM, setEnableLLM,
+ usePapersCool, setUsePapersCool, useArxivApi, setUseArxivApi, useHFDaily, setUseHFDaily, enableLLM, setEnableLLM,
useSummary, setUseSummary, useTrends, setUseTrends,
useInsight, setUseInsight, useRelevance, setUseRelevance,
enableJudge, setEnableJudge, judgeRuns, setJudgeRuns,
judgeMaxItems, setJudgeMaxItems, judgeTokenBudget, setJudgeTokenBudget,
+ notifyEmail, setNotifyEmail: store.setNotifyEmail,
+ notifyEnabled, setNotifyEnabled: store.setNotifyEnabled,
+ resendEnabled, setResendEnabled: store.setResendEnabled,
}} />
@@ -921,7 +1479,18 @@ export default function TopicWorkflowDashboard() {
{error && {error}
}
- {isLoading && (
+ {/* Stream progress card for DailyPaper SSE */}
+ {loadingDaily && streamPhase !== "idle" && (
+
+ )}
+
+ {/* Generic loading card for search / analyze */}
+ {isLoading && !loadingDaily && !loadingAnalyze && (
@@ -994,7 +1563,14 @@ export default function TopicWorkflowDashboard() {
{/* Papers */}
-
{allPapers.length} papers
+
+
{allPapers.length} papers
+ {paperDataSource && (
+
+ {paperDataSource === "dailypaper" ? "DailyPaper" : "Search"}
+
+ )}
+
Sort:
setSortBy(e.target.value as "score" | "judge")}>
@@ -1022,7 +1598,7 @@ export default function TopicWorkflowDashboard() {
))
) : (
- Run a search or generate a DailyPaper to see papers here.
+ Run Search to find papers, then DailyPaper to rank and compose a report, then Analyze to run Judge/Trends.
)}
@@ -1238,6 +1814,51 @@ export default function TopicWorkflowDashboard() {
+
+
+
+ Repository Enrichment
+
+ {loadingRepos ? : null}
+ {loadingRepos ? "Enriching..." : "Find Repos"}
+
+
+
+
+ {repoError && {repoError}
}
+ {repoRows.length > 0 ? (
+
+
+
+
+ Title
+ Repository
+ Stars
+ Language
+
+
+
+ {repoRows.map((row, idx) => (
+
+
+ {row.paper_url ? {row.title} : row.title}
+
+
+ {row.repo_url}
+
+ {row.github?.stars ?? "-"}
+ {row.github?.language || "-"}
+
+ ))}
+
+
+
+ ) : (
+ Click "Find Repos" to enrich papers with code repositories.
+ )}
+
+
+
>
) : isLoading ? (
diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts
index e8dd303..28af73b 100644
--- a/web/src/lib/api.ts
+++ b/web/src/lib/api.ts
@@ -1,6 +1,29 @@
import { Activity, Paper, PaperDetails, Scholar, ScholarDetails, Stats, WikiConcept, TrendingTopic, PipelineTask, ReadingQueueItem, LLMUsageRecord } from "./types"
-const API_BASE_URL = "http://localhost:8000/api"
+const API_BASE_URL = process.env.PAPERBOT_API_BASE_URL || "http://127.0.0.1:8000/api"
+
+function slugToName(slug: string): string {
+ return slug
+ .split("-")
+ .filter(Boolean)
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
+ .join(" ")
+}
+
+async function postJson(path: string, payload: Record): Promise {
+ try {
+ const res = await fetch(`${API_BASE_URL}${path}`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify(payload),
+ cache: "no-store",
+ })
+ if (!res.ok) return null
+ return await res.json() as T
+ } catch {
+ return null
+ }
+}
export async function fetchStats(): Promise {
// TODO: Replace with real API call
@@ -184,37 +207,130 @@ export async function fetchPaperDetails(id: string): Promise {
}
}
+// TODO: add unit tests for fetchScholarDetails — cover successful network+trends,
+// partial responses, and both-null fallback path.
export async function fetchScholarDetails(id: string): Promise {
- const papers = await fetchPapers()
+ const scholarName = slugToName(id)
+
+ type ScholarNetworkResponse = {
+ scholar?: { name?: string; affiliations?: string[]; citation_count?: number; paper_count?: number; h_index?: number }
+ stats?: { papers_used?: number }
+ nodes?: Array<{ name?: string; type?: string; collab_papers?: number }>
+ }
+
+ type ScholarTrendsResponse = {
+ scholar?: { name?: string; affiliations?: string[]; citation_count?: number; paper_count?: number; h_index?: number }
+ trend_summary?: { publication_trend?: "up" | "down" | "flat"; citation_trend?: "up" | "down" | "flat" }
+ topic_distribution?: Array<{ topic?: string; count?: number }>
+ recent_papers?: Array<{ title?: string; year?: number; citation_count?: number; venue?: string; url?: string }>
+ }
+
+ const [network, trends] = await Promise.all([
+ postJson("/research/scholar/network", {
+ scholar_name: scholarName,
+ max_papers: 120,
+ recent_years: 5,
+ max_nodes: 30,
+ }),
+ postJson("/research/scholar/trends", {
+ scholar_name: scholarName,
+ max_papers: 200,
+ year_window: 10,
+ }),
+ ])
+
+ // TODO: mock fallback is hardcoded to Dawn Song — replace with generic
+ // placeholder or remove entirely once real scholar data is always available.
+ // Fallback to mock data if the scholar is not configured in subscriptions yet.
+ if (!network && !trends) {
+ const papers = await fetchPapers()
+ return {
+ id,
+ name: scholarName,
+ affiliation: "University of California, Berkeley",
+ h_index: 120,
+ papers_tracked: 45,
+ recent_activity: "Published 2 days ago",
+ status: "active",
+ bio: "Dawn Song is a Professor in the Department of Electrical Engineering and Computer Science at UC Berkeley. Her research interest lies in deep learning, security, and blockchain.",
+ location: "Berkeley, CA",
+ website: "https://dawnsong.io",
+ expertise_radar: [
+ { subject: "Security", A: 100, fullMark: 100 },
+ { subject: "Deep Learning", A: 90, fullMark: 100 },
+ { subject: "Blockchain", A: 80, fullMark: 100 },
+ { subject: "Systems", A: 85, fullMark: 100 },
+ { subject: "Privacy", A: 95, fullMark: 100 },
+ ],
+ publications: papers,
+ co_authors: [
+ { name: "Dan Hendrycks", avatar: "https://avatar.vercel.sh/dan.png" },
+ { name: "Kevin Eykholt", avatar: "https://avatar.vercel.sh/kevin.png" },
+ ],
+ stats: {
+ total_citations: 54321,
+ papers_count: 230,
+ h_index: 120,
+ },
+ }
+ }
+
+ const scholar = network?.scholar || trends?.scholar || {}
+ const topicDist = (trends?.topic_distribution || []).slice(0, 5)
+ const maxTopicCount = Math.max(1, ...topicDist.map((t) => Number(t.count || 0)))
+
+ const publications: Paper[] = (trends?.recent_papers || []).slice(0, 15).map((paper, idx) => ({
+ id: `sch-${id}-paper-${idx}`,
+ title: String(paper.title || "Untitled"),
+ venue: String(paper.venue || "Unknown venue"),
+ authors: String(scholar.name || scholarName),
+ citations: Number(paper.citation_count || 0),
+ status: "analyzing",
+ tags: topicDist.map((t) => String(t.topic || "")).filter(Boolean).slice(0, 3),
+ }))
+
+ const coauthors = (network?.nodes || [])
+ .filter((n) => n.type === "coauthor")
+ .slice(0, 12)
+ .map((n) => {
+ const name = String(n.name || "Unknown")
+ const collab = Number(n.collab_papers || 0)
+ return {
+ name: collab > 0 ? `${name} (${collab})` : name,
+ avatar: `https://avatar.vercel.sh/${encodeURIComponent(name)}.png`,
+ }
+ })
+
+ const publicationTrend = trends?.trend_summary?.publication_trend || "flat"
+ const recentActivity = publicationTrend === "up"
+ ? "Publication trend up"
+ : publicationTrend === "down"
+ ? "Publication trend down"
+ : "Publication trend stable"
return {
id,
- name: id.split('-').map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' '),
- affiliation: "University of California, Berkeley",
- h_index: 120,
- papers_tracked: 45,
- recent_activity: "Published 2 days ago",
- status: "active",
- bio: "Dawn Song is a Professor in the Department of Electrical Engineering and Computer Science at UC Berkeley. Her research interest lies in deep learning, security, and blockchain.",
- location: "Berkeley, CA",
- website: "https://dawnsong.io",
- expertise_radar: [
- { subject: 'Security', A: 100, fullMark: 100 },
- { subject: 'Deep Learning', A: 90, fullMark: 100 },
- { subject: 'Blockchain', A: 80, fullMark: 100 },
- { subject: 'Systems', A: 85, fullMark: 100 },
- { subject: 'Privacy', A: 95, fullMark: 100 },
- ],
- publications: papers,
- co_authors: [
- { name: "Dan Hendrycks", avatar: "https://avatar.vercel.sh/dan.png" },
- { name: "Kevin Eykholt", avatar: "https://avatar.vercel.sh/kevin.png" }
- ],
+ name: String(scholar.name || scholarName),
+ affiliation: String((scholar.affiliations || ["Unknown affiliation"])[0] || "Unknown affiliation"),
+ h_index: Number(scholar.h_index || 0),
+ papers_tracked: Number(scholar.paper_count || 0),
+ recent_activity: recentActivity,
+ status: publicationTrend === "up" ? "active" : "idle",
+ bio: `Trend snapshot: ${trends?.trend_summary?.citation_trend || "flat"} citation trend over the recent analysis window.`,
+ location: "N/A",
+ website: "",
+ expertise_radar: topicDist.map((t) => ({
+ subject: String(t.topic || "Topic"),
+ A: Math.round((Number(t.count || 0) / maxTopicCount) * 100),
+ fullMark: 100,
+ })),
+ publications,
+ co_authors: coauthors,
stats: {
- total_citations: 54321,
- papers_count: 230,
- h_index: 120
- }
+ total_citations: Number(scholar.citation_count || 0),
+ papers_count: Number(scholar.paper_count || 0),
+ h_index: Number(scholar.h_index || 0),
+ },
}
}
diff --git a/web/src/lib/stores/workflow-store.ts b/web/src/lib/stores/workflow-store.ts
index 49e0f17..6e59be2 100644
--- a/web/src/lib/stores/workflow-store.ts
+++ b/web/src/lib/stores/workflow-store.ts
@@ -88,6 +88,50 @@ export type DailyResult = {
export type WorkflowPhase = "idle" | "searching" | "searched" | "reporting" | "reported" | "error"
+export type WorkflowConfig = {
+ enableLLM: boolean
+ enableJudge: boolean
+ useSummary: boolean
+ useTrends: boolean
+ useInsight: boolean
+ useRelevance: boolean
+ useArxiv: boolean
+ useVenue: boolean
+ usePapersCool: boolean
+ useArxivApi: boolean
+ useHFDaily: boolean
+ saveDaily: boolean
+ topK: number
+ topN: number
+ showPerBranch: number
+ judgeRuns: number
+ judgeMaxItems: number
+ judgeTokenBudget: number
+ outputDir: string
+}
+
+const DEFAULT_CONFIG: WorkflowConfig = {
+ enableLLM: true,
+ enableJudge: true,
+ useSummary: true,
+ useTrends: true,
+ useInsight: true,
+ useRelevance: true,
+ useArxiv: true,
+ useVenue: true,
+ usePapersCool: true,
+ useArxivApi: true,
+ useHFDaily: true,
+ saveDaily: true,
+ topK: 5,
+ topN: 10,
+ showPerBranch: 25,
+ judgeRuns: 1,
+ judgeMaxItems: 20,
+ judgeTokenBudget: 0,
+ outputDir: "./reports/dailypaper",
+}
+
interface WorkflowState {
/* Persisted results */
searchResult: SearchResult | null
@@ -95,14 +139,22 @@ interface WorkflowState {
phase: WorkflowPhase
analyzeLog: string[]
lastUpdated: string | null
+ notifyEmail: string
+ notifyEnabled: boolean
+ resendEnabled: boolean
+ config: WorkflowConfig
/* Actions */
setSearchResult: (result: SearchResult) => void
- setDailyResult: (result: DailyResult) => void
+ setDailyResult: (result: DailyResult | null) => void
updateDailyResult: (updater: (prev: DailyResult) => DailyResult) => void
setPhase: (phase: WorkflowPhase) => void
addAnalyzeLog: (line: string) => void
clearAnalyzeLog: () => void
+ setNotifyEmail: (email: string) => void
+ setNotifyEnabled: (enabled: boolean) => void
+ setResendEnabled: (enabled: boolean) => void
+ updateConfig: (patch: Partial) => void
clearAll: () => void
}
@@ -114,6 +166,10 @@ export const useWorkflowStore = create()(
phase: "idle",
analyzeLog: [],
lastUpdated: null,
+ notifyEmail: "",
+ notifyEnabled: false,
+ resendEnabled: false,
+ config: { ...DEFAULT_CONFIG },
setSearchResult: (result) =>
set({ searchResult: result, lastUpdated: new Date().toISOString() }),
@@ -135,6 +191,15 @@ export const useWorkflowStore = create()(
clearAnalyzeLog: () => set({ analyzeLog: [] }),
+ setNotifyEmail: (email) => set({ notifyEmail: email }),
+
+ setNotifyEnabled: (enabled) => set({ notifyEnabled: enabled }),
+
+ setResendEnabled: (enabled) => set({ resendEnabled: enabled }),
+
+ updateConfig: (patch) =>
+ set((s) => ({ config: { ...s.config, ...patch } })),
+
clearAll: () =>
set({
searchResult: null,
@@ -152,6 +217,10 @@ export const useWorkflowStore = create()(
phase: state.phase === "searching" || state.phase === "reporting" ? "idle" : state.phase,
analyzeLog: state.analyzeLog.slice(-50),
lastUpdated: state.lastUpdated,
+ notifyEmail: state.notifyEmail,
+ notifyEnabled: state.notifyEnabled,
+ resendEnabled: state.resendEnabled,
+ config: state.config,
}),
},
),