From 34d7e2a218e1fdb39b5cc8744ac434f11ffd596c Mon Sep 17 00:00:00 2001
From: boyu <oor2020@163.com>
Date: Sat, 3 Jan 2026 23:14:14 +0100
Subject: [PATCH 1/9] feat(memory): implement P0 scope and acceptance criteria

   Add memory type taxonomy, isolation strategy, and acceptance metrics:

   - docs: memory_types.md, memory_isolation.md, P0 design doc
   - feat: MemoryMetricCollector for tracking 5 P0 metrics
   - feat: MemoryEvalMetricModel for persisting metrics
   - feat: P0 hooks in research routes (false_positive, deletion_compliance)
   - feat: /research/memory/feedback endpoint for retrieval_hit_rate
   - feat: /memory/metrics endpoints for viewing recorded metrics
   - feat: get_items_by_ids() method in memory_store
   - test: deletion_compliance and retrieval_hit_rate eval scripts
   - ci: add memory P0 acceptance tests to CI workflow

   P0 Acceptance Metrics:
   - extraction_precision: >= 85%
   - false_positive_rate: <= 5%
   - retrieval_hit_rate: >= 80%
   - injection_pollution_rate: <= 2%
   - deletion_compliance: 100%

   Generated with [LIU BOYU]<oor2020@163.com>
---
 .github/workflows/ci.yml                      |   7 +
 docs/P0_memory_scope_acceptance_design.md     | 739 ++++++++++++++++++
 docs/memory_isolation.md                      | 144 ++++
 docs/memory_types.md                          | 141 ++++
 evals/memory/__init__.py                      |   7 +
 evals/memory/fixtures/retrieval_queries.json  |  35 +
 evals/memory/fixtures/sample_memories.json    |  42 +
 evals/memory/test_deletion_compliance.py      | 150 ++++
 evals/memory/test_retrieval_hit_rate.py       | 161 ++++
 src/paperbot/api/routes/memory.py             |  44 ++
 src/paperbot/api/routes/research.py           | 134 ++++
 .../infrastructure/stores/memory_store.py     |  18 +
 src/paperbot/infrastructure/stores/models.py  |  28 +
 src/paperbot/memory/eval/__init__.py          |  14 +
 src/paperbot/memory/eval/collector.py         | 240 ++++++
 15 files changed, 1904 insertions(+)
 create mode 100644 docs/P0_memory_scope_acceptance_design.md
 create mode 100644 docs/memory_isolation.md
 create mode 100644 docs/memory_types.md
 create mode 100644 evals/memory/__init__.py
 create mode 100644 evals/memory/fixtures/retrieval_queries.json
 create mode 100644 evals/memory/fixtures/sample_memories.json
 create mode 100644 evals/memory/test_deletion_compliance.py
 create mode 100644 evals/memory/test_retrieval_hit_rate.py
 create mode 100644 src/paperbot/memory/eval/__init__.py
 create mode 100644 src/paperbot/memory/eval/collector.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 22c21f1..4cda81a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -48,3 +48,10 @@ jobs:
           python evals/runners/run_track_pipeline_smoke.py
           python evals/runners/run_eventlog_replay_smoke.py
 
+      - name: Run memory range and acceptance testing
+        env:
+          PYTHONPATH: src
+        run: |
+          python evals/memory/test_deletion_compliance.py
+          python evals/memory/test_retrieval_hit_rate.py
+
diff --git a/docs/P0_memory_scope_acceptance_design.md b/docs/P0_memory_scope_acceptance_design.md
new file mode 100644
index 0000000..461f9a0
--- /dev/null
+++ b/docs/P0_memory_scope_acceptance_design.md
@@ -0,0 +1,739 @@
+# P0: Memory Scope and Acceptance Criteria - Technical Design Document
+
+> **Status**: Draft
+> **Author**: Claude Code
+> **Date**: 2025-12-27
+> **Estimated Effort**: 1-2 days (17h)
+
+---
+
+## 0. Architecture Context: Where P0 Fits
+
+P0 spans **three layers** of the PaperBot 5-layer architecture, focusing on the **Memory subsystem's foundational definitions and quality metrics**.
+
+### 0.1 PaperBot 5-Layer Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│                        PaperBot Standard Architecture                           │
+│         (Offline Ingestion → Storage → Online Retrieval → Generation → Feedback)│
+└─────────────────────────────────────────────────────────────────────────────────┘
+
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│  Layer 1 · Ingestion (Async)                                                    │
+│  ┌──────────┐   ┌──────────────┐   ┌────────────┐   ┌──────────────────────┐    │
+│  │ Sources  │──▶│ Parse/Norm   │──▶│Chunk/Embed │──▶│ EventLog (evidence)  │    │
+│  │ arXiv    │   │ Connectors   │   │ (optional) │   │ SqlAlchemyEventLog   │    │
+│  │ Reddit   │   │              │   │            │   │                      │    │
+│  └──────────┘   └──────────────┘   └────────────┘   └──────────────────────┘    │
+└─────────────────────────────────────────────────────────────────────────────────┘
+                                           │
+                                           ▼
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│  Layer 2 · Storage                                                              │
+│  ┌────────────────────────────────────────┐   ┌─────────────────────────────┐   │
+│  │  SQL 主库 (SQLite/Postgres)            │   │  Vector Index (optional)   │   │
+│  │  ┌─────────────────────────────────┐   │   │  Qdrant / Milvus           │   │
+│  │  │ ╔═══════════════════════════╗   │   │   └─────────────────────────────┘   │
+│  │  │ ║  memory_items  ◀──────────╠───┼───┼─── P0: Type Boundaries        │   │
+│  │  │ ║  memory_audit_log         ║   │   │        Namespace/Isolation     │   │
+│  │  │ ╚═══════════════════════════╝   │   │                                │   │
+│  │  │   research_tracks / tasks       │   │                                │   │
+│  │  │   paper_feedback                │   │                                │   │
+│  │  └─────────────────────────────────┘   │                                │   │
+│  └────────────────────────────────────────┘                                    │
+└─────────────────────────────────────────────────────────────────────────────────┘
+                                           │
+                                           ▼
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│  Layer 3 · Retrieval (Online)                                                   │
+│  ┌────────────┐   ┌─────────────┐   ┌──────────────────────────────────────┐    │
+│  │ContextEng │──▶│ TrackRouter │──▶│ ╔════════════════════════════════╗   │    │
+│  │ine        │   │             │   │ ║  MemoryStore                   ║   │    │
+│  │           │   │ keyword+    │   │ ║  search_memories()  ◀──────────╠───┼─── P0
+│  │           │   │ mem+task+   │   │ ║  (scope-aware retrieval)       ║   │    │
+│  │           │   │ embedding   │   │ ╚════════════════════════════════╝   │    │
+│  └────────────┘   └─────────────┘   └──────────────────────────────────────┘    │
+│                                                                                 │
+│  ┌────────────────┐   ┌────────────────┐   ┌────────────────────────────────┐   │
+│  │ Paper Searcher │──▶│ Rank/Filter/   │──▶│ Replay write                   │   │
+│  │ S2 / offline   │   │ Dedupe         │   │ context_run + impressions      │   │
+│  └────────────────┘   └────────────────┘   └────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────────────────────────┘
+                                           │
+                                           ▼
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│  Layer 4 · Generation                                                           │
+│  ┌────────────────┐   ┌────────────────┐   ┌────────────────────────────────┐   │
+│  │ PromptComposer │──▶│ LLM + Tools    │──▶│ Output Parser                  │   │
+│  │ budget+format  │   │ GPT/Claude     │   │ structured + citations         │   │
+│  └────────────────┘   └────────────────┘   └────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────────────────────────┘
+                                           │
+                                           ▼
+┌─────────────────────────────────────────────────────────────────────────────────┐
+│  Layer 5 · Feedback / Governance / Replay                                       │
+│  ┌─────────────────────────────────┐   ┌────────────────┐   ┌────────────────┐  │
+│  │ ╔═════════════════════════════╗ │   │ Paper Feedback │   │ Evals/Replay   │  │
+│  │ ║  Memory Inbox (governance)  ║ │   │ like/save/     │   │ Summary        │  │
+│  │ ║  suggest → pending          ║ │   │ dislike        │   │                │  │
+│  │ ║  approve/reject/move ◀──────╠─┼───┼────────────────┼───┼─── P0: Metrics │  │
+│  │ ╚═════════════════════════════╝ │   │                │   │    Evaluation  │  │
+│  └─────────────────────────────────┘   └────────────────┘   └────────────────┘  │
+└─────────────────────────────────────────────────────────────────────────────────┘
+
+Legend:
+  ╔═══╗  P0 Focus Area (Memory Scope & Acceptance)
+  ╚═══╝
+  ───▶   Data Flow
+  ◀────  P0 Touch Point
+```
+
+### 0.2 P0 Components Mapped to Architecture Layers
+
+| Layer | Component | P0 Deliverable |
+|-------|-----------|----------------|
+| **Layer 2: Storage** | `memory_items` table | Memory type boundary definitions (taxonomy) |
+| | Namespace fields | `user_id` / `workspace_id` / `scope_type:scope_id` isolation rules |
+| **Layer 3: Retrieval** | `MemoryStore.search_memories()` | Scope-aware retrieval logic validation |
+| | Query resolution | Isolation rule implementation verification |
+| **Layer 5: Feedback** | Memory Inbox | Governance workflow (pending → approved) |
+| | `memory_eval_metrics` (new) | Acceptance criteria measurement |
+| | Audit log | Deletion compliance verification |
+
+### 0.3 P0 Focus: Cross-Cutting Concerns
+
+```
+                    ┌─────────────────────────────────────────┐
+                    │           P0: Scope & Acceptance        │
+                    └─────────────────────────────────────────┘
+                                       │
+           ┌───────────────────────────┼───────────────────────────┐
+           │                           │                           │
+           ▼                           ▼                           ▼
+┌─────────────────────┐   ┌─────────────────────┐   ┌─────────────────────┐
+│  Type Boundaries    │   │  Namespace/Isolation│   │  Acceptance Metrics │
+│                     │   │                     │   │                     │
+│  - User Memory      │   │  - user_id (L1)     │   │  - Extraction       │
+│    profile/pref/    │   │  - workspace_id(L2) │   │    Precision ≥85%   │
+│    goal/constraint  │   │  - scope_type:id(L3)│   │  - False Positive   │
+│  - Episodic Memory  │   │  - provider (meta)  │   │    Rate ≤5%         │
+│    fact/decision/   │   │                     │   │  - Retrieval Hit    │
+│    hypothesis       │   │  Query Resolution:  │   │    Rate ≥80%        │
+│  - Workspace Memory │   │  scope > workspace  │   │  - Injection        │
+│    keyword_set/note │   │  > user (fallback)  │   │    Pollution ≤2%    │
+│                     │   │                     │   │  - Deletion         │
+│  NOT Memory:        │   │                     │   │    Compliance 100%  │
+│  - Context cache    │   │                     │   │                     │
+│  - Code index       │   │                     │   │                     │
+│  - Search results   │   │                     │   │                     │
+└─────────────────────┘   └─────────────────────┘   └─────────────────────┘
+         │                           │                           │
+         └───────────────────────────┼───────────────────────────┘
+                                     │
+                                     ▼
+                    ┌─────────────────────────────────────────┐
+                    │        Implementation Artifacts         │
+                    │                                         │
+                    │  docs/memory_types.md                   │
+                    │  docs/memory_isolation.md               │
+                    │  src/paperbot/memory/eval/collector.py  │
+                    │  evals/memory/test_*.py                 │
+                    └─────────────────────────────────────────┘
+```
+
+### 0.4 Data Flow with P0 Touch Points
+
+```
+                                 INGESTION
+                                     │
+     ┌───────────────────────────────┼───────────────────────────────┐
+     │                               ▼                               │
+     │  ┌─────────────────────────────────────────────────────────┐  │
+     │  │  POST /api/memory/ingest                                │  │
+     │  │  file + user_id + platform + workspace_id               │  │
+     │  └─────────────────────────────────────────────────────────┘  │
+     │                               │                               │
+     │                               ▼                               │
+     │  ┌─────────────────────────────────────────────────────────┐  │
+     │  │  Parsers (ChatGPT/Gemini/Plaintext)                     │  │
+     │  │  → NormalizedMessage[]                                  │  │
+     │  └─────────────────────────────────────────────────────────┘  │
+     │                               │                               │
+     │                               ▼                               │
+     │  ┌─────────────────────────────────────────────────────────┐  │
+     │  │  Extractor (heuristic + optional LLM)                   │  │
+     │  │  → MemoryCandidate[]                                    │  │
+     │  │  ╔═══════════════════════════════════════════════════╗  │  │
+     │  │  ║ P0: kind must be in defined taxonomy              ║  │  │
+     │  │  ║ P0: confidence scoring for precision metrics      ║  │  │
+     │  │  ╚═══════════════════════════════════════════════════╝  │  │
+     │  └─────────────────────────────────────────────────────────┘  │
+     │                               │                               │
+     └───────────────────────────────┼───────────────────────────────┘
+                                     │
+                                 STORAGE
+                                     │
+     ┌───────────────────────────────┼───────────────────────────────┐
+     │                               ▼                               │
+     │  ┌─────────────────────────────────────────────────────────┐  │
+     │  │  MemoryStore.add_memories()                             │  │
+     │  │  ╔═══════════════════════════════════════════════════╗  │  │
+     │  │  ║ P0: scope-aware dedup (content_hash includes scope)║  │  │
+     │  │  ║ P0: status = pending if confidence < 0.6          ║  │  │
+     │  │  ║ P0: namespace isolation (user_id + workspace_id)  ║  │  │
+     │  │  ╚═══════════════════════════════════════════════════╝  │  │
+     │  └─────────────────────────────────────────────────────────┘  │
+     │                               │                               │
+     │                               ▼                               │
+     │  ┌─────────────────────────────────────────────────────────┐  │
+     │  │  memory_items table                                     │  │
+     │  │  memory_audit_log table                                 │  │
+     │  └─────────────────────────────────────────────────────────┘  │
+     │                                                               │
+     └───────────────────────────────────────────────────────────────┘
+                                     │
+                                RETRIEVAL
+                                     │
+     ┌───────────────────────────────┼───────────────────────────────┐
+     │                               ▼                               │
+     │  ┌─────────────────────────────────────────────────────────┐  │
+     │  │  POST /api/memory/context                               │  │
+     │  │  query + user_id + workspace_id + scope                 │  │
+     │  └─────────────────────────────────────────────────────────┘  │
+     │                               │                               │
+     │                               ▼                               │
+     │  ┌─────────────────────────────────────────────────────────┐  │
+     │  │  MemoryStore.search_memories()                          │  │
+     │  │  ╔═══════════════════════════════════════════════════╗  │  │
+     │  │  ║ P0: only approved, non-deleted, non-expired       ║  │  │
+     │  │  ║ P0: scope isolation enforced                      ║  │  │
+     │  │  ║ P0: hit rate metrics collected                    ║  │  │
+     │  │  ╚═══════════════════════════════════════════════════╝  │  │
+     │  └─────────────────────────────────────────────────────────┘  │
+     │                               │                               │
+     │                               ▼                               │
+     │  ┌─────────────────────────────────────────────────────────┐  │
+     │  │  ContextPack (formatted for prompt injection)           │  │
+     │  └─────────────────────────────────────────────────────────┘  │
+     │                                                               │
+     └───────────────────────────────────────────────────────────────┘
+                                     │
+                               GOVERNANCE
+                                     │
+     ┌───────────────────────────────┼───────────────────────────────┐
+     │                               ▼                               │
+     │  ┌─────────────────────────────────────────────────────────┐  │
+     │  │  Memory Inbox: /api/research/memory/inbox               │  │
+     │  │  ╔═══════════════════════════════════════════════════╗  │  │
+     │  │  ║ P0: pending → approved/rejected workflow          ║  │  │
+     │  │  ║ P0: bulk_moderate, bulk_move, clear               ║  │  │
+     │  │  ╚═══════════════════════════════════════════════════╝  │  │
+     │  └─────────────────────────────────────────────────────────┘  │
+     │                               │                               │
+     │                               ▼                               │
+     │  ┌─────────────────────────────────────────────────────────┐  │
+     │  │  DELETE /api/memory/items/{id}                          │  │
+     │  │  ╔═══════════════════════════════════════════════════╗  │  │
+     │  │  ║ P0: soft_delete / hard_delete                     ║  │  │
+     │  │  ║ P0: deletion compliance = 100% (never retrieved)  ║  │  │
+     │  │  ╚═══════════════════════════════════════════════════╝  │  │
+     │  └─────────────────────────────────────────────────────────┘  │
+     │                                                               │
+     └───────────────────────────────────────────────────────────────┘
+                                     │
+                               EVALUATION
+                                     │
+     ┌───────────────────────────────┼───────────────────────────────┐
+     │                               ▼                               │
+     │  ┌─────────────────────────────────────────────────────────┐  │
+     │  │  GET /api/memory/metrics (NEW in P0)                    │  │
+     │  │  ╔═══════════════════════════════════════════════════╗  │  │
+     │  │  ║ P0: MetricCollector aggregates 5 metrics          ║  │  │
+     │  │  ║     - extraction_precision                        ║  │  │
+     │  │  ║     - false_positive_rate                         ║  │  │
+     │  │  ║     - retrieval_hit_rate                          ║  │  │
+     │  │  ║     - injection_pollution_rate                    ║  │  │
+     │  │  ║     - deletion_compliance                         ║  │  │
+     │  │  ╚═══════════════════════════════════════════════════╝  │  │
+     │  └─────────────────────────────────────────────────────────┘  │
+     │                               │                               │
+     │                               ▼                               │
+     │  ┌─────────────────────────────────────────────────────────┐  │
+     │  │  memory_eval_metrics table (NEW in P0)                  │  │
+     │  │  evals/memory/test_*.py (NEW in P0)                     │  │
+     │  └─────────────────────────────────────────────────────────┘  │
+     │                                                               │
+     └───────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 1. Executive Summary
+
+**Objective**: Define clear boundaries for memory types, establish namespace/isolation strategies, and implement measurable acceptance criteria for the cross-platform memory middleware.
+
+**Current State**: The infrastructure (database schema, CRUD APIs, basic extraction/search) is largely complete. P0 focuses on **formalizing definitions** and **implementing evaluation metrics** rather than building new features.
+
+**Scope**: This document covers the three main deliverables from `docs/memory_todo.md` P0 section:
+1. Memory type boundary definitions
+2. Namespace and isolation strategy
+3. Acceptance criteria (5 metrics)
+
+---
+
+## 2. Technical Solution Design
+
+### 2.1 Memory Type Boundaries
+
+Based on the existing `MemoryKind` enum and the TODO requirements, the following formal taxonomy is proposed:
+
+#### 2.1.1 Memory Type Taxonomy
+
+| Category | Kind | Definition | Persistence | Example |
+|----------|------|------------|-------------|---------|
+| **User Memory** | `profile` | Identity facts (name, role, affiliation) | Permanent until edited | "My name is Jerry" |
+| | `preference` | Style/format preferences | Semi-permanent | "I prefer concise answers" |
+| | `goal` | Long-term objectives | Session-spanning | "I'm researching LLM memory" |
+| | `constraint` | Hard rules/requirements | Permanent | "Never use Chinese in responses" |
+| | `project` | Project context/background | Project-scoped | "PaperBot is a research tool" |
+| **Episodic Memory** | `fact` | Session-derived facts | Decaying | "User mentioned deadline is Friday" |
+| | `decision` | Key decisions made | Project-scoped | "We chose SQLite over Postgres" |
+| | `hypothesis` | Tentative assumptions | Low-priority | "User may be a graduate student" |
+| | `todo` | Action items | Time-bound | "Need to implement FTS5" |
+| **Workspace/Project** | `keyword_set` | Domain keywords | Project-scoped | "focus areas: RAG, memory, agents" |
+| | `note` | Free-form annotations | Project-scoped | "Important: check GDPR compliance" |
+
+#### 2.1.2 Exclusions (NOT Memory)
+
+The following are explicitly **NOT** part of the memory system and should be handled elsewhere:
+
+| Item | Reason | Proper Location |
+|------|--------|-----------------|
+| **Context Caching** | Latency optimization, not long-term storage | LLM provider layer |
+| **Code Index** | Repository structure, symbols, dependencies | `context_engine/` or future `code_index` subsystem |
+| **Search Results** | Ephemeral retrieval results | Not persisted |
+| **Embeddings** | Vector representations | Separate `memory_embeddings` table (P1) |
+| **Conversation History** | Raw message logs | `memory_sources` table (input, not output) |
+
+### 2.2 Namespace and Isolation Strategy
+
+#### 2.2.1 Isolation Hierarchy
+
+```
+┌─────────────────────────────────────────────────────┐
+│                    user_id (required)               │
+│  ┌───────────────────────────────────────────────┐  │
+│  │            workspace_id (optional)            │  │
+│  │  ┌─────────────────────────────────────────┐  │  │
+│  │  │   scope_type:scope_id (track/project)  │  │  │
+│  │  │  ┌─────────────────────────────────────┐│  │  │
+│  │  │  │   provider (metadata only)         ││  │  │
+│  │  │  └─────────────────────────────────────┘│  │  │
+│  │  └─────────────────────────────────────────┘  │  │
+│  └───────────────────────────────────────────────┘  │
+└─────────────────────────────────────────────────────┘
+```
+
+#### 2.2.2 Isolation Rules
+
+| Level | Field | Required | Behavior |
+|-------|-------|----------|----------|
+| **L1** | `user_id` | Yes | Primary boundary. Cross-user access is forbidden. |
+| **L2** | `workspace_id` | No | Team/project isolation. Memories with `workspace_id=X` are only visible when querying with the same `workspace_id`. |
+| **L3** | `scope_type:scope_id` | No | Track-level isolation for research contexts. |
+| **Metadata** | `provider` | No | Provenance tracking only (ChatGPT/Gemini/Claude/...). Not used for query filtering. |
+
+#### 2.2.3 Scope Type Values
+
+| scope_type | Description | Visibility |
+|------------|-------------|------------|
+| `global` | User-wide memories | Visible across all tracks/workspaces for that user |
+| `track` | Research direction specific | Only visible when that track is active |
+| `project` | Project-specific context | Only visible within project scope |
+| `paper` | Paper-specific notes | Only visible when analyzing that paper |
+
+#### 2.2.4 Query Resolution Logic
+
+```python
+def resolve_visible_memories(user_id, workspace_id=None, scope_type=None, scope_id=None):
+    """
+    Resolution order (most specific to least):
+    1. Exact scope match (scope_type + scope_id)
+    2. Global scope within workspace
+    3. Global scope for user (if no workspace specified)
+    """
+    base_filter = (user_id == user_id)
+
+    if workspace_id:
+        base_filter &= (workspace_id == workspace_id)
+
+    if scope_type and scope_id:
+        # Include both specific scope AND global
+        scope_filter = (
+            (scope_type == scope_type AND scope_id == scope_id) |
+            (scope_type == 'global')
+        )
+    else:
+        scope_filter = (scope_type == 'global')
+
+    return base_filter & scope_filter
+```
+
+### 2.3 Acceptance Criteria Framework
+
+#### 2.3.1 Metric Definitions
+
+| Metric | Definition | Formula | Target |
+|--------|------------|---------|--------|
+| **Extraction Precision** | Fraction of extracted items that are correct | `correct_items / total_extracted_items` | ≥ 85% |
+| **False Positive Rate (Dirty Memory Rate)** | Fraction of approved items that are incorrect or harmful | `incorrect_approved / total_approved` | ≤ 5% |
+| **Retrieval Hit Rate** | Fraction of relevant memories retrieved when needed | `retrieved_relevant / total_relevant` | ≥ 80% |
+| **Injection Pollution Rate** | Fraction of responses negatively affected by wrong memory | `polluted_responses / total_responses_with_memory` | ≤ 2% |
+| **Deletion Compliance** | Deleted items must never be retrieved or injected | `deleted_items_retrieved == 0` | 100% |
+
+#### 2.3.2 Measurement Methods
+
+| Metric | Data Source | Frequency | Automation Level |
+|--------|-------------|-----------|------------------|
+| Extraction Precision | Human sampling (50 items/week) | Weekly | Manual with script support |
+| False Positive Rate | User corrections + manual audit | Weekly | Semi-automated |
+| Retrieval Hit Rate | Synthetic test queries + user feedback | Daily (CI) + Weekly (user) | Automated + Manual |
+| Injection Pollution Rate | A/B testing + user reports | Monthly | Manual |
+| Deletion Compliance | Automated regression test | Per-commit (CI) | Fully automated |
+
+#### 2.3.3 Metric Storage Schema
+
+```sql
+CREATE TABLE memory_eval_metrics (
+    id INTEGER PRIMARY KEY,
+    metric_name TEXT NOT NULL,        -- 'extraction_precision', 'retrieval_hit_rate', etc.
+    metric_value REAL NOT NULL,       -- 0.0 to 1.0
+    sample_size INTEGER,              -- Number of items evaluated
+    evaluated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    evaluator_id TEXT,                -- 'ci', 'human:<user_id>', 'automated'
+    detail_json TEXT,                 -- Additional context (item IDs, query info, etc.)
+
+    INDEX idx_metric_name_time (metric_name, evaluated_at)
+);
+```
+
+---
+
+## 3. Implementation Principles
+
+### 3.1 Core Design Principles
+
+| Principle | Description | Implementation |
+|-----------|-------------|----------------|
+| **Explicit over Implicit** | All memory types must have clear written definitions | Document in `docs/memory_types.md` + code docstrings |
+| **Fail-Safe Defaults** | Low confidence or high PII risk → require review | `confidence < 0.6` → `status='pending'` |
+| **Auditability First** | Every mutation must be logged | All changes create `memory_audit_log` entries |
+| **Isolation by Default** | Scope to narrowest applicable boundary | Default `scope_type='global'`, require explicit promotion |
+| **Separation of Concerns** | Each layer has single responsibility | Extraction → Storage → Retrieval → Injection |
+
+### 3.2 Fail-Safe Status Transitions
+
+```
+                    ┌──────────────┐
+                    │   Extracted  │
+                    └──────┬───────┘
+                           │
+              ┌────────────┴────────────┐
+              │                         │
+              ▼                         ▼
+    ┌─────────────────┐      ┌─────────────────┐
+    │ confidence ≥ 0.6│      │ confidence < 0.6│
+    │ pii_risk < 2    │      │ OR pii_risk ≥ 2 │
+    └────────┬────────┘      └────────┬────────┘
+             │                        │
+             ▼                        ▼
+    ┌─────────────────┐      ┌─────────────────┐
+    │    approved     │      │     pending     │
+    │  (auto-active)  │      │ (needs review)  │
+    └─────────────────┘      └────────┬────────┘
+                                      │
+                        ┌─────────────┴─────────────┐
+                        │                           │
+                        ▼                           ▼
+               ┌─────────────────┐        ┌─────────────────┐
+               │    approved     │        │    rejected     │
+               │ (human review)  │        │  (discarded)    │
+               └─────────────────┘        └─────────────────┘
+```
+
+### 3.3 Component Responsibilities
+
+| Component | Responsibility | Should NOT Do |
+|-----------|----------------|---------------|
+| **Extractor** | Parse input, produce candidates with confidence | Apply business rules, write to DB |
+| **Store** | Dedup, apply status rules, persist, audit | Parse input, format output |
+| **Retriever** | Find relevant approved items | Return pending/deleted items |
+| **Injector** | Format for prompt, respect token budget | Store or modify memories |
+
+---
+
+## 4. Technology Selection Rationale
+
+### 4.1 Metrics Storage: SQLite
+
+| Criterion | SQLite | InfluxDB | Prometheus |
+|-----------|--------|----------|------------|
+| Consistency with stack | ✅ Same DB | ❌ New dependency | ❌ New infra |
+| Volume fit | ✅ Hundreds/day | Overkill | Overkill |
+| Backup simplicity | ✅ Single file | ❌ Complex | ❌ Complex |
+| Query flexibility | ✅ Full SQL | ⚠️ InfluxQL | ⚠️ PromQL |
+
+**Decision**: SQLite (same as main DB)
+
+### 4.2 Precision Evaluation: Human-in-the-Loop
+
+| Option | Pros | Cons | Decision |
+|--------|------|------|----------|
+| Human sampling | Ground truth quality | Labor-intensive | ✅ Selected |
+| LLM-as-judge | Automated | Circular dependency | ❌ Rejected |
+| Rule-based | Fast | Cannot judge semantic correctness | ❌ Rejected |
+
+**Decision**: Manual sampling (50 items/week) with script support. Build annotated dataset for future automation.
+
+### 4.3 Retrieval Evaluation: Synthetic Test Queries
+
+| Option | Pros | Cons | Decision |
+|--------|------|------|----------|
+| Synthetic fixtures | Reproducible, CI-friendly | May miss edge cases | ✅ Selected |
+| Production sampling | Real-world coverage | Privacy concerns, non-deterministic | ⚠️ Future |
+| User feedback | Direct signal | Sparse, delayed | ⚠️ Supplementary |
+
+**Decision**: Pre-defined query-answer pairs in `evals/memory/fixtures/`. Supplement with user feedback in P1.
+
+---
+
+## 5. Best Practices and References
+
+### 5.1 Industry References
+
+| Source | Key Insight | Application |
+|--------|-------------|-------------|
+| **Generative Agents** (Stanford, 2023) | Memory streams with reflection layers | Validates episodic vs. long-term distinction |
+| **MemGPT** (Berkeley, 2023) | Hierarchical memory with OS-like management | Validates scope-based isolation design |
+| **ChatGPT Memory** (OpenAI, 2024) | User-controllable explicit memory | Justifies `pending/approved` workflow |
+| **Claude Projects** (Anthropic, 2024) | Project-scoped context isolation | Validates workspace_id pattern |
+| **GDPR Article 17** | Right to erasure | Mandates 100% deletion compliance |
+
+### 5.2 Academic Papers
+
+| Paper | Relevance |
+|-------|-----------|
+| Lewis et al., "RAG for Knowledge-Intensive NLP Tasks" (2020) | Retrieval + generation pattern for injection |
+| Thakur et al., "BEIR: Zero-shot IR Evaluation" (2021) | Retrieval metric methodology |
+| Park et al., "Generative Agents" (2023) | Memory stream architecture |
+| Packer et al., "MemGPT" (2023) | Tiered memory management |
+
+### 5.3 Internal Documents
+
+| Document | Content |
+|----------|---------|
+| `docs/memory_survey.md` | Survey of memory architectures across products |
+| `docs/memory_ui_controls_matrix.md` | UI patterns from ChatGPT, Claude, Gemini, etc. |
+| `docs/memory.md` | Current API documentation |
+| `src/paperbot/memory/schema.py` | Existing type definitions |
+
+---
+
+## 6. Risks and Mitigations
+
+### 6.1 Technical Risks
+
+| Risk | Likelihood | Impact | Mitigation |
+|------|------------|--------|------------|
+| Metrics collection slows API | Medium | Medium | Async logging; batch writes; sampling |
+| Human labeling bottleneck | High | Low | Start small (50/week); automate obvious cases |
+| Deletion compliance gaps | Low | Critical | DB constraint; nightly audit; hard delete for GDPR |
+| Scope confusion in queries | Medium | Medium | Clear error messages; API validation; docs |
+
+### 6.2 Process Risks
+
+| Risk | Likelihood | Impact | Mitigation |
+|------|------------|--------|------------|
+| Metric definitions drift | Medium | Medium | Document in code; review in PRs |
+| Evaluation dataset staleness | High | Low | Quarterly refresh; add edge cases from production |
+| Unrealistic targets | Medium | Low | Start with industry benchmarks; adjust after first month |
+
+### 6.3 Compliance Risks
+
+| Risk | Likelihood | Impact | Mitigation |
+|------|------------|--------|------------|
+| PII in metrics | Low | High | Never log raw content; only IDs and scores |
+| Audit log tampering | Low | Critical | Append-only design; consider signed entries |
+
+---
+
+## 7. Workload Estimation
+
+### 7.1 Task Breakdown
+
+| Task | Effort | Dependencies |
+|------|--------|--------------|
+| **Documentation** | | |
+| Create `docs/memory_types.md` with taxonomy | 2h | None |
+| Create `docs/memory_isolation.md` with rules | 1h | Taxonomy |
+| Update `schema.py` docstrings | 1h | Taxonomy |
+| **Evaluation Infrastructure** | | |
+| Add `memory_eval_metrics` table | 1h | None |
+| Implement `MemoryMetricCollector` class | 3h | Table |
+| Hook collector into API endpoints | 2h | Collector |
+| Add `/api/memory/metrics` endpoint | 1h | Collector |
+| **Test Harness** | | |
+| Create extraction precision test | 2h | Collector |
+| Create retrieval hit rate test | 2h | Collector |
+| Create deletion compliance test | 1h | Collector |
+| Add CI job for regression | 1h | Tests |
+
+### 7.2 Summary
+
+| Category | Hours |
+|----------|-------|
+| Documentation | 4h |
+| Evaluation Infrastructure | 7h |
+| Test Harness | 6h |
+| **Total** | **17h (~2 days)** |
+
+### 7.3 Suggested Timeline
+
+```
+Day 1 (Morning):   Documentation (4h)
+                   - memory_types.md
+                   - memory_isolation.md
+                   - schema.py docstrings
+
+Day 1 (Afternoon): Evaluation Infrastructure (4h)
+                   - memory_eval_metrics table
+                   - MetricCollector class (partial)
+
+Day 2 (Morning):   Evaluation Infrastructure + Tests (5h)
+                   - MetricCollector completion
+                   - API integration
+                   - Test harness
+
+Day 2 (Afternoon): CI Integration + Review (4h)
+                   - CI job setup
+                   - Code review
+                   - Update memory_todo.md
+```
+
+---
+
+## 8. Deliverables Checklist
+
+Upon completion of P0, the following artifacts will exist:
+
+### 8.1 Documentation
+- [ ] `docs/memory_types.md` - Formal taxonomy with definitions
+- [ ] `docs/memory_isolation.md` - Namespace and isolation rules
+- [ ] `src/paperbot/memory/schema.py` - Updated docstrings
+
+### 8.2 Code
+- [ ] `src/paperbot/memory/eval/__init__.py` - Evaluation module
+- [ ] `src/paperbot/memory/eval/collector.py` - MetricCollector implementation
+- [ ] `src/paperbot/infrastructure/stores/models.py` - `MemoryEvalMetricModel` added
+- [ ] `src/paperbot/api/routes/memory.py` - `/api/memory/metrics` endpoint
+
+### 8.3 Tests
+- [ ] `evals/memory/test_extraction_precision.py`
+- [ ] `evals/memory/test_retrieval_hit_rate.py`
+- [ ] `evals/memory/test_deletion_compliance.py`
+- [ ] `evals/memory/fixtures/` - Synthetic test data
+
+### 8.4 CI/CD
+- [ ] `.github/workflows/ci.yml` - Memory eval job added
+
+### 8.5 Tracking
+- [ ] `docs/memory_todo.md` - P0 items marked complete
+
+---
+
+## 9. Open Questions
+
+The following questions require user input before implementation:
+
+1. **Extraction language support**: Current heuristics are Chinese-focused. Should P0 include English patterns, or defer to later phase?
+
+2. **Human labeling workflow**: Should we build a simple web UI for labeling, or use external tools (Label Studio, spreadsheet)?
+
+3. **Metrics retention period**: How long should evaluation metrics be stored? (30 days / 90 days / indefinitely)
+
+4. **Deletion compliance scope**: Should 100% target apply to soft-delete only, or require hard-delete for GDPR?
+
+---
+
+## Appendix A: Existing Implementation Summary
+
+### A.1 Current Schema (Already Implemented)
+
+**`memory_items` table fields**:
+- Core: `id`, `user_id`, `workspace_id`, `scope_type`, `scope_id`
+- Content: `kind`, `content`, `content_hash`, `confidence`
+- Status: `status` (pending/approved/rejected/superseded), `supersedes_id`
+- Lifecycle: `expires_at`, `last_used_at`, `use_count`
+- Security: `pii_risk`, `deleted_at`, `deleted_reason`
+- Audit: `created_at`, `updated_at`, `source_id`
+
+**`memory_audit_log` table fields**:
+- `id`, `ts`, `actor_id`, `user_id`, `workspace_id`
+- `action` (create/update/approve/reject/delete/hard_delete/use)
+- `item_id`, `source_id`, `detail_json`
+
+### A.2 Current API Endpoints
+
+| Endpoint | Method | Status |
+|----------|--------|--------|
+| `/api/memory/ingest` | POST | ✅ Implemented |
+| `/api/memory/list` | GET | ✅ Implemented |
+| `/api/memory/context` | POST | ✅ Implemented |
+| `/api/memory/items/{id}` | PATCH | ✅ Implemented |
+| `/api/memory/items/{id}` | DELETE | ✅ Implemented |
+| `/api/memory/metrics` | GET | ❌ To be added |
+
+### A.3 Current Parsers
+
+| Platform | Format | Status |
+|----------|--------|--------|
+| ChatGPT | conversations.json | ✅ Working |
+| Gemini | API logs (loose JSON) | ✅ Working |
+| Generic | {"messages": [...]} | ✅ Working |
+| Plain text | User:/Assistant: prefix | ✅ Working |
+| Claude | Export format | ❌ Not implemented |
+| Perplexity | Unknown | ❌ Not implemented |
+
+
+  Phase 1: Documentation
+
+  | File                     | Status  |
+  |--------------------------|---------|
+  | docs/memory_types.md     | Created |
+  | docs/memory_isolation.md | Created |
+
+  Phase 2: Evaluation Infrastructure
+
+  | File                                         | Status                                         |
+  |----------------------------------------------|------------------------------------------------|
+  | src/paperbot/infrastructure/stores/models.py | Modified (added MemoryEvalMetricModel)         |
+  | src/paperbot/memory/eval/__init__.py         | Created                                        |
+  | src/paperbot/memory/eval/collector.py        | Created                                        |
+  | src/paperbot/api/routes/memory.py            | Modified (added /api/memory/metrics endpoints) |
+
+  Phase 3: Test Harness
+
+  | File                                         | Status  |
+  |----------------------------------------------|---------|
+  | evals/memory/__init__.py                     | Created |
+  | evals/memory/fixtures/sample_memories.json   | Created |
+  | evals/memory/fixtures/retrieval_queries.json | Created |
+  | evals/memory/test_deletion_compliance.py     | Created |
+  | evals/memory/test_retrieval_hit_rate.py      | Created |
+
+  Phase 4: CI Integration
+
+  | File                     | Status                           |
+  |--------------------------|----------------------------------|
+  | .github/workflows/ci.yml | Modified (added memory P0 tests) |
\ No newline at end of file
diff --git a/docs/memory_isolation.md b/docs/memory_isolation.md
new file mode 100644
index 0000000..043b572
--- /dev/null
+++ b/docs/memory_isolation.md
@@ -0,0 +1,144 @@
+# Memory Namespace and Isolation Strategy
+
+This document describes the isolation boundaries and query resolution rules implemented in the memory system.
+
+> **Reference Implementation:** `src/paperbot/infrastructure/stores/memory_store.py`
+
+## Isolation Hierarchy
+
+Memory items are isolated using a three-level namespace hierarchy, as defined in `memory_todo.md`:
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                      Level 1: user_id (required)                │
+│  ┌───────────────────────────────────────────────────────────┐  │
+│  │              Level 2: workspace_id (optional)             │  │
+│  │  ┌─────────────────────────────────────────────────────┐  │  │
+│  │  │        Level 3: scope_type:scope_id (optional)      │  │  │
+│  │  └─────────────────────────────────────────────────────┘  │  │
+│  └───────────────────────────────────────────────────────────┘  │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## Level 1: user_id (Required)
+
+The primary isolation boundary. Cross-user access is forbidden.
+
+**Implementation:** Every query in `memory_store.py` starts with:
+```python
+stmt = select(MemoryItemModel).where(MemoryItemModel.user_id == user_id)
+```
+
+## Level 2: workspace_id (Optional)
+
+Team or project-level isolation within a user's memories.
+
+**Implementation:** `memory_store.py` line 293-294:
+```python
+if workspace_id is not None:
+    stmt = stmt.where(MemoryItemModel.workspace_id == workspace_id)
+```
+
+**Behavior:**
+- If `workspace_id` is set on a memory, it is only visible when querying with the same `workspace_id`
+- Memories with `workspace_id=NULL` are user-global
+
+## Level 3: scope_type:scope_id (Optional)
+
+Fine-grained isolation for research tracks, projects, or papers.
+
+**Implementation:** `memory_store.py` lines 296-301:
+```python
+if scope_type is not None:
+    if scope_type == "global":
+        stmt = stmt.where(or_(MemoryItemModel.scope_type == scope_type, MemoryItemModel.scope_type.is_(None)))
+    else:
+        stmt = stmt.where(MemoryItemModel.scope_type == scope_type)
+if scope_id is not None:
+    stmt = stmt.where(MemoryItemModel.scope_id == scope_id)
+```
+
+**Scope Types:**
+
+| scope_type | scope_id | Usage |
+|------------|----------|-------|
+| `global` | NULL | User-wide memories, visible across all tracks |
+| `track` | track_id | Research direction specific |
+| `project` | project_id | Project-specific context |
+| `paper` | paper_id | Paper-specific notes |
+
+## Content Hash Deduplication
+
+To prevent duplicates while allowing the same content in different scopes, the content hash includes scope information.
+
+**Implementation:** `memory_store.py` lines 168-171:
+```python
+# Dedup within a user's scope boundary (prevents cross-track pollution/dedup collisions).
+content_hash = _sha256_text(
+    f"{effective_scope_type}:{effective_scope_id or ''}:{m.kind}:{content}"
+)
+```
+
+## Status Filtering
+
+Queries only return approved, non-deleted, non-expired items.
+
+**Implementation:** `memory_store.py` lines 303-305:
+```python
+stmt = stmt.where(MemoryItemModel.deleted_at.is_(None))
+stmt = stmt.where(MemoryItemModel.status == "approved")
+stmt = stmt.where(or_(MemoryItemModel.expires_at.is_(None), MemoryItemModel.expires_at > now))
+```
+
+## Auto-Status Assignment
+
+Status is automatically set based on confidence and PII risk.
+
+**Implementation:** `memory_store.py` lines 174-178:
+```python
+effective_status = "approved" if float(m.confidence) >= 0.60 else "pending"
+pii_risk = _estimate_pii_risk(content)
+if pii_risk >= 2 and effective_status == "approved":
+    effective_status = "pending"
+```
+
+**Rules:**
+- `confidence >= 0.60` → `approved`
+- `confidence < 0.60` → `pending`
+- `pii_risk >= 2` → override to `pending` (requires human review)
+
+## Provider as Metadata
+
+The source platform (ChatGPT/Gemini/Claude) is stored in `memory_sources.platform`, not in `memory_items`.
+
+**Rationale:** From `memory_survey.md` section 6.1:
+> 建议拆成三张逻辑表：
+> - `memory_sources`：导入批次（平台、文件名、sha256、统计信息）
+> - `memory_items`：稳定条目
+
+Provider is for provenance tracking, not query filtering.
+
+## Audit Trail
+
+All mutations are logged in `memory_audit_log`.
+
+**Implementation:** `memory_store.py` lines 205-220:
+```python
+session.add(
+    MemoryAuditLogModel(
+        ts=now,
+        actor_id=actor_id,
+        user_id=user_id,
+        workspace_id=workspace_id,
+        action="create",
+        item_id=row.id,
+        ...
+    )
+)
+```
+
+## References
+
+- Requirements: `docs/memory_todo.md` P0 section
+- Survey: `docs/memory_survey.md` section 6
+- Implementation: `src/paperbot/infrastructure/stores/memory_store.py`
diff --git a/docs/memory_types.md b/docs/memory_types.md
new file mode 100644
index 0000000..9e23b6c
--- /dev/null
+++ b/docs/memory_types.md
@@ -0,0 +1,141 @@
+# Memory Type Taxonomy
+
+This document defines the formal taxonomy for memory types in PaperBot's cross-platform memory middleware.
+
+## Overview
+
+Memory items are categorized into three main categories based on their purpose and lifecycle:
+
+1. **User Memory** - Stable, cross-session information about the user
+2. **Episodic Memory** - Session-derived facts with potential decay
+3. **Workspace/Project Memory** - Scope-specific context
+
+## Memory Type Definitions
+
+### User Memory
+
+Long-term, stable information about the user that persists across sessions.
+
+| Kind | Definition | Persistence | Confidence Threshold | Examples |
+|------|------------|-------------|---------------------|----------|
+| `profile` | Identity facts: name, role, affiliation, background | Permanent until edited | 0.85 | "My name is Jerry", "I'm a PhD student at MIT" |
+| `preference` | Style, format, or interaction preferences | Semi-permanent | 0.72 | "I prefer concise answers", "Use Chinese for responses" |
+| `goal` | Long-term objectives or research directions | Session-spanning | 0.68 | "I'm researching LLM memory systems", "Working on a paper about RAG" |
+| `constraint` | Hard rules or requirements that must be followed | Permanent | 0.62 | "Never include code examples", "Always cite sources" |
+| `project` | Project-level context and background | Project-scoped | 0.70 | "PaperBot is a research workflow tool", "The codebase uses FastAPI" |
+
+### Episodic Memory
+
+Information derived from specific sessions or interactions, subject to decay or update.
+
+| Kind | Definition | Persistence | Confidence Threshold | Examples |
+|------|------------|-------------|---------------------|----------|
+| `fact` | Specific facts mentioned during interaction | Decaying (use_count tracked) | 0.65 | "User's deadline is Friday", "Paper was submitted to NeurIPS" |
+| `decision` | Key decisions made during sessions | Project-scoped | 0.70 | "We chose SQLite over Postgres", "Using Docker for sandboxing" |
+| `hypothesis` | Tentative assumptions or inferences | Low-priority, may expire | 0.50 | "User may be a graduate student", "Likely interested in NLP" |
+| `todo` | Action items or pending tasks | Time-bound (expires_at) | 0.58 | "Need to implement FTS5", "Review the memory module" |
+
+### Workspace/Project Memory
+
+Scope-specific information tied to a particular workspace or research track.
+
+| Kind | Definition | Persistence | Confidence Threshold | Examples |
+|------|------------|-------------|---------------------|----------|
+| `keyword_set` | Domain-specific keywords or focus areas | Project-scoped | 0.75 | "Focus areas: RAG, memory, multi-agent" |
+| `note` | Free-form annotations or observations | Project-scoped | 0.60 | "Important: check GDPR compliance", "Consider using vector DB" |
+
+## What is NOT Memory
+
+The following are explicitly **excluded** from the memory system:
+
+| Item | Reason | Where It Belongs |
+|------|--------|------------------|
+| **Context Caching** | Performance optimization, not long-term storage | LLM provider layer (prompt caching) |
+| **Code Index** | Repository structure, symbols, dependencies | `context_engine/` or `code_index` subsystem |
+| **Search Results** | Ephemeral retrieval results | Not persisted; used immediately |
+| **Embeddings** | Vector representations of content | Separate `memory_embeddings` table (future) |
+| **Raw Conversation History** | Unprocessed message logs | `memory_sources` table (input, not output) |
+| **Temporary Variables** | Session-only working memory | In-memory only, not persisted |
+
+## Status Workflow
+
+Memory items follow a status workflow based on confidence and risk:
+
+```
+Extracted (MemoryCandidate)
+         │
+         ├── confidence >= 0.60 AND pii_risk < 2
+         │         │
+         │         ▼
+         │   ┌─────────────┐
+         │   │  approved   │ ──── Can be retrieved and injected
+         │   └─────────────┘
+         │
+         └── confidence < 0.60 OR pii_risk >= 2
+                   │
+                   ▼
+             ┌─────────────┐
+             │   pending   │ ──── Requires human review
+             └─────────────┘
+                   │
+         ┌─────────┴─────────┐
+         │                   │
+         ▼                   ▼
+   ┌─────────────┐    ┌─────────────┐
+   │  approved   │    │  rejected   │
+   └─────────────┘    └─────────────┘
+```
+
+## Confidence Score Guidelines
+
+When implementing extractors, use these confidence score ranges:
+
+| Score Range | Meaning | Action |
+|-------------|---------|--------|
+| 0.85 - 1.00 | High confidence, explicit statement | Auto-approve |
+| 0.70 - 0.84 | Good confidence, clear inference | Auto-approve |
+| 0.60 - 0.69 | Moderate confidence | Auto-approve (borderline) |
+| 0.50 - 0.59 | Low confidence | Pending (needs review) |
+| 0.00 - 0.49 | Very low confidence | Pending or reject |
+
+## PII Risk Levels
+
+| Level | Description | Behavior |
+|-------|-------------|----------|
+| 0 | No PII detected | Normal processing |
+| 1 | Possible PII (redacted patterns found) | Normal, but flagged |
+| 2 | High PII risk (raw PII detected) | Auto-pending, requires review |
+
+## Usage in Code
+
+```python
+from paperbot.memory.schema import MemoryKind, MemoryCandidate
+
+# Valid memory kinds
+valid_kinds = [
+    # User Memory
+    "profile", "preference", "goal", "constraint", "project",
+    # Episodic Memory
+    "fact", "decision", "hypothesis", "todo",
+    # Workspace Memory
+    "keyword_set", "note"
+]
+
+# Creating a memory candidate
+candidate = MemoryCandidate(
+    kind="preference",
+    content="User prefers concise answers",
+    confidence=0.75,
+    tags=["style", "output"],
+    evidence="From conversation: 'Please keep responses brief'",
+    scope_type="global",
+    scope_id=None,
+    status="approved"  # Will be set by store based on confidence
+)
+```
+
+## References
+
+- Design Document: `docs/P0_memory_scope_acceptance_design.md`
+- Schema Definition: `src/paperbot/memory/schema.py`
+- Store Implementation: `src/paperbot/infrastructure/stores/memory_store.py`
diff --git a/evals/memory/__init__.py b/evals/memory/__init__.py
new file mode 100644
index 0000000..a42d48a
--- /dev/null
+++ b/evals/memory/__init__.py
@@ -0,0 +1,7 @@
+"""
+Memory evaluation tests for Scope and Acceptance criteria.
+
+Test files:
+- test_deletion_compliance.py: Verify deleted items never retrieved
+- test_retrieval_hit_rate.py: Verify relevant memories are found
+"""
diff --git a/evals/memory/fixtures/retrieval_queries.json b/evals/memory/fixtures/retrieval_queries.json
new file mode 100644
index 0000000..3a7c97a
--- /dev/null
+++ b/evals/memory/fixtures/retrieval_queries.json
@@ -0,0 +1,35 @@
+{
+  "description": "Synthetic test queries for retrieval hit rate evaluation",
+  "test_cases": [
+    {
+      "id": "pref_output_style",
+      "query": "output format preference",
+      "expected_kinds": ["preference"],
+      "expected_keywords": ["concise", "brief", "format"]
+    },
+    {
+      "id": "project_context",
+      "query": "current project",
+      "expected_kinds": ["project"],
+      "expected_keywords": ["project", "working on", "building"]
+    },
+    {
+      "id": "user_identity",
+      "query": "user name role",
+      "expected_kinds": ["profile"],
+      "expected_keywords": ["name", "role", "position"]
+    },
+    {
+      "id": "constraints",
+      "query": "rules restrictions",
+      "expected_kinds": ["constraint"],
+      "expected_keywords": ["never", "always", "must", "don't"]
+    },
+    {
+      "id": "goals",
+      "query": "goals objectives",
+      "expected_kinds": ["goal"],
+      "expected_keywords": ["goal", "want", "trying to", "working towards"]
+    }
+  ]
+}
diff --git a/evals/memory/fixtures/sample_memories.json b/evals/memory/fixtures/sample_memories.json
new file mode 100644
index 0000000..6445177
--- /dev/null
+++ b/evals/memory/fixtures/sample_memories.json
@@ -0,0 +1,42 @@
+{
+  "description": "Sample memories for testing extraction and retrieval",
+  "user_id": "test_user",
+  "memories": [
+    {
+      "kind": "profile",
+      "content": "User's name is Jerry, a PhD student at MIT",
+      "confidence": 0.85,
+      "tags": ["identity", "name", "role"]
+    },
+    {
+      "kind": "preference",
+      "content": "User prefers concise, brief responses without excessive explanation",
+      "confidence": 0.75,
+      "tags": ["style", "output", "format"]
+    },
+    {
+      "kind": "project",
+      "content": "Currently working on PaperBot, a research workflow tool",
+      "confidence": 0.80,
+      "tags": ["project", "paperbot"]
+    },
+    {
+      "kind": "constraint",
+      "content": "Never include code examples unless explicitly requested",
+      "confidence": 0.70,
+      "tags": ["rules", "code"]
+    },
+    {
+      "kind": "goal",
+      "content": "Working towards building a memory middleware for LLM applications",
+      "confidence": 0.72,
+      "tags": ["goal", "memory", "llm"]
+    },
+    {
+      "kind": "fact",
+      "content": "Project deadline is end of Q1 2025",
+      "confidence": 0.65,
+      "tags": ["deadline", "timeline"]
+    }
+  ]
+}
diff --git a/evals/memory/test_deletion_compliance.py b/evals/memory/test_deletion_compliance.py
new file mode 100644
index 0000000..8f7fa86
--- /dev/null
+++ b/evals/memory/test_deletion_compliance.py
@@ -0,0 +1,150 @@
+"""
+Deletion Compliance Test (Acceptance Criteria)
+
+Target: 100% - Deleted items must NEVER be retrieved.
+
+This test:
+1. Creates test memories
+2. Soft-deletes some items
+3. Attempts to retrieve them via search
+4. Verifies deleted items are not returned
+5. Records the metric
+"""
+
+import os
+import sys
+import tempfile
+
+# Add src to path for imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "src"))
+
+from paperbot.infrastructure.stores.memory_store import SqlAlchemyMemoryStore
+from paperbot.memory.schema import MemoryCandidate
+from paperbot.memory.eval.collector import MemoryMetricCollector
+
+
+def run_deletion_compliance_test() -> dict:
+    """
+    Run deletion compliance test and return results.
+
+    Returns:
+        dict with keys: passed, deleted_count, retrieved_count, compliance_rate
+    """
+    # Use temp database for isolation
+    with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
+        db_path = f.name
+
+    try:
+        db_url = f"sqlite:///{db_path}"
+        store = SqlAlchemyMemoryStore(db_url=db_url)
+        collector = MemoryMetricCollector(db_url=db_url)
+
+        user_id = "test_deletion_user"
+
+        # Step 1: Create test memories
+        test_memories = [
+            MemoryCandidate(kind="profile", content="Test user name is Alice", confidence=0.85),
+            MemoryCandidate(kind="preference", content="Prefers markdown format", confidence=0.75),
+            MemoryCandidate(kind="project", content="Working on test project", confidence=0.80),
+            MemoryCandidate(kind="goal", content="Goal is to test deletion", confidence=0.70),
+            MemoryCandidate(kind="fact", content="Test fact that will be deleted", confidence=0.65),
+        ]
+
+        created, skipped, _rows = store.add_memories(
+            user_id=user_id,
+            memories=test_memories,
+            actor_id="test",
+        )
+
+        # Query to get the created memory IDs (rows are detached from session)
+        all_items = store.list_memories(user_id=user_id, limit=100)
+        item_ids = [item["id"] for item in all_items]
+
+        # Step 2: Soft-delete some items (last 2 items)
+        deleted_ids = []
+        for i, item_id in enumerate(item_ids):
+            if i >= 3:  # Delete last 2 items
+                store.soft_delete_item(
+                    user_id=user_id,
+                    item_id=item_id,
+                    actor_id="test",
+                    reason="Test deletion",
+                )
+                deleted_ids.append(item_id)
+
+        deleted_count = len(deleted_ids)
+
+        # Step 3: Attempt to retrieve ALL items via search
+        # Use broad queries that should match deleted items if they were returned
+        queries = ["test", "goal", "deletion", "fact", "Alice", "project"]
+
+        retrieved_deleted_count = 0
+        for query in queries:
+            results = store.search_memories(
+                user_id=user_id,
+                query=query,
+                limit=100,
+            )
+            # Check if any deleted ID appears in results
+            result_ids = {r.get("id") for r in results}
+            for deleted_id in deleted_ids:
+                if deleted_id in result_ids:
+                    retrieved_deleted_count += 1
+                    print(f"FAIL: Deleted item {deleted_id} was retrieved for query '{query}'")
+
+        # Step 4: Also check list_memories
+        all_items = store.list_memories(user_id=user_id, limit=100)
+        for item in all_items:
+            if item.get("id") in deleted_ids:
+                retrieved_deleted_count += 1
+                print(f"FAIL: Deleted item {item.get('id')} was returned in list_memories")
+
+        # Step 5: Calculate compliance
+        # Compliance = 1.0 means no deleted items were retrieved
+        compliance_rate = 1.0 if retrieved_deleted_count == 0 else 0.0
+
+        # Record metric
+        collector.record_deletion_compliance(
+            deleted_retrieved_count=retrieved_deleted_count,
+            deleted_total_count=deleted_count,
+            evaluator_id="test:deletion_compliance",
+            detail={
+                "deleted_ids": deleted_ids,
+                "queries_tested": queries,
+            },
+        )
+
+        passed = compliance_rate == 1.0
+
+        return {
+            "passed": passed,
+            "deleted_count": deleted_count,
+            "retrieved_count": retrieved_deleted_count,
+            "compliance_rate": compliance_rate,
+        }
+
+    finally:
+        # Cleanup
+        if os.path.exists(db_path):
+            os.unlink(db_path)
+
+
+def main():
+    print("=" * 60)
+    print("P0 Deletion Compliance Test")
+    print("Target: 100% (deleted items must never be retrieved)")
+    print("=" * 60)
+
+    result = run_deletion_compliance_test()
+
+    print(f"\nResults:")
+    print(f"  Deleted items: {result['deleted_count']}")
+    print(f"  Retrieved (should be 0): {result['retrieved_count']}")
+    print(f"  Compliance rate: {result['compliance_rate']:.1%}")
+    print(f"  Status: {'PASS' if result['passed'] else 'FAIL'}")
+
+    return 0 if result["passed"] else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/evals/memory/test_retrieval_hit_rate.py b/evals/memory/test_retrieval_hit_rate.py
new file mode 100644
index 0000000..7a23741
--- /dev/null
+++ b/evals/memory/test_retrieval_hit_rate.py
@@ -0,0 +1,161 @@
+"""
+Retrieval Hit Rate Test (Acceptance Criteria)
+
+Target: >= 80% - Relevant memories should be retrieved when needed.
+
+This test:
+1. Loads sample memories from fixtures
+2. Runs queries from fixtures
+3. Checks if expected memory kinds are retrieved
+4. Calculates hit rate
+5. Records the metric
+"""
+
+import json
+import os
+import sys
+import tempfile
+from pathlib import Path
+
+# Add src to path for imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "src"))
+
+from paperbot.infrastructure.stores.memory_store import SqlAlchemyMemoryStore
+from paperbot.memory.schema import MemoryCandidate
+from paperbot.memory.eval.collector import MemoryMetricCollector
+
+
+FIXTURES_DIR = Path(__file__).parent / "fixtures"
+
+
+def load_fixtures():
+    """Load test fixtures."""
+    with open(FIXTURES_DIR / "sample_memories.json") as f:
+        memories_data = json.load(f)
+
+    with open(FIXTURES_DIR / "retrieval_queries.json") as f:
+        queries_data = json.load(f)
+
+    return memories_data, queries_data
+
+
+def run_retrieval_hit_rate_test() -> dict:
+    """
+    Run retrieval hit rate test and return results.
+
+    Returns:
+        dict with keys: passed, total_queries, hits, hit_rate, details
+    """
+    # Use temp database for isolation
+    with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
+        db_path = f.name
+
+    try:
+        db_url = f"sqlite:///{db_path}"
+        store = SqlAlchemyMemoryStore(db_url=db_url)
+        collector = MemoryMetricCollector(db_url=db_url)
+
+        # Load fixtures
+        memories_data, queries_data = load_fixtures()
+        user_id = memories_data["user_id"]
+
+        # Step 1: Insert sample memories
+        candidates = [
+            MemoryCandidate(
+                kind=m["kind"],
+                content=m["content"],
+                confidence=m["confidence"],
+                tags=m.get("tags", []),
+            )
+            for m in memories_data["memories"]
+        ]
+
+        store.add_memories(
+            user_id=user_id,
+            memories=candidates,
+            actor_id="test",
+        )
+
+        # Step 2: Run queries and check hits
+        total_queries = len(queries_data["test_cases"])
+        hits = 0
+        details = []
+
+        for test_case in queries_data["test_cases"]:
+            query = test_case["query"]
+            expected_kinds = set(test_case["expected_kinds"])
+
+            # Search
+            results = store.search_memories(
+                user_id=user_id,
+                query=query,
+                limit=10,
+            )
+
+            # Check if expected kinds are found
+            result_kinds = {r.get("kind") for r in results}
+            found_expected = bool(expected_kinds & result_kinds)
+
+            if found_expected:
+                hits += 1
+                status = "HIT"
+            else:
+                status = "MISS"
+
+            detail = {
+                "id": test_case["id"],
+                "query": query,
+                "expected_kinds": list(expected_kinds),
+                "result_kinds": list(result_kinds),
+                "status": status,
+            }
+            details.append(detail)
+            print(f"  {test_case['id']}: {status} (expected {expected_kinds}, got {result_kinds})")
+
+        # Step 3: Calculate hit rate
+        hit_rate = hits / total_queries if total_queries > 0 else 0.0
+
+        # Step 4: Record metric
+        collector.record_retrieval_hit_rate(
+            hits=hits,
+            expected=total_queries,
+            evaluator_id="test:retrieval_hit_rate",
+            detail={"test_cases": details},
+        )
+
+        passed = hit_rate >= 0.80  # Target: >= 80%
+
+        return {
+            "passed": passed,
+            "total_queries": total_queries,
+            "hits": hits,
+            "hit_rate": hit_rate,
+            "details": details,
+        }
+
+    finally:
+        # Cleanup
+        if os.path.exists(db_path):
+            os.unlink(db_path)
+
+
+def main():
+    print("=" * 60)
+    print("Retrieval Hit Rate Test")
+    print("Target: >= 80% (relevant memories should be retrieved)")
+    print("=" * 60)
+
+    result = run_retrieval_hit_rate_test()
+
+    print(f"\nResults:")
+    print(f"  Total queries: {result['total_queries']}")
+    print(f"  Hits: {result['hits']}")
+    print(f"  Hit rate: {result['hit_rate']:.1%}")
+    print(f"  Target: >= 80%")
+    print(f"  Status: {'PASS' if result['passed'] else 'FAIL'}")
+
+    return 0 if result["passed"] else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/paperbot/api/routes/memory.py b/src/paperbot/api/routes/memory.py
index 9bbc9ba..c2a862c 100644
--- a/src/paperbot/api/routes/memory.py
+++ b/src/paperbot/api/routes/memory.py
@@ -10,6 +10,7 @@
 from paperbot.memory import build_memory_context, extract_memories, parse_chat_log
 from paperbot.memory.schema import MemoryCandidate
 
+from paperbot.memory.eval.collector import MemoryMetricCollector
 router = APIRouter()
 
 _store = SqlAlchemyMemoryStore()
@@ -262,3 +263,46 @@ def delete_memory_item(
     if not ok:
         raise HTTPException(status_code=404, detail="memory item not found")
     return {"status": "ok"}
+
+
+# --- Metrics Endpoints (Scope and Acceptance Criteria) ---
+
+_metric_collector = MemoryMetricCollector(_store._provider)
+
+
+class MetricsSummaryResponse(BaseModel):
+    status: str  # "pass" or "fail"
+    metrics: Dict[str, Any]
+    targets: Dict[str, float]
+
+
+@router.get("/memory/metrics", response_model=MetricsSummaryResponse)
+def get_memory_metrics():
+    """
+    Get summary of memory system evaluation metrics.
+
+    Returns the latest value for each P0 acceptance metric:
+    - extraction_precision (target: >= 85%)
+    - false_positive_rate (target: <= 5%)
+    - retrieval_hit_rate (target: >= 80%)
+    - injection_pollution_rate (target: <= 2%)
+    - deletion_compliance (target: 100%)
+    """
+    return _metric_collector.get_metrics_summary()
+
+
+class MetricHistoryResponse(BaseModel):
+    metric_name: str
+    history: List[Dict[str, Any]]
+
+
+@router.get("/memory/metrics/{metric_name}", response_model=MetricHistoryResponse)
+def get_metric_history(metric_name: str, limit: int = Query(30, ge=1, le=100)):
+    """Get historical values for a specific metric."""
+    if metric_name not in MemoryMetricCollector.TARGETS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unknown metric: {metric_name}. Valid: {list(MemoryMetricCollector.TARGETS.keys())}",
+        )
+    history = _metric_collector.get_metric_history(metric_name, limit=limit)
+    return MetricHistoryResponse(metric_name=metric_name, history=history)
diff --git a/src/paperbot/api/routes/research.py b/src/paperbot/api/routes/research.py
index 11a24af..8d19649 100644
--- a/src/paperbot/api/routes/research.py
+++ b/src/paperbot/api/routes/research.py
@@ -9,6 +9,7 @@
 from paperbot.context_engine.track_router import TrackRouter
 from paperbot.infrastructure.stores.memory_store import SqlAlchemyMemoryStore
 from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore
+from paperbot.memory.eval.collector import MemoryMetricCollector
 from paperbot.memory.extractor import extract_memories
 from paperbot.memory.schema import MemoryCandidate, NormalizedMessage
 
@@ -17,6 +18,15 @@
 _research_store = SqlAlchemyResearchStore()
 _memory_store = SqlAlchemyMemoryStore()
 _track_router = TrackRouter(research_store=_research_store, memory_store=_memory_store)
+_metric_collector: Optional[MemoryMetricCollector] = None
+
+
+def _get_metric_collector() -> MemoryMetricCollector:
+    """Lazy initialization of metric collector."""
+    global _metric_collector
+    if _metric_collector is None:
+        _metric_collector = MemoryMetricCollector()
+    return _metric_collector
 
 
 def _schedule_embedding_precompute(
@@ -361,6 +371,9 @@ class BulkModerateResponse(BaseModel):
 
 @router.post("/research/memory/bulk_moderate", response_model=BulkModerateResponse)
 def bulk_moderate(req: BulkModerateRequest, background_tasks: BackgroundTasks):
+    # Get items before update to check their confidence for P0 metrics
+    items_before = _memory_store.get_items_by_ids(user_id=req.user_id, item_ids=req.item_ids)
+
     updated = _memory_store.bulk_update_items(
         user_id=req.user_id,
         item_ids=req.item_ids,
@@ -373,6 +386,26 @@ def bulk_moderate(req: BulkModerateRequest, background_tasks: BackgroundTasks):
         if i.get("scope_type") == "track" and i.get("scope_id")
     ]
     _schedule_embedding_precompute(background_tasks, user_id=req.user_id, track_ids=affected_tracks)
+
+    # P0 Hook: Record false positive rate when user rejects high-confidence items
+    # A rejection of an auto-approved (confidence >= 0.60) item is a false positive
+    if req.status == "rejected" and items_before:
+        high_confidence_rejected = sum(
+            1 for item in items_before
+            if item.get("confidence", 0) >= 0.60 and item.get("status") == "approved"
+        )
+        if high_confidence_rejected > 0:
+            collector = _get_metric_collector()
+            collector.record_false_positive_rate(
+                false_positive_count=high_confidence_rejected,
+                total_approved_count=len(items_before),
+                evaluator_id=f"user:{req.user_id}",
+                detail={
+                    "item_ids": req.item_ids,
+                    "action": "bulk_moderate_reject",
+                },
+            )
+
     return BulkModerateResponse(user_id=req.user_id, updated=updated)
 
 
@@ -410,6 +443,72 @@ def bulk_move(req: BulkMoveRequest, background_tasks: BackgroundTasks):
     return BulkMoveResponse(user_id=req.user_id, updated=updated)
 
 
+class MemoryFeedbackRequest(BaseModel):
+    """Request to record feedback on retrieved memories."""
+    user_id: str = "default"
+    memory_ids: List[int] = Field(..., min_length=1, description="IDs of memories being rated")
+    helpful_ids: List[int] = Field(default_factory=list, description="IDs of memories that were helpful")
+    not_helpful_ids: List[int] = Field(default_factory=list, description="IDs of memories that were not helpful")
+    context_run_id: Optional[int] = None
+    query: Optional[str] = None
+
+
+class MemoryFeedbackResponse(BaseModel):
+    user_id: str
+    total_rated: int
+    helpful_count: int
+    not_helpful_count: int
+    hit_rate: float
+
+
+@router.post("/research/memory/feedback", response_model=MemoryFeedbackResponse)
+def record_memory_feedback(req: MemoryFeedbackRequest):
+    """
+    Record user feedback on retrieved memories.
+
+    This endpoint allows users to indicate which memories were helpful vs not helpful
+    when they were retrieved for a query. This data feeds into the P0 retrieval_hit_rate metric.
+
+    Usage:
+    - After building context, frontend shows retrieved memories
+    - User marks which memories were helpful
+    - Frontend calls this endpoint with the feedback
+    """
+    helpful_set = set(req.helpful_ids)
+    not_helpful_set = set(req.not_helpful_ids)
+
+    # Count hits (helpful) and misses (not helpful)
+    hits = len(helpful_set)
+    total = len(req.memory_ids)
+
+    if total > 0:
+        hit_rate = hits / total
+        collector = _get_metric_collector()
+        collector.record_retrieval_hit_rate(
+            hits=hits,
+            expected=total,
+            evaluator_id=f"user:{req.user_id}",
+            detail={
+                "memory_ids": req.memory_ids,
+                "helpful_ids": list(helpful_set),
+                "not_helpful_ids": list(not_helpful_set),
+                "context_run_id": req.context_run_id,
+                "query": req.query,
+                "action": "memory_feedback",
+            },
+        )
+    else:
+        hit_rate = 0.0
+
+    return MemoryFeedbackResponse(
+        user_id=req.user_id,
+        total_rated=total,
+        helpful_count=hits,
+        not_helpful_count=len(not_helpful_set),
+        hit_rate=hit_rate,
+    )
+
+
 class ClearTrackMemoryResponse(BaseModel):
     user_id: str
     track_id: int
@@ -433,6 +532,41 @@ def clear_track_memory(
         reason="clear_track_memory",
     )
     _schedule_embedding_precompute(background_tasks, user_id=user_id, track_ids=[track_id])
+
+    # P0 Hook: Verify deletion compliance - deleted items should not be retrievable
+    if deleted > 0:
+        # Try to retrieve items from the cleared scope (should return empty)
+        retrieved_after_delete = _memory_store.list_memories(
+            user_id=user_id,
+            scope_type="track",
+            scope_id=str(track_id),
+            include_deleted=False,
+            include_pending=True,
+            limit=100,
+        )
+        # Also try searching
+        search_results = _memory_store.search_memories(
+            user_id=user_id,
+            query="*",  # broad query
+            scope_type="track",
+            scope_id=str(track_id),
+            limit=100,
+        )
+        retrieved_count = len(retrieved_after_delete) + len(search_results)
+
+        collector = _get_metric_collector()
+        collector.record_deletion_compliance(
+            deleted_retrieved_count=retrieved_count,
+            deleted_total_count=deleted,
+            evaluator_id=f"user:{user_id}",
+            detail={
+                "track_id": track_id,
+                "deleted_count": deleted,
+                "retrieved_after_delete": retrieved_count,
+                "action": "clear_track_memory",
+            },
+        )
+
     return ClearTrackMemoryResponse(user_id=user_id, track_id=track_id, deleted_count=deleted)
 
 
diff --git a/src/paperbot/infrastructure/stores/memory_store.py b/src/paperbot/infrastructure/stores/memory_store.py
index bc5e69e..5820a8f 100644
--- a/src/paperbot/infrastructure/stores/memory_store.py
+++ b/src/paperbot/infrastructure/stores/memory_store.py
@@ -268,6 +268,24 @@ def list_memories(
             rows = session.execute(stmt).scalars().all()
             return [self._row_to_dict(r) for r in rows]
 
+    def get_items_by_ids(
+        self,
+        *,
+        user_id: str,
+        item_ids: List[int],
+    ) -> List[Dict[str, Any]]:
+        """Get memory items by their IDs for a specific user."""
+        if not item_ids:
+            return []
+        with self._provider.session() as session:
+            stmt = (
+                select(MemoryItemModel)
+                .where(MemoryItemModel.user_id == user_id)
+                .where(MemoryItemModel.id.in_(item_ids))
+            )
+            rows = session.execute(stmt).scalars().all()
+            return [self._row_to_dict(r) for r in rows]
+
     def search_memories(
         self,
         *,
diff --git a/src/paperbot/infrastructure/stores/models.py b/src/paperbot/infrastructure/stores/models.py
index 646cb6f..3a1204e 100644
--- a/src/paperbot/infrastructure/stores/models.py
+++ b/src/paperbot/infrastructure/stores/models.py
@@ -319,6 +319,34 @@ class MemoryAuditLogModel(Base):
     detail_json: Mapped[str] = mapped_column(Text, default="{}")
 
 
+class MemoryEvalMetricModel(Base):
+    """
+    Evaluation metrics for memory system quality (acceptance criteria).
+
+    Stores periodic measurements of:
+    - extraction_precision: % of extracted items that are correct
+    - false_positive_rate: % of approved items that are incorrect
+    - retrieval_hit_rate: % of relevant memories retrieved when needed
+    - injection_pollution_rate: % of responses negatively affected by wrong memory
+    - deletion_compliance: deleted items never retrieved (should be 1.0)
+
+    See docs/memory_types.md for metric definitions.
+    """
+
+    __tablename__ = "memory_eval_metrics"
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
+
+    metric_name: Mapped[str] = mapped_column(String(64), index=True)
+    metric_value: Mapped[float] = mapped_column(Float)
+    sample_size: Mapped[Optional[int]] = mapped_column(Integer, nullable=True)
+
+    evaluated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
+    evaluator_id: Mapped[str] = mapped_column(String(64), default="system")
+
+    detail_json: Mapped[str] = mapped_column(Text, default="{}")
+
+
 class ResearchTrackModel(Base):
     """
     User research direction / track.
diff --git a/src/paperbot/memory/eval/__init__.py b/src/paperbot/memory/eval/__init__.py
new file mode 100644
index 0000000..0075104
--- /dev/null
+++ b/src/paperbot/memory/eval/__init__.py
@@ -0,0 +1,14 @@
+"""
+Memory evaluation module for Scope and Acceptance criteria.
+
+Provides metrics collection and evaluation for:
+- extraction_precision
+- false_positive_rate
+- retrieval_hit_rate
+- injection_pollution_rate
+- deletion_compliance
+"""
+
+from paperbot.memory.eval.collector import MemoryMetricCollector
+
+__all__ = ["MemoryMetricCollector"]
diff --git a/src/paperbot/memory/eval/collector.py b/src/paperbot/memory/eval/collector.py
new file mode 100644
index 0000000..ba1c7ae
--- /dev/null
+++ b/src/paperbot/memory/eval/collector.py
@@ -0,0 +1,240 @@
+"""
+Memory metric collector for Scope and Acceptance criteria.
+
+Collects and stores evaluation metrics defined in docs/memory_types.md:
+- extraction_precision: >= 85%
+- false_positive_rate: <= 5%
+- retrieval_hit_rate: >= 80%
+- injection_pollution_rate: <= 2%
+- deletion_compliance: 100%
+"""
+
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional
+
+from sqlalchemy import select, func
+
+from paperbot.infrastructure.stores.models import MemoryEvalMetricModel, MemoryItemModel
+from paperbot.infrastructure.stores.db_provider import DbProvider
+
+
+class MemoryMetricCollector:
+    """
+    Collects and stores memory system evaluation metrics.
+
+    Usage:
+        collector = MemoryMetricCollector(db_provider)
+
+        # Record extraction results
+        collector.record_extraction_precision(
+            correct_count=85,
+            total_count=100,
+            evaluator_id="human:reviewer1"
+        )
+
+        # Get summary
+        summary = collector.get_metrics_summary()
+    """
+
+    # P0 target thresholds
+    TARGETS = {
+        "extraction_precision": 0.85,
+        "false_positive_rate": 0.05,
+        "retrieval_hit_rate": 0.80,
+        "injection_pollution_rate": 0.02,
+        "deletion_compliance": 1.00,
+    }
+
+    def __init__(self, db_provider: DbProvider):
+        self._provider = db_provider
+
+    def record_extraction_precision(
+        self,
+        correct_count: int,
+        total_count: int,
+        evaluator_id: str = "system",
+        detail: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """Record extraction precision metric (correct / total extracted)."""
+        if total_count == 0:
+            return
+        value = correct_count / total_count
+        self._record_metric(
+            metric_name="extraction_precision",
+            metric_value=value,
+            sample_size=total_count,
+            evaluator_id=evaluator_id,
+            detail=detail,
+        )
+
+    def record_false_positive_rate(
+        self,
+        false_positive_count: int,
+        total_approved_count: int,
+        evaluator_id: str = "system",
+        detail: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """Record false positive rate (incorrect approved / total approved)."""
+        if total_approved_count == 0:
+            return
+        value = false_positive_count / total_approved_count
+        self._record_metric(
+            metric_name="false_positive_rate",
+            metric_value=value,
+            sample_size=total_approved_count,
+            evaluator_id=evaluator_id,
+            detail=detail,
+        )
+
+    def record_retrieval_hit_rate(
+        self,
+        hits: int,
+        expected: int,
+        evaluator_id: str = "system",
+        detail: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """Record retrieval hit rate (retrieved relevant / total relevant)."""
+        if expected == 0:
+            return
+        value = hits / expected
+        self._record_metric(
+            metric_name="retrieval_hit_rate",
+            metric_value=value,
+            sample_size=expected,
+            evaluator_id=evaluator_id,
+            detail=detail,
+        )
+
+    def record_injection_pollution_rate(
+        self,
+        polluted_count: int,
+        total_injections: int,
+        evaluator_id: str = "system",
+        detail: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """Record injection pollution rate (polluted responses / total with memory)."""
+        if total_injections == 0:
+            return
+        value = polluted_count / total_injections
+        self._record_metric(
+            metric_name="injection_pollution_rate",
+            metric_value=value,
+            sample_size=total_injections,
+            evaluator_id=evaluator_id,
+            detail=detail,
+        )
+
+    def record_deletion_compliance(
+        self,
+        deleted_retrieved_count: int,
+        deleted_total_count: int,
+        evaluator_id: str = "system",
+        detail: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """
+        Record deletion compliance (should be 1.0 = 100% compliant).
+
+        Compliance = 1.0 - (deleted_retrieved / deleted_total)
+        If any deleted item was retrieved, compliance < 1.0.
+        """
+        if deleted_total_count == 0:
+            return
+        value = 1.0 - (deleted_retrieved_count / deleted_total_count)
+        self._record_metric(
+            metric_name="deletion_compliance",
+            metric_value=value,
+            sample_size=deleted_total_count,
+            evaluator_id=evaluator_id,
+            detail=detail,
+        )
+
+    def _record_metric(
+        self,
+        metric_name: str,
+        metric_value: float,
+        sample_size: int,
+        evaluator_id: str,
+        detail: Optional[Dict[str, Any]],
+    ) -> None:
+        """Store a metric record in the database."""
+        now = datetime.now(timezone.utc)
+        with self._provider.session() as session:
+            row = MemoryEvalMetricModel(
+                metric_name=metric_name,
+                metric_value=metric_value,
+                sample_size=sample_size,
+                evaluated_at=now,
+                evaluator_id=evaluator_id,
+                detail_json=json.dumps(detail or {}, ensure_ascii=False),
+            )
+            session.add(row)
+            session.commit()
+
+    def get_latest_metrics(self) -> Dict[str, Dict[str, Any]]:
+        """Get the most recent value for each metric type."""
+        result = {}
+        with self._provider.session() as session:
+            for metric_name in self.TARGETS.keys():
+                stmt = (
+                    select(MemoryEvalMetricModel)
+                    .where(MemoryEvalMetricModel.metric_name == metric_name)
+                    .order_by(MemoryEvalMetricModel.evaluated_at.desc())
+                    .limit(1)
+                )
+                row = session.execute(stmt).scalar_one_or_none()
+                if row:
+                    result[metric_name] = {
+                        "value": row.metric_value,
+                        "target": self.TARGETS[metric_name],
+                        "meets_target": self._meets_target(metric_name, row.metric_value),
+                        "sample_size": row.sample_size,
+                        "evaluated_at": row.evaluated_at.isoformat() if row.evaluated_at else None,
+                        "evaluator_id": row.evaluator_id,
+                    }
+        return result
+
+    def get_metrics_summary(self) -> Dict[str, Any]:
+        """Get summary of all metrics with pass/fail status."""
+        latest = self.get_latest_metrics()
+        all_pass = all(m.get("meets_target", False) for m in latest.values())
+        return {
+            "status": "pass" if all_pass else "fail",
+            "metrics": latest,
+            "targets": self.TARGETS,
+        }
+
+    def _meets_target(self, metric_name: str, value: float) -> bool:
+        """Check if a metric value meets its target."""
+        target = self.TARGETS.get(metric_name)
+        if target is None:
+            return True
+        # For rates that should be LOW (false_positive, pollution), lower is better
+        if metric_name in ("false_positive_rate", "injection_pollution_rate"):
+            return value <= target
+        # For rates that should be HIGH (precision, hit_rate, compliance), higher is better
+        return value >= target
+
+    def get_metric_history(
+        self, metric_name: str, limit: int = 30
+    ) -> List[Dict[str, Any]]:
+        """Get historical values for a specific metric."""
+        result = []
+        with self._provider.session() as session:
+            stmt = (
+                select(MemoryEvalMetricModel)
+                .where(MemoryEvalMetricModel.metric_name == metric_name)
+                .order_by(MemoryEvalMetricModel.evaluated_at.desc())
+                .limit(limit)
+            )
+            rows = session.execute(stmt).scalars().all()
+            for row in rows:
+                result.append({
+                    "value": row.metric_value,
+                    "sample_size": row.sample_size,
+                    "evaluated_at": row.evaluated_at.isoformat() if row.evaluated_at else None,
+                    "evaluator_id": row.evaluator_id,
+                })
+        return result

From 78ed180d08a3a89d4f40f8a9244fdeab6e979480 Mon Sep 17 00:00:00 2001
From: boyu <oor2020@163.com>
Date: Sat, 3 Jan 2026 23:30:28 +0100
Subject: [PATCH 2/9] fix(memory): use SessionProvider instead of DbProvider   
 in collector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

   Fix import error: DbProvider doesn't exist, use SessionProvider from sqlalchemy_db.

   🤖 Generated with [LIU BOYU]<oor2020@163.com>
---
 src/paperbot/memory/eval/collector.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/paperbot/memory/eval/collector.py b/src/paperbot/memory/eval/collector.py
index ba1c7ae..e747f15 100644
--- a/src/paperbot/memory/eval/collector.py
+++ b/src/paperbot/memory/eval/collector.py
@@ -17,8 +17,8 @@
 
 from sqlalchemy import select, func
 
-from paperbot.infrastructure.stores.models import MemoryEvalMetricModel, MemoryItemModel
-from paperbot.infrastructure.stores.db_provider import DbProvider
+from paperbot.infrastructure.stores.models import Base, MemoryEvalMetricModel
+from paperbot.infrastructure.stores.sqlalchemy_db import SessionProvider, get_db_url
 
 
 class MemoryMetricCollector:
@@ -48,8 +48,10 @@ class MemoryMetricCollector:
         "deletion_compliance": 1.00,
     }
 
-    def __init__(self, db_provider: DbProvider):
-        self._provider = db_provider
+    def __init__(self, db_url: Optional[str] = None):
+        self._provider = SessionProvider(db_url or get_db_url())
+        # Ensure table exists
+        Base.metadata.create_all(self._provider.engine)
 
     def record_extraction_precision(
         self,

From 978fdd6c761b86f8125fa152b526508047d9cf33 Mon Sep 17 00:00:00 2001
From: boyu <oor2020@163.com>
Date: Sun, 4 Jan 2026 09:03:10 +0100
Subject: [PATCH 3/9] fix(ci): add python-multipart and fix collector imports
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  - Add python-multipart to requirements.txt and requirements-ci.txt
    (required for FastAPI file uploads in /memory/ingest)

  🤖 Generated with [LIU BOYU]<oor2020@163.com>
---
 requirements-ci.txt | 1 +
 requirements.txt    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index 84399ba..5c91441 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -2,6 +2,7 @@
 # Keep this smaller than requirements.txt to avoid heavy system deps (e.g. weasyprint).
 
 fastapi==0.115.0
+python-multipart>=0.0.6
 starlette>=0.37.2,<0.39.0
 sse-starlette>=2.1.0
 httpx[http2]>=0.27.0,<0.28.0
diff --git a/requirements.txt b/requirements.txt
index 4e54a17..f3036df 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -62,6 +62,7 @@ json-repair>=0.22.0
 
 # FastAPI backend for CLI/Web
 fastapi==0.115.0
+python-multipart>=0.0.6  # Required for file uploads
 # Pin Starlette for Python 3.9 compatible TestClient (avoid metaclass conflicts in newer Starlette)
 starlette>=0.37.2,<0.39.0
 uvicorn[standard]>=0.32.0

From c3bb7e3f2d2fa36a8e72ea1fbcd846447bf715e1 Mon Sep 17 00:00:00 2001
From: boyu <oor2020@163.com>
Date: Sun, 4 Jan 2026 09:09:20 +0100
Subject: [PATCH 4/9] fix: remove SessionProvider arg from
 MemoryMetricCollector calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  - memory.py: use MemoryMetricCollector() without args
  - collector.py: update docstring to show correct usage

  🤖 Generated with [LIU BOYU]<oor2020@163.com>
---
 src/paperbot/api/routes/memory.py     | 2 +-
 src/paperbot/memory/eval/collector.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/paperbot/api/routes/memory.py b/src/paperbot/api/routes/memory.py
index c2a862c..670a169 100644
--- a/src/paperbot/api/routes/memory.py
+++ b/src/paperbot/api/routes/memory.py
@@ -267,7 +267,7 @@ def delete_memory_item(
 
 # --- Metrics Endpoints (Scope and Acceptance Criteria) ---
 
-_metric_collector = MemoryMetricCollector(_store._provider)
+_metric_collector = MemoryMetricCollector()
 
 
 class MetricsSummaryResponse(BaseModel):
diff --git a/src/paperbot/memory/eval/collector.py b/src/paperbot/memory/eval/collector.py
index e747f15..120cd12 100644
--- a/src/paperbot/memory/eval/collector.py
+++ b/src/paperbot/memory/eval/collector.py
@@ -26,7 +26,8 @@ class MemoryMetricCollector:
     Collects and stores memory system evaluation metrics.
 
     Usage:
-        collector = MemoryMetricCollector(db_provider)
+        collector = MemoryMetricCollector()  # Uses default db_url
+        # Or: collector = MemoryMetricCollector(db_url="sqlite:///custom.db")
 
         # Record extraction results
         collector.record_extraction_precision(

From a6e2ad3bc90e522f3a12e94991277ae38551d117 Mon Sep 17 00:00:00 2001
From: boyu <oor2020@163.com>
Date: Sun, 4 Jan 2026 09:19:29 +0100
Subject: [PATCH 5/9] fix(ci): add gitpython to requirements-ci.txt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  🤖 Generated with [LIU BOYU]<oor2020@163.com>
---
 requirements-ci.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index 5c91441..d0ff9c6 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -29,3 +29,7 @@ redis>=5.0.0
 openai>=1.0.0
 anthropic>=0.3.0
 
+# 代码分析（可选）
+gitpython>=3.1.0
+
+

From eaec917f35d9359a1a2c7f6f7054520d9ee6b5e9 Mon Sep 17 00:00:00 2001
From: boyu <oor2020@163.com>
Date: Sun, 4 Jan 2026 09:24:39 +0100
Subject: [PATCH 6/9] fix(ci): add requests to requirements-ci.txt

  Generated with [LIU BOYU]<oor2020@163.com>
---
 requirements-ci.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index d0ff9c6..5f4d738 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -31,5 +31,6 @@ anthropic>=0.3.0
 
 # 代码分析（可选）
 gitpython>=3.1.0
+requests>=2.28.0
 
 

From 6d118c6a6f711ff387a0ad7886d05c9ffbad2268 Mon Sep 17 00:00:00 2001
From: boyu <oor2020@163.com>
Date: Sun, 4 Jan 2026 09:29:48 +0100
Subject: [PATCH 7/9] fix(ci): add jinja2 to requirements-ci.txt

  Generated with [LIU BOYU]<oor2020@163.com>
---
 requirements-ci.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index 5f4d738..02fc402 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -29,8 +29,8 @@ redis>=5.0.0
 openai>=1.0.0
 anthropic>=0.3.0
 
-# 代码分析（可选）
+
 gitpython>=3.1.0
 requests>=2.28.0
-
+jinja2>=3.1.0
 

From 995c0ce400d077a2feef6838e0151fd2ef6b3e94 Mon Sep 17 00:00:00 2001
From: boyu <oor2020@163.com>
Date: Sun, 4 Jan 2026 12:13:12 +0100
Subject: [PATCH 8/9] fix(ci): add missing dependencies to requirements-ci.txt

  Add: markdown, aiofiles, python-dotenv, json-repair, uvicorn
  Generated with [LIU BOYU]<oor2020@163.com>
---
 requirements-ci.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index 02fc402..bd6e363 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -33,4 +33,9 @@ anthropic>=0.3.0
 gitpython>=3.1.0
 requests>=2.28.0
 jinja2>=3.1.0
+markdown>=3.4.0
+aiofiles>=22.1.0
+python-dotenv>=0.19.0
+json-repair>=0.22.0
+uvicorn>=0.32.0
 

From 982cb192c0aee6afad168e6e2a3e130e01cd8ac3 Mon Sep 17 00:00:00 2001
From: boyu <oor2020@163.com>
Date: Sun, 4 Jan 2026 16:41:57 +0100
Subject: [PATCH 9/9] test(memory): add unit and integration tests for
 acceptance criteria

   Unit tests (tests/unit/test_memory_metric_collector.py):
   - Test all 5 metric recording methods
   - Test pass/fail target validation
   - Test metrics summary and history
   - Test edge cases (zero totals)

   Unit tests (tests/unit/test_memory_module.py):
   - Add test for get_items_by_ids method

   Integration tests (tests/integration/test_scope_and_acceptance_criteria_hooks.py):
   - Test memory feedback records retrieval_hit_rate
   - Test bulk_moderate rejection records false_positive_rate
   - Test clear_track_memory records deletion_compliance
   - Test /memory/metrics endpoints

   Generated with [LIU BOYU]<oor2020@163.com>
---
 ...est_scope_and_acceptance_criteria_hooks.py | 186 +++++++++++++++++
 tests/unit/test_memory_metric_collector.py    | 189 ++++++++++++++++++
 tests/unit/test_memory_module.py              |  34 ++++
 3 files changed, 409 insertions(+)
 create mode 100644 tests/integration/test_scope_and_acceptance_criteria_hooks.py
 create mode 100644 tests/unit/test_memory_metric_collector.py

diff --git a/tests/integration/test_scope_and_acceptance_criteria_hooks.py b/tests/integration/test_scope_and_acceptance_criteria_hooks.py
new file mode 100644
index 0000000..aba6f09
--- /dev/null
+++ b/tests/integration/test_scope_and_acceptance_criteria_hooks.py
@@ -0,0 +1,186 @@
+"""
+Integration tests for Scope and Acceptance Criteria 
+memory hooks in API routes.
+
+Tests the Scope and Acceptance criteria hooks:
+- false_positive_rate: recorded when user rejects high-confidence items
+- deletion_compliance: recorded when clearing track memory
+- retrieval_hit_rate: recorded when user provides memory feedback
+"""
+from __future__ import annotations
+
+import pytest
+from fastapi.testclient import TestClient
+
+
+@pytest.fixture
+def test_client(tmp_path, monkeypatch):
+    """Create test client with isolated database."""
+    monkeypatch.setenv("PAPERBOT_DB_URL", f"sqlite:///{tmp_path / 'test.db'}")
+
+    from paperbot.api import main as api_main
+
+    with TestClient(api_main.app) as client:
+        yield client
+
+
+def test_memory_feedback_records_hit_rate(test_client):
+    """Test that memory feedback endpoint records retrieval_hit_rate metric."""
+    # Record memory feedback
+    response = test_client.post(
+        "/api/research/memory/feedback",
+        json={
+            "user_id": "test_user",
+            "memory_ids": [1, 2, 3, 4, 5],
+            "helpful_ids": [1, 2, 3, 4],
+            "not_helpful_ids": [5],
+            "query": "test query",
+        },
+    )
+    assert response.status_code == 200
+    data = response.json()
+    assert data["total_rated"] == 5
+    assert data["helpful_count"] == 4
+    assert data["hit_rate"] == 0.8
+
+    # Verify metric was recorded
+    metrics_response = test_client.get("/api/memory/metrics")
+    assert metrics_response.status_code == 200
+    metrics = metrics_response.json()
+    assert "retrieval_hit_rate" in metrics["metrics"]
+    assert metrics["metrics"]["retrieval_hit_rate"]["value"] == 0.8
+
+
+def test_bulk_moderate_reject_records_false_positive(test_client):
+    """Test that rejecting high-confidence items records false_positive_rate."""
+    # First create a track
+    track_response = test_client.post(
+        "/api/research/tracks",
+        json={"user_id": "test_user", "name": "Test Track", "activate": True},
+    )
+    assert track_response.status_code == 200
+    track_id = track_response.json()["track"]["id"]
+
+    # Create a high-confidence approved memory
+    mem_response = test_client.post(
+        "/api/research/memory/items",
+        json={
+            "user_id": "test_user",
+            "kind": "preference",
+            "content": "Test preference",
+            "confidence": 0.85,
+            "scope_type": "track",
+            "scope_id": str(track_id),
+            "status": "approved",
+        },
+    )
+    assert mem_response.status_code == 200
+    item_id = mem_response.json()["item"]["id"]
+
+    # Reject the high-confidence item (this should record false_positive_rate)
+    reject_response = test_client.post(
+        "/api/research/memory/bulk_moderate",
+        json={
+            "user_id": "test_user",
+            "item_ids": [item_id],
+            "status": "rejected",
+        },
+    )
+    assert reject_response.status_code == 200
+
+    # Verify false_positive_rate metric was recorded
+    metrics_response = test_client.get("/api/memory/metrics")
+    assert metrics_response.status_code == 200
+    metrics = metrics_response.json()
+    assert "false_positive_rate" in metrics["metrics"]
+
+
+def test_clear_track_memory_records_deletion_compliance(test_client):
+    """Test that clearing track memory records deletion_compliance metric."""
+    # Create a track
+    track_response = test_client.post(
+        "/api/research/tracks",
+        json={"user_id": "test_user", "name": "Delete Test Track", "activate": True},
+    )
+    assert track_response.status_code == 200
+    track_id = track_response.json()["track"]["id"]
+
+    # Create some memories in the track
+    for i in range(3):
+        test_client.post(
+            "/api/research/memory/items",
+            json={
+                "user_id": "test_user",
+                "kind": "fact",
+                "content": f"Test fact {i}",
+                "confidence": 0.7,
+                "scope_type": "track",
+                "scope_id": str(track_id),
+            },
+        )
+
+    # Clear track memory (triggers deletion_compliance check)
+    clear_response = test_client.post(
+        f"/api/research/tracks/{track_id}/memory/clear",
+        params={"user_id": "test_user", "confirm": True},
+    )
+    assert clear_response.status_code == 200
+    assert clear_response.json()["deleted_count"] == 3
+
+    # Verify deletion_compliance metric was recorded
+    metrics_response = test_client.get("/api/memory/metrics")
+    assert metrics_response.status_code == 200
+    metrics = metrics_response.json()
+    assert "deletion_compliance" in metrics["metrics"]
+    # Should be 1.0 (100% compliant) since no deleted items should be retrievable
+    assert metrics["metrics"]["deletion_compliance"]["value"] == 1.0
+
+
+def test_metrics_endpoints(test_client):
+    """Test the /memory/metrics endpoints."""
+    # Get initial metrics (should be empty)
+    response = test_client.get("/api/memory/metrics")
+    assert response.status_code == 200
+    data = response.json()
+    assert "status" in data
+    assert "metrics" in data
+    assert "targets" in data
+
+    # Record a metric
+    test_client.post(
+        "/api/research/memory/feedback",
+        json={
+            "user_id": "test_user",
+            "memory_ids": [1, 2],
+            "helpful_ids": [1, 2],
+            "not_helpful_ids": [],
+        },
+    )
+
+    # Get metric history
+    history_response = test_client.get("/api/memory/metrics/retrieval_hit_rate")
+    assert history_response.status_code == 200
+    data = history_response.json()
+    assert "history" in data
+    assert len(data["history"]) >= 1
+    assert data["history"][0]["value"] == 1.0  # 100% hit rate
+
+
+def test_metrics_summary_pass_fail(test_client):
+    """Test that metrics summary correctly reports pass/fail status."""
+    # Record a metric that meets target
+    test_client.post(
+        "/api/research/memory/feedback",
+        json={
+            "user_id": "test_user",
+            "memory_ids": [1, 2, 3, 4, 5],
+            "helpful_ids": [1, 2, 3, 4, 5],  # 100% hit rate > 80% target
+            "not_helpful_ids": [],
+        },
+    )
+
+    response = test_client.get("/api/memory/metrics")
+    data = response.json()
+
+    # With only one metric recorded that meets target, status should be pass
+    assert data["metrics"]["retrieval_hit_rate"]["meets_target"] is True
diff --git a/tests/unit/test_memory_metric_collector.py b/tests/unit/test_memory_metric_collector.py
new file mode 100644
index 0000000..a8554cb
--- /dev/null
+++ b/tests/unit/test_memory_metric_collector.py
@@ -0,0 +1,189 @@
+"""
+Unit tests for MemoryMetricCollector (P0 Acceptance Criteria).
+"""
+from __future__ import annotations
+
+import pytest
+
+from paperbot.memory.eval.collector import MemoryMetricCollector
+
+
+def test_collector_record_extraction_precision(tmp_path):
+    """Test recording extraction precision metric."""
+    db_url = f"sqlite:///{tmp_path / 'metrics.db'}"
+    collector = MemoryMetricCollector(db_url=db_url)
+
+    collector.record_extraction_precision(
+        correct_count=85,
+        total_count=100,
+        evaluator_id="test:unit",
+        detail={"test": True},
+    )
+
+    latest = collector.get_latest_metrics()
+    assert "extraction_precision" in latest
+    assert latest["extraction_precision"]["value"] == 0.85
+    assert latest["extraction_precision"]["meets_target"] is True  # target is 0.85
+
+
+def test_collector_record_false_positive_rate(tmp_path):
+    """Test recording false positive rate metric."""
+    db_url = f"sqlite:///{tmp_path / 'metrics.db'}"
+    collector = MemoryMetricCollector(db_url=db_url)
+
+    collector.record_false_positive_rate(
+        false_positive_count=3,
+        total_approved_count=100,
+        evaluator_id="test:unit",
+    )
+
+    latest = collector.get_latest_metrics()
+    assert "false_positive_rate" in latest
+    assert latest["false_positive_rate"]["value"] == 0.03
+    assert latest["false_positive_rate"]["meets_target"] is True  # target is <= 0.05
+
+
+def test_collector_record_false_positive_rate_fails_target(tmp_path):
+    """Test false positive rate exceeding target."""
+    db_url = f"sqlite:///{tmp_path / 'metrics.db'}"
+    collector = MemoryMetricCollector(db_url=db_url)
+
+    collector.record_false_positive_rate(
+        false_positive_count=10,
+        total_approved_count=100,
+        evaluator_id="test:unit",
+    )
+
+    latest = collector.get_latest_metrics()
+    assert latest["false_positive_rate"]["value"] == 0.10
+    assert latest["false_positive_rate"]["meets_target"] is False  # 10% > 5%
+
+
+def test_collector_record_retrieval_hit_rate(tmp_path):
+    """Test recording retrieval hit rate metric."""
+    db_url = f"sqlite:///{tmp_path / 'metrics.db'}"
+    collector = MemoryMetricCollector(db_url=db_url)
+
+    collector.record_retrieval_hit_rate(
+        hits=8,
+        expected=10,
+        evaluator_id="test:unit",
+    )
+
+    latest = collector.get_latest_metrics()
+    assert "retrieval_hit_rate" in latest
+    assert latest["retrieval_hit_rate"]["value"] == 0.80
+    assert latest["retrieval_hit_rate"]["meets_target"] is True  # target is >= 0.80
+
+
+def test_collector_record_injection_pollution_rate(tmp_path):
+    """Test recording injection pollution rate metric."""
+    db_url = f"sqlite:///{tmp_path / 'metrics.db'}"
+    collector = MemoryMetricCollector(db_url=db_url)
+
+    collector.record_injection_pollution_rate(
+        polluted_count=1,
+        total_injections=100,
+        evaluator_id="test:unit",
+    )
+
+    latest = collector.get_latest_metrics()
+    assert "injection_pollution_rate" in latest
+    assert latest["injection_pollution_rate"]["value"] == 0.01
+    assert latest["injection_pollution_rate"]["meets_target"] is True  # target is <= 0.02
+
+
+def test_collector_record_deletion_compliance(tmp_path):
+    """Test recording deletion compliance metric."""
+    db_url = f"sqlite:///{tmp_path / 'metrics.db'}"
+    collector = MemoryMetricCollector(db_url=db_url)
+
+    # Perfect compliance: no deleted items retrieved
+    collector.record_deletion_compliance(
+        deleted_retrieved_count=0,
+        deleted_total_count=10,
+        evaluator_id="test:unit",
+    )
+
+    latest = collector.get_latest_metrics()
+    assert "deletion_compliance" in latest
+    assert latest["deletion_compliance"]["value"] == 1.0
+    assert latest["deletion_compliance"]["meets_target"] is True
+
+
+def test_collector_record_deletion_compliance_fails(tmp_path):
+    """Test deletion compliance failure when deleted items are retrieved."""
+    db_url = f"sqlite:///{tmp_path / 'metrics.db'}"
+    collector = MemoryMetricCollector(db_url=db_url)
+
+    # Bad compliance: 2 deleted items were retrieved
+    collector.record_deletion_compliance(
+        deleted_retrieved_count=2,
+        deleted_total_count=10,
+        evaluator_id="test:unit",
+    )
+
+    latest = collector.get_latest_metrics()
+    assert latest["deletion_compliance"]["value"] == 0.8  # 1 - 2/10
+    assert latest["deletion_compliance"]["meets_target"] is False
+
+
+def test_collector_get_metrics_summary(tmp_path):
+    """Test getting metrics summary with pass/fail status."""
+    db_url = f"sqlite:///{tmp_path / 'metrics.db'}"
+    collector = MemoryMetricCollector(db_url=db_url)
+
+    # Record all metrics meeting targets
+    collector.record_extraction_precision(correct_count=90, total_count=100)
+    collector.record_false_positive_rate(false_positive_count=2, total_approved_count=100)
+    collector.record_retrieval_hit_rate(hits=85, expected=100)
+    collector.record_injection_pollution_rate(polluted_count=1, total_injections=100)
+    collector.record_deletion_compliance(deleted_retrieved_count=0, deleted_total_count=10)
+
+    summary = collector.get_metrics_summary()
+    assert summary["status"] == "pass"
+    assert len(summary["metrics"]) == 5
+
+
+def test_collector_get_metrics_summary_fails(tmp_path):
+    """Test metrics summary fails when any metric doesn't meet target."""
+    db_url = f"sqlite:///{tmp_path / 'metrics.db'}"
+    collector = MemoryMetricCollector(db_url=db_url)
+
+    # One metric fails
+    collector.record_extraction_precision(correct_count=50, total_count=100)  # 50% < 85%
+
+    summary = collector.get_metrics_summary()
+    assert summary["status"] == "fail"
+
+
+def test_collector_get_metric_history(tmp_path):
+    """Test getting metric history."""
+    db_url = f"sqlite:///{tmp_path / 'metrics.db'}"
+    collector = MemoryMetricCollector(db_url=db_url)
+
+    # Record multiple values
+    collector.record_extraction_precision(correct_count=80, total_count=100)
+    collector.record_extraction_precision(correct_count=85, total_count=100)
+    collector.record_extraction_precision(correct_count=90, total_count=100)
+
+    history = collector.get_metric_history("extraction_precision", limit=10)
+    assert len(history) == 3
+    # Most recent first
+    assert history[0]["value"] == 0.90
+    assert history[1]["value"] == 0.85
+    assert history[2]["value"] == 0.80
+
+
+def test_collector_skips_zero_totals(tmp_path):
+    """Test that collector skips recording when total is zero."""
+    db_url = f"sqlite:///{tmp_path / 'metrics.db'}"
+    collector = MemoryMetricCollector(db_url=db_url)
+
+    collector.record_extraction_precision(correct_count=0, total_count=0)
+    collector.record_false_positive_rate(false_positive_count=0, total_approved_count=0)
+    collector.record_retrieval_hit_rate(hits=0, expected=0)
+    collector.record_deletion_compliance(deleted_retrieved_count=0, deleted_total_count=0)
+
+    latest = collector.get_latest_metrics()
+    assert len(latest) == 0  # No metrics recorded
diff --git a/tests/unit/test_memory_module.py b/tests/unit/test_memory_module.py
index 8f204d9..0e64dee 100644
--- a/tests/unit/test_memory_module.py
+++ b/tests/unit/test_memory_module.py
@@ -125,3 +125,37 @@ def test_memory_store_soft_delete_and_status_filter(tmp_path):
     assert any("太长" in i["content"] for i in store.search_memories(user_id="u1", query="太长", limit=10))
     store.soft_delete_item(user_id="u1", item_id=int(pref.id), reason="user request")
     assert not any("太长" in i["content"] for i in store.search_memories(user_id="u1", query="太长", limit=10))
+
+
+def test_memory_store_get_items_by_ids(tmp_path):
+    """Test get_items_by_ids method retrieves correct items."""
+    from paperbot.memory.schema import MemoryCandidate
+
+    db_url = f"sqlite:///{tmp_path / 'mem_ids.db'}"
+    store = SqlAlchemyMemoryStore(db_url=db_url, auto_create_schema=True)
+
+    # Create test memories
+    candidates = [
+        MemoryCandidate(kind="preference", content="Prefers Python", confidence=0.8),
+        MemoryCandidate(kind="goal", content="Learn Rust", confidence=0.7),
+        MemoryCandidate(kind="fact", content="Uses VSCode", confidence=0.6),
+    ]
+    created, _, rows = store.add_memories(user_id="u1", memories=candidates)
+    assert created == 3
+
+    # Get IDs from list_memories since rows are detached
+    all_items = store.list_memories(user_id="u1", limit=10)
+    item_ids = [item["id"] for item in all_items]
+    assert len(item_ids) == 3
+
+    # Test get_items_by_ids
+    retrieved = store.get_items_by_ids(user_id="u1", item_ids=item_ids[:2])
+    assert len(retrieved) == 2
+
+    # Test with empty list
+    empty = store.get_items_by_ids(user_id="u1", item_ids=[])
+    assert len(empty) == 0
+
+    # Test user isolation - u2 can't see u1's items
+    other_user = store.get_items_by_ids(user_id="u2", item_ids=item_ids)
+    assert len(other_user) == 0