From 34d7e2a218e1fdb39b5cc8744ac434f11ffd596c Mon Sep 17 00:00:00 2001 From: boyu Date: Sat, 3 Jan 2026 23:14:14 +0100 Subject: [PATCH 1/9] feat(memory): implement P0 scope and acceptance criteria Add memory type taxonomy, isolation strategy, and acceptance metrics: - docs: memory_types.md, memory_isolation.md, P0 design doc - feat: MemoryMetricCollector for tracking 5 P0 metrics - feat: MemoryEvalMetricModel for persisting metrics - feat: P0 hooks in research routes (false_positive, deletion_compliance) - feat: /research/memory/feedback endpoint for retrieval_hit_rate - feat: /memory/metrics endpoints for viewing recorded metrics - feat: get_items_by_ids() method in memory_store - test: deletion_compliance and retrieval_hit_rate eval scripts - ci: add memory P0 acceptance tests to CI workflow P0 Acceptance Metrics: - extraction_precision: >= 85% - false_positive_rate: <= 5% - retrieval_hit_rate: >= 80% - injection_pollution_rate: <= 2% - deletion_compliance: 100% Generated with [LIU BOYU] --- .github/workflows/ci.yml | 7 + docs/P0_memory_scope_acceptance_design.md | 739 ++++++++++++++++++ docs/memory_isolation.md | 144 ++++ docs/memory_types.md | 141 ++++ evals/memory/__init__.py | 7 + evals/memory/fixtures/retrieval_queries.json | 35 + evals/memory/fixtures/sample_memories.json | 42 + evals/memory/test_deletion_compliance.py | 150 ++++ evals/memory/test_retrieval_hit_rate.py | 161 ++++ src/paperbot/api/routes/memory.py | 44 ++ src/paperbot/api/routes/research.py | 134 ++++ .../infrastructure/stores/memory_store.py | 18 + src/paperbot/infrastructure/stores/models.py | 28 + src/paperbot/memory/eval/__init__.py | 14 + src/paperbot/memory/eval/collector.py | 240 ++++++ 15 files changed, 1904 insertions(+) create mode 100644 docs/P0_memory_scope_acceptance_design.md create mode 100644 docs/memory_isolation.md create mode 100644 docs/memory_types.md create mode 100644 evals/memory/__init__.py create mode 100644 evals/memory/fixtures/retrieval_queries.json create mode 100644 evals/memory/fixtures/sample_memories.json create mode 100644 evals/memory/test_deletion_compliance.py create mode 100644 evals/memory/test_retrieval_hit_rate.py create mode 100644 src/paperbot/memory/eval/__init__.py create mode 100644 src/paperbot/memory/eval/collector.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 22c21f1..4cda81a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,3 +48,10 @@ jobs: python evals/runners/run_track_pipeline_smoke.py python evals/runners/run_eventlog_replay_smoke.py + - name: Run memory range and acceptance testing + env: + PYTHONPATH: src + run: | + python evals/memory/test_deletion_compliance.py + python evals/memory/test_retrieval_hit_rate.py + diff --git a/docs/P0_memory_scope_acceptance_design.md b/docs/P0_memory_scope_acceptance_design.md new file mode 100644 index 0000000..461f9a0 --- /dev/null +++ b/docs/P0_memory_scope_acceptance_design.md @@ -0,0 +1,739 @@ +# P0: Memory Scope and Acceptance Criteria - Technical Design Document + +> **Status**: Draft +> **Author**: Claude Code +> **Date**: 2025-12-27 +> **Estimated Effort**: 1-2 days (17h) + +--- + +## 0. Architecture Context: Where P0 Fits + +P0 spans **three layers** of the PaperBot 5-layer architecture, focusing on the **Memory subsystem's foundational definitions and quality metrics**. + +### 0.1 PaperBot 5-Layer Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ PaperBot Standard Architecture │ +│ (Offline Ingestion → Storage → Online Retrieval → Generation → Feedback)│ +└─────────────────────────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Layer 1 · Ingestion (Async) │ +│ ┌──────────┐ ┌──────────────┐ ┌────────────┐ ┌──────────────────────┐ │ +│ │ Sources │──▶│ Parse/Norm │──▶│Chunk/Embed │──▶│ EventLog (evidence) │ │ +│ │ arXiv │ │ Connectors │ │ (optional) │ │ SqlAlchemyEventLog │ │ +│ │ Reddit │ │ │ │ │ │ │ │ +│ └──────────┘ └──────────────┘ └────────────┘ └──────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Layer 2 · Storage │ +│ ┌────────────────────────────────────────┐ ┌─────────────────────────────┐ │ +│ │ SQL 主库 (SQLite/Postgres) │ │ Vector Index (optional) │ │ +│ │ ┌─────────────────────────────────┐ │ │ Qdrant / Milvus │ │ +│ │ │ ╔═══════════════════════════╗ │ │ └─────────────────────────────┘ │ +│ │ │ ║ memory_items ◀──────────╠───┼───┼─── P0: Type Boundaries │ │ +│ │ │ ║ memory_audit_log ║ │ │ Namespace/Isolation │ │ +│ │ │ ╚═══════════════════════════╝ │ │ │ │ +│ │ │ research_tracks / tasks │ │ │ │ +│ │ │ paper_feedback │ │ │ │ +│ │ └─────────────────────────────────┘ │ │ │ +│ └────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Layer 3 · Retrieval (Online) │ +│ ┌────────────┐ ┌─────────────┐ ┌──────────────────────────────────────┐ │ +│ │ContextEng │──▶│ TrackRouter │──▶│ ╔════════════════════════════════╗ │ │ +│ │ine │ │ │ │ ║ MemoryStore ║ │ │ +│ │ │ │ keyword+ │ │ ║ search_memories() ◀──────────╠───┼─── P0 +│ │ │ │ mem+task+ │ │ ║ (scope-aware retrieval) ║ │ │ +│ │ │ │ embedding │ │ ╚════════════════════════════════╝ │ │ +│ └────────────┘ └─────────────┘ └──────────────────────────────────────┘ │ +│ │ +│ ┌────────────────┐ ┌────────────────┐ ┌────────────────────────────────┐ │ +│ │ Paper Searcher │──▶│ Rank/Filter/ │──▶│ Replay write │ │ +│ │ S2 / offline │ │ Dedupe │ │ context_run + impressions │ │ +│ └────────────────┘ └────────────────┘ └────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Layer 4 · Generation │ +│ ┌────────────────┐ ┌────────────────┐ ┌────────────────────────────────┐ │ +│ │ PromptComposer │──▶│ LLM + Tools │──▶│ Output Parser │ │ +│ │ budget+format │ │ GPT/Claude │ │ structured + citations │ │ +│ └────────────────┘ └────────────────┘ └────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ Layer 5 · Feedback / Governance / Replay │ +│ ┌─────────────────────────────────┐ ┌────────────────┐ ┌────────────────┐ │ +│ │ ╔═════════════════════════════╗ │ │ Paper Feedback │ │ Evals/Replay │ │ +│ │ ║ Memory Inbox (governance) ║ │ │ like/save/ │ │ Summary │ │ +│ │ ║ suggest → pending ║ │ │ dislike │ │ │ │ +│ │ ║ approve/reject/move ◀──────╠─┼───┼────────────────┼───┼─── P0: Metrics │ │ +│ │ ╚═════════════════════════════╝ │ │ │ │ Evaluation │ │ +│ └─────────────────────────────────┘ └────────────────┘ └────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────────┘ + +Legend: + ╔═══╗ P0 Focus Area (Memory Scope & Acceptance) + ╚═══╝ + ───▶ Data Flow + ◀──── P0 Touch Point +``` + +### 0.2 P0 Components Mapped to Architecture Layers + +| Layer | Component | P0 Deliverable | +|-------|-----------|----------------| +| **Layer 2: Storage** | `memory_items` table | Memory type boundary definitions (taxonomy) | +| | Namespace fields | `user_id` / `workspace_id` / `scope_type:scope_id` isolation rules | +| **Layer 3: Retrieval** | `MemoryStore.search_memories()` | Scope-aware retrieval logic validation | +| | Query resolution | Isolation rule implementation verification | +| **Layer 5: Feedback** | Memory Inbox | Governance workflow (pending → approved) | +| | `memory_eval_metrics` (new) | Acceptance criteria measurement | +| | Audit log | Deletion compliance verification | + +### 0.3 P0 Focus: Cross-Cutting Concerns + +``` + ┌─────────────────────────────────────────┐ + │ P0: Scope & Acceptance │ + └─────────────────────────────────────────┘ + │ + ┌───────────────────────────┼───────────────────────────┐ + │ │ │ + ▼ ▼ ▼ +┌─────────────────────┐ ┌─────────────────────┐ ┌─────────────────────┐ +│ Type Boundaries │ │ Namespace/Isolation│ │ Acceptance Metrics │ +│ │ │ │ │ │ +│ - User Memory │ │ - user_id (L1) │ │ - Extraction │ +│ profile/pref/ │ │ - workspace_id(L2) │ │ Precision ≥85% │ +│ goal/constraint │ │ - scope_type:id(L3)│ │ - False Positive │ +│ - Episodic Memory │ │ - provider (meta) │ │ Rate ≤5% │ +│ fact/decision/ │ │ │ │ - Retrieval Hit │ +│ hypothesis │ │ Query Resolution: │ │ Rate ≥80% │ +│ - Workspace Memory │ │ scope > workspace │ │ - Injection │ +│ keyword_set/note │ │ > user (fallback) │ │ Pollution ≤2% │ +│ │ │ │ │ - Deletion │ +│ NOT Memory: │ │ │ │ Compliance 100% │ +│ - Context cache │ │ │ │ │ +│ - Code index │ │ │ │ │ +│ - Search results │ │ │ │ │ +└─────────────────────┘ └─────────────────────┘ └─────────────────────┘ + │ │ │ + └───────────────────────────┼───────────────────────────┘ + │ + ▼ + ┌─────────────────────────────────────────┐ + │ Implementation Artifacts │ + │ │ + │ docs/memory_types.md │ + │ docs/memory_isolation.md │ + │ src/paperbot/memory/eval/collector.py │ + │ evals/memory/test_*.py │ + └─────────────────────────────────────────┘ +``` + +### 0.4 Data Flow with P0 Touch Points + +``` + INGESTION + │ + ┌───────────────────────────────┼───────────────────────────────┐ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────────┐ │ + │ │ POST /api/memory/ingest │ │ + │ │ file + user_id + platform + workspace_id │ │ + │ └─────────────────────────────────────────────────────────┘ │ + │ │ │ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────────┐ │ + │ │ Parsers (ChatGPT/Gemini/Plaintext) │ │ + │ │ → NormalizedMessage[] │ │ + │ └─────────────────────────────────────────────────────────┘ │ + │ │ │ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────────┐ │ + │ │ Extractor (heuristic + optional LLM) │ │ + │ │ → MemoryCandidate[] │ │ + │ │ ╔═══════════════════════════════════════════════════╗ │ │ + │ │ ║ P0: kind must be in defined taxonomy ║ │ │ + │ │ ║ P0: confidence scoring for precision metrics ║ │ │ + │ │ ╚═══════════════════════════════════════════════════╝ │ │ + │ └─────────────────────────────────────────────────────────┘ │ + │ │ │ + └───────────────────────────────┼───────────────────────────────┘ + │ + STORAGE + │ + ┌───────────────────────────────┼───────────────────────────────┐ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────────┐ │ + │ │ MemoryStore.add_memories() │ │ + │ │ ╔═══════════════════════════════════════════════════╗ │ │ + │ │ ║ P0: scope-aware dedup (content_hash includes scope)║ │ │ + │ │ ║ P0: status = pending if confidence < 0.6 ║ │ │ + │ │ ║ P0: namespace isolation (user_id + workspace_id) ║ │ │ + │ │ ╚═══════════════════════════════════════════════════╝ │ │ + │ └─────────────────────────────────────────────────────────┘ │ + │ │ │ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────────┐ │ + │ │ memory_items table │ │ + │ │ memory_audit_log table │ │ + │ └─────────────────────────────────────────────────────────┘ │ + │ │ + └───────────────────────────────────────────────────────────────┘ + │ + RETRIEVAL + │ + ┌───────────────────────────────┼───────────────────────────────┐ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────────┐ │ + │ │ POST /api/memory/context │ │ + │ │ query + user_id + workspace_id + scope │ │ + │ └─────────────────────────────────────────────────────────┘ │ + │ │ │ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────────┐ │ + │ │ MemoryStore.search_memories() │ │ + │ │ ╔═══════════════════════════════════════════════════╗ │ │ + │ │ ║ P0: only approved, non-deleted, non-expired ║ │ │ + │ │ ║ P0: scope isolation enforced ║ │ │ + │ │ ║ P0: hit rate metrics collected ║ │ │ + │ │ ╚═══════════════════════════════════════════════════╝ │ │ + │ └─────────────────────────────────────────────────────────┘ │ + │ │ │ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────────┐ │ + │ │ ContextPack (formatted for prompt injection) │ │ + │ └─────────────────────────────────────────────────────────┘ │ + │ │ + └───────────────────────────────────────────────────────────────┘ + │ + GOVERNANCE + │ + ┌───────────────────────────────┼───────────────────────────────┐ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────────┐ │ + │ │ Memory Inbox: /api/research/memory/inbox │ │ + │ │ ╔═══════════════════════════════════════════════════╗ │ │ + │ │ ║ P0: pending → approved/rejected workflow ║ │ │ + │ │ ║ P0: bulk_moderate, bulk_move, clear ║ │ │ + │ │ ╚═══════════════════════════════════════════════════╝ │ │ + │ └─────────────────────────────────────────────────────────┘ │ + │ │ │ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────────┐ │ + │ │ DELETE /api/memory/items/{id} │ │ + │ │ ╔═══════════════════════════════════════════════════╗ │ │ + │ │ ║ P0: soft_delete / hard_delete ║ │ │ + │ │ ║ P0: deletion compliance = 100% (never retrieved) ║ │ │ + │ │ ╚═══════════════════════════════════════════════════╝ │ │ + │ └─────────────────────────────────────────────────────────┘ │ + │ │ + └───────────────────────────────────────────────────────────────┘ + │ + EVALUATION + │ + ┌───────────────────────────────┼───────────────────────────────┐ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────────┐ │ + │ │ GET /api/memory/metrics (NEW in P0) │ │ + │ │ ╔═══════════════════════════════════════════════════╗ │ │ + │ │ ║ P0: MetricCollector aggregates 5 metrics ║ │ │ + │ │ ║ - extraction_precision ║ │ │ + │ │ ║ - false_positive_rate ║ │ │ + │ │ ║ - retrieval_hit_rate ║ │ │ + │ │ ║ - injection_pollution_rate ║ │ │ + │ │ ║ - deletion_compliance ║ │ │ + │ │ ╚═══════════════════════════════════════════════════╝ │ │ + │ └─────────────────────────────────────────────────────────┘ │ + │ │ │ + │ ▼ │ + │ ┌─────────────────────────────────────────────────────────┐ │ + │ │ memory_eval_metrics table (NEW in P0) │ │ + │ │ evals/memory/test_*.py (NEW in P0) │ │ + │ └─────────────────────────────────────────────────────────┘ │ + │ │ + └───────────────────────────────────────────────────────────────┘ +``` + +--- + +## 1. Executive Summary + +**Objective**: Define clear boundaries for memory types, establish namespace/isolation strategies, and implement measurable acceptance criteria for the cross-platform memory middleware. + +**Current State**: The infrastructure (database schema, CRUD APIs, basic extraction/search) is largely complete. P0 focuses on **formalizing definitions** and **implementing evaluation metrics** rather than building new features. + +**Scope**: This document covers the three main deliverables from `docs/memory_todo.md` P0 section: +1. Memory type boundary definitions +2. Namespace and isolation strategy +3. Acceptance criteria (5 metrics) + +--- + +## 2. Technical Solution Design + +### 2.1 Memory Type Boundaries + +Based on the existing `MemoryKind` enum and the TODO requirements, the following formal taxonomy is proposed: + +#### 2.1.1 Memory Type Taxonomy + +| Category | Kind | Definition | Persistence | Example | +|----------|------|------------|-------------|---------| +| **User Memory** | `profile` | Identity facts (name, role, affiliation) | Permanent until edited | "My name is Jerry" | +| | `preference` | Style/format preferences | Semi-permanent | "I prefer concise answers" | +| | `goal` | Long-term objectives | Session-spanning | "I'm researching LLM memory" | +| | `constraint` | Hard rules/requirements | Permanent | "Never use Chinese in responses" | +| | `project` | Project context/background | Project-scoped | "PaperBot is a research tool" | +| **Episodic Memory** | `fact` | Session-derived facts | Decaying | "User mentioned deadline is Friday" | +| | `decision` | Key decisions made | Project-scoped | "We chose SQLite over Postgres" | +| | `hypothesis` | Tentative assumptions | Low-priority | "User may be a graduate student" | +| | `todo` | Action items | Time-bound | "Need to implement FTS5" | +| **Workspace/Project** | `keyword_set` | Domain keywords | Project-scoped | "focus areas: RAG, memory, agents" | +| | `note` | Free-form annotations | Project-scoped | "Important: check GDPR compliance" | + +#### 2.1.2 Exclusions (NOT Memory) + +The following are explicitly **NOT** part of the memory system and should be handled elsewhere: + +| Item | Reason | Proper Location | +|------|--------|-----------------| +| **Context Caching** | Latency optimization, not long-term storage | LLM provider layer | +| **Code Index** | Repository structure, symbols, dependencies | `context_engine/` or future `code_index` subsystem | +| **Search Results** | Ephemeral retrieval results | Not persisted | +| **Embeddings** | Vector representations | Separate `memory_embeddings` table (P1) | +| **Conversation History** | Raw message logs | `memory_sources` table (input, not output) | + +### 2.2 Namespace and Isolation Strategy + +#### 2.2.1 Isolation Hierarchy + +``` +┌─────────────────────────────────────────────────────┐ +│ user_id (required) │ +│ ┌───────────────────────────────────────────────┐ │ +│ │ workspace_id (optional) │ │ +│ │ ┌─────────────────────────────────────────┐ │ │ +│ │ │ scope_type:scope_id (track/project) │ │ │ +│ │ │ ┌─────────────────────────────────────┐│ │ │ +│ │ │ │ provider (metadata only) ││ │ │ +│ │ │ └─────────────────────────────────────┘│ │ │ +│ │ └─────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────┘ +``` + +#### 2.2.2 Isolation Rules + +| Level | Field | Required | Behavior | +|-------|-------|----------|----------| +| **L1** | `user_id` | Yes | Primary boundary. Cross-user access is forbidden. | +| **L2** | `workspace_id` | No | Team/project isolation. Memories with `workspace_id=X` are only visible when querying with the same `workspace_id`. | +| **L3** | `scope_type:scope_id` | No | Track-level isolation for research contexts. | +| **Metadata** | `provider` | No | Provenance tracking only (ChatGPT/Gemini/Claude/...). Not used for query filtering. | + +#### 2.2.3 Scope Type Values + +| scope_type | Description | Visibility | +|------------|-------------|------------| +| `global` | User-wide memories | Visible across all tracks/workspaces for that user | +| `track` | Research direction specific | Only visible when that track is active | +| `project` | Project-specific context | Only visible within project scope | +| `paper` | Paper-specific notes | Only visible when analyzing that paper | + +#### 2.2.4 Query Resolution Logic + +```python +def resolve_visible_memories(user_id, workspace_id=None, scope_type=None, scope_id=None): + """ + Resolution order (most specific to least): + 1. Exact scope match (scope_type + scope_id) + 2. Global scope within workspace + 3. Global scope for user (if no workspace specified) + """ + base_filter = (user_id == user_id) + + if workspace_id: + base_filter &= (workspace_id == workspace_id) + + if scope_type and scope_id: + # Include both specific scope AND global + scope_filter = ( + (scope_type == scope_type AND scope_id == scope_id) | + (scope_type == 'global') + ) + else: + scope_filter = (scope_type == 'global') + + return base_filter & scope_filter +``` + +### 2.3 Acceptance Criteria Framework + +#### 2.3.1 Metric Definitions + +| Metric | Definition | Formula | Target | +|--------|------------|---------|--------| +| **Extraction Precision** | Fraction of extracted items that are correct | `correct_items / total_extracted_items` | ≥ 85% | +| **False Positive Rate (Dirty Memory Rate)** | Fraction of approved items that are incorrect or harmful | `incorrect_approved / total_approved` | ≤ 5% | +| **Retrieval Hit Rate** | Fraction of relevant memories retrieved when needed | `retrieved_relevant / total_relevant` | ≥ 80% | +| **Injection Pollution Rate** | Fraction of responses negatively affected by wrong memory | `polluted_responses / total_responses_with_memory` | ≤ 2% | +| **Deletion Compliance** | Deleted items must never be retrieved or injected | `deleted_items_retrieved == 0` | 100% | + +#### 2.3.2 Measurement Methods + +| Metric | Data Source | Frequency | Automation Level | +|--------|-------------|-----------|------------------| +| Extraction Precision | Human sampling (50 items/week) | Weekly | Manual with script support | +| False Positive Rate | User corrections + manual audit | Weekly | Semi-automated | +| Retrieval Hit Rate | Synthetic test queries + user feedback | Daily (CI) + Weekly (user) | Automated + Manual | +| Injection Pollution Rate | A/B testing + user reports | Monthly | Manual | +| Deletion Compliance | Automated regression test | Per-commit (CI) | Fully automated | + +#### 2.3.3 Metric Storage Schema + +```sql +CREATE TABLE memory_eval_metrics ( + id INTEGER PRIMARY KEY, + metric_name TEXT NOT NULL, -- 'extraction_precision', 'retrieval_hit_rate', etc. + metric_value REAL NOT NULL, -- 0.0 to 1.0 + sample_size INTEGER, -- Number of items evaluated + evaluated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + evaluator_id TEXT, -- 'ci', 'human:', 'automated' + detail_json TEXT, -- Additional context (item IDs, query info, etc.) + + INDEX idx_metric_name_time (metric_name, evaluated_at) +); +``` + +--- + +## 3. Implementation Principles + +### 3.1 Core Design Principles + +| Principle | Description | Implementation | +|-----------|-------------|----------------| +| **Explicit over Implicit** | All memory types must have clear written definitions | Document in `docs/memory_types.md` + code docstrings | +| **Fail-Safe Defaults** | Low confidence or high PII risk → require review | `confidence < 0.6` → `status='pending'` | +| **Auditability First** | Every mutation must be logged | All changes create `memory_audit_log` entries | +| **Isolation by Default** | Scope to narrowest applicable boundary | Default `scope_type='global'`, require explicit promotion | +| **Separation of Concerns** | Each layer has single responsibility | Extraction → Storage → Retrieval → Injection | + +### 3.2 Fail-Safe Status Transitions + +``` + ┌──────────────┐ + │ Extracted │ + └──────┬───────┘ + │ + ┌────────────┴────────────┐ + │ │ + ▼ ▼ + ┌─────────────────┐ ┌─────────────────┐ + │ confidence ≥ 0.6│ │ confidence < 0.6│ + │ pii_risk < 2 │ │ OR pii_risk ≥ 2 │ + └────────┬────────┘ └────────┬────────┘ + │ │ + ▼ ▼ + ┌─────────────────┐ ┌─────────────────┐ + │ approved │ │ pending │ + │ (auto-active) │ │ (needs review) │ + └─────────────────┘ └────────┬────────┘ + │ + ┌─────────────┴─────────────┐ + │ │ + ▼ ▼ + ┌─────────────────┐ ┌─────────────────┐ + │ approved │ │ rejected │ + │ (human review) │ │ (discarded) │ + └─────────────────┘ └─────────────────┘ +``` + +### 3.3 Component Responsibilities + +| Component | Responsibility | Should NOT Do | +|-----------|----------------|---------------| +| **Extractor** | Parse input, produce candidates with confidence | Apply business rules, write to DB | +| **Store** | Dedup, apply status rules, persist, audit | Parse input, format output | +| **Retriever** | Find relevant approved items | Return pending/deleted items | +| **Injector** | Format for prompt, respect token budget | Store or modify memories | + +--- + +## 4. Technology Selection Rationale + +### 4.1 Metrics Storage: SQLite + +| Criterion | SQLite | InfluxDB | Prometheus | +|-----------|--------|----------|------------| +| Consistency with stack | ✅ Same DB | ❌ New dependency | ❌ New infra | +| Volume fit | ✅ Hundreds/day | Overkill | Overkill | +| Backup simplicity | ✅ Single file | ❌ Complex | ❌ Complex | +| Query flexibility | ✅ Full SQL | ⚠️ InfluxQL | ⚠️ PromQL | + +**Decision**: SQLite (same as main DB) + +### 4.2 Precision Evaluation: Human-in-the-Loop + +| Option | Pros | Cons | Decision | +|--------|------|------|----------| +| Human sampling | Ground truth quality | Labor-intensive | ✅ Selected | +| LLM-as-judge | Automated | Circular dependency | ❌ Rejected | +| Rule-based | Fast | Cannot judge semantic correctness | ❌ Rejected | + +**Decision**: Manual sampling (50 items/week) with script support. Build annotated dataset for future automation. + +### 4.3 Retrieval Evaluation: Synthetic Test Queries + +| Option | Pros | Cons | Decision | +|--------|------|------|----------| +| Synthetic fixtures | Reproducible, CI-friendly | May miss edge cases | ✅ Selected | +| Production sampling | Real-world coverage | Privacy concerns, non-deterministic | ⚠️ Future | +| User feedback | Direct signal | Sparse, delayed | ⚠️ Supplementary | + +**Decision**: Pre-defined query-answer pairs in `evals/memory/fixtures/`. Supplement with user feedback in P1. + +--- + +## 5. Best Practices and References + +### 5.1 Industry References + +| Source | Key Insight | Application | +|--------|-------------|-------------| +| **Generative Agents** (Stanford, 2023) | Memory streams with reflection layers | Validates episodic vs. long-term distinction | +| **MemGPT** (Berkeley, 2023) | Hierarchical memory with OS-like management | Validates scope-based isolation design | +| **ChatGPT Memory** (OpenAI, 2024) | User-controllable explicit memory | Justifies `pending/approved` workflow | +| **Claude Projects** (Anthropic, 2024) | Project-scoped context isolation | Validates workspace_id pattern | +| **GDPR Article 17** | Right to erasure | Mandates 100% deletion compliance | + +### 5.2 Academic Papers + +| Paper | Relevance | +|-------|-----------| +| Lewis et al., "RAG for Knowledge-Intensive NLP Tasks" (2020) | Retrieval + generation pattern for injection | +| Thakur et al., "BEIR: Zero-shot IR Evaluation" (2021) | Retrieval metric methodology | +| Park et al., "Generative Agents" (2023) | Memory stream architecture | +| Packer et al., "MemGPT" (2023) | Tiered memory management | + +### 5.3 Internal Documents + +| Document | Content | +|----------|---------| +| `docs/memory_survey.md` | Survey of memory architectures across products | +| `docs/memory_ui_controls_matrix.md` | UI patterns from ChatGPT, Claude, Gemini, etc. | +| `docs/memory.md` | Current API documentation | +| `src/paperbot/memory/schema.py` | Existing type definitions | + +--- + +## 6. Risks and Mitigations + +### 6.1 Technical Risks + +| Risk | Likelihood | Impact | Mitigation | +|------|------------|--------|------------| +| Metrics collection slows API | Medium | Medium | Async logging; batch writes; sampling | +| Human labeling bottleneck | High | Low | Start small (50/week); automate obvious cases | +| Deletion compliance gaps | Low | Critical | DB constraint; nightly audit; hard delete for GDPR | +| Scope confusion in queries | Medium | Medium | Clear error messages; API validation; docs | + +### 6.2 Process Risks + +| Risk | Likelihood | Impact | Mitigation | +|------|------------|--------|------------| +| Metric definitions drift | Medium | Medium | Document in code; review in PRs | +| Evaluation dataset staleness | High | Low | Quarterly refresh; add edge cases from production | +| Unrealistic targets | Medium | Low | Start with industry benchmarks; adjust after first month | + +### 6.3 Compliance Risks + +| Risk | Likelihood | Impact | Mitigation | +|------|------------|--------|------------| +| PII in metrics | Low | High | Never log raw content; only IDs and scores | +| Audit log tampering | Low | Critical | Append-only design; consider signed entries | + +--- + +## 7. Workload Estimation + +### 7.1 Task Breakdown + +| Task | Effort | Dependencies | +|------|--------|--------------| +| **Documentation** | | | +| Create `docs/memory_types.md` with taxonomy | 2h | None | +| Create `docs/memory_isolation.md` with rules | 1h | Taxonomy | +| Update `schema.py` docstrings | 1h | Taxonomy | +| **Evaluation Infrastructure** | | | +| Add `memory_eval_metrics` table | 1h | None | +| Implement `MemoryMetricCollector` class | 3h | Table | +| Hook collector into API endpoints | 2h | Collector | +| Add `/api/memory/metrics` endpoint | 1h | Collector | +| **Test Harness** | | | +| Create extraction precision test | 2h | Collector | +| Create retrieval hit rate test | 2h | Collector | +| Create deletion compliance test | 1h | Collector | +| Add CI job for regression | 1h | Tests | + +### 7.2 Summary + +| Category | Hours | +|----------|-------| +| Documentation | 4h | +| Evaluation Infrastructure | 7h | +| Test Harness | 6h | +| **Total** | **17h (~2 days)** | + +### 7.3 Suggested Timeline + +``` +Day 1 (Morning): Documentation (4h) + - memory_types.md + - memory_isolation.md + - schema.py docstrings + +Day 1 (Afternoon): Evaluation Infrastructure (4h) + - memory_eval_metrics table + - MetricCollector class (partial) + +Day 2 (Morning): Evaluation Infrastructure + Tests (5h) + - MetricCollector completion + - API integration + - Test harness + +Day 2 (Afternoon): CI Integration + Review (4h) + - CI job setup + - Code review + - Update memory_todo.md +``` + +--- + +## 8. Deliverables Checklist + +Upon completion of P0, the following artifacts will exist: + +### 8.1 Documentation +- [ ] `docs/memory_types.md` - Formal taxonomy with definitions +- [ ] `docs/memory_isolation.md` - Namespace and isolation rules +- [ ] `src/paperbot/memory/schema.py` - Updated docstrings + +### 8.2 Code +- [ ] `src/paperbot/memory/eval/__init__.py` - Evaluation module +- [ ] `src/paperbot/memory/eval/collector.py` - MetricCollector implementation +- [ ] `src/paperbot/infrastructure/stores/models.py` - `MemoryEvalMetricModel` added +- [ ] `src/paperbot/api/routes/memory.py` - `/api/memory/metrics` endpoint + +### 8.3 Tests +- [ ] `evals/memory/test_extraction_precision.py` +- [ ] `evals/memory/test_retrieval_hit_rate.py` +- [ ] `evals/memory/test_deletion_compliance.py` +- [ ] `evals/memory/fixtures/` - Synthetic test data + +### 8.4 CI/CD +- [ ] `.github/workflows/ci.yml` - Memory eval job added + +### 8.5 Tracking +- [ ] `docs/memory_todo.md` - P0 items marked complete + +--- + +## 9. Open Questions + +The following questions require user input before implementation: + +1. **Extraction language support**: Current heuristics are Chinese-focused. Should P0 include English patterns, or defer to later phase? + +2. **Human labeling workflow**: Should we build a simple web UI for labeling, or use external tools (Label Studio, spreadsheet)? + +3. **Metrics retention period**: How long should evaluation metrics be stored? (30 days / 90 days / indefinitely) + +4. **Deletion compliance scope**: Should 100% target apply to soft-delete only, or require hard-delete for GDPR? + +--- + +## Appendix A: Existing Implementation Summary + +### A.1 Current Schema (Already Implemented) + +**`memory_items` table fields**: +- Core: `id`, `user_id`, `workspace_id`, `scope_type`, `scope_id` +- Content: `kind`, `content`, `content_hash`, `confidence` +- Status: `status` (pending/approved/rejected/superseded), `supersedes_id` +- Lifecycle: `expires_at`, `last_used_at`, `use_count` +- Security: `pii_risk`, `deleted_at`, `deleted_reason` +- Audit: `created_at`, `updated_at`, `source_id` + +**`memory_audit_log` table fields**: +- `id`, `ts`, `actor_id`, `user_id`, `workspace_id` +- `action` (create/update/approve/reject/delete/hard_delete/use) +- `item_id`, `source_id`, `detail_json` + +### A.2 Current API Endpoints + +| Endpoint | Method | Status | +|----------|--------|--------| +| `/api/memory/ingest` | POST | ✅ Implemented | +| `/api/memory/list` | GET | ✅ Implemented | +| `/api/memory/context` | POST | ✅ Implemented | +| `/api/memory/items/{id}` | PATCH | ✅ Implemented | +| `/api/memory/items/{id}` | DELETE | ✅ Implemented | +| `/api/memory/metrics` | GET | ❌ To be added | + +### A.3 Current Parsers + +| Platform | Format | Status | +|----------|--------|--------| +| ChatGPT | conversations.json | ✅ Working | +| Gemini | API logs (loose JSON) | ✅ Working | +| Generic | {"messages": [...]} | ✅ Working | +| Plain text | User:/Assistant: prefix | ✅ Working | +| Claude | Export format | ❌ Not implemented | +| Perplexity | Unknown | ❌ Not implemented | + + + Phase 1: Documentation + + | File | Status | + |--------------------------|---------| + | docs/memory_types.md | Created | + | docs/memory_isolation.md | Created | + + Phase 2: Evaluation Infrastructure + + | File | Status | + |----------------------------------------------|------------------------------------------------| + | src/paperbot/infrastructure/stores/models.py | Modified (added MemoryEvalMetricModel) | + | src/paperbot/memory/eval/__init__.py | Created | + | src/paperbot/memory/eval/collector.py | Created | + | src/paperbot/api/routes/memory.py | Modified (added /api/memory/metrics endpoints) | + + Phase 3: Test Harness + + | File | Status | + |----------------------------------------------|---------| + | evals/memory/__init__.py | Created | + | evals/memory/fixtures/sample_memories.json | Created | + | evals/memory/fixtures/retrieval_queries.json | Created | + | evals/memory/test_deletion_compliance.py | Created | + | evals/memory/test_retrieval_hit_rate.py | Created | + + Phase 4: CI Integration + + | File | Status | + |--------------------------|----------------------------------| + | .github/workflows/ci.yml | Modified (added memory P0 tests) | \ No newline at end of file diff --git a/docs/memory_isolation.md b/docs/memory_isolation.md new file mode 100644 index 0000000..043b572 --- /dev/null +++ b/docs/memory_isolation.md @@ -0,0 +1,144 @@ +# Memory Namespace and Isolation Strategy + +This document describes the isolation boundaries and query resolution rules implemented in the memory system. + +> **Reference Implementation:** `src/paperbot/infrastructure/stores/memory_store.py` + +## Isolation Hierarchy + +Memory items are isolated using a three-level namespace hierarchy, as defined in `memory_todo.md`: + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Level 1: user_id (required) │ +│ ┌───────────────────────────────────────────────────────────┐ │ +│ │ Level 2: workspace_id (optional) │ │ +│ │ ┌─────────────────────────────────────────────────────┐ │ │ +│ │ │ Level 3: scope_type:scope_id (optional) │ │ │ +│ │ └─────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Level 1: user_id (Required) + +The primary isolation boundary. Cross-user access is forbidden. + +**Implementation:** Every query in `memory_store.py` starts with: +```python +stmt = select(MemoryItemModel).where(MemoryItemModel.user_id == user_id) +``` + +## Level 2: workspace_id (Optional) + +Team or project-level isolation within a user's memories. + +**Implementation:** `memory_store.py` line 293-294: +```python +if workspace_id is not None: + stmt = stmt.where(MemoryItemModel.workspace_id == workspace_id) +``` + +**Behavior:** +- If `workspace_id` is set on a memory, it is only visible when querying with the same `workspace_id` +- Memories with `workspace_id=NULL` are user-global + +## Level 3: scope_type:scope_id (Optional) + +Fine-grained isolation for research tracks, projects, or papers. + +**Implementation:** `memory_store.py` lines 296-301: +```python +if scope_type is not None: + if scope_type == "global": + stmt = stmt.where(or_(MemoryItemModel.scope_type == scope_type, MemoryItemModel.scope_type.is_(None))) + else: + stmt = stmt.where(MemoryItemModel.scope_type == scope_type) +if scope_id is not None: + stmt = stmt.where(MemoryItemModel.scope_id == scope_id) +``` + +**Scope Types:** + +| scope_type | scope_id | Usage | +|------------|----------|-------| +| `global` | NULL | User-wide memories, visible across all tracks | +| `track` | track_id | Research direction specific | +| `project` | project_id | Project-specific context | +| `paper` | paper_id | Paper-specific notes | + +## Content Hash Deduplication + +To prevent duplicates while allowing the same content in different scopes, the content hash includes scope information. + +**Implementation:** `memory_store.py` lines 168-171: +```python +# Dedup within a user's scope boundary (prevents cross-track pollution/dedup collisions). +content_hash = _sha256_text( + f"{effective_scope_type}:{effective_scope_id or ''}:{m.kind}:{content}" +) +``` + +## Status Filtering + +Queries only return approved, non-deleted, non-expired items. + +**Implementation:** `memory_store.py` lines 303-305: +```python +stmt = stmt.where(MemoryItemModel.deleted_at.is_(None)) +stmt = stmt.where(MemoryItemModel.status == "approved") +stmt = stmt.where(or_(MemoryItemModel.expires_at.is_(None), MemoryItemModel.expires_at > now)) +``` + +## Auto-Status Assignment + +Status is automatically set based on confidence and PII risk. + +**Implementation:** `memory_store.py` lines 174-178: +```python +effective_status = "approved" if float(m.confidence) >= 0.60 else "pending" +pii_risk = _estimate_pii_risk(content) +if pii_risk >= 2 and effective_status == "approved": + effective_status = "pending" +``` + +**Rules:** +- `confidence >= 0.60` → `approved` +- `confidence < 0.60` → `pending` +- `pii_risk >= 2` → override to `pending` (requires human review) + +## Provider as Metadata + +The source platform (ChatGPT/Gemini/Claude) is stored in `memory_sources.platform`, not in `memory_items`. + +**Rationale:** From `memory_survey.md` section 6.1: +> 建议拆成三张逻辑表: +> - `memory_sources`:导入批次(平台、文件名、sha256、统计信息) +> - `memory_items`:稳定条目 + +Provider is for provenance tracking, not query filtering. + +## Audit Trail + +All mutations are logged in `memory_audit_log`. + +**Implementation:** `memory_store.py` lines 205-220: +```python +session.add( + MemoryAuditLogModel( + ts=now, + actor_id=actor_id, + user_id=user_id, + workspace_id=workspace_id, + action="create", + item_id=row.id, + ... + ) +) +``` + +## References + +- Requirements: `docs/memory_todo.md` P0 section +- Survey: `docs/memory_survey.md` section 6 +- Implementation: `src/paperbot/infrastructure/stores/memory_store.py` diff --git a/docs/memory_types.md b/docs/memory_types.md new file mode 100644 index 0000000..9e23b6c --- /dev/null +++ b/docs/memory_types.md @@ -0,0 +1,141 @@ +# Memory Type Taxonomy + +This document defines the formal taxonomy for memory types in PaperBot's cross-platform memory middleware. + +## Overview + +Memory items are categorized into three main categories based on their purpose and lifecycle: + +1. **User Memory** - Stable, cross-session information about the user +2. **Episodic Memory** - Session-derived facts with potential decay +3. **Workspace/Project Memory** - Scope-specific context + +## Memory Type Definitions + +### User Memory + +Long-term, stable information about the user that persists across sessions. + +| Kind | Definition | Persistence | Confidence Threshold | Examples | +|------|------------|-------------|---------------------|----------| +| `profile` | Identity facts: name, role, affiliation, background | Permanent until edited | 0.85 | "My name is Jerry", "I'm a PhD student at MIT" | +| `preference` | Style, format, or interaction preferences | Semi-permanent | 0.72 | "I prefer concise answers", "Use Chinese for responses" | +| `goal` | Long-term objectives or research directions | Session-spanning | 0.68 | "I'm researching LLM memory systems", "Working on a paper about RAG" | +| `constraint` | Hard rules or requirements that must be followed | Permanent | 0.62 | "Never include code examples", "Always cite sources" | +| `project` | Project-level context and background | Project-scoped | 0.70 | "PaperBot is a research workflow tool", "The codebase uses FastAPI" | + +### Episodic Memory + +Information derived from specific sessions or interactions, subject to decay or update. + +| Kind | Definition | Persistence | Confidence Threshold | Examples | +|------|------------|-------------|---------------------|----------| +| `fact` | Specific facts mentioned during interaction | Decaying (use_count tracked) | 0.65 | "User's deadline is Friday", "Paper was submitted to NeurIPS" | +| `decision` | Key decisions made during sessions | Project-scoped | 0.70 | "We chose SQLite over Postgres", "Using Docker for sandboxing" | +| `hypothesis` | Tentative assumptions or inferences | Low-priority, may expire | 0.50 | "User may be a graduate student", "Likely interested in NLP" | +| `todo` | Action items or pending tasks | Time-bound (expires_at) | 0.58 | "Need to implement FTS5", "Review the memory module" | + +### Workspace/Project Memory + +Scope-specific information tied to a particular workspace or research track. + +| Kind | Definition | Persistence | Confidence Threshold | Examples | +|------|------------|-------------|---------------------|----------| +| `keyword_set` | Domain-specific keywords or focus areas | Project-scoped | 0.75 | "Focus areas: RAG, memory, multi-agent" | +| `note` | Free-form annotations or observations | Project-scoped | 0.60 | "Important: check GDPR compliance", "Consider using vector DB" | + +## What is NOT Memory + +The following are explicitly **excluded** from the memory system: + +| Item | Reason | Where It Belongs | +|------|--------|------------------| +| **Context Caching** | Performance optimization, not long-term storage | LLM provider layer (prompt caching) | +| **Code Index** | Repository structure, symbols, dependencies | `context_engine/` or `code_index` subsystem | +| **Search Results** | Ephemeral retrieval results | Not persisted; used immediately | +| **Embeddings** | Vector representations of content | Separate `memory_embeddings` table (future) | +| **Raw Conversation History** | Unprocessed message logs | `memory_sources` table (input, not output) | +| **Temporary Variables** | Session-only working memory | In-memory only, not persisted | + +## Status Workflow + +Memory items follow a status workflow based on confidence and risk: + +``` +Extracted (MemoryCandidate) + │ + ├── confidence >= 0.60 AND pii_risk < 2 + │ │ + │ ▼ + │ ┌─────────────┐ + │ │ approved │ ──── Can be retrieved and injected + │ └─────────────┘ + │ + └── confidence < 0.60 OR pii_risk >= 2 + │ + ▼ + ┌─────────────┐ + │ pending │ ──── Requires human review + └─────────────┘ + │ + ┌─────────┴─────────┐ + │ │ + ▼ ▼ + ┌─────────────┐ ┌─────────────┐ + │ approved │ │ rejected │ + └─────────────┘ └─────────────┘ +``` + +## Confidence Score Guidelines + +When implementing extractors, use these confidence score ranges: + +| Score Range | Meaning | Action | +|-------------|---------|--------| +| 0.85 - 1.00 | High confidence, explicit statement | Auto-approve | +| 0.70 - 0.84 | Good confidence, clear inference | Auto-approve | +| 0.60 - 0.69 | Moderate confidence | Auto-approve (borderline) | +| 0.50 - 0.59 | Low confidence | Pending (needs review) | +| 0.00 - 0.49 | Very low confidence | Pending or reject | + +## PII Risk Levels + +| Level | Description | Behavior | +|-------|-------------|----------| +| 0 | No PII detected | Normal processing | +| 1 | Possible PII (redacted patterns found) | Normal, but flagged | +| 2 | High PII risk (raw PII detected) | Auto-pending, requires review | + +## Usage in Code + +```python +from paperbot.memory.schema import MemoryKind, MemoryCandidate + +# Valid memory kinds +valid_kinds = [ + # User Memory + "profile", "preference", "goal", "constraint", "project", + # Episodic Memory + "fact", "decision", "hypothesis", "todo", + # Workspace Memory + "keyword_set", "note" +] + +# Creating a memory candidate +candidate = MemoryCandidate( + kind="preference", + content="User prefers concise answers", + confidence=0.75, + tags=["style", "output"], + evidence="From conversation: 'Please keep responses brief'", + scope_type="global", + scope_id=None, + status="approved" # Will be set by store based on confidence +) +``` + +## References + +- Design Document: `docs/P0_memory_scope_acceptance_design.md` +- Schema Definition: `src/paperbot/memory/schema.py` +- Store Implementation: `src/paperbot/infrastructure/stores/memory_store.py` diff --git a/evals/memory/__init__.py b/evals/memory/__init__.py new file mode 100644 index 0000000..a42d48a --- /dev/null +++ b/evals/memory/__init__.py @@ -0,0 +1,7 @@ +""" +Memory evaluation tests for Scope and Acceptance criteria. + +Test files: +- test_deletion_compliance.py: Verify deleted items never retrieved +- test_retrieval_hit_rate.py: Verify relevant memories are found +""" diff --git a/evals/memory/fixtures/retrieval_queries.json b/evals/memory/fixtures/retrieval_queries.json new file mode 100644 index 0000000..3a7c97a --- /dev/null +++ b/evals/memory/fixtures/retrieval_queries.json @@ -0,0 +1,35 @@ +{ + "description": "Synthetic test queries for retrieval hit rate evaluation", + "test_cases": [ + { + "id": "pref_output_style", + "query": "output format preference", + "expected_kinds": ["preference"], + "expected_keywords": ["concise", "brief", "format"] + }, + { + "id": "project_context", + "query": "current project", + "expected_kinds": ["project"], + "expected_keywords": ["project", "working on", "building"] + }, + { + "id": "user_identity", + "query": "user name role", + "expected_kinds": ["profile"], + "expected_keywords": ["name", "role", "position"] + }, + { + "id": "constraints", + "query": "rules restrictions", + "expected_kinds": ["constraint"], + "expected_keywords": ["never", "always", "must", "don't"] + }, + { + "id": "goals", + "query": "goals objectives", + "expected_kinds": ["goal"], + "expected_keywords": ["goal", "want", "trying to", "working towards"] + } + ] +} diff --git a/evals/memory/fixtures/sample_memories.json b/evals/memory/fixtures/sample_memories.json new file mode 100644 index 0000000..6445177 --- /dev/null +++ b/evals/memory/fixtures/sample_memories.json @@ -0,0 +1,42 @@ +{ + "description": "Sample memories for testing extraction and retrieval", + "user_id": "test_user", + "memories": [ + { + "kind": "profile", + "content": "User's name is Jerry, a PhD student at MIT", + "confidence": 0.85, + "tags": ["identity", "name", "role"] + }, + { + "kind": "preference", + "content": "User prefers concise, brief responses without excessive explanation", + "confidence": 0.75, + "tags": ["style", "output", "format"] + }, + { + "kind": "project", + "content": "Currently working on PaperBot, a research workflow tool", + "confidence": 0.80, + "tags": ["project", "paperbot"] + }, + { + "kind": "constraint", + "content": "Never include code examples unless explicitly requested", + "confidence": 0.70, + "tags": ["rules", "code"] + }, + { + "kind": "goal", + "content": "Working towards building a memory middleware for LLM applications", + "confidence": 0.72, + "tags": ["goal", "memory", "llm"] + }, + { + "kind": "fact", + "content": "Project deadline is end of Q1 2025", + "confidence": 0.65, + "tags": ["deadline", "timeline"] + } + ] +} diff --git a/evals/memory/test_deletion_compliance.py b/evals/memory/test_deletion_compliance.py new file mode 100644 index 0000000..8f7fa86 --- /dev/null +++ b/evals/memory/test_deletion_compliance.py @@ -0,0 +1,150 @@ +""" +Deletion Compliance Test (Acceptance Criteria) + +Target: 100% - Deleted items must NEVER be retrieved. + +This test: +1. Creates test memories +2. Soft-deletes some items +3. Attempts to retrieve them via search +4. Verifies deleted items are not returned +5. Records the metric +""" + +import os +import sys +import tempfile + +# Add src to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "src")) + +from paperbot.infrastructure.stores.memory_store import SqlAlchemyMemoryStore +from paperbot.memory.schema import MemoryCandidate +from paperbot.memory.eval.collector import MemoryMetricCollector + + +def run_deletion_compliance_test() -> dict: + """ + Run deletion compliance test and return results. + + Returns: + dict with keys: passed, deleted_count, retrieved_count, compliance_rate + """ + # Use temp database for isolation + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: + db_path = f.name + + try: + db_url = f"sqlite:///{db_path}" + store = SqlAlchemyMemoryStore(db_url=db_url) + collector = MemoryMetricCollector(db_url=db_url) + + user_id = "test_deletion_user" + + # Step 1: Create test memories + test_memories = [ + MemoryCandidate(kind="profile", content="Test user name is Alice", confidence=0.85), + MemoryCandidate(kind="preference", content="Prefers markdown format", confidence=0.75), + MemoryCandidate(kind="project", content="Working on test project", confidence=0.80), + MemoryCandidate(kind="goal", content="Goal is to test deletion", confidence=0.70), + MemoryCandidate(kind="fact", content="Test fact that will be deleted", confidence=0.65), + ] + + created, skipped, _rows = store.add_memories( + user_id=user_id, + memories=test_memories, + actor_id="test", + ) + + # Query to get the created memory IDs (rows are detached from session) + all_items = store.list_memories(user_id=user_id, limit=100) + item_ids = [item["id"] for item in all_items] + + # Step 2: Soft-delete some items (last 2 items) + deleted_ids = [] + for i, item_id in enumerate(item_ids): + if i >= 3: # Delete last 2 items + store.soft_delete_item( + user_id=user_id, + item_id=item_id, + actor_id="test", + reason="Test deletion", + ) + deleted_ids.append(item_id) + + deleted_count = len(deleted_ids) + + # Step 3: Attempt to retrieve ALL items via search + # Use broad queries that should match deleted items if they were returned + queries = ["test", "goal", "deletion", "fact", "Alice", "project"] + + retrieved_deleted_count = 0 + for query in queries: + results = store.search_memories( + user_id=user_id, + query=query, + limit=100, + ) + # Check if any deleted ID appears in results + result_ids = {r.get("id") for r in results} + for deleted_id in deleted_ids: + if deleted_id in result_ids: + retrieved_deleted_count += 1 + print(f"FAIL: Deleted item {deleted_id} was retrieved for query '{query}'") + + # Step 4: Also check list_memories + all_items = store.list_memories(user_id=user_id, limit=100) + for item in all_items: + if item.get("id") in deleted_ids: + retrieved_deleted_count += 1 + print(f"FAIL: Deleted item {item.get('id')} was returned in list_memories") + + # Step 5: Calculate compliance + # Compliance = 1.0 means no deleted items were retrieved + compliance_rate = 1.0 if retrieved_deleted_count == 0 else 0.0 + + # Record metric + collector.record_deletion_compliance( + deleted_retrieved_count=retrieved_deleted_count, + deleted_total_count=deleted_count, + evaluator_id="test:deletion_compliance", + detail={ + "deleted_ids": deleted_ids, + "queries_tested": queries, + }, + ) + + passed = compliance_rate == 1.0 + + return { + "passed": passed, + "deleted_count": deleted_count, + "retrieved_count": retrieved_deleted_count, + "compliance_rate": compliance_rate, + } + + finally: + # Cleanup + if os.path.exists(db_path): + os.unlink(db_path) + + +def main(): + print("=" * 60) + print("P0 Deletion Compliance Test") + print("Target: 100% (deleted items must never be retrieved)") + print("=" * 60) + + result = run_deletion_compliance_test() + + print(f"\nResults:") + print(f" Deleted items: {result['deleted_count']}") + print(f" Retrieved (should be 0): {result['retrieved_count']}") + print(f" Compliance rate: {result['compliance_rate']:.1%}") + print(f" Status: {'PASS' if result['passed'] else 'FAIL'}") + + return 0 if result["passed"] else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/evals/memory/test_retrieval_hit_rate.py b/evals/memory/test_retrieval_hit_rate.py new file mode 100644 index 0000000..7a23741 --- /dev/null +++ b/evals/memory/test_retrieval_hit_rate.py @@ -0,0 +1,161 @@ +""" +Retrieval Hit Rate Test (Acceptance Criteria) + +Target: >= 80% - Relevant memories should be retrieved when needed. + +This test: +1. Loads sample memories from fixtures +2. Runs queries from fixtures +3. Checks if expected memory kinds are retrieved +4. Calculates hit rate +5. Records the metric +""" + +import json +import os +import sys +import tempfile +from pathlib import Path + +# Add src to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "src")) + +from paperbot.infrastructure.stores.memory_store import SqlAlchemyMemoryStore +from paperbot.memory.schema import MemoryCandidate +from paperbot.memory.eval.collector import MemoryMetricCollector + + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + +def load_fixtures(): + """Load test fixtures.""" + with open(FIXTURES_DIR / "sample_memories.json") as f: + memories_data = json.load(f) + + with open(FIXTURES_DIR / "retrieval_queries.json") as f: + queries_data = json.load(f) + + return memories_data, queries_data + + +def run_retrieval_hit_rate_test() -> dict: + """ + Run retrieval hit rate test and return results. + + Returns: + dict with keys: passed, total_queries, hits, hit_rate, details + """ + # Use temp database for isolation + with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: + db_path = f.name + + try: + db_url = f"sqlite:///{db_path}" + store = SqlAlchemyMemoryStore(db_url=db_url) + collector = MemoryMetricCollector(db_url=db_url) + + # Load fixtures + memories_data, queries_data = load_fixtures() + user_id = memories_data["user_id"] + + # Step 1: Insert sample memories + candidates = [ + MemoryCandidate( + kind=m["kind"], + content=m["content"], + confidence=m["confidence"], + tags=m.get("tags", []), + ) + for m in memories_data["memories"] + ] + + store.add_memories( + user_id=user_id, + memories=candidates, + actor_id="test", + ) + + # Step 2: Run queries and check hits + total_queries = len(queries_data["test_cases"]) + hits = 0 + details = [] + + for test_case in queries_data["test_cases"]: + query = test_case["query"] + expected_kinds = set(test_case["expected_kinds"]) + + # Search + results = store.search_memories( + user_id=user_id, + query=query, + limit=10, + ) + + # Check if expected kinds are found + result_kinds = {r.get("kind") for r in results} + found_expected = bool(expected_kinds & result_kinds) + + if found_expected: + hits += 1 + status = "HIT" + else: + status = "MISS" + + detail = { + "id": test_case["id"], + "query": query, + "expected_kinds": list(expected_kinds), + "result_kinds": list(result_kinds), + "status": status, + } + details.append(detail) + print(f" {test_case['id']}: {status} (expected {expected_kinds}, got {result_kinds})") + + # Step 3: Calculate hit rate + hit_rate = hits / total_queries if total_queries > 0 else 0.0 + + # Step 4: Record metric + collector.record_retrieval_hit_rate( + hits=hits, + expected=total_queries, + evaluator_id="test:retrieval_hit_rate", + detail={"test_cases": details}, + ) + + passed = hit_rate >= 0.80 # Target: >= 80% + + return { + "passed": passed, + "total_queries": total_queries, + "hits": hits, + "hit_rate": hit_rate, + "details": details, + } + + finally: + # Cleanup + if os.path.exists(db_path): + os.unlink(db_path) + + +def main(): + print("=" * 60) + print("Retrieval Hit Rate Test") + print("Target: >= 80% (relevant memories should be retrieved)") + print("=" * 60) + + result = run_retrieval_hit_rate_test() + + print(f"\nResults:") + print(f" Total queries: {result['total_queries']}") + print(f" Hits: {result['hits']}") + print(f" Hit rate: {result['hit_rate']:.1%}") + print(f" Target: >= 80%") + print(f" Status: {'PASS' if result['passed'] else 'FAIL'}") + + return 0 if result["passed"] else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/paperbot/api/routes/memory.py b/src/paperbot/api/routes/memory.py index 9bbc9ba..c2a862c 100644 --- a/src/paperbot/api/routes/memory.py +++ b/src/paperbot/api/routes/memory.py @@ -10,6 +10,7 @@ from paperbot.memory import build_memory_context, extract_memories, parse_chat_log from paperbot.memory.schema import MemoryCandidate +from paperbot.memory.eval.collector import MemoryMetricCollector router = APIRouter() _store = SqlAlchemyMemoryStore() @@ -262,3 +263,46 @@ def delete_memory_item( if not ok: raise HTTPException(status_code=404, detail="memory item not found") return {"status": "ok"} + + +# --- Metrics Endpoints (Scope and Acceptance Criteria) --- + +_metric_collector = MemoryMetricCollector(_store._provider) + + +class MetricsSummaryResponse(BaseModel): + status: str # "pass" or "fail" + metrics: Dict[str, Any] + targets: Dict[str, float] + + +@router.get("/memory/metrics", response_model=MetricsSummaryResponse) +def get_memory_metrics(): + """ + Get summary of memory system evaluation metrics. + + Returns the latest value for each P0 acceptance metric: + - extraction_precision (target: >= 85%) + - false_positive_rate (target: <= 5%) + - retrieval_hit_rate (target: >= 80%) + - injection_pollution_rate (target: <= 2%) + - deletion_compliance (target: 100%) + """ + return _metric_collector.get_metrics_summary() + + +class MetricHistoryResponse(BaseModel): + metric_name: str + history: List[Dict[str, Any]] + + +@router.get("/memory/metrics/{metric_name}", response_model=MetricHistoryResponse) +def get_metric_history(metric_name: str, limit: int = Query(30, ge=1, le=100)): + """Get historical values for a specific metric.""" + if metric_name not in MemoryMetricCollector.TARGETS: + raise HTTPException( + status_code=400, + detail=f"Unknown metric: {metric_name}. Valid: {list(MemoryMetricCollector.TARGETS.keys())}", + ) + history = _metric_collector.get_metric_history(metric_name, limit=limit) + return MetricHistoryResponse(metric_name=metric_name, history=history) diff --git a/src/paperbot/api/routes/research.py b/src/paperbot/api/routes/research.py index 11a24af..8d19649 100644 --- a/src/paperbot/api/routes/research.py +++ b/src/paperbot/api/routes/research.py @@ -9,6 +9,7 @@ from paperbot.context_engine.track_router import TrackRouter from paperbot.infrastructure.stores.memory_store import SqlAlchemyMemoryStore from paperbot.infrastructure.stores.research_store import SqlAlchemyResearchStore +from paperbot.memory.eval.collector import MemoryMetricCollector from paperbot.memory.extractor import extract_memories from paperbot.memory.schema import MemoryCandidate, NormalizedMessage @@ -17,6 +18,15 @@ _research_store = SqlAlchemyResearchStore() _memory_store = SqlAlchemyMemoryStore() _track_router = TrackRouter(research_store=_research_store, memory_store=_memory_store) +_metric_collector: Optional[MemoryMetricCollector] = None + + +def _get_metric_collector() -> MemoryMetricCollector: + """Lazy initialization of metric collector.""" + global _metric_collector + if _metric_collector is None: + _metric_collector = MemoryMetricCollector() + return _metric_collector def _schedule_embedding_precompute( @@ -361,6 +371,9 @@ class BulkModerateResponse(BaseModel): @router.post("/research/memory/bulk_moderate", response_model=BulkModerateResponse) def bulk_moderate(req: BulkModerateRequest, background_tasks: BackgroundTasks): + # Get items before update to check their confidence for P0 metrics + items_before = _memory_store.get_items_by_ids(user_id=req.user_id, item_ids=req.item_ids) + updated = _memory_store.bulk_update_items( user_id=req.user_id, item_ids=req.item_ids, @@ -373,6 +386,26 @@ def bulk_moderate(req: BulkModerateRequest, background_tasks: BackgroundTasks): if i.get("scope_type") == "track" and i.get("scope_id") ] _schedule_embedding_precompute(background_tasks, user_id=req.user_id, track_ids=affected_tracks) + + # P0 Hook: Record false positive rate when user rejects high-confidence items + # A rejection of an auto-approved (confidence >= 0.60) item is a false positive + if req.status == "rejected" and items_before: + high_confidence_rejected = sum( + 1 for item in items_before + if item.get("confidence", 0) >= 0.60 and item.get("status") == "approved" + ) + if high_confidence_rejected > 0: + collector = _get_metric_collector() + collector.record_false_positive_rate( + false_positive_count=high_confidence_rejected, + total_approved_count=len(items_before), + evaluator_id=f"user:{req.user_id}", + detail={ + "item_ids": req.item_ids, + "action": "bulk_moderate_reject", + }, + ) + return BulkModerateResponse(user_id=req.user_id, updated=updated) @@ -410,6 +443,72 @@ def bulk_move(req: BulkMoveRequest, background_tasks: BackgroundTasks): return BulkMoveResponse(user_id=req.user_id, updated=updated) +class MemoryFeedbackRequest(BaseModel): + """Request to record feedback on retrieved memories.""" + user_id: str = "default" + memory_ids: List[int] = Field(..., min_length=1, description="IDs of memories being rated") + helpful_ids: List[int] = Field(default_factory=list, description="IDs of memories that were helpful") + not_helpful_ids: List[int] = Field(default_factory=list, description="IDs of memories that were not helpful") + context_run_id: Optional[int] = None + query: Optional[str] = None + + +class MemoryFeedbackResponse(BaseModel): + user_id: str + total_rated: int + helpful_count: int + not_helpful_count: int + hit_rate: float + + +@router.post("/research/memory/feedback", response_model=MemoryFeedbackResponse) +def record_memory_feedback(req: MemoryFeedbackRequest): + """ + Record user feedback on retrieved memories. + + This endpoint allows users to indicate which memories were helpful vs not helpful + when they were retrieved for a query. This data feeds into the P0 retrieval_hit_rate metric. + + Usage: + - After building context, frontend shows retrieved memories + - User marks which memories were helpful + - Frontend calls this endpoint with the feedback + """ + helpful_set = set(req.helpful_ids) + not_helpful_set = set(req.not_helpful_ids) + + # Count hits (helpful) and misses (not helpful) + hits = len(helpful_set) + total = len(req.memory_ids) + + if total > 0: + hit_rate = hits / total + collector = _get_metric_collector() + collector.record_retrieval_hit_rate( + hits=hits, + expected=total, + evaluator_id=f"user:{req.user_id}", + detail={ + "memory_ids": req.memory_ids, + "helpful_ids": list(helpful_set), + "not_helpful_ids": list(not_helpful_set), + "context_run_id": req.context_run_id, + "query": req.query, + "action": "memory_feedback", + }, + ) + else: + hit_rate = 0.0 + + return MemoryFeedbackResponse( + user_id=req.user_id, + total_rated=total, + helpful_count=hits, + not_helpful_count=len(not_helpful_set), + hit_rate=hit_rate, + ) + + class ClearTrackMemoryResponse(BaseModel): user_id: str track_id: int @@ -433,6 +532,41 @@ def clear_track_memory( reason="clear_track_memory", ) _schedule_embedding_precompute(background_tasks, user_id=user_id, track_ids=[track_id]) + + # P0 Hook: Verify deletion compliance - deleted items should not be retrievable + if deleted > 0: + # Try to retrieve items from the cleared scope (should return empty) + retrieved_after_delete = _memory_store.list_memories( + user_id=user_id, + scope_type="track", + scope_id=str(track_id), + include_deleted=False, + include_pending=True, + limit=100, + ) + # Also try searching + search_results = _memory_store.search_memories( + user_id=user_id, + query="*", # broad query + scope_type="track", + scope_id=str(track_id), + limit=100, + ) + retrieved_count = len(retrieved_after_delete) + len(search_results) + + collector = _get_metric_collector() + collector.record_deletion_compliance( + deleted_retrieved_count=retrieved_count, + deleted_total_count=deleted, + evaluator_id=f"user:{user_id}", + detail={ + "track_id": track_id, + "deleted_count": deleted, + "retrieved_after_delete": retrieved_count, + "action": "clear_track_memory", + }, + ) + return ClearTrackMemoryResponse(user_id=user_id, track_id=track_id, deleted_count=deleted) diff --git a/src/paperbot/infrastructure/stores/memory_store.py b/src/paperbot/infrastructure/stores/memory_store.py index bc5e69e..5820a8f 100644 --- a/src/paperbot/infrastructure/stores/memory_store.py +++ b/src/paperbot/infrastructure/stores/memory_store.py @@ -268,6 +268,24 @@ def list_memories( rows = session.execute(stmt).scalars().all() return [self._row_to_dict(r) for r in rows] + def get_items_by_ids( + self, + *, + user_id: str, + item_ids: List[int], + ) -> List[Dict[str, Any]]: + """Get memory items by their IDs for a specific user.""" + if not item_ids: + return [] + with self._provider.session() as session: + stmt = ( + select(MemoryItemModel) + .where(MemoryItemModel.user_id == user_id) + .where(MemoryItemModel.id.in_(item_ids)) + ) + rows = session.execute(stmt).scalars().all() + return [self._row_to_dict(r) for r in rows] + def search_memories( self, *, diff --git a/src/paperbot/infrastructure/stores/models.py b/src/paperbot/infrastructure/stores/models.py index 646cb6f..3a1204e 100644 --- a/src/paperbot/infrastructure/stores/models.py +++ b/src/paperbot/infrastructure/stores/models.py @@ -319,6 +319,34 @@ class MemoryAuditLogModel(Base): detail_json: Mapped[str] = mapped_column(Text, default="{}") +class MemoryEvalMetricModel(Base): + """ + Evaluation metrics for memory system quality (acceptance criteria). + + Stores periodic measurements of: + - extraction_precision: % of extracted items that are correct + - false_positive_rate: % of approved items that are incorrect + - retrieval_hit_rate: % of relevant memories retrieved when needed + - injection_pollution_rate: % of responses negatively affected by wrong memory + - deletion_compliance: deleted items never retrieved (should be 1.0) + + See docs/memory_types.md for metric definitions. + """ + + __tablename__ = "memory_eval_metrics" + + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + + metric_name: Mapped[str] = mapped_column(String(64), index=True) + metric_value: Mapped[float] = mapped_column(Float) + sample_size: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + + evaluated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True) + evaluator_id: Mapped[str] = mapped_column(String(64), default="system") + + detail_json: Mapped[str] = mapped_column(Text, default="{}") + + class ResearchTrackModel(Base): """ User research direction / track. diff --git a/src/paperbot/memory/eval/__init__.py b/src/paperbot/memory/eval/__init__.py new file mode 100644 index 0000000..0075104 --- /dev/null +++ b/src/paperbot/memory/eval/__init__.py @@ -0,0 +1,14 @@ +""" +Memory evaluation module for Scope and Acceptance criteria. + +Provides metrics collection and evaluation for: +- extraction_precision +- false_positive_rate +- retrieval_hit_rate +- injection_pollution_rate +- deletion_compliance +""" + +from paperbot.memory.eval.collector import MemoryMetricCollector + +__all__ = ["MemoryMetricCollector"] diff --git a/src/paperbot/memory/eval/collector.py b/src/paperbot/memory/eval/collector.py new file mode 100644 index 0000000..ba1c7ae --- /dev/null +++ b/src/paperbot/memory/eval/collector.py @@ -0,0 +1,240 @@ +""" +Memory metric collector for Scope and Acceptance criteria. + +Collects and stores evaluation metrics defined in docs/memory_types.md: +- extraction_precision: >= 85% +- false_positive_rate: <= 5% +- retrieval_hit_rate: >= 80% +- injection_pollution_rate: <= 2% +- deletion_compliance: 100% +""" + +from __future__ import annotations + +import json +from datetime import datetime, timezone +from typing import Any, Dict, List, Optional + +from sqlalchemy import select, func + +from paperbot.infrastructure.stores.models import MemoryEvalMetricModel, MemoryItemModel +from paperbot.infrastructure.stores.db_provider import DbProvider + + +class MemoryMetricCollector: + """ + Collects and stores memory system evaluation metrics. + + Usage: + collector = MemoryMetricCollector(db_provider) + + # Record extraction results + collector.record_extraction_precision( + correct_count=85, + total_count=100, + evaluator_id="human:reviewer1" + ) + + # Get summary + summary = collector.get_metrics_summary() + """ + + # P0 target thresholds + TARGETS = { + "extraction_precision": 0.85, + "false_positive_rate": 0.05, + "retrieval_hit_rate": 0.80, + "injection_pollution_rate": 0.02, + "deletion_compliance": 1.00, + } + + def __init__(self, db_provider: DbProvider): + self._provider = db_provider + + def record_extraction_precision( + self, + correct_count: int, + total_count: int, + evaluator_id: str = "system", + detail: Optional[Dict[str, Any]] = None, + ) -> None: + """Record extraction precision metric (correct / total extracted).""" + if total_count == 0: + return + value = correct_count / total_count + self._record_metric( + metric_name="extraction_precision", + metric_value=value, + sample_size=total_count, + evaluator_id=evaluator_id, + detail=detail, + ) + + def record_false_positive_rate( + self, + false_positive_count: int, + total_approved_count: int, + evaluator_id: str = "system", + detail: Optional[Dict[str, Any]] = None, + ) -> None: + """Record false positive rate (incorrect approved / total approved).""" + if total_approved_count == 0: + return + value = false_positive_count / total_approved_count + self._record_metric( + metric_name="false_positive_rate", + metric_value=value, + sample_size=total_approved_count, + evaluator_id=evaluator_id, + detail=detail, + ) + + def record_retrieval_hit_rate( + self, + hits: int, + expected: int, + evaluator_id: str = "system", + detail: Optional[Dict[str, Any]] = None, + ) -> None: + """Record retrieval hit rate (retrieved relevant / total relevant).""" + if expected == 0: + return + value = hits / expected + self._record_metric( + metric_name="retrieval_hit_rate", + metric_value=value, + sample_size=expected, + evaluator_id=evaluator_id, + detail=detail, + ) + + def record_injection_pollution_rate( + self, + polluted_count: int, + total_injections: int, + evaluator_id: str = "system", + detail: Optional[Dict[str, Any]] = None, + ) -> None: + """Record injection pollution rate (polluted responses / total with memory).""" + if total_injections == 0: + return + value = polluted_count / total_injections + self._record_metric( + metric_name="injection_pollution_rate", + metric_value=value, + sample_size=total_injections, + evaluator_id=evaluator_id, + detail=detail, + ) + + def record_deletion_compliance( + self, + deleted_retrieved_count: int, + deleted_total_count: int, + evaluator_id: str = "system", + detail: Optional[Dict[str, Any]] = None, + ) -> None: + """ + Record deletion compliance (should be 1.0 = 100% compliant). + + Compliance = 1.0 - (deleted_retrieved / deleted_total) + If any deleted item was retrieved, compliance < 1.0. + """ + if deleted_total_count == 0: + return + value = 1.0 - (deleted_retrieved_count / deleted_total_count) + self._record_metric( + metric_name="deletion_compliance", + metric_value=value, + sample_size=deleted_total_count, + evaluator_id=evaluator_id, + detail=detail, + ) + + def _record_metric( + self, + metric_name: str, + metric_value: float, + sample_size: int, + evaluator_id: str, + detail: Optional[Dict[str, Any]], + ) -> None: + """Store a metric record in the database.""" + now = datetime.now(timezone.utc) + with self._provider.session() as session: + row = MemoryEvalMetricModel( + metric_name=metric_name, + metric_value=metric_value, + sample_size=sample_size, + evaluated_at=now, + evaluator_id=evaluator_id, + detail_json=json.dumps(detail or {}, ensure_ascii=False), + ) + session.add(row) + session.commit() + + def get_latest_metrics(self) -> Dict[str, Dict[str, Any]]: + """Get the most recent value for each metric type.""" + result = {} + with self._provider.session() as session: + for metric_name in self.TARGETS.keys(): + stmt = ( + select(MemoryEvalMetricModel) + .where(MemoryEvalMetricModel.metric_name == metric_name) + .order_by(MemoryEvalMetricModel.evaluated_at.desc()) + .limit(1) + ) + row = session.execute(stmt).scalar_one_or_none() + if row: + result[metric_name] = { + "value": row.metric_value, + "target": self.TARGETS[metric_name], + "meets_target": self._meets_target(metric_name, row.metric_value), + "sample_size": row.sample_size, + "evaluated_at": row.evaluated_at.isoformat() if row.evaluated_at else None, + "evaluator_id": row.evaluator_id, + } + return result + + def get_metrics_summary(self) -> Dict[str, Any]: + """Get summary of all metrics with pass/fail status.""" + latest = self.get_latest_metrics() + all_pass = all(m.get("meets_target", False) for m in latest.values()) + return { + "status": "pass" if all_pass else "fail", + "metrics": latest, + "targets": self.TARGETS, + } + + def _meets_target(self, metric_name: str, value: float) -> bool: + """Check if a metric value meets its target.""" + target = self.TARGETS.get(metric_name) + if target is None: + return True + # For rates that should be LOW (false_positive, pollution), lower is better + if metric_name in ("false_positive_rate", "injection_pollution_rate"): + return value <= target + # For rates that should be HIGH (precision, hit_rate, compliance), higher is better + return value >= target + + def get_metric_history( + self, metric_name: str, limit: int = 30 + ) -> List[Dict[str, Any]]: + """Get historical values for a specific metric.""" + result = [] + with self._provider.session() as session: + stmt = ( + select(MemoryEvalMetricModel) + .where(MemoryEvalMetricModel.metric_name == metric_name) + .order_by(MemoryEvalMetricModel.evaluated_at.desc()) + .limit(limit) + ) + rows = session.execute(stmt).scalars().all() + for row in rows: + result.append({ + "value": row.metric_value, + "sample_size": row.sample_size, + "evaluated_at": row.evaluated_at.isoformat() if row.evaluated_at else None, + "evaluator_id": row.evaluator_id, + }) + return result From 78ed180d08a3a89d4f40f8a9244fdeab6e979480 Mon Sep 17 00:00:00 2001 From: boyu Date: Sat, 3 Jan 2026 23:30:28 +0100 Subject: [PATCH 2/9] fix(memory): use SessionProvider instead of DbProvider in collector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix import error: DbProvider doesn't exist, use SessionProvider from sqlalchemy_db. 🤖 Generated with [LIU BOYU] --- src/paperbot/memory/eval/collector.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/paperbot/memory/eval/collector.py b/src/paperbot/memory/eval/collector.py index ba1c7ae..e747f15 100644 --- a/src/paperbot/memory/eval/collector.py +++ b/src/paperbot/memory/eval/collector.py @@ -17,8 +17,8 @@ from sqlalchemy import select, func -from paperbot.infrastructure.stores.models import MemoryEvalMetricModel, MemoryItemModel -from paperbot.infrastructure.stores.db_provider import DbProvider +from paperbot.infrastructure.stores.models import Base, MemoryEvalMetricModel +from paperbot.infrastructure.stores.sqlalchemy_db import SessionProvider, get_db_url class MemoryMetricCollector: @@ -48,8 +48,10 @@ class MemoryMetricCollector: "deletion_compliance": 1.00, } - def __init__(self, db_provider: DbProvider): - self._provider = db_provider + def __init__(self, db_url: Optional[str] = None): + self._provider = SessionProvider(db_url or get_db_url()) + # Ensure table exists + Base.metadata.create_all(self._provider.engine) def record_extraction_precision( self, From 978fdd6c761b86f8125fa152b526508047d9cf33 Mon Sep 17 00:00:00 2001 From: boyu Date: Sun, 4 Jan 2026 09:03:10 +0100 Subject: [PATCH 3/9] fix(ci): add python-multipart and fix collector imports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add python-multipart to requirements.txt and requirements-ci.txt (required for FastAPI file uploads in /memory/ingest) 🤖 Generated with [LIU BOYU] --- requirements-ci.txt | 1 + requirements.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements-ci.txt b/requirements-ci.txt index 84399ba..5c91441 100644 --- a/requirements-ci.txt +++ b/requirements-ci.txt @@ -2,6 +2,7 @@ # Keep this smaller than requirements.txt to avoid heavy system deps (e.g. weasyprint). fastapi==0.115.0 +python-multipart>=0.0.6 starlette>=0.37.2,<0.39.0 sse-starlette>=2.1.0 httpx[http2]>=0.27.0,<0.28.0 diff --git a/requirements.txt b/requirements.txt index 4e54a17..f3036df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -62,6 +62,7 @@ json-repair>=0.22.0 # FastAPI backend for CLI/Web fastapi==0.115.0 +python-multipart>=0.0.6 # Required for file uploads # Pin Starlette for Python 3.9 compatible TestClient (avoid metaclass conflicts in newer Starlette) starlette>=0.37.2,<0.39.0 uvicorn[standard]>=0.32.0 From c3bb7e3f2d2fa36a8e72ea1fbcd846447bf715e1 Mon Sep 17 00:00:00 2001 From: boyu Date: Sun, 4 Jan 2026 09:09:20 +0100 Subject: [PATCH 4/9] fix: remove SessionProvider arg from MemoryMetricCollector calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - memory.py: use MemoryMetricCollector() without args - collector.py: update docstring to show correct usage 🤖 Generated with [LIU BOYU] --- src/paperbot/api/routes/memory.py | 2 +- src/paperbot/memory/eval/collector.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/paperbot/api/routes/memory.py b/src/paperbot/api/routes/memory.py index c2a862c..670a169 100644 --- a/src/paperbot/api/routes/memory.py +++ b/src/paperbot/api/routes/memory.py @@ -267,7 +267,7 @@ def delete_memory_item( # --- Metrics Endpoints (Scope and Acceptance Criteria) --- -_metric_collector = MemoryMetricCollector(_store._provider) +_metric_collector = MemoryMetricCollector() class MetricsSummaryResponse(BaseModel): diff --git a/src/paperbot/memory/eval/collector.py b/src/paperbot/memory/eval/collector.py index e747f15..120cd12 100644 --- a/src/paperbot/memory/eval/collector.py +++ b/src/paperbot/memory/eval/collector.py @@ -26,7 +26,8 @@ class MemoryMetricCollector: Collects and stores memory system evaluation metrics. Usage: - collector = MemoryMetricCollector(db_provider) + collector = MemoryMetricCollector() # Uses default db_url + # Or: collector = MemoryMetricCollector(db_url="sqlite:///custom.db") # Record extraction results collector.record_extraction_precision( From a6e2ad3bc90e522f3a12e94991277ae38551d117 Mon Sep 17 00:00:00 2001 From: boyu Date: Sun, 4 Jan 2026 09:19:29 +0100 Subject: [PATCH 5/9] fix(ci): add gitpython to requirements-ci.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [LIU BOYU] --- requirements-ci.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/requirements-ci.txt b/requirements-ci.txt index 5c91441..d0ff9c6 100644 --- a/requirements-ci.txt +++ b/requirements-ci.txt @@ -29,3 +29,7 @@ redis>=5.0.0 openai>=1.0.0 anthropic>=0.3.0 +# 代码分析(可选) +gitpython>=3.1.0 + + From eaec917f35d9359a1a2c7f6f7054520d9ee6b5e9 Mon Sep 17 00:00:00 2001 From: boyu Date: Sun, 4 Jan 2026 09:24:39 +0100 Subject: [PATCH 6/9] fix(ci): add requests to requirements-ci.txt Generated with [LIU BOYU] --- requirements-ci.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-ci.txt b/requirements-ci.txt index d0ff9c6..5f4d738 100644 --- a/requirements-ci.txt +++ b/requirements-ci.txt @@ -31,5 +31,6 @@ anthropic>=0.3.0 # 代码分析(可选) gitpython>=3.1.0 +requests>=2.28.0 From 6d118c6a6f711ff387a0ad7886d05c9ffbad2268 Mon Sep 17 00:00:00 2001 From: boyu Date: Sun, 4 Jan 2026 09:29:48 +0100 Subject: [PATCH 7/9] fix(ci): add jinja2 to requirements-ci.txt Generated with [LIU BOYU] --- requirements-ci.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-ci.txt b/requirements-ci.txt index 5f4d738..02fc402 100644 --- a/requirements-ci.txt +++ b/requirements-ci.txt @@ -29,8 +29,8 @@ redis>=5.0.0 openai>=1.0.0 anthropic>=0.3.0 -# 代码分析(可选) + gitpython>=3.1.0 requests>=2.28.0 - +jinja2>=3.1.0 From 995c0ce400d077a2feef6838e0151fd2ef6b3e94 Mon Sep 17 00:00:00 2001 From: boyu Date: Sun, 4 Jan 2026 12:13:12 +0100 Subject: [PATCH 8/9] fix(ci): add missing dependencies to requirements-ci.txt Add: markdown, aiofiles, python-dotenv, json-repair, uvicorn Generated with [LIU BOYU] --- requirements-ci.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/requirements-ci.txt b/requirements-ci.txt index 02fc402..bd6e363 100644 --- a/requirements-ci.txt +++ b/requirements-ci.txt @@ -33,4 +33,9 @@ anthropic>=0.3.0 gitpython>=3.1.0 requests>=2.28.0 jinja2>=3.1.0 +markdown>=3.4.0 +aiofiles>=22.1.0 +python-dotenv>=0.19.0 +json-repair>=0.22.0 +uvicorn>=0.32.0 From 982cb192c0aee6afad168e6e2a3e130e01cd8ac3 Mon Sep 17 00:00:00 2001 From: boyu Date: Sun, 4 Jan 2026 16:41:57 +0100 Subject: [PATCH 9/9] test(memory): add unit and integration tests for acceptance criteria Unit tests (tests/unit/test_memory_metric_collector.py): - Test all 5 metric recording methods - Test pass/fail target validation - Test metrics summary and history - Test edge cases (zero totals) Unit tests (tests/unit/test_memory_module.py): - Add test for get_items_by_ids method Integration tests (tests/integration/test_scope_and_acceptance_criteria_hooks.py): - Test memory feedback records retrieval_hit_rate - Test bulk_moderate rejection records false_positive_rate - Test clear_track_memory records deletion_compliance - Test /memory/metrics endpoints Generated with [LIU BOYU] --- ...est_scope_and_acceptance_criteria_hooks.py | 186 +++++++++++++++++ tests/unit/test_memory_metric_collector.py | 189 ++++++++++++++++++ tests/unit/test_memory_module.py | 34 ++++ 3 files changed, 409 insertions(+) create mode 100644 tests/integration/test_scope_and_acceptance_criteria_hooks.py create mode 100644 tests/unit/test_memory_metric_collector.py diff --git a/tests/integration/test_scope_and_acceptance_criteria_hooks.py b/tests/integration/test_scope_and_acceptance_criteria_hooks.py new file mode 100644 index 0000000..aba6f09 --- /dev/null +++ b/tests/integration/test_scope_and_acceptance_criteria_hooks.py @@ -0,0 +1,186 @@ +""" +Integration tests for Scope and Acceptance Criteria +memory hooks in API routes. + +Tests the Scope and Acceptance criteria hooks: +- false_positive_rate: recorded when user rejects high-confidence items +- deletion_compliance: recorded when clearing track memory +- retrieval_hit_rate: recorded when user provides memory feedback +""" +from __future__ import annotations + +import pytest +from fastapi.testclient import TestClient + + +@pytest.fixture +def test_client(tmp_path, monkeypatch): + """Create test client with isolated database.""" + monkeypatch.setenv("PAPERBOT_DB_URL", f"sqlite:///{tmp_path / 'test.db'}") + + from paperbot.api import main as api_main + + with TestClient(api_main.app) as client: + yield client + + +def test_memory_feedback_records_hit_rate(test_client): + """Test that memory feedback endpoint records retrieval_hit_rate metric.""" + # Record memory feedback + response = test_client.post( + "/api/research/memory/feedback", + json={ + "user_id": "test_user", + "memory_ids": [1, 2, 3, 4, 5], + "helpful_ids": [1, 2, 3, 4], + "not_helpful_ids": [5], + "query": "test query", + }, + ) + assert response.status_code == 200 + data = response.json() + assert data["total_rated"] == 5 + assert data["helpful_count"] == 4 + assert data["hit_rate"] == 0.8 + + # Verify metric was recorded + metrics_response = test_client.get("/api/memory/metrics") + assert metrics_response.status_code == 200 + metrics = metrics_response.json() + assert "retrieval_hit_rate" in metrics["metrics"] + assert metrics["metrics"]["retrieval_hit_rate"]["value"] == 0.8 + + +def test_bulk_moderate_reject_records_false_positive(test_client): + """Test that rejecting high-confidence items records false_positive_rate.""" + # First create a track + track_response = test_client.post( + "/api/research/tracks", + json={"user_id": "test_user", "name": "Test Track", "activate": True}, + ) + assert track_response.status_code == 200 + track_id = track_response.json()["track"]["id"] + + # Create a high-confidence approved memory + mem_response = test_client.post( + "/api/research/memory/items", + json={ + "user_id": "test_user", + "kind": "preference", + "content": "Test preference", + "confidence": 0.85, + "scope_type": "track", + "scope_id": str(track_id), + "status": "approved", + }, + ) + assert mem_response.status_code == 200 + item_id = mem_response.json()["item"]["id"] + + # Reject the high-confidence item (this should record false_positive_rate) + reject_response = test_client.post( + "/api/research/memory/bulk_moderate", + json={ + "user_id": "test_user", + "item_ids": [item_id], + "status": "rejected", + }, + ) + assert reject_response.status_code == 200 + + # Verify false_positive_rate metric was recorded + metrics_response = test_client.get("/api/memory/metrics") + assert metrics_response.status_code == 200 + metrics = metrics_response.json() + assert "false_positive_rate" in metrics["metrics"] + + +def test_clear_track_memory_records_deletion_compliance(test_client): + """Test that clearing track memory records deletion_compliance metric.""" + # Create a track + track_response = test_client.post( + "/api/research/tracks", + json={"user_id": "test_user", "name": "Delete Test Track", "activate": True}, + ) + assert track_response.status_code == 200 + track_id = track_response.json()["track"]["id"] + + # Create some memories in the track + for i in range(3): + test_client.post( + "/api/research/memory/items", + json={ + "user_id": "test_user", + "kind": "fact", + "content": f"Test fact {i}", + "confidence": 0.7, + "scope_type": "track", + "scope_id": str(track_id), + }, + ) + + # Clear track memory (triggers deletion_compliance check) + clear_response = test_client.post( + f"/api/research/tracks/{track_id}/memory/clear", + params={"user_id": "test_user", "confirm": True}, + ) + assert clear_response.status_code == 200 + assert clear_response.json()["deleted_count"] == 3 + + # Verify deletion_compliance metric was recorded + metrics_response = test_client.get("/api/memory/metrics") + assert metrics_response.status_code == 200 + metrics = metrics_response.json() + assert "deletion_compliance" in metrics["metrics"] + # Should be 1.0 (100% compliant) since no deleted items should be retrievable + assert metrics["metrics"]["deletion_compliance"]["value"] == 1.0 + + +def test_metrics_endpoints(test_client): + """Test the /memory/metrics endpoints.""" + # Get initial metrics (should be empty) + response = test_client.get("/api/memory/metrics") + assert response.status_code == 200 + data = response.json() + assert "status" in data + assert "metrics" in data + assert "targets" in data + + # Record a metric + test_client.post( + "/api/research/memory/feedback", + json={ + "user_id": "test_user", + "memory_ids": [1, 2], + "helpful_ids": [1, 2], + "not_helpful_ids": [], + }, + ) + + # Get metric history + history_response = test_client.get("/api/memory/metrics/retrieval_hit_rate") + assert history_response.status_code == 200 + data = history_response.json() + assert "history" in data + assert len(data["history"]) >= 1 + assert data["history"][0]["value"] == 1.0 # 100% hit rate + + +def test_metrics_summary_pass_fail(test_client): + """Test that metrics summary correctly reports pass/fail status.""" + # Record a metric that meets target + test_client.post( + "/api/research/memory/feedback", + json={ + "user_id": "test_user", + "memory_ids": [1, 2, 3, 4, 5], + "helpful_ids": [1, 2, 3, 4, 5], # 100% hit rate > 80% target + "not_helpful_ids": [], + }, + ) + + response = test_client.get("/api/memory/metrics") + data = response.json() + + # With only one metric recorded that meets target, status should be pass + assert data["metrics"]["retrieval_hit_rate"]["meets_target"] is True diff --git a/tests/unit/test_memory_metric_collector.py b/tests/unit/test_memory_metric_collector.py new file mode 100644 index 0000000..a8554cb --- /dev/null +++ b/tests/unit/test_memory_metric_collector.py @@ -0,0 +1,189 @@ +""" +Unit tests for MemoryMetricCollector (P0 Acceptance Criteria). +""" +from __future__ import annotations + +import pytest + +from paperbot.memory.eval.collector import MemoryMetricCollector + + +def test_collector_record_extraction_precision(tmp_path): + """Test recording extraction precision metric.""" + db_url = f"sqlite:///{tmp_path / 'metrics.db'}" + collector = MemoryMetricCollector(db_url=db_url) + + collector.record_extraction_precision( + correct_count=85, + total_count=100, + evaluator_id="test:unit", + detail={"test": True}, + ) + + latest = collector.get_latest_metrics() + assert "extraction_precision" in latest + assert latest["extraction_precision"]["value"] == 0.85 + assert latest["extraction_precision"]["meets_target"] is True # target is 0.85 + + +def test_collector_record_false_positive_rate(tmp_path): + """Test recording false positive rate metric.""" + db_url = f"sqlite:///{tmp_path / 'metrics.db'}" + collector = MemoryMetricCollector(db_url=db_url) + + collector.record_false_positive_rate( + false_positive_count=3, + total_approved_count=100, + evaluator_id="test:unit", + ) + + latest = collector.get_latest_metrics() + assert "false_positive_rate" in latest + assert latest["false_positive_rate"]["value"] == 0.03 + assert latest["false_positive_rate"]["meets_target"] is True # target is <= 0.05 + + +def test_collector_record_false_positive_rate_fails_target(tmp_path): + """Test false positive rate exceeding target.""" + db_url = f"sqlite:///{tmp_path / 'metrics.db'}" + collector = MemoryMetricCollector(db_url=db_url) + + collector.record_false_positive_rate( + false_positive_count=10, + total_approved_count=100, + evaluator_id="test:unit", + ) + + latest = collector.get_latest_metrics() + assert latest["false_positive_rate"]["value"] == 0.10 + assert latest["false_positive_rate"]["meets_target"] is False # 10% > 5% + + +def test_collector_record_retrieval_hit_rate(tmp_path): + """Test recording retrieval hit rate metric.""" + db_url = f"sqlite:///{tmp_path / 'metrics.db'}" + collector = MemoryMetricCollector(db_url=db_url) + + collector.record_retrieval_hit_rate( + hits=8, + expected=10, + evaluator_id="test:unit", + ) + + latest = collector.get_latest_metrics() + assert "retrieval_hit_rate" in latest + assert latest["retrieval_hit_rate"]["value"] == 0.80 + assert latest["retrieval_hit_rate"]["meets_target"] is True # target is >= 0.80 + + +def test_collector_record_injection_pollution_rate(tmp_path): + """Test recording injection pollution rate metric.""" + db_url = f"sqlite:///{tmp_path / 'metrics.db'}" + collector = MemoryMetricCollector(db_url=db_url) + + collector.record_injection_pollution_rate( + polluted_count=1, + total_injections=100, + evaluator_id="test:unit", + ) + + latest = collector.get_latest_metrics() + assert "injection_pollution_rate" in latest + assert latest["injection_pollution_rate"]["value"] == 0.01 + assert latest["injection_pollution_rate"]["meets_target"] is True # target is <= 0.02 + + +def test_collector_record_deletion_compliance(tmp_path): + """Test recording deletion compliance metric.""" + db_url = f"sqlite:///{tmp_path / 'metrics.db'}" + collector = MemoryMetricCollector(db_url=db_url) + + # Perfect compliance: no deleted items retrieved + collector.record_deletion_compliance( + deleted_retrieved_count=0, + deleted_total_count=10, + evaluator_id="test:unit", + ) + + latest = collector.get_latest_metrics() + assert "deletion_compliance" in latest + assert latest["deletion_compliance"]["value"] == 1.0 + assert latest["deletion_compliance"]["meets_target"] is True + + +def test_collector_record_deletion_compliance_fails(tmp_path): + """Test deletion compliance failure when deleted items are retrieved.""" + db_url = f"sqlite:///{tmp_path / 'metrics.db'}" + collector = MemoryMetricCollector(db_url=db_url) + + # Bad compliance: 2 deleted items were retrieved + collector.record_deletion_compliance( + deleted_retrieved_count=2, + deleted_total_count=10, + evaluator_id="test:unit", + ) + + latest = collector.get_latest_metrics() + assert latest["deletion_compliance"]["value"] == 0.8 # 1 - 2/10 + assert latest["deletion_compliance"]["meets_target"] is False + + +def test_collector_get_metrics_summary(tmp_path): + """Test getting metrics summary with pass/fail status.""" + db_url = f"sqlite:///{tmp_path / 'metrics.db'}" + collector = MemoryMetricCollector(db_url=db_url) + + # Record all metrics meeting targets + collector.record_extraction_precision(correct_count=90, total_count=100) + collector.record_false_positive_rate(false_positive_count=2, total_approved_count=100) + collector.record_retrieval_hit_rate(hits=85, expected=100) + collector.record_injection_pollution_rate(polluted_count=1, total_injections=100) + collector.record_deletion_compliance(deleted_retrieved_count=0, deleted_total_count=10) + + summary = collector.get_metrics_summary() + assert summary["status"] == "pass" + assert len(summary["metrics"]) == 5 + + +def test_collector_get_metrics_summary_fails(tmp_path): + """Test metrics summary fails when any metric doesn't meet target.""" + db_url = f"sqlite:///{tmp_path / 'metrics.db'}" + collector = MemoryMetricCollector(db_url=db_url) + + # One metric fails + collector.record_extraction_precision(correct_count=50, total_count=100) # 50% < 85% + + summary = collector.get_metrics_summary() + assert summary["status"] == "fail" + + +def test_collector_get_metric_history(tmp_path): + """Test getting metric history.""" + db_url = f"sqlite:///{tmp_path / 'metrics.db'}" + collector = MemoryMetricCollector(db_url=db_url) + + # Record multiple values + collector.record_extraction_precision(correct_count=80, total_count=100) + collector.record_extraction_precision(correct_count=85, total_count=100) + collector.record_extraction_precision(correct_count=90, total_count=100) + + history = collector.get_metric_history("extraction_precision", limit=10) + assert len(history) == 3 + # Most recent first + assert history[0]["value"] == 0.90 + assert history[1]["value"] == 0.85 + assert history[2]["value"] == 0.80 + + +def test_collector_skips_zero_totals(tmp_path): + """Test that collector skips recording when total is zero.""" + db_url = f"sqlite:///{tmp_path / 'metrics.db'}" + collector = MemoryMetricCollector(db_url=db_url) + + collector.record_extraction_precision(correct_count=0, total_count=0) + collector.record_false_positive_rate(false_positive_count=0, total_approved_count=0) + collector.record_retrieval_hit_rate(hits=0, expected=0) + collector.record_deletion_compliance(deleted_retrieved_count=0, deleted_total_count=0) + + latest = collector.get_latest_metrics() + assert len(latest) == 0 # No metrics recorded diff --git a/tests/unit/test_memory_module.py b/tests/unit/test_memory_module.py index 8f204d9..0e64dee 100644 --- a/tests/unit/test_memory_module.py +++ b/tests/unit/test_memory_module.py @@ -125,3 +125,37 @@ def test_memory_store_soft_delete_and_status_filter(tmp_path): assert any("太长" in i["content"] for i in store.search_memories(user_id="u1", query="太长", limit=10)) store.soft_delete_item(user_id="u1", item_id=int(pref.id), reason="user request") assert not any("太长" in i["content"] for i in store.search_memories(user_id="u1", query="太长", limit=10)) + + +def test_memory_store_get_items_by_ids(tmp_path): + """Test get_items_by_ids method retrieves correct items.""" + from paperbot.memory.schema import MemoryCandidate + + db_url = f"sqlite:///{tmp_path / 'mem_ids.db'}" + store = SqlAlchemyMemoryStore(db_url=db_url, auto_create_schema=True) + + # Create test memories + candidates = [ + MemoryCandidate(kind="preference", content="Prefers Python", confidence=0.8), + MemoryCandidate(kind="goal", content="Learn Rust", confidence=0.7), + MemoryCandidate(kind="fact", content="Uses VSCode", confidence=0.6), + ] + created, _, rows = store.add_memories(user_id="u1", memories=candidates) + assert created == 3 + + # Get IDs from list_memories since rows are detached + all_items = store.list_memories(user_id="u1", limit=10) + item_ids = [item["id"] for item in all_items] + assert len(item_ids) == 3 + + # Test get_items_by_ids + retrieved = store.get_items_by_ids(user_id="u1", item_ids=item_ids[:2]) + assert len(retrieved) == 2 + + # Test with empty list + empty = store.get_items_by_ids(user_id="u1", item_ids=[]) + assert len(empty) == 0 + + # Test user isolation - u2 can't see u1's items + other_user = store.get_items_by_ids(user_id="u2", item_ids=item_ids) + assert len(other_user) == 0