diff --git a/README.md b/README.md index 5799689..2af9527 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ node dist/cli/index.js --help - `linkledger save [--note "..."] [--tags a,b] [--json]` - `linkledger annotate --highlight|--lowlight|--note "..." [--actor human|agent:name] [--confidence 0.0-1.0] [--pin] [--json]` - `linkledger tag --add a,b [--actor ...] [--json]` -- `linkledger find [--tags ...] [--type article|x|youtube|pdf|bluesky|linkedin] [--since YYYY-MM-DD] [--limit N] [--json]` +- `linkledger find [--tags ...] [--type article|x|youtube|pdf|bluesky|linkedin|reddit] [--since YYYY-MM-DD] [--limit N] [--json]` - `linkledger brief [--max-items N] [--expand-chunks] [--json]` - `linkledger related [--max-items N] [--json]` - `linkledger status [--json]` @@ -43,6 +43,7 @@ node dist/cli/index.js --help - `pdf`: text-native PDF extraction via content stream parsing. - `bluesky`: oEmbed + page metadata extraction with article fallback. - `linkedin`: page metadata extraction with LinkedIn-specific parsing. +- `reddit`: Reddit listing API extraction for post + top comments with article fallback. Retryable adapter failures are requeued with exponential backoff in `worker`. @@ -71,3 +72,12 @@ An agent-usage skill is included at: - Benchmarking: `docs/benchmarking.md` - Agent brief workflow: `docs/agent-brief-workflow.md` - Human validation playbook: `docs/human-test-plan.md` + +## Maintenance + +- Backfill legacy Reddit items previously saved as `article`: + +```bash +npm run backfill:reddit -- --dry-run +npm run backfill:reddit +``` diff --git a/TEST_PLAN.md b/TEST_PLAN.md index cfbe165..e73c5a5 100644 --- a/TEST_PLAN.md +++ b/TEST_PLAN.md @@ -36,6 +36,7 @@ - Keyword match in annotation text. - Combined filters (`--tags`, `--type`, `--since`). - Stable sort when scores tie. +- Reddit-specific source filtering (`--type reddit`) against post/comment text. 3. `annotate` - Reject missing confidence for agent actor. @@ -68,6 +69,10 @@ - text-native PDF - image-heavy/low-text PDF +- Reddit fixtures: + - post with self text + top comments + - fallback behavior when listing API fails + ## 5. Performance Tests ### Dataset sizes - 1,000 items @@ -105,3 +110,4 @@ 2. Add human and agent highlights; verify cap and confidence rules. 3. Run topic `brief` and verify evidence quality manually. 4. Re-run after 30+ day staleness simulation and verify revalidation behavior. +5. Validate Reddit URL canonicalization (`redd.it`, `old.reddit.com`) and backfill dry run. diff --git a/docs/human-test-plan.md b/docs/human-test-plan.md index 18bbe6e..4fad946 100644 --- a/docs/human-test-plan.md +++ b/docs/human-test-plan.md @@ -47,7 +47,7 @@ unset LINKLEDGER_DB_PATH ## 4. Phase A - Smoke Test (15-30 minutes) -Use 6-10 links across source types: article, X, YouTube, PDF, Bluesky, LinkedIn. +Use 8-12 links across source types: article, X, YouTube, PDF, Bluesky, LinkedIn, Reddit. ### A1. Save and ingest @@ -59,6 +59,7 @@ node --import tsx src/cli/index.ts worker --limit 20 --max-attempts 3 --base-bac Expected: - `save`: `ok=true`, item created or deduped. - `worker`: mostly `succeeded`; occasional `requeued` allowed. +- Reddit URLs from `redd.it` and `old.reddit.com` should canonicalize to `https://www.reddit.com/comments/`. ### A2. Check status and enrichment @@ -81,6 +82,35 @@ Expected: - `find` returns relevant results with `snippet` and `why_ranked` fields. - `brief` returns high-signal items with `summary`, `key_claims`, highlights/lowlights/notes. +### A4. Reddit-specific validation (real links) + +Save at least 3 real Reddit links: +1. `redd.it/` short link +2. `old.reddit.com/r//comments//...` +3. Standard `www.reddit.com/r//comments//...` + +For each: + +```bash +node --import tsx src/cli/index.ts save "" --tags reddit,smoke --json +node --import tsx src/cli/index.ts worker --limit 20 --max-attempts 3 --base-backoff-ms 2000 --json +node --import tsx src/cli/index.ts find "top comment text from the post" --type reddit --limit 5 --json +``` + +Expected: +- `save` output item `source_type` is `reddit`. +- `item.canonical_url` is normalized to `https://www.reddit.com/comments/`. +- `find --type reddit` returns the saved item when searching for post body or top-comment phrases. + +Optional backfill check for existing DBs with older data: + +```bash +npm run backfill:reddit -- --dry-run +``` + +Expected: +- Reports how many legacy `article` rows can be reclassified to `reddit`. + ## 5. Phase B - Real Workflow Validation (1-2 weeks) Use the tool in normal content production. diff --git a/package.json b/package.json index 036e48b..b9c440f 100644 --- a/package.json +++ b/package.json @@ -9,6 +9,7 @@ "scripts": { "build": "tsc -p tsconfig.json", "bench:seed": "node --import tsx scripts/seed-benchmark-data.ts", + "backfill:reddit": "node --import tsx scripts/backfill-reddit-source-type.ts", "bench:find": "node --import tsx scripts/bench-find.ts", "bench:brief": "node --import tsx scripts/bench-brief.ts", "bench": "npm run bench:find && npm run bench:brief", diff --git a/scripts/backfill-reddit-source-type.ts b/scripts/backfill-reddit-source-type.ts new file mode 100644 index 0000000..fd8123a --- /dev/null +++ b/scripts/backfill-reddit-source-type.ts @@ -0,0 +1,13 @@ +import { createServiceContext } from '../src/services/context.js'; +import { RedditBackfillService } from '../src/services/reddit-backfill-service.js'; + +const dryRun = process.argv.includes('--dry-run'); +const context = createServiceContext(); + +try { + const service = new RedditBackfillService(context); + const result = service.execute({ dryRun }); + process.stdout.write(`${JSON.stringify(result, null, 2)}\n`); +} finally { + context.db.close(); +} diff --git a/src/adapters/reddit-adapter.ts b/src/adapters/reddit-adapter.ts new file mode 100644 index 0000000..1f213cb --- /dev/null +++ b/src/adapters/reddit-adapter.ts @@ -0,0 +1,178 @@ +import { createHash } from 'node:crypto'; +import { AppError } from '../lib/errors.js'; +import { extractRedditPostId, isRedditHost } from '../lib/reddit.js'; +import { detectSourceType } from '../lib/url.js'; +import type { SourceType } from '../lib/types.js'; +import type { AdapterParseResult, SourceAdapter } from './source-adapter.js'; + +const MAX_POST_CHARS = 1800; +const MAX_COMMENT_CHARS = 900; +const MAX_COMMENTS = 5; + +interface RedditListingChild { + kind?: string; + data?: Record; +} + +const toText = (value: unknown): string => { + if (typeof value !== 'string') { + return ''; + } + + return value + .replace(/\r\n/g, '\n') + .replace(/\r/g, '\n') + .replace(/\n{3,}/g, '\n\n') + .trim(); +}; + +const compact = (value: string): string => value.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim(); + +const truncate = (value: string, maxChars: number): string => { + if (value.length <= maxChars) { + return value; + } + + return `${value.slice(0, Math.max(0, maxChars - 3)).trimEnd()}...`; +}; + +const toTokenCount = (value: string): number => value.split(/\s+/).filter(Boolean).length; + +const asListingChildren = (payload: unknown, index: number): RedditListingChild[] => { + if (!Array.isArray(payload)) { + return []; + } + + const entry = payload[index] as { data?: { children?: RedditListingChild[] } } | undefined; + if (!entry?.data?.children || !Array.isArray(entry.data.children)) { + return []; + } + + return entry.data.children; +}; + +const toPublishedAt = (value: unknown): string | undefined => { + if (typeof value !== 'number' || !Number.isFinite(value)) { + return undefined; + } + + return new Date(value * 1000).toISOString(); +}; + +const isRedditPageUrl = (value: string): boolean => { + try { + const parsed = new URL(value); + return isRedditHost(parsed.hostname.toLowerCase()); + } catch { + return false; + } +}; + +export class RedditAdapter implements SourceAdapter { + supports(url: string): boolean { + return this.detectType(url) === 'reddit'; + } + + detectType(url: string): SourceType { + return detectSourceType(url); + } + + async fetchAndParse(input: { url: string }): Promise { + const parsedUrl = new URL(input.url); + const postId = extractRedditPostId(parsedUrl.pathname); + if (!postId) { + throw new AppError('PARSE_FAILED', `Could not extract Reddit post id from ${input.url}`, false); + } + + const listingUrl = `https://www.reddit.com/comments/${encodeURIComponent(postId)}.json?raw_json=1&sort=top&limit=8`; + const response = await fetch(listingUrl, { + headers: { 'user-agent': 'linkledger-cli/0.1.0' } + }); + + if (!response.ok) { + throw new AppError( + 'FETCH_FAILED', + `Reddit listing fetch failed (${response.status}) for ${input.url}`, + response.status >= 500 || response.status === 429 + ); + } + + const payload = (await response.json()) as unknown; + const postChild = asListingChildren(payload, 0).find((entry) => entry.kind === 't3'); + const postData = postChild?.data; + if (!postData) { + throw new AppError('PARSE_FAILED', 'Reddit listing did not include post payload', false); + } + + const title = compact(toText(postData.title)); + const author = compact(toText(postData.author)) || undefined; + const subreddit = compact(toText(postData.subreddit)); + const selfText = compact(toText(postData.selftext)); + const linkedUrl = compact(toText(postData.url)); + + const postParts = [ + title ? `Title: ${title}` : '', + subreddit ? `Subreddit: r/${subreddit}` : '', + author ? `Author: u/${author}` : '', + selfText ? `Body:\n${selfText}` : '', + linkedUrl && !isRedditPageUrl(linkedUrl) ? `Linked URL: ${linkedUrl}` : '' + ].filter(Boolean); + + const chunks: Array<{ text: string; tokenCount: number }> = []; + if (postParts.length > 0) { + const postChunk = truncate(postParts.join('\n\n'), MAX_POST_CHARS); + chunks.push({ + text: postChunk, + tokenCount: toTokenCount(postChunk) + }); + } + + const commentChildren = asListingChildren(payload, 1); + let commentCount = 0; + for (const comment of commentChildren) { + if (comment.kind !== 't1') { + continue; + } + + const data = comment.data ?? {}; + const body = compact(toText(data.body)); + if (!body) { + continue; + } + + const commentAuthor = compact(toText(data.author)) || 'unknown'; + const scoreValue = typeof data.score === 'number' && Number.isFinite(data.score) ? data.score : null; + const scoreLabel = scoreValue === null ? '' : ` (score ${scoreValue})`; + + commentCount += 1; + const commentText = truncate( + `Top comment ${commentCount} by u/${commentAuthor}${scoreLabel}:\n${body}`, + MAX_COMMENT_CHARS + ); + chunks.push({ + text: commentText, + tokenCount: toTokenCount(commentText) + }); + + if (commentCount >= MAX_COMMENTS) { + break; + } + } + + if (chunks.length === 0) { + throw new AppError('PARSE_FAILED', 'No Reddit text could be extracted', false); + } + + const fallbackTitle = `Reddit post ${postId}`; + return { + metadata: { + title: title || fallbackTitle, + author, + publishedAt: toPublishedAt(postData.created_utc) + }, + chunks, + checksum: createHash('sha256').update(chunks.map((chunk) => chunk.text).join('\n')).digest('hex'), + fetchedAt: new Date().toISOString() + }; + } +} diff --git a/src/cli/index.ts b/src/cli/index.ts index ee6f039..235d941 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -192,7 +192,7 @@ program .description('Find items by query with optional filters') .argument('', 'Search query') .option('--tags ', 'Filter by tags') - .option('--type ', 'Filter by source type (article|x|youtube|pdf|bluesky|linkedin)') + .option('--type ', 'Filter by source type (article|x|youtube|pdf|bluesky|linkedin|reddit)') .option('--since ', 'Filter by creation date (inclusive)') .option('--limit ', 'Result limit', (value) => Number.parseInt(value, 10), 20) .option('--json', 'Output machine-readable JSON envelope') diff --git a/src/lib/reddit.ts b/src/lib/reddit.ts new file mode 100644 index 0000000..f9cd348 --- /dev/null +++ b/src/lib/reddit.ts @@ -0,0 +1,37 @@ +const REDDIT_HOST_SUFFIX = '.reddit.com'; +const REDDIT_SHORT_HOST_SUFFIX = '.redd.it'; + +export const isRedditHost = (host: string): boolean => { + const normalized = host.toLowerCase(); + return ( + normalized === 'reddit.com' || + normalized.endsWith(REDDIT_HOST_SUFFIX) || + normalized === 'redd.it' || + normalized.endsWith(REDDIT_SHORT_HOST_SUFFIX) + ); +}; + +export const extractRedditPostId = (pathname: string): string | undefined => { + const parts = pathname + .split('/') + .map((part) => part.trim()) + .filter(Boolean); + + if (parts.length === 0) { + return undefined; + } + + if (parts[0] === 'r' && parts[2] === 'comments' && parts[3]) { + return parts[3].toLowerCase(); + } + + if (parts[0] === 'comments' && parts[1]) { + return parts[1].toLowerCase(); + } + + if (parts[0] === 'gallery' && parts[1]) { + return parts[1].toLowerCase(); + } + + return undefined; +}; diff --git a/src/lib/types.ts b/src/lib/types.ts index 360942f..2160029 100644 --- a/src/lib/types.ts +++ b/src/lib/types.ts @@ -1,4 +1,4 @@ -export type SourceType = 'article' | 'x' | 'youtube' | 'pdf' | 'bluesky' | 'linkedin' | 'unknown'; +export type SourceType = 'article' | 'x' | 'youtube' | 'pdf' | 'bluesky' | 'linkedin' | 'reddit' | 'unknown'; export type IngestStatus = 'metadata_saved' | 'parsed' | 'enriched' | 'failed'; export type AnnotationType = 'highlight' | 'lowlight' | 'note'; diff --git a/src/lib/url.ts b/src/lib/url.ts index ac5faa3..f003180 100644 --- a/src/lib/url.ts +++ b/src/lib/url.ts @@ -1,4 +1,5 @@ import type { SourceType } from './types.js'; +import { extractRedditPostId, isRedditHost } from './reddit.js'; const TRACKING_PARAMS = new Set([ 'utm_source', @@ -10,18 +11,58 @@ const TRACKING_PARAMS = new Set([ 'fbclid' ]); +const REDDIT_NOISE_PARAMS = new Set(['context', 'depth', 'sort', 'share_id', 'rdt', 'ref', 'ref_source']); + +const normalizeRedditUrl = (parsed: URL, originalHost: string): void => { + parsed.protocol = 'https:'; + if (originalHost === 'redd.it' || originalHost.endsWith('.redd.it')) { + const shortId = parsed.pathname + .split('/') + .map((part) => part.trim()) + .filter(Boolean)[0]; + if (shortId) { + parsed.pathname = `/comments/${shortId.toLowerCase()}`; + } + } else { + const postId = extractRedditPostId(parsed.pathname); + if (postId) { + parsed.pathname = `/comments/${postId}`; + } + } + + parsed.hostname = 'www.reddit.com'; + parsed.port = ''; +}; + export const canonicalizeUrl = (input: string): string => { const parsed = new URL(input.trim()); parsed.hash = ''; parsed.protocol = parsed.protocol.toLowerCase(); - parsed.hostname = parsed.hostname.toLowerCase(); + const originalHost = parsed.hostname.toLowerCase(); + parsed.hostname = originalHost; + + const isReddit = isRedditHost(originalHost); + if (isReddit) { + normalizeRedditUrl(parsed, originalHost); + } if ((parsed.protocol === 'https:' && parsed.port === '443') || (parsed.protocol === 'http:' && parsed.port === '80')) { parsed.port = ''; } const keptEntries = [...parsed.searchParams.entries()] - .filter(([key]) => !TRACKING_PARAMS.has(key.toLowerCase())) + .filter(([key]) => { + const normalized = key.toLowerCase(); + if (TRACKING_PARAMS.has(normalized)) { + return false; + } + + if (isReddit && REDDIT_NOISE_PARAMS.has(normalized)) { + return false; + } + + return true; + }) .sort(([a], [b]) => a.localeCompare(b)); parsed.search = ''; @@ -40,6 +81,10 @@ export const detectSourceType = (url: string): SourceType => { const parsed = new URL(url); const host = parsed.hostname.toLowerCase(); + if (isRedditHost(host)) { + return 'reddit'; + } + if (host === 'x.com' || host.endsWith('.x.com') || host === 'twitter.com' || host.endsWith('.twitter.com')) { return 'x'; } diff --git a/src/services/ingest-worker-service.ts b/src/services/ingest-worker-service.ts index 719eccd..48aabd2 100644 --- a/src/services/ingest-worker-service.ts +++ b/src/services/ingest-worker-service.ts @@ -2,6 +2,7 @@ import { ArticleAdapter } from '../adapters/article-adapter.js'; import { BlueskyAdapter } from '../adapters/bluesky-adapter.js'; import { LinkedInAdapter } from '../adapters/linkedin-adapter.js'; import { PdfAdapter } from '../adapters/pdf-adapter.js'; +import { RedditAdapter } from '../adapters/reddit-adapter.js'; import type { SourceAdapter } from '../adapters/source-adapter.js'; import { XAdapter } from '../adapters/x-adapter.js'; import { YouTubeAdapter } from '../adapters/youtube-adapter.js'; @@ -52,6 +53,7 @@ export class IngestWorkerService { private readonly pdfAdapter = new PdfAdapter(); private readonly blueskyAdapter = new BlueskyAdapter(); private readonly linkedinAdapter = new LinkedInAdapter(); + private readonly redditAdapter = new RedditAdapter(); constructor(private readonly context: ServiceContext) { this.indexService = new SearchIndexService(context); @@ -233,6 +235,10 @@ export class IngestWorkerService { return [this.linkedinAdapter, this.articleAdapter]; } + if (item.source_type === 'reddit') { + return [this.redditAdapter, this.articleAdapter]; + } + if (item.source_type === 'pdf') { return [this.pdfAdapter]; } diff --git a/src/services/reddit-backfill-service.ts b/src/services/reddit-backfill-service.ts new file mode 100644 index 0000000..d9b5391 --- /dev/null +++ b/src/services/reddit-backfill-service.ts @@ -0,0 +1,136 @@ +import { canonicalizeUrl, detectSourceType } from '../lib/url.js'; +import { nowIso } from '../lib/time.js'; +import type { ServiceContext } from './context.js'; +import { SearchIndexService } from './search-index-service.js'; + +export interface RedditBackfillOptions { + dryRun?: boolean; +} + +export interface RedditBackfillResult { + dry_run: boolean; + scanned: number; + updated: number; + updated_canonical: number; + conflicts: number; + skipped: number; + conflict_item_ids: string[]; +} + +interface CandidateRow { + id: string; + canonical_url: string; + original_url: string; +} + +const detectRedditCanonical = (candidate: CandidateRow): string | undefined => { + const inputs = [candidate.canonical_url, candidate.original_url]; + + for (const input of inputs) { + try { + const canonical = canonicalizeUrl(input); + if (detectSourceType(canonical) === 'reddit') { + return canonical; + } + } catch { + continue; + } + } + + return undefined; +}; + +export class RedditBackfillService { + private readonly indexService: SearchIndexService; + + constructor(private readonly context: ServiceContext) { + this.indexService = new SearchIndexService(context); + } + + execute(options: RedditBackfillOptions = {}): RedditBackfillResult { + const dryRun = options.dryRun ?? false; + const rows = this.context.db + .prepare( + `SELECT id, canonical_url, original_url + FROM items + WHERE source_type = 'article' + AND ( + canonical_url LIKE '%reddit.com/%' + OR canonical_url LIKE '%redd.it/%' + OR original_url LIKE '%reddit.com/%' + OR original_url LIKE '%redd.it/%' + ) + ORDER BY created_at ASC, id ASC` + ) + .all() as CandidateRow[]; + + const result: RedditBackfillResult = { + dry_run: dryRun, + scanned: rows.length, + updated: 0, + updated_canonical: 0, + conflicts: 0, + skipped: 0, + conflict_item_ids: [] + }; + + const plannedCanonicalOwners = new Map(); + const now = nowIso(); + const updates: Array<{ id: string; canonical: string }> = []; + + for (const row of rows) { + const canonical = detectRedditCanonical(row); + if (!canonical) { + result.skipped += 1; + continue; + } + + const plannedOwner = plannedCanonicalOwners.get(canonical); + if (plannedOwner && plannedOwner !== row.id) { + result.conflicts += 1; + result.conflict_item_ids.push(row.id); + continue; + } + + const collision = this.context.itemRepository.findByCanonicalUrl(canonical); + if (collision && collision.id !== row.id) { + result.conflicts += 1; + result.conflict_item_ids.push(row.id); + continue; + } + + plannedCanonicalOwners.set(canonical, row.id); + result.updated += 1; + if (canonical !== row.canonical_url) { + result.updated_canonical += 1; + } + + updates.push({ id: row.id, canonical }); + } + + if (!dryRun) { + const tx = this.context.db.transaction(() => { + const updateStmt = this.context.db.prepare( + `UPDATE items + SET source_type = 'reddit', + canonical_url = @canonical, + updated_at = @updatedAt + WHERE id = @id` + ); + + for (const update of updates) { + updateStmt.run({ + canonical: update.canonical, + updatedAt: now, + id: update.id + }); + this.indexService.syncItem(update.id); + } + }); + + tx(); + } + + return result; + } +} diff --git a/test/fixtures/reddit/listing.json b/test/fixtures/reddit/listing.json new file mode 100644 index 0000000..029c36c --- /dev/null +++ b/test/fixtures/reddit/listing.json @@ -0,0 +1,50 @@ +[ + { + "kind": "Listing", + "data": { + "children": [ + { + "kind": "t3", + "data": { + "id": "abc123", + "title": "Shipping local-first memory for agents", + "selftext": "We shipped LinkLedger today.\n\nWould love feedback from folks building agent workflows.", + "author": "georgediab", + "subreddit": "programming", + "created_utc": 1766966400, + "url": "https://github.com/gd/linkledger-cli" + } + } + ] + } + }, + { + "kind": "Listing", + "data": { + "children": [ + { + "kind": "t1", + "data": { + "id": "c_1", + "author": "alice", + "score": 128, + "body": "This is exactly the tooling I wanted for reusable evidence in drafts." + } + }, + { + "kind": "t1", + "data": { + "id": "c_2", + "author": "bob", + "score": 64, + "body": "Please keep canonicalization for redd.it and old.reddit links." + } + }, + { + "kind": "more", + "data": {} + } + ] + } + } +] diff --git a/test/integration/find-search.test.ts b/test/integration/find-search.test.ts index 5bfb488..c0a0ff4 100644 --- a/test/integration/find-search.test.ts +++ b/test/integration/find-search.test.ts @@ -1,4 +1,6 @@ import assert from 'node:assert/strict'; +import { readFileSync } from 'node:fs'; +import path from 'node:path'; import test from 'node:test'; import { AnnotationService } from '../../src/services/annotation-service.js'; import { createServiceContext } from '../../src/services/context.js'; @@ -7,6 +9,9 @@ import { IngestWorkerService } from '../../src/services/ingest-worker-service.js import { SaveService } from '../../src/services/save-service.js'; import { withTempDb } from '../helpers/temp-db.js'; +const fixture = (folder: string, name: string): string => + readFileSync(path.join(process.cwd(), 'test', 'fixtures', folder, name), 'utf8'); + test('find ranks pinned high-confidence annotations above low-confidence annotations', async () => { await withTempDb(async () => { const context = createServiceContext(); @@ -166,3 +171,47 @@ test('find queues stale revalidation for items older than threshold', async () = } }); }); + +test('find supports reddit source-type filtering', async () => { + await withTempDb(async () => { + const context = createServiceContext(); + const originalFetch = globalThis.fetch; + + globalThis.fetch = async (input) => { + const url = typeof input === 'string' ? input : input.url; + + if (url.includes('reddit.com/comments/abc123.json')) { + return new Response(fixture('reddit', 'listing.json'), { + status: 200, + headers: { 'content-type': 'application/json' } + }); + } + + return new Response( + 'Article

General article text for control item.

', + { status: 200, headers: { 'content-type': 'text/html' } } + ); + }; + + try { + const saveService = new SaveService(context); + const worker = new IngestWorkerService(context); + const find = new FindService(context); + + const reddit = saveService.execute({ + url: 'https://www.reddit.com/r/programming/comments/abc123/linkledger_release' + }).item; + saveService.execute({ url: 'https://example.com/non-reddit-item' }); + + await worker.runOnce({ limit: 20, maxAttempts: 3, baseBackoffMs: 0 }); + + const filtered = find.execute({ query: 'reusable evidence drafts', sourceType: 'reddit', limit: 10 }); + assert.equal(filtered.length, 1); + assert.equal(filtered[0]?.id, reddit.id); + assert.equal(filtered[0]?.source_type, 'reddit'); + } finally { + globalThis.fetch = originalFetch; + context.db.close(); + } + }); +}); diff --git a/test/integration/reddit-backfill.test.ts b/test/integration/reddit-backfill.test.ts new file mode 100644 index 0000000..2a41499 --- /dev/null +++ b/test/integration/reddit-backfill.test.ts @@ -0,0 +1,112 @@ +import assert from 'node:assert/strict'; +import test from 'node:test'; +import { createServiceContext } from '../../src/services/context.js'; +import { RedditBackfillService } from '../../src/services/reddit-backfill-service.js'; +import { withTempDb } from '../helpers/temp-db.js'; + +test('reddit backfill reclassifies article items and skips canonical conflicts', async () => { + await withTempDb(async () => { + const context = createServiceContext(); + + try { + const now = new Date().toISOString(); + context.db + .prepare( + `INSERT INTO items ( + id, canonical_url, original_url, source_type, + title, author, published_at, fetched_at, + ingest_status, ingest_error, checksum, created_at, updated_at + ) VALUES (?, ?, ?, ?, NULL, NULL, NULL, NULL, ?, NULL, NULL, ?, ?)` + ) + .run( + 'itm_reddit_old', + 'https://old.reddit.com/r/programming/comments/AbC123/linkledger_release/?context=3', + 'https://old.reddit.com/r/programming/comments/AbC123/linkledger_release/?context=3', + 'article', + 'metadata_saved', + '2026-01-01T00:00:00.000Z', + now + ); + + context.db + .prepare( + `INSERT INTO items ( + id, canonical_url, original_url, source_type, + title, author, published_at, fetched_at, + ingest_status, ingest_error, checksum, created_at, updated_at + ) VALUES (?, ?, ?, ?, NULL, NULL, NULL, NULL, ?, NULL, NULL, ?, ?)` + ) + .run( + 'itm_reddit_short', + 'https://redd.it/abc123', + 'https://redd.it/abc123', + 'article', + 'metadata_saved', + '2026-01-02T00:00:00.000Z', + now + ); + + const service = new RedditBackfillService(context); + const result = service.execute(); + + assert.equal(result.scanned, 2); + assert.equal(result.updated, 1); + assert.equal(result.updated_canonical, 1); + assert.equal(result.conflicts, 1); + assert.deepEqual(result.conflict_item_ids, ['itm_reddit_short']); + + const updated = context.itemRepository.findById('itm_reddit_old'); + assert.ok(updated); + assert.equal(updated.source_type, 'reddit'); + assert.equal(updated.canonical_url, 'https://www.reddit.com/comments/abc123'); + + const conflicted = context.itemRepository.findById('itm_reddit_short'); + assert.ok(conflicted); + assert.equal(conflicted.source_type, 'article'); + assert.equal(conflicted.canonical_url, 'https://redd.it/abc123'); + } finally { + context.db.close(); + } + }); +}); + +test('reddit backfill dry-run reports changes without mutating rows', async () => { + await withTempDb(async () => { + const context = createServiceContext(); + + try { + const now = new Date().toISOString(); + context.db + .prepare( + `INSERT INTO items ( + id, canonical_url, original_url, source_type, + title, author, published_at, fetched_at, + ingest_status, ingest_error, checksum, created_at, updated_at + ) VALUES (?, ?, ?, ?, NULL, NULL, NULL, NULL, ?, NULL, NULL, ?, ?)` + ) + .run( + 'itm_dry_run', + 'https://www.reddit.com/r/programming/comments/abc123/linkledger_release', + 'https://www.reddit.com/r/programming/comments/abc123/linkledger_release', + 'article', + 'metadata_saved', + '2026-01-01T00:00:00.000Z', + now + ); + + const service = new RedditBackfillService(context); + const result = service.execute({ dryRun: true }); + + assert.equal(result.dry_run, true); + assert.equal(result.updated, 1); + assert.equal(result.updated_canonical, 1); + + const row = context.itemRepository.findById('itm_dry_run'); + assert.ok(row); + assert.equal(row.source_type, 'article'); + assert.equal(row.canonical_url, 'https://www.reddit.com/r/programming/comments/abc123/linkledger_release'); + } finally { + context.db.close(); + } + }); +}); diff --git a/test/integration/worker-ingest.test.ts b/test/integration/worker-ingest.test.ts index 6c95674..d7b6ff5 100644 --- a/test/integration/worker-ingest.test.ts +++ b/test/integration/worker-ingest.test.ts @@ -116,7 +116,7 @@ test('worker requeues retryable failures with backoff and succeeds on next run', }); }); -test('worker uses first-class Bluesky and LinkedIn adapters', async () => { +test('worker uses first-class Bluesky, LinkedIn, and Reddit adapters', async () => { await withTempDb(async () => { const context = createServiceContext(); const save = new SaveService(context); @@ -140,6 +140,13 @@ test('worker uses first-class Bluesky and LinkedIn adapters', async () => { }); } + if (url.includes('reddit.com/comments/abc123.json')) { + return new Response(fixture('reddit', 'listing.json'), { + status: 200, + headers: { 'content-type': 'application/json' } + }); + } + return new Response('fallback article', { status: 200, headers: { 'content-type': 'text/html' } @@ -153,9 +160,12 @@ test('worker uses first-class Bluesky and LinkedIn adapters', async () => { const linkedinItem = save.execute({ url: 'https://www.linkedin.com/posts/gdiab_memory-layer-cli' }).item; + const redditItem = save.execute({ + url: 'https://old.reddit.com/r/programming/comments/abc123/linkledger_release/?context=3&utm_source=share' + }).item; const run = await worker.runOnce({ limit: 20, maxAttempts: 3, baseBackoffMs: 0 }); - assert.equal(run.succeeded, 2); + assert.equal(run.succeeded, 3); assert.equal(run.failed, 0); assert.equal(run.requeued, 0); @@ -169,6 +179,13 @@ test('worker uses first-class Bluesky and LinkedIn adapters', async () => { assert.equal(linkedin.source_type, 'linkedin'); assert.equal(linkedin.ingest_status, 'enriched'); assert.equal(linkedin.author, 'George Diab'); + + const reddit = context.itemRepository.findById(redditItem.id); + assert.ok(reddit); + assert.equal(reddit.source_type, 'reddit'); + assert.equal(reddit.canonical_url, 'https://www.reddit.com/comments/abc123'); + assert.equal(reddit.ingest_status, 'enriched'); + assert.equal(reddit.title, 'Shipping local-first memory for agents'); } finally { globalThis.fetch = originalFetch; context.db.close(); diff --git a/test/unit/reddit-adapter.test.ts b/test/unit/reddit-adapter.test.ts new file mode 100644 index 0000000..aaff48b --- /dev/null +++ b/test/unit/reddit-adapter.test.ts @@ -0,0 +1,59 @@ +import assert from 'node:assert/strict'; +import { readFileSync } from 'node:fs'; +import path from 'node:path'; +import test from 'node:test'; +import { RedditAdapter } from '../../src/adapters/reddit-adapter.js'; + +const fixture = (name: string): string => + readFileSync(path.join(process.cwd(), 'test', 'fixtures', 'reddit', name), 'utf8'); + +test('RedditAdapter parses post text and top comments into chunks', async () => { + const adapter = new RedditAdapter(); + const originalFetch = globalThis.fetch; + + globalThis.fetch = async () => + new Response(fixture('listing.json'), { + status: 200, + headers: { 'content-type': 'application/json' } + }); + + try { + const result = await adapter.fetchAndParse({ + url: 'https://www.reddit.com/comments/abc123' + }); + + assert.equal(result.metadata.title, 'Shipping local-first memory for agents'); + assert.equal(result.metadata.author, 'georgediab'); + assert.equal(result.chunks.length, 3); + assert.equal(result.chunks[0]?.text.includes('Subreddit: r/programming'), true); + assert.equal(result.chunks[1]?.text.includes('Top comment 1 by u/alice'), true); + assert.equal(result.chunks[2]?.text.includes('Top comment 2 by u/bob'), true); + } finally { + globalThis.fetch = originalFetch; + } +}); + +test('RedditAdapter marks upstream failures as retryable for 5xx', async () => { + const adapter = new RedditAdapter(); + const originalFetch = globalThis.fetch; + + globalThis.fetch = async () => + new Response('server unavailable', { + status: 503, + headers: { 'content-type': 'text/plain' } + }); + + try { + await assert.rejects( + () => adapter.fetchAndParse({ url: 'https://www.reddit.com/comments/abc123' }), + (error: unknown) => { + const typed = error as { code?: string; retryable?: boolean }; + assert.equal(typed.code, 'FETCH_FAILED'); + assert.equal(typed.retryable, true); + return true; + } + ); + } finally { + globalThis.fetch = originalFetch; + } +}); diff --git a/test/unit/url.test.ts b/test/unit/url.test.ts index 82e53e8..3ef0d70 100644 --- a/test/unit/url.test.ts +++ b/test/unit/url.test.ts @@ -9,7 +9,18 @@ test('canonicalizeUrl strips tracking params and normalizes host/path', () => { assert.equal(output, 'https://example.com/path?a=1&b=2'); }); -test('detectSourceType detects x/youtube/pdf/bluesky/linkedin/article', () => { +test('canonicalizeUrl normalizes reddit short-links and host variants', () => { + const short = canonicalizeUrl('https://redd.it/AbC123?utm_source=share&share_id=abc'); + assert.equal(short, 'https://www.reddit.com/comments/abc123'); + + const hostVariant = canonicalizeUrl( + 'https://old.reddit.com/r/programming/comments/AbC123/linkledger_release/?context=3#details' + ); + assert.equal(hostVariant, 'https://www.reddit.com/comments/abc123'); +}); + +test('detectSourceType detects reddit/x/youtube/pdf/bluesky/linkedin/article', () => { + assert.equal(detectSourceType('https://www.reddit.com/r/programming/comments/abc123/post-title'), 'reddit'); assert.equal(detectSourceType('https://x.com/user/status/123'), 'x'); assert.equal(detectSourceType('https://www.youtube.com/watch?v=abc'), 'youtube'); assert.equal(detectSourceType('https://example.com/doc.pdf'), 'pdf');