gdiab · gdiab · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@ node dist/cli/index.js --help
 - `linkledger save <url> [--note "..."] [--tags a,b] [--json]`
 - `linkledger annotate <item-id> --highlight|--lowlight|--note "..." [--actor human|agent:name] [--confidence 0.0-1.0] [--pin] [--json]`
 - `linkledger tag <item-id> --add a,b [--actor ...] [--json]`
-- `linkledger find <query> [--tags ...] [--type article|x|youtube|pdf|bluesky|linkedin] [--since YYYY-MM-DD] [--limit N] [--json]`
+- `linkledger find <query> [--tags ...] [--type article|x|youtube|pdf|bluesky|linkedin|reddit] [--since YYYY-MM-DD] [--limit N] [--json]`
 - `linkledger brief <query> [--max-items N] [--expand-chunks] [--json]`
 - `linkledger related <item-id> [--max-items N] [--json]`
 - `linkledger status <item-id> [--json]`
@@ -43,6 +43,7 @@ node dist/cli/index.js --help
 - `pdf`: text-native PDF extraction via content stream parsing.
 - `bluesky`: oEmbed + page metadata extraction with article fallback.
 - `linkedin`: page metadata extraction with LinkedIn-specific parsing.
+- `reddit`: Reddit listing API extraction for post + top comments with article fallback.
 
 Retryable adapter failures are requeued with exponential backoff in `worker`.
 
@@ -71,3 +72,12 @@ An agent-usage skill is included at:
 - Benchmarking: `docs/benchmarking.md`
 - Agent brief workflow: `docs/agent-brief-workflow.md`
 - Human validation playbook: `docs/human-test-plan.md`
+
+## Maintenance
+
+- Backfill legacy Reddit items previously saved as `article`:
+
+```bash
+npm run backfill:reddit -- --dry-run
+npm run backfill:reddit
+```
diff --git a/TEST_PLAN.md b/TEST_PLAN.md
@@ -36,6 +36,7 @@
 - Keyword match in annotation text.
 - Combined filters (`--tags`, `--type`, `--since`).
 - Stable sort when scores tie.
+- Reddit-specific source filtering (`--type reddit`) against post/comment text.
 
 3. `annotate`
 - Reject missing confidence for agent actor.
@@ -68,6 +69,10 @@
   - text-native PDF
   - image-heavy/low-text PDF
 
+- Reddit fixtures:
+  - post with self text + top comments
+  - fallback behavior when listing API fails
+
 ## 5. Performance Tests
 ### Dataset sizes
 - 1,000 items
@@ -105,3 +110,4 @@
 2. Add human and agent highlights; verify cap and confidence rules.
 3. Run topic `brief` and verify evidence quality manually.
 4. Re-run after 30+ day staleness simulation and verify revalidation behavior.
+5. Validate Reddit URL canonicalization (`redd.it`, `old.reddit.com`) and backfill dry run.
diff --git a/docs/human-test-plan.md b/docs/human-test-plan.md
@@ -47,7 +47,7 @@ unset LINKLEDGER_DB_PATH
 
 ## 4. Phase A - Smoke Test (15-30 minutes)
 
-Use 6-10 links across source types: article, X, YouTube, PDF, Bluesky, LinkedIn.
+Use 8-12 links across source types: article, X, YouTube, PDF, Bluesky, LinkedIn, Reddit.
 
 ### A1. Save and ingest
 
@@ -59,6 +59,7 @@ node --import tsx src/cli/index.ts worker --limit 20 --max-attempts 3 --base-bac
 Expected:
 - `save`: `ok=true`, item created or deduped.
 - `worker`: mostly `succeeded`; occasional `requeued` allowed.
+- Reddit URLs from `redd.it` and `old.reddit.com` should canonicalize to `https://www.reddit.com/comments/<post-id>`.
 
 ### A2. Check status and enrichment
 
@@ -81,6 +82,35 @@ Expected:
 - `find` returns relevant results with `snippet` and `why_ranked` fields.
 - `brief` returns high-signal items with `summary`, `key_claims`, highlights/lowlights/notes.
 
+### A4. Reddit-specific validation (real links)
+
+Save at least 3 real Reddit links:
+1. `redd.it/<post-id>` short link
+2. `old.reddit.com/r/<subreddit>/comments/<post-id>/...`
+3. Standard `www.reddit.com/r/<subreddit>/comments/<post-id>/...`
+
+For each:
+
+```bash
+node --import tsx src/cli/index.ts save "<reddit-url>" --tags reddit,smoke --json
+node --import tsx src/cli/index.ts worker --limit 20 --max-attempts 3 --base-backoff-ms 2000 --json
+node --import tsx src/cli/index.ts find "top comment text from the post" --type reddit --limit 5 --json
+```
+
+Expected:
+- `save` output item `source_type` is `reddit`.
+- `item.canonical_url` is normalized to `https://www.reddit.com/comments/<post-id>`.
+- `find --type reddit` returns the saved item when searching for post body or top-comment phrases.
+
+Optional backfill check for existing DBs with older data:
+
+```bash
+npm run backfill:reddit -- --dry-run
+```
+
+Expected:
+- Reports how many legacy `article` rows can be reclassified to `reddit`.
+
 ## 5. Phase B - Real Workflow Validation (1-2 weeks)
 
 Use the tool in normal content production.

diff --git a/package.json b/package.json
@@ -9,6 +9,7 @@
   "scripts": {
     "build": "tsc -p tsconfig.json",
     "bench:seed": "node --import tsx scripts/seed-benchmark-data.ts",
+    "backfill:reddit": "node --import tsx scripts/backfill-reddit-source-type.ts",
     "bench:find": "node --import tsx scripts/bench-find.ts",
     "bench:brief": "node --import tsx scripts/bench-brief.ts",
     "bench": "npm run bench:find && npm run bench:brief",

diff --git a/scripts/backfill-reddit-source-type.ts b/scripts/backfill-reddit-source-type.ts
@@ -0,0 +1,13 @@
+import { createServiceContext } from '../src/services/context.js';
+import { RedditBackfillService } from '../src/services/reddit-backfill-service.js';
+
+const dryRun = process.argv.includes('--dry-run');
+const context = createServiceContext();
+
+try {
+  const service = new RedditBackfillService(context);
+  const result = service.execute({ dryRun });
+  process.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
+} finally {
+  context.db.close();
+}
diff --git a/src/adapters/reddit-adapter.ts b/src/adapters/reddit-adapter.ts
@@ -0,0 +1,178 @@
+import { createHash } from 'node:crypto';
+import { AppError } from '../lib/errors.js';
+import { extractRedditPostId, isRedditHost } from '../lib/reddit.js';
+import { detectSourceType } from '../lib/url.js';
+import type { SourceType } from '../lib/types.js';
+import type { AdapterParseResult, SourceAdapter } from './source-adapter.js';
+
+const MAX_POST_CHARS = 1800;
+const MAX_COMMENT_CHARS = 900;
+const MAX_COMMENTS = 5;
+
+interface RedditListingChild {
+  kind?: string;
+  data?: Record<string, unknown>;
+}
+
+const toText = (value: unknown): string => {
+  if (typeof value !== 'string') {
+    return '';
+  }
+
+  return value
+    .replace(/\r\n/g, '\n')
+    .replace(/\r/g, '\n')
+    .replace(/\n{3,}/g, '\n\n')
+    .trim();
+};
+
+const compact = (value: string): string => value.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
+
+const truncate = (value: string, maxChars: number): string => {
+  if (value.length <= maxChars) {
+    return value;
+  }
+
+  return `${value.slice(0, Math.max(0, maxChars - 3)).trimEnd()}...`;
+};
+
+const toTokenCount = (value: string): number => value.split(/\s+/).filter(Boolean).length;
+
+const asListingChildren = (payload: unknown, index: number): RedditListingChild[] => {
+  if (!Array.isArray(payload)) {
+    return [];
+  }
+
+  const entry = payload[index] as { data?: { children?: RedditListingChild[] } } | undefined;
+  if (!entry?.data?.children || !Array.isArray(entry.data.children)) {
+    return [];
+  }
+
+  return entry.data.children;
+};
+
+const toPublishedAt = (value: unknown): string | undefined => {
+  if (typeof value !== 'number' || !Number.isFinite(value)) {
+    return undefined;
+  }
+
+  return new Date(value * 1000).toISOString();
+};
+
+const isRedditPageUrl = (value: string): boolean => {
+  try {
+    const parsed = new URL(value);
+    return isRedditHost(parsed.hostname.toLowerCase());
+  } catch {
+    return false;
+  }
+};
+
+export class RedditAdapter implements SourceAdapter {
+  supports(url: string): boolean {
+    return this.detectType(url) === 'reddit';
+  }
+
+  detectType(url: string): SourceType {
+    return detectSourceType(url);
+  }
+
+  async fetchAndParse(input: { url: string }): Promise<AdapterParseResult> {
+    const parsedUrl = new URL(input.url);
+    const postId = extractRedditPostId(parsedUrl.pathname);
+    if (!postId) {
+      throw new AppError('PARSE_FAILED', `Could not extract Reddit post id from ${input.url}`, false);
+    }
+
+    const listingUrl = `https://www.reddit.com/comments/${encodeURIComponent(postId)}.json?raw_json=1&sort=top&limit=8`;
+    const response = await fetch(listingUrl, {
+      headers: { 'user-agent': 'linkledger-cli/0.1.0' }
+    });
+
+    if (!response.ok) {
+      throw new AppError(
+        'FETCH_FAILED',
+        `Reddit listing fetch failed (${response.status}) for ${input.url}`,
+        response.status >= 500 || response.status === 429
+      );
+    }
+
+    const payload = (await response.json()) as unknown;
+    const postChild = asListingChildren(payload, 0).find((entry) => entry.kind === 't3');
+    const postData = postChild?.data;
+    if (!postData) {
+      throw new AppError('PARSE_FAILED', 'Reddit listing did not include post payload', false);
+    }
+
+    const title = compact(toText(postData.title));
+    const author = compact(toText(postData.author)) || undefined;
+    const subreddit = compact(toText(postData.subreddit));
+    const selfText = compact(toText(postData.selftext));
+    const linkedUrl = compact(toText(postData.url));
+
+    const postParts = [
+      title ? `Title: ${title}` : '',
+      subreddit ? `Subreddit: r/${subreddit}` : '',
+      author ? `Author: u/${author}` : '',
+      selfText ? `Body:\n${selfText}` : '',
+      linkedUrl && !isRedditPageUrl(linkedUrl) ? `Linked URL: ${linkedUrl}` : ''
+    ].filter(Boolean);
+
+    const chunks: Array<{ text: string; tokenCount: number }> = [];
+    if (postParts.length > 0) {
+      const postChunk = truncate(postParts.join('\n\n'), MAX_POST_CHARS);
+      chunks.push({
+        text: postChunk,
+        tokenCount: toTokenCount(postChunk)
+      });
+    }
+
+    const commentChildren = asListingChildren(payload, 1);
+    let commentCount = 0;
+    for (const comment of commentChildren) {
+      if (comment.kind !== 't1') {
+        continue;
+      }
+
+      const data = comment.data ?? {};
+      const body = compact(toText(data.body));
+      if (!body) {
+        continue;
+      }
+
+      const commentAuthor = compact(toText(data.author)) || 'unknown';
+      const scoreValue = typeof data.score === 'number' && Number.isFinite(data.score) ? data.score : null;
+      const scoreLabel = scoreValue === null ? '' : ` (score ${scoreValue})`;
+
+      commentCount += 1;
+      const commentText = truncate(
+        `Top comment ${commentCount} by u/${commentAuthor}${scoreLabel}:\n${body}`,
+        MAX_COMMENT_CHARS
+      );
+      chunks.push({
+        text: commentText,
+        tokenCount: toTokenCount(commentText)
+      });
+
+      if (commentCount >= MAX_COMMENTS) {
+        break;
+      }
+    }
+
+    if (chunks.length === 0) {
+      throw new AppError('PARSE_FAILED', 'No Reddit text could be extracted', false);
+    }
+
+    const fallbackTitle = `Reddit post ${postId}`;
+    return {
+      metadata: {
+        title: title || fallbackTitle,
+        author,
+        publishedAt: toPublishedAt(postData.created_utc)
+      },
+      chunks,
+      checksum: createHash('sha256').update(chunks.map((chunk) => chunk.text).join('\n')).digest('hex'),
+      fetchedAt: new Date().toISOString()
+    };
+  }
+}
diff --git a/src/cli/index.ts b/src/cli/index.ts
@@ -192,7 +192,7 @@ program
   .description('Find items by query with optional filters')
   .argument('<query>', 'Search query')
   .option('--tags <csv>', 'Filter by tags')
-  .option('--type <source-type>', 'Filter by source type (article|x|youtube|pdf|bluesky|linkedin)')
+  .option('--type <source-type>', 'Filter by source type (article|x|youtube|pdf|bluesky|linkedin|reddit)')
   .option('--since <yyyy-mm-dd>', 'Filter by creation date (inclusive)')
   .option('--limit <n>', 'Result limit', (value) => Number.parseInt(value, 10), 20)
   .option('--json', 'Output machine-readable JSON envelope')

diff --git a/src/lib/reddit.ts b/src/lib/reddit.ts
@@ -0,0 +1,37 @@
+const REDDIT_HOST_SUFFIX = '.reddit.com';
+const REDDIT_SHORT_HOST_SUFFIX = '.redd.it';
+
+export const isRedditHost = (host: string): boolean => {
+  const normalized = host.toLowerCase();
+  return (
+    normalized === 'reddit.com' ||
+    normalized.endsWith(REDDIT_HOST_SUFFIX) ||
+    normalized === 'redd.it' ||
+    normalized.endsWith(REDDIT_SHORT_HOST_SUFFIX)
+  );
+};
+
+export const extractRedditPostId = (pathname: string): string | undefined => {
+  const parts = pathname
+    .split('/')
+    .map((part) => part.trim())
+    .filter(Boolean);
+
+  if (parts.length === 0) {
+    return undefined;
+  }
+
+  if (parts[0] === 'r' && parts[2] === 'comments' && parts[3]) {
+    return parts[3].toLowerCase();
+  }
+
+  if (parts[0] === 'comments' && parts[1]) {
+    return parts[1].toLowerCase();
+  }
+
+  if (parts[0] === 'gallery' && parts[1]) {
+    return parts[1].toLowerCase();
+  }
+
+  return undefined;
+};
diff --git a/src/lib/types.ts b/src/lib/types.ts
@@ -1,4 +1,4 @@
-export type SourceType = 'article' | 'x' | 'youtube' | 'pdf' | 'bluesky' | 'linkedin' | 'unknown';
+export type SourceType = 'article' | 'x' | 'youtube' | 'pdf' | 'bluesky' | 'linkedin' | 'reddit' | 'unknown';
 export type IngestStatus = 'metadata_saved' | 'parsed' | 'enriched' | 'failed';
 export type AnnotationType = 'highlight' | 'lowlight' | 'note';