From 4935e1a07f75195f7e862f36c59f5bbcd6c322ae Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <gewenyu99@gmail.com>
Date: Wed, 11 Feb 2026 15:32:10 -0500
Subject: [PATCH 1/2] Instrument the loop

---
 bin.ts                                    |   6 +
 src/lib/__tests__/agent-interface.test.ts |   1 +
 src/lib/agent-interface.ts                | 674 +++++++++++++++++++++-
 src/lib/agent-runner.ts                   | 190 +++++-
 src/run.ts                                |   2 +
 src/utils/types.ts                        |   7 +
 6 files changed, 864 insertions(+), 16 deletions(-)

diff --git a/bin.ts b/bin.ts
index 7c82ae0..e8e1341 100644
--- a/bin.ts
+++ b/bin.ts
@@ -115,6 +115,12 @@ yargs(hideBin(process.argv))
             'Show menu for manual integration selection instead of auto-detecting\nenv: POSTHOG_WIZARD_MENU',
           type: 'boolean',
         },
+        benchmark: {
+          default: false,
+          describe:
+            'Run in benchmark mode with per-phase token tracking\nenv: POSTHOG_WIZARD_BENCHMARK',
+          type: 'boolean',
+        },
       });
     },
     (argv) => {
diff --git a/src/lib/__tests__/agent-interface.test.ts b/src/lib/__tests__/agent-interface.test.ts
index beed455..d11a257 100644
--- a/src/lib/__tests__/agent-interface.test.ts
+++ b/src/lib/__tests__/agent-interface.test.ts
@@ -32,6 +32,7 @@ describe('runAgent', () => {
     localMcp: false,
     ci: false,
     menu: false,
+    benchmark: false,
   };
 
   const defaultAgentConfig = {
diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts
index 2c315fe..a06acfd 100644
--- a/src/lib/agent-interface.ts
+++ b/src/lib/agent-interface.ts
@@ -3,7 +3,9 @@
  * Uses Claude Agent SDK directly with PostHog LLM gateway
  */
 
+import fs from 'fs';
 import path from 'path';
+import chalk from 'chalk';
 import clack from '../utils/clack';
 import { debug, logToFile, initLogFile, LOG_FILE_PATH } from '../utils/debug';
 import type { WizardOptions } from '../utils/types';
@@ -84,6 +86,43 @@ type AgentRunConfig = {
   model: string;
 };
 
+export const BENCHMARK_FILE_PATH = '/tmp/posthog-wizard-benchmark.json';
+
+export interface StepUsage {
+  name: string;
+  usage: {
+    input_tokens: number;
+    output_tokens: number;
+    cache_creation_input_tokens: number;
+    cache_read_input_tokens: number;
+  };
+  modelUsage: Record<string, unknown>;
+  totalCostUsd: number;
+  durationMs: number;
+  durationApiMs: number;
+  numTurns: number;
+  /** Conversation context size (tokens) entering this step */
+  contextTokensIn: number;
+  /** Conversation context size (tokens) exiting this step */
+  contextTokensOut: number;
+  /** Number of auto-compactions that occurred during this step */
+  compactions?: number;
+  /** Token count before each compaction (from SDK compact_boundary messages) */
+  compactionPreTokens?: number[];
+}
+
+export interface BenchmarkData {
+  timestamp: string;
+  steps: StepUsage[];
+  totals: {
+    totalCostUsd: number;
+    durationMs: number;
+    inputTokens: number;
+    outputTokens: number;
+    numTurns: number;
+  };
+}
+
 /**
  * Package managers that can be used to run commands.
  */
@@ -380,7 +419,11 @@ export async function runAgent(
     successMessage?: string;
     errorMessage?: string;
   },
-): Promise<{ error?: AgentErrorType; message?: string }> {
+): Promise<{
+  error?: AgentErrorType;
+  message?: string;
+  benchmark?: BenchmarkData;
+}> {
   const {
     estimatedDurationMinutes = 8,
     spinnerMessage = 'Customizing your PostHog setup...',
@@ -405,6 +448,8 @@ export async function runAgent(
   const collectedText: string[] = [];
   // Track if we received a successful result (before any cleanup errors)
   let receivedSuccessResult = false;
+  // Track the result message for benchmark data extraction
+  let resultMessage: SDKMessage = null;
 
   // Workaround for SDK bug: stdin closes before canUseTool responses can be sent.
   // The fix is to use an async generator for the prompt that stays open until
@@ -429,7 +474,11 @@ export async function runAgent(
   // Helper to handle successful completion (used in normal path and race condition recovery)
   const completeWithSuccess = (
     suppressedError?: Error,
-  ): { error?: AgentErrorType; message?: string } => {
+  ): {
+    error?: AgentErrorType;
+    message?: string;
+    benchmark?: BenchmarkData;
+  } => {
     const durationMs = Date.now() - startTime;
     const durationSeconds = Math.round(durationMs / 1000);
 
@@ -465,7 +514,19 @@ export async function runAgent(
       duration_seconds: durationSeconds,
     });
     spinner.stop(successMessage);
-    return {};
+
+    // Write benchmark data from the single-query result if available
+    let benchmark: BenchmarkData | undefined;
+    if (resultMessage && options.benchmark) {
+      benchmark = extractBenchmarkFromResult(
+        'single-run',
+        resultMessage,
+        durationMs,
+      );
+      writeBenchmarkData(benchmark);
+    }
+
+    return { benchmark };
   };
 
   try {
@@ -568,6 +629,7 @@ export async function runAgent(
         // The SDK may emit a second error result during cleanup due to a race condition
         if (message.subtype === 'success' && !message.is_error) {
           receivedSuccessResult = true;
+          resultMessage = message;
         }
         signalDone!();
       }
@@ -644,6 +706,611 @@ export async function runAgent(
   }
 }
 
+/**
+ * Format milliseconds into a human-readable duration string (e.g., "2m 34s", "45s").
+ */
+function formatDuration(ms: number): string {
+  const totalSeconds = Math.round(ms / 1000);
+  const minutes = Math.floor(totalSeconds / 60);
+  const seconds = totalSeconds % 60;
+  if (minutes > 0) {
+    return `${minutes}m ${seconds}s`;
+  }
+  return `${seconds}s`;
+}
+
+/**
+ * Format token count into a human-readable string (e.g., "1.2M", "345K", "1,234").
+ */
+function formatTokenCount(tokens: number): string {
+  if (tokens >= 1_000_000) {
+    return `${(tokens / 1_000_000).toFixed(1)}M`;
+  }
+  if (tokens >= 10_000) {
+    return `${Math.round(tokens / 1000)}K`;
+  }
+  return tokens.toLocaleString();
+}
+
+/**
+ * Sum token usage across all models from the SDK's modelUsage field.
+ * The top-level `usage` field only has the last API call's tokens;
+ * `modelUsage` has the accurate per-model aggregates (camelCase fields).
+ */
+function sumModelUsage(modelUsage: Record<string, any>): {
+  input_tokens: number;
+  output_tokens: number;
+  cache_creation_input_tokens: number;
+  cache_read_input_tokens: number;
+} {
+  let input_tokens = 0;
+  let output_tokens = 0;
+  let cache_creation_input_tokens = 0;
+  let cache_read_input_tokens = 0;
+
+  for (const model of Object.values(modelUsage)) {
+    input_tokens += model.inputTokens ?? 0;
+    output_tokens += model.outputTokens ?? 0;
+    cache_creation_input_tokens += model.cacheCreationInputTokens ?? 0;
+    cache_read_input_tokens += model.cacheReadInputTokens ?? 0;
+  }
+
+  return {
+    input_tokens,
+    output_tokens,
+    cache_creation_input_tokens,
+    cache_read_input_tokens,
+  };
+}
+
+/**
+ * Extract benchmark data from a single SDK result message.
+ */
+function extractBenchmarkFromResult(
+  stepName: string,
+  message: SDKMessage,
+  wallDurationMs: number,
+): BenchmarkData {
+  const modelUsage = message.modelUsage ?? {};
+  const usage = sumModelUsage(modelUsage);
+  const lastCallUsage = message.usage ?? {};
+  const contextTokensOut =
+    Number(lastCallUsage.input_tokens ?? 0) +
+    Number(lastCallUsage.cache_read_input_tokens ?? 0) +
+    Number(lastCallUsage.cache_creation_input_tokens ?? 0);
+  const step: StepUsage = {
+    name: stepName,
+    usage,
+    modelUsage,
+    totalCostUsd: message.total_cost_usd ?? 0,
+    durationMs: message.duration_ms ?? wallDurationMs,
+    durationApiMs: message.duration_api_ms ?? 0,
+    numTurns: message.num_turns ?? 0,
+    contextTokensIn: 0,
+    contextTokensOut,
+  };
+
+  return {
+    timestamp: new Date().toISOString(),
+    steps: [step],
+    totals: {
+      totalCostUsd: step.totalCostUsd,
+      durationMs: step.durationMs,
+      inputTokens: step.usage.input_tokens,
+      outputTokens: step.usage.output_tokens,
+      numTurns: step.numTurns,
+    },
+  };
+}
+
+/**
+ * Write benchmark data to the benchmark file.
+ */
+function writeBenchmarkData(data: BenchmarkData): void {
+  try {
+    fs.writeFileSync(BENCHMARK_FILE_PATH, JSON.stringify(data, null, 2));
+    logToFile(`Benchmark data written to ${BENCHMARK_FILE_PATH}`);
+  } catch (error) {
+    logToFile('Failed to write benchmark data:', error);
+  }
+}
+
+/**
+ * Execute multiple agent steps in a single conversation with per-step usage tracking.
+ * Uses one query() call with multiple user messages, so conversation context is preserved
+ * across steps (identical behavior to normal non-benchmark mode).
+ *
+ * Steps can be discovered dynamically via the onAfterStep callback — e.g., after the
+ * setup step installs a skill, onAfterStep discovers the workflow files and returns
+ * them as additional steps to run in the same conversation.
+ *
+ * Per-step usage is computed as deltas between consecutive SDK result messages.
+ *
+ * Writes benchmark data to BENCHMARK_FILE_PATH when all steps complete.
+ */
+export async function runAgentSteps(
+  agentConfig: AgentRunConfig,
+  initialSteps: Array<{ name: string; prompt: string }>,
+  options: WizardOptions,
+  spinner: ReturnType<typeof clack.spinner>,
+  config?: {
+    estimatedDurationMinutes?: number;
+    spinnerMessage?: string;
+    successMessage?: string;
+    errorMessage?: string;
+    /** Called after each step completes. Return additional steps to append to the queue. */
+    onAfterStep?: (
+      stepIndex: number,
+      stepName: string,
+    ) => Array<{ name: string; prompt: string }>;
+  },
+): Promise<{
+  error?: AgentErrorType;
+  message?: string;
+  benchmark?: BenchmarkData;
+}> {
+  const {
+    estimatedDurationMinutes = 8,
+    spinnerMessage = 'Customizing your PostHog setup...',
+    successMessage = 'PostHog integration complete',
+    errorMessage = 'Integration failed',
+    onAfterStep,
+  } = config ?? {};
+
+  const { query } = await getSDKModule();
+
+  clack.log.step(
+    `This whole process should take about ${estimatedDurationMinutes} minutes including error checking and fixes.\n\nGrab some coffee!`,
+  );
+  clack.log.info(`${chalk.cyan('[BENCHMARK]')} Verbose logs: ${LOG_FILE_PATH}`);
+  clack.log.info(
+    `${chalk.cyan(
+      '[BENCHMARK]',
+    )} Benchmark data will be written to: ${BENCHMARK_FILE_PATH}`,
+  );
+
+  spinner.start(spinnerMessage);
+
+  const overallStartTime = Date.now();
+  const stepUsages: StepUsage[] = [];
+  const collectedText: string[] = [];
+  let receivedSuccessResult = false;
+
+  // Dynamic steps list — grows as onAfterStep discovers more
+  const allSteps = [...initialSteps];
+  const stepStartTimes: number[] = [];
+  let completedStepCount = 0;
+
+  // Per-step compaction tracking (reset after each step)
+  let stepCompactions = 0;
+  let stepCompactionPreTokens: number[] = [];
+
+  // Previous cumulative values for delta computation
+  let prevCumulative = {
+    usage: {
+      input_tokens: 0,
+      output_tokens: 0,
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: 0,
+    },
+    modelUsage: {} as Record<string, any>,
+    costUsd: 0,
+    durationMs: 0,
+    durationApiMs: 0,
+    numTurns: 0,
+  };
+
+  // Step completion synchronization: resolves with `true` on success, `false` on error
+  // eslint-disable-next-line @typescript-eslint/no-empty-function
+  let resolveStepDone: (success: boolean) => void = () => {};
+  function waitForStepDone(): Promise<boolean> {
+    return new Promise((resolve) => {
+      resolveStepDone = resolve;
+    });
+  }
+
+  // Final cleanup signal for SDK stdin workaround
+  let signalAllDone: () => void;
+  const allDone = new Promise<void>((resolve) => {
+    signalAllDone = resolve;
+  });
+
+  // Prompt stream generator — yields user messages for each step in order,
+  // pausing between steps to wait for the result and discover more steps.
+  const promptStream = async function* () {
+    let i = 0;
+    while (i < allSteps.length) {
+      const step = allSteps[i];
+      stepStartTimes[i] = Date.now();
+
+      logToFile(`Yielding benchmark step ${i + 1}: ${step.name}`);
+      spinner.stop(
+        `${chalk.cyan('[BENCHMARK]')} Starting step ${i + 1}/${
+          allSteps.length
+        }: ${chalk.bold(step.name)}`,
+      );
+      spinner.start(
+        `Running step ${i + 1}/${allSteps.length}: ${step.name}...`,
+      );
+
+      yield {
+        type: 'user',
+        session_id: '',
+        message: { role: 'user', content: step.prompt },
+        parent_tool_use_id: null,
+      };
+
+      // Wait for this step's result before yielding the next prompt
+      const success = await waitForStepDone();
+      if (!success) {
+        // Step failed — stop yielding, let the generator end
+        break;
+      }
+
+      // Discover more steps after this one completes
+      if (onAfterStep) {
+        const moreSteps = onAfterStep(i, step.name);
+        if (moreSteps.length > 0) {
+          allSteps.push(...moreSteps);
+          clack.log.info(
+            `${chalk.cyan('[BENCHMARK]')} Discovered ${
+              moreSteps.length
+            } more phases: ${moreSteps.map((s) => s.name).join(', ')}`,
+          );
+        }
+      }
+
+      i++;
+    }
+
+    // Keep generator alive for SDK cleanup (stdin workaround)
+    await allDone;
+  };
+
+  const allowedTools = [
+    'Read',
+    'Write',
+    'Edit',
+    'Glob',
+    'Grep',
+    'Bash',
+    'ListMcpResourcesTool',
+    'Skill',
+  ];
+
+  try {
+    const response = query({
+      prompt: promptStream(),
+      options: {
+        model: agentConfig.model,
+        cwd: agentConfig.workingDirectory,
+        permissionMode: 'acceptEdits',
+        mcpServers: agentConfig.mcpServers,
+        settingSources: ['project'],
+        allowedTools,
+        env: {
+          ...process.env,
+          ANTHROPIC_API_KEY: undefined,
+        },
+        canUseTool: (toolName: string, input: unknown) => {
+          logToFile('canUseTool called:', { toolName, input });
+          const result = wizardCanUseTool(
+            toolName,
+            input as Record<string, unknown>,
+          );
+          logToFile('canUseTool result:', result);
+          return Promise.resolve(result);
+        },
+        tools: { type: 'preset', preset: 'claude_code' },
+        stderr: (data: string) => {
+          logToFile('CLI stderr:', data);
+          if (options.debug) {
+            debug('CLI stderr:', data);
+          }
+        },
+      },
+    });
+
+    for await (const message of response) {
+      handleSDKMessage(
+        message,
+        options,
+        spinner,
+        collectedText,
+        receivedSuccessResult,
+      );
+
+      // Track compaction events from the SDK
+      if (message.type === 'system' && message.subtype === 'compact_boundary') {
+        const preTokens = message.compact_metadata?.pre_tokens ?? 0;
+        const trigger = message.compact_metadata?.trigger ?? 'unknown';
+        stepCompactions++;
+        stepCompactionPreTokens.push(preTokens);
+        logToFile(
+          `[COMPACTION] Context compacted (trigger: ${trigger}, pre_tokens: ${formatTokenCount(
+            preTokens,
+          )})`,
+        );
+        clack.log.info(
+          `${chalk.yellow('[COMPACTION]')} Context compacted during step "${
+            allSteps[completedStepCount]?.name
+          }" (trigger: ${trigger}, pre_tokens: ${formatTokenCount(preTokens)})`,
+        );
+      }
+
+      if (message.type === 'result') {
+        if (message.subtype === 'success' && !message.is_error) {
+          receivedSuccessResult = true;
+
+          const stepIndex = completedStepCount;
+          const stepDurationMs = Date.now() - stepStartTimes[stepIndex];
+
+          // Compute delta usage from cumulative SDK values
+          const modelUsageData = message.modelUsage ?? {};
+          const cumulativeUsage = sumModelUsage(modelUsageData);
+          const cumulativeCost = message.total_cost_usd ?? 0;
+          const cumulativeDuration = message.duration_ms ?? 0;
+          const cumulativeDurationApi = message.duration_api_ms ?? 0;
+          const cumulativeTurns = message.num_turns ?? 0;
+
+          const deltaUsage = {
+            input_tokens:
+              cumulativeUsage.input_tokens - prevCumulative.usage.input_tokens,
+            output_tokens:
+              cumulativeUsage.output_tokens -
+              prevCumulative.usage.output_tokens,
+            cache_creation_input_tokens:
+              cumulativeUsage.cache_creation_input_tokens -
+              prevCumulative.usage.cache_creation_input_tokens,
+            cache_read_input_tokens:
+              cumulativeUsage.cache_read_input_tokens -
+              prevCumulative.usage.cache_read_input_tokens,
+          };
+          const deltaCost = cumulativeCost - prevCumulative.costUsd;
+          // num_turns is per-response (not cumulative), so use directly
+          const stepTurns = cumulativeTurns;
+          const deltaDurationApi =
+            cumulativeDurationApi - prevCumulative.durationApiMs;
+          const deltaModelUsage = computeModelUsageDelta(
+            modelUsageData,
+            prevCumulative.modelUsage,
+          );
+
+          // Context size from the last API call's usage (not cumulative modelUsage).
+          // The last call's input represents the actual conversation window at that point.
+          const lastCallUsage = message.usage ?? {};
+          const contextTokensOut =
+            Number(lastCallUsage.input_tokens ?? 0) +
+            Number(lastCallUsage.cache_read_input_tokens ?? 0) +
+            Number(lastCallUsage.cache_creation_input_tokens ?? 0);
+          const contextTokensIn =
+            stepUsages.length > 0
+              ? stepUsages[stepUsages.length - 1].contextTokensOut
+              : 0;
+
+          stepUsages.push({
+            name: allSteps[stepIndex].name,
+            usage: deltaUsage,
+            modelUsage: deltaModelUsage,
+            totalCostUsd: deltaCost,
+            durationMs: stepDurationMs,
+            durationApiMs: deltaDurationApi,
+            numTurns: stepTurns,
+            contextTokensIn,
+            contextTokensOut,
+            ...(stepCompactions > 0 && {
+              compactions: stepCompactions,
+              compactionPreTokens: stepCompactionPreTokens,
+            }),
+          });
+
+          // Reset per-step compaction tracking
+          stepCompactions = 0;
+          stepCompactionPreTokens = [];
+
+          // Update cumulative tracking
+          prevCumulative = {
+            usage: cumulativeUsage,
+            modelUsage: modelUsageData,
+            costUsd: cumulativeCost,
+            durationMs: cumulativeDuration,
+            durationApiMs: cumulativeDurationApi,
+            numTurns: cumulativeTurns,
+          };
+
+          spinner.stop(
+            `${chalk.cyan('[BENCHMARK]')} Completed step ${stepIndex + 1}/${
+              allSteps.length
+            }: ${chalk.bold(allSteps[stepIndex].name)} ${chalk.dim(
+              `(${formatDuration(stepDurationMs)}, $${deltaCost.toFixed(
+                4,
+              )}, ${stepTurns} turns, ctx: ${formatTokenCount(
+                contextTokensIn,
+              )} → ${formatTokenCount(contextTokensOut)})`,
+            )}`,
+          );
+          logToFile(
+            `Step "${allSteps[stepIndex].name}" completed in ${Math.round(
+              stepDurationMs / 1000,
+            )}s`,
+          );
+
+          completedStepCount++;
+          resolveStepDone(true);
+        } else {
+          // Error result — signal generator to stop yielding
+          resolveStepDone(false);
+        }
+
+        // Signal generator cleanup when all done
+        if (completedStepCount >= allSteps.length) {
+          signalAllDone!();
+        }
+      }
+    }
+
+    // Check for error signals in collected output
+    const outputText = collectedText.join('\n');
+    if (outputText.includes(AgentSignals.ERROR_MCP_MISSING)) {
+      spinner.stop('Agent could not access PostHog MCP');
+      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+      writeBenchmarkData(benchmark);
+      return { error: AgentErrorType.MCP_MISSING, benchmark };
+    }
+    if (outputText.includes(AgentSignals.ERROR_RESOURCE_MISSING)) {
+      spinner.stop('Agent could not access setup resource');
+      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+      writeBenchmarkData(benchmark);
+      return { error: AgentErrorType.RESOURCE_MISSING, benchmark };
+    }
+    if (outputText.includes('API Error: 429')) {
+      spinner.stop('Rate limit exceeded');
+      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+      writeBenchmarkData(benchmark);
+      return {
+        error: AgentErrorType.RATE_LIMIT,
+        message: outputText,
+        benchmark,
+      };
+    }
+    if (outputText.includes('API Error:')) {
+      spinner.stop('API error occurred');
+      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+      writeBenchmarkData(benchmark);
+      return {
+        error: AgentErrorType.API_ERROR,
+        message: outputText,
+        benchmark,
+      };
+    }
+
+    const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+    writeBenchmarkData(benchmark);
+
+    const totalDurationSeconds = Math.round(
+      (Date.now() - overallStartTime) / 1000,
+    );
+    const totalCost = stepUsages.reduce((sum, s) => sum + s.totalCostUsd, 0);
+    clack.log.success(
+      `${chalk.cyan(
+        '[BENCHMARK]',
+      )} All ${completedStepCount} steps completed in ${formatDuration(
+        totalDurationSeconds * 1000,
+      )}, total cost: $${totalCost.toFixed(4)}`,
+    );
+    clack.log.info(
+      `${chalk.cyan('[BENCHMARK]')} Results written to ${BENCHMARK_FILE_PATH}`,
+    );
+    logToFile(
+      `All ${completedStepCount} benchmark steps completed in ${totalDurationSeconds}s`,
+    );
+
+    analytics.capture(WIZARD_INTERACTION_EVENT_NAME, {
+      action: 'agent integration completed',
+      duration_ms: Date.now() - overallStartTime,
+      duration_seconds: totalDurationSeconds,
+      benchmark_steps: completedStepCount,
+    });
+
+    spinner.stop(successMessage);
+    return { benchmark };
+  } catch (error) {
+    signalAllDone!();
+
+    if (receivedSuccessResult) {
+      logToFile('Ignoring post-completion error, agent completed successfully');
+      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+      writeBenchmarkData(benchmark);
+      spinner.stop(successMessage);
+      return { benchmark };
+    }
+
+    spinner.stop(errorMessage);
+    const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+    writeBenchmarkData(benchmark);
+
+    const outputText = collectedText.join('\n');
+    const apiErrorMatch = outputText.match(/API Error: [^\n]+/g);
+    const apiErrorMessage = apiErrorMatch
+      ? apiErrorMatch.join('\n')
+      : undefined;
+
+    if (outputText.includes('API Error: 429')) {
+      return {
+        error: AgentErrorType.RATE_LIMIT,
+        message: apiErrorMessage,
+        benchmark,
+      };
+    }
+    if (outputText.includes('API Error:')) {
+      return {
+        error: AgentErrorType.API_ERROR,
+        message: apiErrorMessage,
+        benchmark,
+      };
+    }
+
+    throw error;
+  }
+}
+
+/**
+ * Compute per-model usage deltas between current and previous cumulative modelUsage.
+ */
+function computeModelUsageDelta(
+  current: Record<string, any>,
+  previous: Record<string, any>,
+): Record<string, any> {
+  const result: Record<string, any> = {};
+  for (const [model, data] of Object.entries(current)) {
+    const prev = previous[model] ?? {};
+    result[model] = {
+      inputTokens: (data.inputTokens ?? 0) - (prev.inputTokens ?? 0),
+      outputTokens: (data.outputTokens ?? 0) - (prev.outputTokens ?? 0),
+      cacheReadInputTokens:
+        (data.cacheReadInputTokens ?? 0) - (prev.cacheReadInputTokens ?? 0),
+      cacheCreationInputTokens:
+        (data.cacheCreationInputTokens ?? 0) -
+        (prev.cacheCreationInputTokens ?? 0),
+      webSearchRequests:
+        (data.webSearchRequests ?? 0) - (prev.webSearchRequests ?? 0),
+      costUSD: (data.costUSD ?? 0) - (prev.costUSD ?? 0),
+      contextWindow: data.contextWindow ?? 0,
+    };
+  }
+  return result;
+}
+
+/**
+ * Build BenchmarkData from collected step usages.
+ */
+function buildBenchmarkData(
+  stepUsages: StepUsage[],
+  overallStartTime: number,
+): BenchmarkData {
+  return {
+    timestamp: new Date().toISOString(),
+    steps: stepUsages,
+    totals: {
+      totalCostUsd: stepUsages.reduce((sum, s) => sum + s.totalCostUsd, 0),
+      durationMs: Date.now() - overallStartTime,
+      inputTokens: stepUsages.reduce(
+        (sum, s) =>
+          sum +
+          s.usage.input_tokens +
+          s.usage.cache_read_input_tokens +
+          s.usage.cache_creation_input_tokens,
+        0,
+      ),
+      outputTokens: stepUsages.reduce(
+        (sum, s) => sum + s.usage.output_tokens,
+        0,
+      ),
+      numTurns: stepUsages.reduce((sum, s) => sum + s.numTurns, 0),
+    },
+  };
+}
+
 /**
  * Handle SDK messages and provide user feedback
  *
@@ -739,7 +1406,6 @@ function handleSDKMessage(
     }
 
     default:
-      // Log other message types for debugging
       if (options.debug) {
         debug(`Unhandled message type: ${message.type}`);
       }
diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts
index d319df2..bd75599 100644
--- a/src/lib/agent-runner.ts
+++ b/src/lib/agent-runner.ts
@@ -22,9 +22,13 @@ import clack from '../utils/clack';
 import {
   initializeAgent,
   runAgent,
+  runAgentSteps,
   AgentSignals,
   AgentErrorType,
 } from './agent-interface';
+import { logToFile } from '../utils/debug';
+import fs from 'fs';
+import path from 'path';
 import { getCloudUrlFromRegion } from '../utils/urls';
 import chalk from 'chalk';
 import * as semver from 'semver';
@@ -193,18 +197,90 @@ export async function runAgentWizard(
     options,
   );
 
-  const agentResult = await runAgent(
-    agent,
-    integrationPrompt,
-    options,
-    spinner,
-    {
-      estimatedDurationMinutes: config.ui.estimatedDurationMinutes,
-      spinnerMessage: SPINNER_MESSAGE,
-      successMessage: config.ui.successMessage,
-      errorMessage: 'Integration failed',
-    },
-  );
+  const agentRunConfig = {
+    estimatedDurationMinutes: config.ui.estimatedDurationMinutes,
+    spinnerMessage: SPINNER_MESSAGE,
+    successMessage: config.ui.successMessage,
+    errorMessage: 'Integration failed',
+  };
+
+  let agentResult;
+
+  if (options.benchmark) {
+    clack.log.info(
+      `${chalk.cyan(
+        '[BENCHMARK]',
+      )} Running in benchmark mode — each workflow phase will be tracked separately`,
+    );
+
+    // Benchmark mode: run setup + workflow phases in a single conversation,
+    // with per-step tracking. Context is preserved across steps (identical to normal mode).
+    const additionalLines = config.prompts.getAdditionalContextLines
+      ? config.prompts.getAdditionalContextLines(frameworkContext)
+      : [];
+    const additionalContext =
+      additionalLines.length > 0
+        ? '\n' + additionalLines.map((line) => `- ${line}`).join('\n')
+        : '';
+
+    const projectContext = `Project context:
+- Framework: ${config.metadata.name} ${frameworkVersion || 'latest'}
+- TypeScript: ${typeScriptDetected ? 'Yes' : 'No'}
+- PostHog API Key: ${projectApiKey}
+- PostHog Host: ${host}${additionalContext}`;
+
+    const setupPrompt = `You have access to the PostHog MCP server which provides skills to integrate PostHog into this ${config.metadata.name} project.
+
+${projectContext}
+
+Instructions (follow these steps IN ORDER - do not skip or reorder):
+
+STEP 1: List available skills from the PostHog MCP server using ListMcpResourcesTool. If this tool is not available or you cannot access the MCP server, you must emit: ${AgentSignals.ERROR_MCP_MISSING} Could not access the PostHog MCP server and halt.
+
+   Review the skill descriptions and choose the one that best matches this project's framework and configuration.
+   If no suitable skill is found, or you cannot access the MCP server, you emit: ${AgentSignals.ERROR_RESOURCE_MISSING} Could not find a suitable skill for this project.
+
+STEP 2: Fetch the chosen skill resource (e.g., posthog://skills/{skill-id}).
+   The resource returns a shell command to install the skill.
+
+STEP 3: Run the installation command using Bash:
+   - Execute the EXACT command returned by the resource (do not modify it)
+   - This will download and extract the skill to .claude/skills/{skill-id}/
+
+STEP 4: Load the installed skill's SKILL.md file to understand what references are available.
+
+STEP 5: Set up environment variables for PostHog in a .env file with the API key and host provided above, using the appropriate naming convention for ${config.metadata.name}. Make sure to use these environment variables in the code files you create instead of hardcoding the API key and host.
+
+Important: Look for lockfiles (pnpm-lock.yaml, package-lock.json, yarn.lock, bun.lockb) to determine the package manager (excluding the contents of node_modules). Do not manually edit package.json. Always install packages as a background task. Don't await completion; proceed with other work immediately after starting the installation. You must read a file immediately before attempting to write it, even if you have previously read it; failure to do so will cause a tool failure.
+
+`;
+
+    const setupStep = { name: 'setup', prompt: setupPrompt };
+
+    // Run all steps in a single conversation. After the setup step installs the skill,
+    // onAfterStep discovers workflow files on disk and adds them as additional steps.
+    agentResult = await runAgentSteps(agent, [setupStep], options, spinner, {
+      ...agentRunConfig,
+      onAfterStep: (stepIndex, stepName) => {
+        if (stepName === 'setup') {
+          return discoverBenchmarkSteps(
+            options.installDir,
+            config,
+            projectContext,
+          );
+        }
+        return [];
+      },
+    });
+  } else {
+    agentResult = await runAgent(
+      agent,
+      integrationPrompt,
+      options,
+      spinner,
+      agentRunConfig,
+    );
+  }
 
   // Handle error cases detected in agent output
   if (agentResult.error === AgentErrorType.MCP_MISSING) {
@@ -412,3 +488,93 @@ Important: Look for lockfiles (pnpm-lock.yaml, package-lock.json, yarn.lock, bun
 
 `;
 }
+
+/**
+ * Discover installed skill workflow files and build benchmark step prompts.
+ * Scans .claude/skills/ in the install directory for workflow files (1.0-*, 1.1-*, etc.).
+ */
+function discoverBenchmarkSteps(
+  installDir: string,
+  config: FrameworkConfig,
+  projectContext: string,
+): Array<{ name: string; prompt: string }> {
+  const skillsDir = path.join(installDir, '.claude', 'skills');
+
+  if (!fs.existsSync(skillsDir)) {
+    logToFile('No .claude/skills/ directory found for benchmark discovery');
+    return [];
+  }
+
+  // Find installed skill directory
+  const skillDirs = fs.readdirSync(skillsDir).filter((entry) => {
+    const fullPath = path.join(skillsDir, entry);
+    return fs.statSync(fullPath).isDirectory();
+  });
+
+  if (skillDirs.length === 0) {
+    logToFile('No skill directories found in .claude/skills/');
+    return [];
+  }
+
+  // Use the first skill directory found
+  const skillId = skillDirs[0];
+  const skillPath = path.join(skillsDir, skillId);
+  logToFile(`Discovered skill for benchmark: ${skillId}`);
+
+  // Workflow files live in the references/ subdirectory
+  const referencesDir = path.join(skillPath, 'references');
+  if (!fs.existsSync(referencesDir)) {
+    logToFile('No references/ directory found in skill directory');
+    return [];
+  }
+
+  // Find workflow files matching pattern like "basic-integration-1.0-begin.md"
+  // The naming convention is {category}-{number}.{step}-{name}.md
+  const allFiles = fs.readdirSync(referencesDir);
+  const workflowFiles = allFiles
+    .filter((f) => /\d+\.\d+-\w+\.md$/.test(f))
+    .sort();
+
+  if (workflowFiles.length === 0) {
+    logToFile(
+      `No workflow files found in references/ directory. Files present: ${allFiles.join(
+        ', ',
+      )}`,
+    );
+    return [];
+  }
+
+  logToFile(
+    `Found ${workflowFiles.length} workflow files: ${workflowFiles.join(', ')}`,
+  );
+
+  const steps: Array<{ name: string; prompt: string }> = [];
+
+  for (const workflowFile of workflowFiles) {
+    // Extract phase name from filename
+    // e.g., "basic-integration-1.0-begin.md" -> "1.0-begin"
+    const phaseMatch = workflowFile.match(/(\d+\.\d+-.+)\.md$/);
+    const phaseName = phaseMatch
+      ? phaseMatch[1]
+      : workflowFile.replace(/\.md$/, '');
+
+    const prompt = `You are performing phase "${phaseName}" of a PostHog integration for a ${config.metadata.name} project.
+
+${projectContext}
+
+The PostHog skill is installed at .claude/skills/${skillId}/.
+Read SKILL.md in that directory for available reference files.
+
+Follow the instructions in the workflow file: .claude/skills/${skillId}/references/${workflowFile}
+
+Important: Read files before writing. Use environment variables, not hardcoded keys.
+Do not manually edit package.json. Use lockfiles to determine the package manager.
+You must read a file immediately before attempting to write it, even if you have previously read it; failure to do so will cause a tool failure.
+Always install packages as a background task. Don't await completion; proceed with other work immediately after starting the installation.
+`;
+
+    steps.push({ name: phaseName, prompt });
+  }
+
+  return steps;
+}
diff --git a/src/run.ts b/src/run.ts
index e5e5311..42cc336 100644
--- a/src/run.ts
+++ b/src/run.ts
@@ -27,6 +27,7 @@ type Args = {
   ci?: boolean;
   apiKey?: string;
   menu?: boolean;
+  benchmark?: boolean;
 };
 
 export async function runWizard(argv: Args) {
@@ -57,6 +58,7 @@ export async function runWizard(argv: Args) {
     ci: finalArgs.ci ?? false,
     apiKey: finalArgs.apiKey,
     menu: finalArgs.menu ?? false,
+    benchmark: finalArgs.benchmark ?? false,
   };
 
   clack.intro(`Welcome to the PostHog setup wizard ✨`);
diff --git a/src/utils/types.ts b/src/utils/types.ts
index 7379156..5c0b8d6 100644
--- a/src/utils/types.ts
+++ b/src/utils/types.ts
@@ -60,6 +60,13 @@ export type WizardOptions = {
    * Whether to show the menu for manual integration selection instead of auto-detecting.
    */
   menu: boolean;
+
+  /**
+   * Whether to run in benchmark mode with per-phase token tracking.
+   * When enabled, the wizard runs each workflow phase as a separate agent call
+   * and writes detailed usage data to /tmp/posthog-wizard-benchmark.json.
+   */
+  benchmark: boolean;
 };
 
 export interface Feature {

From 9731e3d76e1412b88243df7b7df3208ed96bb171 Mon Sep 17 00:00:00 2001
From: "Vincent (Wen Yu) Ge" <gewenyu99@gmail.com>
Date: Wed, 11 Feb 2026 16:21:18 -0500
Subject: [PATCH 2/2] Clean up

---
 src/lib/agent-interface.ts | 662 +------------------------------------
 src/lib/agent-runner.ts    | 178 +---------
 src/lib/benchmark.ts       | 371 +++++++++++++++++++++
 3 files changed, 385 insertions(+), 826 deletions(-)
 create mode 100644 src/lib/benchmark.ts

diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts
index a06acfd..f5310f9 100644
--- a/src/lib/agent-interface.ts
+++ b/src/lib/agent-interface.ts
@@ -3,9 +3,7 @@
  * Uses Claude Agent SDK directly with PostHog LLM gateway
  */
 
-import fs from 'fs';
 import path from 'path';
-import chalk from 'chalk';
 import clack from '../utils/clack';
 import { debug, logToFile, initLogFile, LOG_FILE_PATH } from '../utils/debug';
 import type { WizardOptions } from '../utils/types';
@@ -16,6 +14,8 @@ import {
 } from './constants';
 import { getLlmGatewayUrlFromHost } from '../utils/urls';
 import { LINTING_TOOLS } from './safe-tools';
+import type { BenchmarkData } from './benchmark';
+import { BenchmarkTracker } from './benchmark';
 
 // Dynamic import cache for ESM module
 let _sdkModule: any = null;
@@ -86,43 +86,6 @@ type AgentRunConfig = {
   model: string;
 };
 
-export const BENCHMARK_FILE_PATH = '/tmp/posthog-wizard-benchmark.json';
-
-export interface StepUsage {
-  name: string;
-  usage: {
-    input_tokens: number;
-    output_tokens: number;
-    cache_creation_input_tokens: number;
-    cache_read_input_tokens: number;
-  };
-  modelUsage: Record<string, unknown>;
-  totalCostUsd: number;
-  durationMs: number;
-  durationApiMs: number;
-  numTurns: number;
-  /** Conversation context size (tokens) entering this step */
-  contextTokensIn: number;
-  /** Conversation context size (tokens) exiting this step */
-  contextTokensOut: number;
-  /** Number of auto-compactions that occurred during this step */
-  compactions?: number;
-  /** Token count before each compaction (from SDK compact_boundary messages) */
-  compactionPreTokens?: number[];
-}
-
-export interface BenchmarkData {
-  timestamp: string;
-  steps: StepUsage[];
-  totals: {
-    totalCostUsd: number;
-    durationMs: number;
-    inputTokens: number;
-    outputTokens: number;
-    numTurns: number;
-  };
-}
-
 /**
  * Package managers that can be used to run commands.
  */
@@ -437,6 +400,9 @@ export async function runAgent(
     `This whole process should take about ${estimatedDurationMinutes} minutes including error checking and fixes.\n\nGrab some coffee!`,
   );
 
+  // Create benchmark tracker before spinner starts so its log output is visible
+  const tracker = options.benchmark ? new BenchmarkTracker(spinner) : null;
+
   spinner.start(spinnerMessage);
 
   const cliPath = getClaudeCodeExecutablePath();
@@ -515,17 +481,7 @@ export async function runAgent(
     });
     spinner.stop(successMessage);
 
-    // Write benchmark data from the single-query result if available
-    let benchmark: BenchmarkData | undefined;
-    if (resultMessage && options.benchmark) {
-      benchmark = extractBenchmarkFromResult(
-        'single-run',
-        resultMessage,
-        durationMs,
-      );
-      writeBenchmarkData(benchmark);
-    }
-
+    const benchmark = tracker?.finalize(resultMessage, durationMs);
     return { benchmark };
   };
 
@@ -622,6 +578,7 @@ export async function runAgent(
         collectedText,
         receivedSuccessResult,
       );
+      tracker?.onMessage(message);
 
       // Signal completion when result received
       if (message.type === 'result') {
@@ -706,611 +663,6 @@ export async function runAgent(
   }
 }
 
-/**
- * Format milliseconds into a human-readable duration string (e.g., "2m 34s", "45s").
- */
-function formatDuration(ms: number): string {
-  const totalSeconds = Math.round(ms / 1000);
-  const minutes = Math.floor(totalSeconds / 60);
-  const seconds = totalSeconds % 60;
-  if (minutes > 0) {
-    return `${minutes}m ${seconds}s`;
-  }
-  return `${seconds}s`;
-}
-
-/**
- * Format token count into a human-readable string (e.g., "1.2M", "345K", "1,234").
- */
-function formatTokenCount(tokens: number): string {
-  if (tokens >= 1_000_000) {
-    return `${(tokens / 1_000_000).toFixed(1)}M`;
-  }
-  if (tokens >= 10_000) {
-    return `${Math.round(tokens / 1000)}K`;
-  }
-  return tokens.toLocaleString();
-}
-
-/**
- * Sum token usage across all models from the SDK's modelUsage field.
- * The top-level `usage` field only has the last API call's tokens;
- * `modelUsage` has the accurate per-model aggregates (camelCase fields).
- */
-function sumModelUsage(modelUsage: Record<string, any>): {
-  input_tokens: number;
-  output_tokens: number;
-  cache_creation_input_tokens: number;
-  cache_read_input_tokens: number;
-} {
-  let input_tokens = 0;
-  let output_tokens = 0;
-  let cache_creation_input_tokens = 0;
-  let cache_read_input_tokens = 0;
-
-  for (const model of Object.values(modelUsage)) {
-    input_tokens += model.inputTokens ?? 0;
-    output_tokens += model.outputTokens ?? 0;
-    cache_creation_input_tokens += model.cacheCreationInputTokens ?? 0;
-    cache_read_input_tokens += model.cacheReadInputTokens ?? 0;
-  }
-
-  return {
-    input_tokens,
-    output_tokens,
-    cache_creation_input_tokens,
-    cache_read_input_tokens,
-  };
-}
-
-/**
- * Extract benchmark data from a single SDK result message.
- */
-function extractBenchmarkFromResult(
-  stepName: string,
-  message: SDKMessage,
-  wallDurationMs: number,
-): BenchmarkData {
-  const modelUsage = message.modelUsage ?? {};
-  const usage = sumModelUsage(modelUsage);
-  const lastCallUsage = message.usage ?? {};
-  const contextTokensOut =
-    Number(lastCallUsage.input_tokens ?? 0) +
-    Number(lastCallUsage.cache_read_input_tokens ?? 0) +
-    Number(lastCallUsage.cache_creation_input_tokens ?? 0);
-  const step: StepUsage = {
-    name: stepName,
-    usage,
-    modelUsage,
-    totalCostUsd: message.total_cost_usd ?? 0,
-    durationMs: message.duration_ms ?? wallDurationMs,
-    durationApiMs: message.duration_api_ms ?? 0,
-    numTurns: message.num_turns ?? 0,
-    contextTokensIn: 0,
-    contextTokensOut,
-  };
-
-  return {
-    timestamp: new Date().toISOString(),
-    steps: [step],
-    totals: {
-      totalCostUsd: step.totalCostUsd,
-      durationMs: step.durationMs,
-      inputTokens: step.usage.input_tokens,
-      outputTokens: step.usage.output_tokens,
-      numTurns: step.numTurns,
-    },
-  };
-}
-
-/**
- * Write benchmark data to the benchmark file.
- */
-function writeBenchmarkData(data: BenchmarkData): void {
-  try {
-    fs.writeFileSync(BENCHMARK_FILE_PATH, JSON.stringify(data, null, 2));
-    logToFile(`Benchmark data written to ${BENCHMARK_FILE_PATH}`);
-  } catch (error) {
-    logToFile('Failed to write benchmark data:', error);
-  }
-}
-
-/**
- * Execute multiple agent steps in a single conversation with per-step usage tracking.
- * Uses one query() call with multiple user messages, so conversation context is preserved
- * across steps (identical behavior to normal non-benchmark mode).
- *
- * Steps can be discovered dynamically via the onAfterStep callback — e.g., after the
- * setup step installs a skill, onAfterStep discovers the workflow files and returns
- * them as additional steps to run in the same conversation.
- *
- * Per-step usage is computed as deltas between consecutive SDK result messages.
- *
- * Writes benchmark data to BENCHMARK_FILE_PATH when all steps complete.
- */
-export async function runAgentSteps(
-  agentConfig: AgentRunConfig,
-  initialSteps: Array<{ name: string; prompt: string }>,
-  options: WizardOptions,
-  spinner: ReturnType<typeof clack.spinner>,
-  config?: {
-    estimatedDurationMinutes?: number;
-    spinnerMessage?: string;
-    successMessage?: string;
-    errorMessage?: string;
-    /** Called after each step completes. Return additional steps to append to the queue. */
-    onAfterStep?: (
-      stepIndex: number,
-      stepName: string,
-    ) => Array<{ name: string; prompt: string }>;
-  },
-): Promise<{
-  error?: AgentErrorType;
-  message?: string;
-  benchmark?: BenchmarkData;
-}> {
-  const {
-    estimatedDurationMinutes = 8,
-    spinnerMessage = 'Customizing your PostHog setup...',
-    successMessage = 'PostHog integration complete',
-    errorMessage = 'Integration failed',
-    onAfterStep,
-  } = config ?? {};
-
-  const { query } = await getSDKModule();
-
-  clack.log.step(
-    `This whole process should take about ${estimatedDurationMinutes} minutes including error checking and fixes.\n\nGrab some coffee!`,
-  );
-  clack.log.info(`${chalk.cyan('[BENCHMARK]')} Verbose logs: ${LOG_FILE_PATH}`);
-  clack.log.info(
-    `${chalk.cyan(
-      '[BENCHMARK]',
-    )} Benchmark data will be written to: ${BENCHMARK_FILE_PATH}`,
-  );
-
-  spinner.start(spinnerMessage);
-
-  const overallStartTime = Date.now();
-  const stepUsages: StepUsage[] = [];
-  const collectedText: string[] = [];
-  let receivedSuccessResult = false;
-
-  // Dynamic steps list — grows as onAfterStep discovers more
-  const allSteps = [...initialSteps];
-  const stepStartTimes: number[] = [];
-  let completedStepCount = 0;
-
-  // Per-step compaction tracking (reset after each step)
-  let stepCompactions = 0;
-  let stepCompactionPreTokens: number[] = [];
-
-  // Previous cumulative values for delta computation
-  let prevCumulative = {
-    usage: {
-      input_tokens: 0,
-      output_tokens: 0,
-      cache_creation_input_tokens: 0,
-      cache_read_input_tokens: 0,
-    },
-    modelUsage: {} as Record<string, any>,
-    costUsd: 0,
-    durationMs: 0,
-    durationApiMs: 0,
-    numTurns: 0,
-  };
-
-  // Step completion synchronization: resolves with `true` on success, `false` on error
-  // eslint-disable-next-line @typescript-eslint/no-empty-function
-  let resolveStepDone: (success: boolean) => void = () => {};
-  function waitForStepDone(): Promise<boolean> {
-    return new Promise((resolve) => {
-      resolveStepDone = resolve;
-    });
-  }
-
-  // Final cleanup signal for SDK stdin workaround
-  let signalAllDone: () => void;
-  const allDone = new Promise<void>((resolve) => {
-    signalAllDone = resolve;
-  });
-
-  // Prompt stream generator — yields user messages for each step in order,
-  // pausing between steps to wait for the result and discover more steps.
-  const promptStream = async function* () {
-    let i = 0;
-    while (i < allSteps.length) {
-      const step = allSteps[i];
-      stepStartTimes[i] = Date.now();
-
-      logToFile(`Yielding benchmark step ${i + 1}: ${step.name}`);
-      spinner.stop(
-        `${chalk.cyan('[BENCHMARK]')} Starting step ${i + 1}/${
-          allSteps.length
-        }: ${chalk.bold(step.name)}`,
-      );
-      spinner.start(
-        `Running step ${i + 1}/${allSteps.length}: ${step.name}...`,
-      );
-
-      yield {
-        type: 'user',
-        session_id: '',
-        message: { role: 'user', content: step.prompt },
-        parent_tool_use_id: null,
-      };
-
-      // Wait for this step's result before yielding the next prompt
-      const success = await waitForStepDone();
-      if (!success) {
-        // Step failed — stop yielding, let the generator end
-        break;
-      }
-
-      // Discover more steps after this one completes
-      if (onAfterStep) {
-        const moreSteps = onAfterStep(i, step.name);
-        if (moreSteps.length > 0) {
-          allSteps.push(...moreSteps);
-          clack.log.info(
-            `${chalk.cyan('[BENCHMARK]')} Discovered ${
-              moreSteps.length
-            } more phases: ${moreSteps.map((s) => s.name).join(', ')}`,
-          );
-        }
-      }
-
-      i++;
-    }
-
-    // Keep generator alive for SDK cleanup (stdin workaround)
-    await allDone;
-  };
-
-  const allowedTools = [
-    'Read',
-    'Write',
-    'Edit',
-    'Glob',
-    'Grep',
-    'Bash',
-    'ListMcpResourcesTool',
-    'Skill',
-  ];
-
-  try {
-    const response = query({
-      prompt: promptStream(),
-      options: {
-        model: agentConfig.model,
-        cwd: agentConfig.workingDirectory,
-        permissionMode: 'acceptEdits',
-        mcpServers: agentConfig.mcpServers,
-        settingSources: ['project'],
-        allowedTools,
-        env: {
-          ...process.env,
-          ANTHROPIC_API_KEY: undefined,
-        },
-        canUseTool: (toolName: string, input: unknown) => {
-          logToFile('canUseTool called:', { toolName, input });
-          const result = wizardCanUseTool(
-            toolName,
-            input as Record<string, unknown>,
-          );
-          logToFile('canUseTool result:', result);
-          return Promise.resolve(result);
-        },
-        tools: { type: 'preset', preset: 'claude_code' },
-        stderr: (data: string) => {
-          logToFile('CLI stderr:', data);
-          if (options.debug) {
-            debug('CLI stderr:', data);
-          }
-        },
-      },
-    });
-
-    for await (const message of response) {
-      handleSDKMessage(
-        message,
-        options,
-        spinner,
-        collectedText,
-        receivedSuccessResult,
-      );
-
-      // Track compaction events from the SDK
-      if (message.type === 'system' && message.subtype === 'compact_boundary') {
-        const preTokens = message.compact_metadata?.pre_tokens ?? 0;
-        const trigger = message.compact_metadata?.trigger ?? 'unknown';
-        stepCompactions++;
-        stepCompactionPreTokens.push(preTokens);
-        logToFile(
-          `[COMPACTION] Context compacted (trigger: ${trigger}, pre_tokens: ${formatTokenCount(
-            preTokens,
-          )})`,
-        );
-        clack.log.info(
-          `${chalk.yellow('[COMPACTION]')} Context compacted during step "${
-            allSteps[completedStepCount]?.name
-          }" (trigger: ${trigger}, pre_tokens: ${formatTokenCount(preTokens)})`,
-        );
-      }
-
-      if (message.type === 'result') {
-        if (message.subtype === 'success' && !message.is_error) {
-          receivedSuccessResult = true;
-
-          const stepIndex = completedStepCount;
-          const stepDurationMs = Date.now() - stepStartTimes[stepIndex];
-
-          // Compute delta usage from cumulative SDK values
-          const modelUsageData = message.modelUsage ?? {};
-          const cumulativeUsage = sumModelUsage(modelUsageData);
-          const cumulativeCost = message.total_cost_usd ?? 0;
-          const cumulativeDuration = message.duration_ms ?? 0;
-          const cumulativeDurationApi = message.duration_api_ms ?? 0;
-          const cumulativeTurns = message.num_turns ?? 0;
-
-          const deltaUsage = {
-            input_tokens:
-              cumulativeUsage.input_tokens - prevCumulative.usage.input_tokens,
-            output_tokens:
-              cumulativeUsage.output_tokens -
-              prevCumulative.usage.output_tokens,
-            cache_creation_input_tokens:
-              cumulativeUsage.cache_creation_input_tokens -
-              prevCumulative.usage.cache_creation_input_tokens,
-            cache_read_input_tokens:
-              cumulativeUsage.cache_read_input_tokens -
-              prevCumulative.usage.cache_read_input_tokens,
-          };
-          const deltaCost = cumulativeCost - prevCumulative.costUsd;
-          // num_turns is per-response (not cumulative), so use directly
-          const stepTurns = cumulativeTurns;
-          const deltaDurationApi =
-            cumulativeDurationApi - prevCumulative.durationApiMs;
-          const deltaModelUsage = computeModelUsageDelta(
-            modelUsageData,
-            prevCumulative.modelUsage,
-          );
-
-          // Context size from the last API call's usage (not cumulative modelUsage).
-          // The last call's input represents the actual conversation window at that point.
-          const lastCallUsage = message.usage ?? {};
-          const contextTokensOut =
-            Number(lastCallUsage.input_tokens ?? 0) +
-            Number(lastCallUsage.cache_read_input_tokens ?? 0) +
-            Number(lastCallUsage.cache_creation_input_tokens ?? 0);
-          const contextTokensIn =
-            stepUsages.length > 0
-              ? stepUsages[stepUsages.length - 1].contextTokensOut
-              : 0;
-
-          stepUsages.push({
-            name: allSteps[stepIndex].name,
-            usage: deltaUsage,
-            modelUsage: deltaModelUsage,
-            totalCostUsd: deltaCost,
-            durationMs: stepDurationMs,
-            durationApiMs: deltaDurationApi,
-            numTurns: stepTurns,
-            contextTokensIn,
-            contextTokensOut,
-            ...(stepCompactions > 0 && {
-              compactions: stepCompactions,
-              compactionPreTokens: stepCompactionPreTokens,
-            }),
-          });
-
-          // Reset per-step compaction tracking
-          stepCompactions = 0;
-          stepCompactionPreTokens = [];
-
-          // Update cumulative tracking
-          prevCumulative = {
-            usage: cumulativeUsage,
-            modelUsage: modelUsageData,
-            costUsd: cumulativeCost,
-            durationMs: cumulativeDuration,
-            durationApiMs: cumulativeDurationApi,
-            numTurns: cumulativeTurns,
-          };
-
-          spinner.stop(
-            `${chalk.cyan('[BENCHMARK]')} Completed step ${stepIndex + 1}/${
-              allSteps.length
-            }: ${chalk.bold(allSteps[stepIndex].name)} ${chalk.dim(
-              `(${formatDuration(stepDurationMs)}, $${deltaCost.toFixed(
-                4,
-              )}, ${stepTurns} turns, ctx: ${formatTokenCount(
-                contextTokensIn,
-              )} → ${formatTokenCount(contextTokensOut)})`,
-            )}`,
-          );
-          logToFile(
-            `Step "${allSteps[stepIndex].name}" completed in ${Math.round(
-              stepDurationMs / 1000,
-            )}s`,
-          );
-
-          completedStepCount++;
-          resolveStepDone(true);
-        } else {
-          // Error result — signal generator to stop yielding
-          resolveStepDone(false);
-        }
-
-        // Signal generator cleanup when all done
-        if (completedStepCount >= allSteps.length) {
-          signalAllDone!();
-        }
-      }
-    }
-
-    // Check for error signals in collected output
-    const outputText = collectedText.join('\n');
-    if (outputText.includes(AgentSignals.ERROR_MCP_MISSING)) {
-      spinner.stop('Agent could not access PostHog MCP');
-      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
-      writeBenchmarkData(benchmark);
-      return { error: AgentErrorType.MCP_MISSING, benchmark };
-    }
-    if (outputText.includes(AgentSignals.ERROR_RESOURCE_MISSING)) {
-      spinner.stop('Agent could not access setup resource');
-      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
-      writeBenchmarkData(benchmark);
-      return { error: AgentErrorType.RESOURCE_MISSING, benchmark };
-    }
-    if (outputText.includes('API Error: 429')) {
-      spinner.stop('Rate limit exceeded');
-      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
-      writeBenchmarkData(benchmark);
-      return {
-        error: AgentErrorType.RATE_LIMIT,
-        message: outputText,
-        benchmark,
-      };
-    }
-    if (outputText.includes('API Error:')) {
-      spinner.stop('API error occurred');
-      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
-      writeBenchmarkData(benchmark);
-      return {
-        error: AgentErrorType.API_ERROR,
-        message: outputText,
-        benchmark,
-      };
-    }
-
-    const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
-    writeBenchmarkData(benchmark);
-
-    const totalDurationSeconds = Math.round(
-      (Date.now() - overallStartTime) / 1000,
-    );
-    const totalCost = stepUsages.reduce((sum, s) => sum + s.totalCostUsd, 0);
-    clack.log.success(
-      `${chalk.cyan(
-        '[BENCHMARK]',
-      )} All ${completedStepCount} steps completed in ${formatDuration(
-        totalDurationSeconds * 1000,
-      )}, total cost: $${totalCost.toFixed(4)}`,
-    );
-    clack.log.info(
-      `${chalk.cyan('[BENCHMARK]')} Results written to ${BENCHMARK_FILE_PATH}`,
-    );
-    logToFile(
-      `All ${completedStepCount} benchmark steps completed in ${totalDurationSeconds}s`,
-    );
-
-    analytics.capture(WIZARD_INTERACTION_EVENT_NAME, {
-      action: 'agent integration completed',
-      duration_ms: Date.now() - overallStartTime,
-      duration_seconds: totalDurationSeconds,
-      benchmark_steps: completedStepCount,
-    });
-
-    spinner.stop(successMessage);
-    return { benchmark };
-  } catch (error) {
-    signalAllDone!();
-
-    if (receivedSuccessResult) {
-      logToFile('Ignoring post-completion error, agent completed successfully');
-      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
-      writeBenchmarkData(benchmark);
-      spinner.stop(successMessage);
-      return { benchmark };
-    }
-
-    spinner.stop(errorMessage);
-    const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
-    writeBenchmarkData(benchmark);
-
-    const outputText = collectedText.join('\n');
-    const apiErrorMatch = outputText.match(/API Error: [^\n]+/g);
-    const apiErrorMessage = apiErrorMatch
-      ? apiErrorMatch.join('\n')
-      : undefined;
-
-    if (outputText.includes('API Error: 429')) {
-      return {
-        error: AgentErrorType.RATE_LIMIT,
-        message: apiErrorMessage,
-        benchmark,
-      };
-    }
-    if (outputText.includes('API Error:')) {
-      return {
-        error: AgentErrorType.API_ERROR,
-        message: apiErrorMessage,
-        benchmark,
-      };
-    }
-
-    throw error;
-  }
-}
-
-/**
- * Compute per-model usage deltas between current and previous cumulative modelUsage.
- */
-function computeModelUsageDelta(
-  current: Record<string, any>,
-  previous: Record<string, any>,
-): Record<string, any> {
-  const result: Record<string, any> = {};
-  for (const [model, data] of Object.entries(current)) {
-    const prev = previous[model] ?? {};
-    result[model] = {
-      inputTokens: (data.inputTokens ?? 0) - (prev.inputTokens ?? 0),
-      outputTokens: (data.outputTokens ?? 0) - (prev.outputTokens ?? 0),
-      cacheReadInputTokens:
-        (data.cacheReadInputTokens ?? 0) - (prev.cacheReadInputTokens ?? 0),
-      cacheCreationInputTokens:
-        (data.cacheCreationInputTokens ?? 0) -
-        (prev.cacheCreationInputTokens ?? 0),
-      webSearchRequests:
-        (data.webSearchRequests ?? 0) - (prev.webSearchRequests ?? 0),
-      costUSD: (data.costUSD ?? 0) - (prev.costUSD ?? 0),
-      contextWindow: data.contextWindow ?? 0,
-    };
-  }
-  return result;
-}
-
-/**
- * Build BenchmarkData from collected step usages.
- */
-function buildBenchmarkData(
-  stepUsages: StepUsage[],
-  overallStartTime: number,
-): BenchmarkData {
-  return {
-    timestamp: new Date().toISOString(),
-    steps: stepUsages,
-    totals: {
-      totalCostUsd: stepUsages.reduce((sum, s) => sum + s.totalCostUsd, 0),
-      durationMs: Date.now() - overallStartTime,
-      inputTokens: stepUsages.reduce(
-        (sum, s) =>
-          sum +
-          s.usage.input_tokens +
-          s.usage.cache_read_input_tokens +
-          s.usage.cache_creation_input_tokens,
-        0,
-      ),
-      outputTokens: stepUsages.reduce(
-        (sum, s) => sum + s.usage.output_tokens,
-        0,
-      ),
-      numTurns: stepUsages.reduce((sum, s) => sum + s.numTurns, 0),
-    },
-  };
-}
-
 /**
  * Handle SDK messages and provide user feedback
  *
diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts
index bd75599..483f6e0 100644
--- a/src/lib/agent-runner.ts
+++ b/src/lib/agent-runner.ts
@@ -22,13 +22,9 @@ import clack from '../utils/clack';
 import {
   initializeAgent,
   runAgent,
-  runAgentSteps,
   AgentSignals,
   AgentErrorType,
 } from './agent-interface';
-import { logToFile } from '../utils/debug';
-import fs from 'fs';
-import path from 'path';
 import { getCloudUrlFromRegion } from '../utils/urls';
 import chalk from 'chalk';
 import * as semver from 'semver';
@@ -204,83 +200,13 @@ export async function runAgentWizard(
     errorMessage: 'Integration failed',
   };
 
-  let agentResult;
-
-  if (options.benchmark) {
-    clack.log.info(
-      `${chalk.cyan(
-        '[BENCHMARK]',
-      )} Running in benchmark mode — each workflow phase will be tracked separately`,
-    );
-
-    // Benchmark mode: run setup + workflow phases in a single conversation,
-    // with per-step tracking. Context is preserved across steps (identical to normal mode).
-    const additionalLines = config.prompts.getAdditionalContextLines
-      ? config.prompts.getAdditionalContextLines(frameworkContext)
-      : [];
-    const additionalContext =
-      additionalLines.length > 0
-        ? '\n' + additionalLines.map((line) => `- ${line}`).join('\n')
-        : '';
-
-    const projectContext = `Project context:
-- Framework: ${config.metadata.name} ${frameworkVersion || 'latest'}
-- TypeScript: ${typeScriptDetected ? 'Yes' : 'No'}
-- PostHog API Key: ${projectApiKey}
-- PostHog Host: ${host}${additionalContext}`;
-
-    const setupPrompt = `You have access to the PostHog MCP server which provides skills to integrate PostHog into this ${config.metadata.name} project.
-
-${projectContext}
-
-Instructions (follow these steps IN ORDER - do not skip or reorder):
-
-STEP 1: List available skills from the PostHog MCP server using ListMcpResourcesTool. If this tool is not available or you cannot access the MCP server, you must emit: ${AgentSignals.ERROR_MCP_MISSING} Could not access the PostHog MCP server and halt.
-
-   Review the skill descriptions and choose the one that best matches this project's framework and configuration.
-   If no suitable skill is found, or you cannot access the MCP server, you emit: ${AgentSignals.ERROR_RESOURCE_MISSING} Could not find a suitable skill for this project.
-
-STEP 2: Fetch the chosen skill resource (e.g., posthog://skills/{skill-id}).
-   The resource returns a shell command to install the skill.
-
-STEP 3: Run the installation command using Bash:
-   - Execute the EXACT command returned by the resource (do not modify it)
-   - This will download and extract the skill to .claude/skills/{skill-id}/
-
-STEP 4: Load the installed skill's SKILL.md file to understand what references are available.
-
-STEP 5: Set up environment variables for PostHog in a .env file with the API key and host provided above, using the appropriate naming convention for ${config.metadata.name}. Make sure to use these environment variables in the code files you create instead of hardcoding the API key and host.
-
-Important: Look for lockfiles (pnpm-lock.yaml, package-lock.json, yarn.lock, bun.lockb) to determine the package manager (excluding the contents of node_modules). Do not manually edit package.json. Always install packages as a background task. Don't await completion; proceed with other work immediately after starting the installation. You must read a file immediately before attempting to write it, even if you have previously read it; failure to do so will cause a tool failure.
-
-`;
-
-    const setupStep = { name: 'setup', prompt: setupPrompt };
-
-    // Run all steps in a single conversation. After the setup step installs the skill,
-    // onAfterStep discovers workflow files on disk and adds them as additional steps.
-    agentResult = await runAgentSteps(agent, [setupStep], options, spinner, {
-      ...agentRunConfig,
-      onAfterStep: (stepIndex, stepName) => {
-        if (stepName === 'setup') {
-          return discoverBenchmarkSteps(
-            options.installDir,
-            config,
-            projectContext,
-          );
-        }
-        return [];
-      },
-    });
-  } else {
-    agentResult = await runAgent(
-      agent,
-      integrationPrompt,
-      options,
-      spinner,
-      agentRunConfig,
-    );
-  }
+  const agentResult = await runAgent(
+    agent,
+    integrationPrompt,
+    options,
+    spinner,
+    agentRunConfig,
+  );
 
   // Handle error cases detected in agent output
   if (agentResult.error === AgentErrorType.MCP_MISSING) {
@@ -488,93 +414,3 @@ Important: Look for lockfiles (pnpm-lock.yaml, package-lock.json, yarn.lock, bun
 
 `;
 }
-
-/**
- * Discover installed skill workflow files and build benchmark step prompts.
- * Scans .claude/skills/ in the install directory for workflow files (1.0-*, 1.1-*, etc.).
- */
-function discoverBenchmarkSteps(
-  installDir: string,
-  config: FrameworkConfig,
-  projectContext: string,
-): Array<{ name: string; prompt: string }> {
-  const skillsDir = path.join(installDir, '.claude', 'skills');
-
-  if (!fs.existsSync(skillsDir)) {
-    logToFile('No .claude/skills/ directory found for benchmark discovery');
-    return [];
-  }
-
-  // Find installed skill directory
-  const skillDirs = fs.readdirSync(skillsDir).filter((entry) => {
-    const fullPath = path.join(skillsDir, entry);
-    return fs.statSync(fullPath).isDirectory();
-  });
-
-  if (skillDirs.length === 0) {
-    logToFile('No skill directories found in .claude/skills/');
-    return [];
-  }
-
-  // Use the first skill directory found
-  const skillId = skillDirs[0];
-  const skillPath = path.join(skillsDir, skillId);
-  logToFile(`Discovered skill for benchmark: ${skillId}`);
-
-  // Workflow files live in the references/ subdirectory
-  const referencesDir = path.join(skillPath, 'references');
-  if (!fs.existsSync(referencesDir)) {
-    logToFile('No references/ directory found in skill directory');
-    return [];
-  }
-
-  // Find workflow files matching pattern like "basic-integration-1.0-begin.md"
-  // The naming convention is {category}-{number}.{step}-{name}.md
-  const allFiles = fs.readdirSync(referencesDir);
-  const workflowFiles = allFiles
-    .filter((f) => /\d+\.\d+-\w+\.md$/.test(f))
-    .sort();
-
-  if (workflowFiles.length === 0) {
-    logToFile(
-      `No workflow files found in references/ directory. Files present: ${allFiles.join(
-        ', ',
-      )}`,
-    );
-    return [];
-  }
-
-  logToFile(
-    `Found ${workflowFiles.length} workflow files: ${workflowFiles.join(', ')}`,
-  );
-
-  const steps: Array<{ name: string; prompt: string }> = [];
-
-  for (const workflowFile of workflowFiles) {
-    // Extract phase name from filename
-    // e.g., "basic-integration-1.0-begin.md" -> "1.0-begin"
-    const phaseMatch = workflowFile.match(/(\d+\.\d+-.+)\.md$/);
-    const phaseName = phaseMatch
-      ? phaseMatch[1]
-      : workflowFile.replace(/\.md$/, '');
-
-    const prompt = `You are performing phase "${phaseName}" of a PostHog integration for a ${config.metadata.name} project.
-
-${projectContext}
-
-The PostHog skill is installed at .claude/skills/${skillId}/.
-Read SKILL.md in that directory for available reference files.
-
-Follow the instructions in the workflow file: .claude/skills/${skillId}/references/${workflowFile}
-
-Important: Read files before writing. Use environment variables, not hardcoded keys.
-Do not manually edit package.json. Use lockfiles to determine the package manager.
-You must read a file immediately before attempting to write it, even if you have previously read it; failure to do so will cause a tool failure.
-Always install packages as a background task. Don't await completion; proceed with other work immediately after starting the installation.
-`;
-
-    steps.push({ name: phaseName, prompt });
-  }
-
-  return steps;
-}
diff --git a/src/lib/benchmark.ts b/src/lib/benchmark.ts
new file mode 100644
index 0000000..278ed22
--- /dev/null
+++ b/src/lib/benchmark.ts
@@ -0,0 +1,371 @@
+/**
+ * Benchmark tracking for wizard runs.
+ *
+ * Detects workflow phase transitions by watching for Read tool calls on
+ * workflow files (e.g., basic-integration-1.0-begin.md). Tracks per-phase
+ * timing, turns, and token usage. Writes results to a JSON file for CI.
+ *
+ * Usage in runAgent():
+ *   const tracker = options.benchmark ? new BenchmarkTracker() : null;
+ *   // in message loop:
+ *   tracker?.onMessage(message);
+ *   // on success:
+ *   const benchmark = tracker?.finalize(resultMessage, durationMs);
+ */
+
+import fs from 'fs';
+import chalk from 'chalk';
+import clack from '../utils/clack';
+import { logToFile, LOG_FILE_PATH } from '../utils/debug';
+
+export const BENCHMARK_FILE_PATH = '/tmp/posthog-wizard-benchmark.json';
+
+// ── Types ──────────────────────────────────────────────────────────────
+
+export interface StepUsage {
+  name: string;
+  usage: {
+    input_tokens: number;
+    output_tokens: number;
+    cache_creation_input_tokens: number;
+    cache_read_input_tokens: number;
+  };
+  modelUsage: Record<string, unknown>;
+  totalCostUsd: number;
+  durationMs: number;
+  durationApiMs: number;
+  numTurns: number;
+  contextTokensIn?: number;
+  contextTokensOut?: number;
+  compactions?: number;
+  compactionPreTokens?: number[];
+}
+
+export interface BenchmarkData {
+  timestamp: string;
+  steps: StepUsage[];
+  totals: {
+    totalCostUsd: number;
+    durationMs: number;
+    inputTokens: number;
+    outputTokens: number;
+    numTurns: number;
+  };
+}
+
+// ── Formatting helpers ─────────────────────────────────────────────────
+
+function formatDuration(ms: number): string {
+  const totalSeconds = Math.round(ms / 1000);
+  const minutes = Math.floor(totalSeconds / 60);
+  const seconds = totalSeconds % 60;
+  if (minutes > 0) return `${minutes}m ${seconds}s`;
+  return `${seconds}s`;
+}
+
+function formatTokenCount(tokens: number): string {
+  if (tokens >= 1_000_000) return `${(tokens / 1_000_000).toFixed(1)}M`;
+  if (tokens >= 10_000) return `${Math.round(tokens / 1000)}K`;
+  return tokens.toLocaleString();
+}
+
+// ── Internal helpers ───────────────────────────────────────────────────
+
+/**
+ * Sum token usage across all models from the SDK's modelUsage field.
+ * modelUsage has per-model aggregates with camelCase field names.
+ */
+function sumModelUsage(modelUsage: Record<string, any>): {
+  input_tokens: number;
+  output_tokens: number;
+  cache_creation_input_tokens: number;
+  cache_read_input_tokens: number;
+} {
+  let input_tokens = 0;
+  let output_tokens = 0;
+  let cache_creation_input_tokens = 0;
+  let cache_read_input_tokens = 0;
+
+  for (const model of Object.values(modelUsage)) {
+    input_tokens += model.inputTokens ?? 0;
+    output_tokens += model.outputTokens ?? 0;
+    cache_creation_input_tokens += model.cacheCreationInputTokens ?? 0;
+    cache_read_input_tokens += model.cacheReadInputTokens ?? 0;
+  }
+
+  return {
+    input_tokens,
+    output_tokens,
+    cache_creation_input_tokens,
+    cache_read_input_tokens,
+  };
+}
+
+/** Regex to detect workflow file references: matches "1.0-begin" from file paths/text */
+const WORKFLOW_FILE_RE = /(\d+\.\d+-[a-z]+)(?:\.md)?/;
+
+// ── BenchmarkTracker ───────────────────────────────────────────────────
+
+interface PhaseRecord {
+  name: string;
+  startTime: number;
+  endTime: number;
+  turns: number;
+  inputTokens: number;
+  outputTokens: number;
+  compactions: number;
+  compactionPreTokens: number[];
+}
+
+/**
+ * Observes the SDK message stream and tracks per-phase metrics.
+ *
+ * Phase transitions are detected by watching for Read tool calls on
+ * workflow files (matching the pattern `*-1.0-begin.md`). Everything
+ * before the first workflow file is tracked as the "setup" phase.
+ */
+export class BenchmarkTracker {
+  private spinner: ReturnType<typeof clack.spinner>;
+  private phases: PhaseRecord[] = [];
+  private currentPhase = 'setup';
+  private phaseStartTime: number;
+  private phaseTurns = 0;
+  private phaseInputTokens = 0;
+  private phaseOutputTokens = 0;
+  private phaseCompactions = 0;
+  private phaseCompactionPreTokens: number[] = [];
+  private seenPhases = new Set<string>();
+
+  constructor(spinner: ReturnType<typeof clack.spinner>) {
+    this.spinner = spinner;
+    this.phaseStartTime = Date.now();
+    clack.log.info(
+      `${chalk.cyan('[BENCHMARK]')} Verbose logs: ${LOG_FILE_PATH}`,
+    );
+    clack.log.info(
+      `${chalk.cyan(
+        '[BENCHMARK]',
+      )} Benchmark data will be written to: ${BENCHMARK_FILE_PATH}`,
+    );
+    clack.log.info(
+      `${chalk.cyan('[BENCHMARK]')} Starting phase: ${chalk.bold('setup')}`,
+    );
+    logToFile('[BENCHMARK] Starting phase: setup');
+  }
+
+  /**
+   * Feed every SDK message into the tracker.
+   */
+  onMessage(message: any): void {
+    if (message.type === 'assistant') {
+      this.phaseTurns++;
+
+      // Accumulate per-turn token usage if the API response includes it
+      const usage = message.message?.usage;
+      if (usage) {
+        this.phaseInputTokens += usage.input_tokens ?? 0;
+        this.phaseOutputTokens += usage.output_tokens ?? 0;
+      }
+
+      // Scan all content blocks for workflow phase references
+      const content = message.message?.content;
+      if (Array.isArray(content)) {
+        for (const block of content) {
+          // Detect from text content (agent mentioning/quoting workflow files)
+          if (block.type === 'text' && typeof block.text === 'string') {
+            this.detectPhaseFromText(block.text);
+          }
+          // Detect from tool_use blocks (Read tool on workflow files)
+          if (block.type === 'tool_use') {
+            const filePath = block.input?.file_path ?? block.input?.path ?? '';
+            if (typeof filePath === 'string') {
+              this.detectPhaseFromText(filePath);
+            }
+          }
+        }
+      }
+    }
+
+    // Track compaction events (SDK compact_boundary messages)
+    if (message.type === 'system' && message.subtype === 'compact_boundary') {
+      const preTokens = message.compact_metadata?.pre_tokens ?? 0;
+      const trigger = message.compact_metadata?.trigger ?? 'unknown';
+      this.phaseCompactions++;
+      this.phaseCompactionPreTokens.push(preTokens);
+      logToFile(
+        `[BENCHMARK] [COMPACTION] Context compacted during "${
+          this.currentPhase
+        }" (trigger: ${trigger}, pre_tokens: ${formatTokenCount(preTokens)})`,
+      );
+      clack.log.info(
+        `${chalk.yellow('[COMPACTION]')} Context compacted during "${
+          this.currentPhase
+        }" (trigger: ${trigger}, pre_tokens: ${formatTokenCount(preTokens)})`,
+      );
+    }
+  }
+
+  private detectPhaseFromText(text: string): void {
+    const match = text.match(WORKFLOW_FILE_RE);
+    if (match && !this.seenPhases.has(match[1])) {
+      this.transitionTo(match[1]);
+    }
+  }
+
+  /**
+   * Close tracking and build the final BenchmarkData.
+   * Call this when the agent result message is received.
+   */
+  finalize(resultMessage: any, totalDurationMs: number): BenchmarkData {
+    // Close the current (last) phase
+    this.closeCurrentPhase();
+
+    // Build per-phase StepUsage from tracked phases + aggregate from result
+    const modelUsage = resultMessage?.modelUsage ?? {};
+    const aggregateUsage = sumModelUsage(modelUsage);
+    const lastCallUsage = resultMessage?.usage ?? {};
+    const contextTokensOut =
+      Number(lastCallUsage.input_tokens ?? 0) +
+      Number(lastCallUsage.cache_read_input_tokens ?? 0) +
+      Number(lastCallUsage.cache_creation_input_tokens ?? 0);
+
+    const totalTurns = this.phases.reduce((s, p) => s + p.turns, 0);
+    const totalCost = resultMessage?.total_cost_usd ?? 0;
+
+    const steps: StepUsage[] = this.phases.map((phase) => ({
+      name: phase.name,
+      // Per-phase token usage from assistant message usage fields
+      usage: {
+        input_tokens: phase.inputTokens,
+        output_tokens: phase.outputTokens,
+        cache_creation_input_tokens: 0,
+        cache_read_input_tokens: 0,
+      },
+      modelUsage: {},
+      // Proportional cost estimate based on turns
+      totalCostUsd: totalTurns > 0 ? totalCost * (phase.turns / totalTurns) : 0,
+      durationMs: phase.endTime - phase.startTime,
+      durationApiMs: 0,
+      numTurns: phase.turns,
+      ...(phase.compactions > 0 && {
+        compactions: phase.compactions,
+        compactionPreTokens: phase.compactionPreTokens,
+      }),
+    }));
+
+    // Stamp context size on the last step
+    if (steps.length > 0) {
+      steps[steps.length - 1].contextTokensOut = contextTokensOut;
+    }
+
+    const benchmark: BenchmarkData = {
+      timestamp: new Date().toISOString(),
+      steps,
+      totals: {
+        totalCostUsd: totalCost,
+        durationMs: totalDurationMs,
+        inputTokens:
+          aggregateUsage.input_tokens +
+          aggregateUsage.cache_read_input_tokens +
+          aggregateUsage.cache_creation_input_tokens,
+        outputTokens: aggregateUsage.output_tokens,
+        numTurns: resultMessage?.num_turns ?? totalTurns,
+      },
+    };
+
+    // Log summary
+    const totalDurationStr = formatDuration(totalDurationMs);
+    clack.log.success(
+      `${chalk.cyan('[BENCHMARK]')} ${
+        this.phases.length
+      } phases completed in ${totalDurationStr}, cost: $${totalCost.toFixed(
+        4,
+      )}`,
+    );
+    clack.log.info(
+      `${chalk.cyan('[BENCHMARK]')} Results written to ${BENCHMARK_FILE_PATH}`,
+    );
+
+    writeBenchmarkData(benchmark);
+    return benchmark;
+  }
+
+  // ── Private ────────────────────────────────────────────────────────
+
+  private transitionTo(newPhase: string): void {
+    // Stop spinner so log output is visible
+    this.spinner.stop(
+      `${chalk.cyan('[BENCHMARK]')} Completed phase: ${chalk.bold(
+        this.currentPhase,
+      )} ${chalk.dim(`(${this.formatPhaseStats()})`)}`,
+    );
+    this.closeCurrentPhase();
+
+    this.seenPhases.add(newPhase);
+    this.currentPhase = newPhase;
+    this.phaseStartTime = Date.now();
+    this.phaseTurns = 0;
+    this.phaseInputTokens = 0;
+    this.phaseOutputTokens = 0;
+    this.phaseCompactions = 0;
+    this.phaseCompactionPreTokens = [];
+
+    clack.log.info(
+      `${chalk.cyan('[BENCHMARK]')} Starting phase: ${chalk.bold(newPhase)}`,
+    );
+    logToFile(`[BENCHMARK] Starting phase: ${newPhase}`);
+
+    // Restart spinner
+    this.spinner.start(`Integrating PostHog (${newPhase})...`);
+  }
+
+  private closeCurrentPhase(): void {
+    const now = Date.now();
+
+    this.phases.push({
+      name: this.currentPhase,
+      startTime: this.phaseStartTime,
+      endTime: now,
+      turns: this.phaseTurns,
+      inputTokens: this.phaseInputTokens,
+      outputTokens: this.phaseOutputTokens,
+      compactions: this.phaseCompactions,
+      compactionPreTokens: [...this.phaseCompactionPreTokens],
+    });
+
+    logToFile(
+      `[BENCHMARK] Completed phase: ${
+        this.currentPhase
+      } (${this.formatPhaseStats()})`,
+    );
+  }
+
+  private formatPhaseStats(): string {
+    const duration = Date.now() - this.phaseStartTime;
+    const parts = [formatDuration(duration), `${this.phaseTurns} turns`];
+    if (this.phaseInputTokens > 0 || this.phaseOutputTokens > 0) {
+      parts.push(
+        `in: ${formatTokenCount(this.phaseInputTokens)}`,
+        `out: ${formatTokenCount(this.phaseOutputTokens)}`,
+      );
+    }
+    if (this.phaseCompactions > 0) {
+      parts.push(`${this.phaseCompactions} compaction(s)`);
+    }
+    return parts.join(', ');
+  }
+}
+
+// ── File I/O ───────────────────────────────────────────────────────────
+
+/**
+ * Write benchmark data to the benchmark file.
+ */
+export function writeBenchmarkData(data: BenchmarkData): void {
+  try {
+    fs.writeFileSync(BENCHMARK_FILE_PATH, JSON.stringify(data, null, 2));
+    logToFile(`Benchmark data written to ${BENCHMARK_FILE_PATH}`);
+  } catch (error) {
+    logToFile('Failed to write benchmark data:', error);
+  }
+}