diff --git a/bin.ts b/bin.ts
index 7c82ae0..e8e1341 100644
--- a/bin.ts
+++ b/bin.ts
@@ -115,6 +115,12 @@ yargs(hideBin(process.argv))
             'Show menu for manual integration selection instead of auto-detecting\nenv: POSTHOG_WIZARD_MENU',
           type: 'boolean',
         },
+        benchmark: {
+          default: false,
+          describe:
+            'Run in benchmark mode with per-phase token tracking\nenv: POSTHOG_WIZARD_BENCHMARK',
+          type: 'boolean',
+        },
       });
     },
     (argv) => {
diff --git a/src/lib/__tests__/agent-interface.test.ts b/src/lib/__tests__/agent-interface.test.ts
index beed455..d11a257 100644
--- a/src/lib/__tests__/agent-interface.test.ts
+++ b/src/lib/__tests__/agent-interface.test.ts
@@ -32,6 +32,7 @@ describe('runAgent', () => {
     localMcp: false,
     ci: false,
     menu: false,
+    benchmark: false,
   };
 
   const defaultAgentConfig = {
diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts
index 2c315fe..a06acfd 100644
--- a/src/lib/agent-interface.ts
+++ b/src/lib/agent-interface.ts
@@ -3,7 +3,9 @@
  * Uses Claude Agent SDK directly with PostHog LLM gateway
  */
 
+import fs from 'fs';
 import path from 'path';
+import chalk from 'chalk';
 import clack from '../utils/clack';
 import { debug, logToFile, initLogFile, LOG_FILE_PATH } from '../utils/debug';
 import type { WizardOptions } from '../utils/types';
@@ -84,6 +86,43 @@ type AgentRunConfig = {
   model: string;
 };
 
+export const BENCHMARK_FILE_PATH = '/tmp/posthog-wizard-benchmark.json';
+
+export interface StepUsage {
+  name: string;
+  usage: {
+    input_tokens: number;
+    output_tokens: number;
+    cache_creation_input_tokens: number;
+    cache_read_input_tokens: number;
+  };
+  modelUsage: Record<string, unknown>;
+  totalCostUsd: number;
+  durationMs: number;
+  durationApiMs: number;
+  numTurns: number;
+  /** Conversation context size (tokens) entering this step */
+  contextTokensIn: number;
+  /** Conversation context size (tokens) exiting this step */
+  contextTokensOut: number;
+  /** Number of auto-compactions that occurred during this step */
+  compactions?: number;
+  /** Token count before each compaction (from SDK compact_boundary messages) */
+  compactionPreTokens?: number[];
+}
+
+export interface BenchmarkData {
+  timestamp: string;
+  steps: StepUsage[];
+  totals: {
+    totalCostUsd: number;
+    durationMs: number;
+    inputTokens: number;
+    outputTokens: number;
+    numTurns: number;
+  };
+}
+
 /**
  * Package managers that can be used to run commands.
  */
@@ -380,7 +419,11 @@ export async function runAgent(
     successMessage?: string;
     errorMessage?: string;
   },
-): Promise<{ error?: AgentErrorType; message?: string }> {
+): Promise<{
+  error?: AgentErrorType;
+  message?: string;
+  benchmark?: BenchmarkData;
+}> {
   const {
     estimatedDurationMinutes = 8,
     spinnerMessage = 'Customizing your PostHog setup...',
@@ -405,6 +448,8 @@ export async function runAgent(
   const collectedText: string[] = [];
   // Track if we received a successful result (before any cleanup errors)
   let receivedSuccessResult = false;
+  // Track the result message for benchmark data extraction
+  let resultMessage: SDKMessage = null;
 
   // Workaround for SDK bug: stdin closes before canUseTool responses can be sent.
   // The fix is to use an async generator for the prompt that stays open until
@@ -429,7 +474,11 @@ export async function runAgent(
   // Helper to handle successful completion (used in normal path and race condition recovery)
   const completeWithSuccess = (
     suppressedError?: Error,
-  ): { error?: AgentErrorType; message?: string } => {
+  ): {
+    error?: AgentErrorType;
+    message?: string;
+    benchmark?: BenchmarkData;
+  } => {
     const durationMs = Date.now() - startTime;
     const durationSeconds = Math.round(durationMs / 1000);
 
@@ -465,7 +514,19 @@ export async function runAgent(
       duration_seconds: durationSeconds,
     });
     spinner.stop(successMessage);
-    return {};
+
+    // Write benchmark data from the single-query result if available
+    let benchmark: BenchmarkData | undefined;
+    if (resultMessage && options.benchmark) {
+      benchmark = extractBenchmarkFromResult(
+        'single-run',
+        resultMessage,
+        durationMs,
+      );
+      writeBenchmarkData(benchmark);
+    }
+
+    return { benchmark };
   };
 
   try {
@@ -568,6 +629,7 @@ export async function runAgent(
         // The SDK may emit a second error result during cleanup due to a race condition
         if (message.subtype === 'success' && !message.is_error) {
           receivedSuccessResult = true;
+          resultMessage = message;
         }
         signalDone!();
       }
@@ -644,6 +706,611 @@ export async function runAgent(
   }
 }
 
+/**
+ * Format milliseconds into a human-readable duration string (e.g., "2m 34s", "45s").
+ */
+function formatDuration(ms: number): string {
+  const totalSeconds = Math.round(ms / 1000);
+  const minutes = Math.floor(totalSeconds / 60);
+  const seconds = totalSeconds % 60;
+  if (minutes > 0) {
+    return `${minutes}m ${seconds}s`;
+  }
+  return `${seconds}s`;
+}
+
+/**
+ * Format token count into a human-readable string (e.g., "1.2M", "345K", "1,234").
+ */
+function formatTokenCount(tokens: number): string {
+  if (tokens >= 1_000_000) {
+    return `${(tokens / 1_000_000).toFixed(1)}M`;
+  }
+  if (tokens >= 10_000) {
+    return `${Math.round(tokens / 1000)}K`;
+  }
+  return tokens.toLocaleString();
+}
+
+/**
+ * Sum token usage across all models from the SDK's modelUsage field.
+ * The top-level `usage` field only has the last API call's tokens;
+ * `modelUsage` has the accurate per-model aggregates (camelCase fields).
+ */
+function sumModelUsage(modelUsage: Record<string, any>): {
+  input_tokens: number;
+  output_tokens: number;
+  cache_creation_input_tokens: number;
+  cache_read_input_tokens: number;
+} {
+  let input_tokens = 0;
+  let output_tokens = 0;
+  let cache_creation_input_tokens = 0;
+  let cache_read_input_tokens = 0;
+
+  for (const model of Object.values(modelUsage)) {
+    input_tokens += model.inputTokens ?? 0;
+    output_tokens += model.outputTokens ?? 0;
+    cache_creation_input_tokens += model.cacheCreationInputTokens ?? 0;
+    cache_read_input_tokens += model.cacheReadInputTokens ?? 0;
+  }
+
+  return {
+    input_tokens,
+    output_tokens,
+    cache_creation_input_tokens,
+    cache_read_input_tokens,
+  };
+}
+
+/**
+ * Extract benchmark data from a single SDK result message.
+ */
+function extractBenchmarkFromResult(
+  stepName: string,
+  message: SDKMessage,
+  wallDurationMs: number,
+): BenchmarkData {
+  const modelUsage = message.modelUsage ?? {};
+  const usage = sumModelUsage(modelUsage);
+  const lastCallUsage = message.usage ?? {};
+  const contextTokensOut =
+    Number(lastCallUsage.input_tokens ?? 0) +
+    Number(lastCallUsage.cache_read_input_tokens ?? 0) +
+    Number(lastCallUsage.cache_creation_input_tokens ?? 0);
+  const step: StepUsage = {
+    name: stepName,
+    usage,
+    modelUsage,
+    totalCostUsd: message.total_cost_usd ?? 0,
+    durationMs: message.duration_ms ?? wallDurationMs,
+    durationApiMs: message.duration_api_ms ?? 0,
+    numTurns: message.num_turns ?? 0,
+    contextTokensIn: 0,
+    contextTokensOut,
+  };
+
+  return {
+    timestamp: new Date().toISOString(),
+    steps: [step],
+    totals: {
+      totalCostUsd: step.totalCostUsd,
+      durationMs: step.durationMs,
+      inputTokens: step.usage.input_tokens,
+      outputTokens: step.usage.output_tokens,
+      numTurns: step.numTurns,
+    },
+  };
+}
+
+/**
+ * Write benchmark data to the benchmark file.
+ */
+function writeBenchmarkData(data: BenchmarkData): void {
+  try {
+    fs.writeFileSync(BENCHMARK_FILE_PATH, JSON.stringify(data, null, 2));
+    logToFile(`Benchmark data written to ${BENCHMARK_FILE_PATH}`);
+  } catch (error) {
+    logToFile('Failed to write benchmark data:', error);
+  }
+}
+
+/**
+ * Execute multiple agent steps in a single conversation with per-step usage tracking.
+ * Uses one query() call with multiple user messages, so conversation context is preserved
+ * across steps (identical behavior to normal non-benchmark mode).
+ *
+ * Steps can be discovered dynamically via the onAfterStep callback — e.g., after the
+ * setup step installs a skill, onAfterStep discovers the workflow files and returns
+ * them as additional steps to run in the same conversation.
+ *
+ * Per-step usage is computed as deltas between consecutive SDK result messages.
+ *
+ * Writes benchmark data to BENCHMARK_FILE_PATH when all steps complete.
+ */
+export async function runAgentSteps(
+  agentConfig: AgentRunConfig,
+  initialSteps: Array<{ name: string; prompt: string }>,
+  options: WizardOptions,
+  spinner: ReturnType<typeof clack.spinner>,
+  config?: {
+    estimatedDurationMinutes?: number;
+    spinnerMessage?: string;
+    successMessage?: string;
+    errorMessage?: string;
+    /** Called after each step completes. Return additional steps to append to the queue. */
+    onAfterStep?: (
+      stepIndex: number,
+      stepName: string,
+    ) => Array<{ name: string; prompt: string }>;
+  },
+): Promise<{
+  error?: AgentErrorType;
+  message?: string;
+  benchmark?: BenchmarkData;
+}> {
+  const {
+    estimatedDurationMinutes = 8,
+    spinnerMessage = 'Customizing your PostHog setup...',
+    successMessage = 'PostHog integration complete',
+    errorMessage = 'Integration failed',
+    onAfterStep,
+  } = config ?? {};
+
+  const { query } = await getSDKModule();
+
+  clack.log.step(
+    `This whole process should take about ${estimatedDurationMinutes} minutes including error checking and fixes.\n\nGrab some coffee!`,
+  );
+  clack.log.info(`${chalk.cyan('[BENCHMARK]')} Verbose logs: ${LOG_FILE_PATH}`);
+  clack.log.info(
+    `${chalk.cyan(
+      '[BENCHMARK]',
+    )} Benchmark data will be written to: ${BENCHMARK_FILE_PATH}`,
+  );
+
+  spinner.start(spinnerMessage);
+
+  const overallStartTime = Date.now();
+  const stepUsages: StepUsage[] = [];
+  const collectedText: string[] = [];
+  let receivedSuccessResult = false;
+
+  // Dynamic steps list — grows as onAfterStep discovers more
+  const allSteps = [...initialSteps];
+  const stepStartTimes: number[] = [];
+  let completedStepCount = 0;
+
+  // Per-step compaction tracking (reset after each step)
+  let stepCompactions = 0;
+  let stepCompactionPreTokens: number[] = [];
+
+  // Previous cumulative values for delta computation
+  let prevCumulative = {
+    usage: {
+      input_tokens: 0,
+      output_tokens: 0,
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: 0,
+    },
+    modelUsage: {} as Record<string, any>,
+    costUsd: 0,
+    durationMs: 0,
+    durationApiMs: 0,
+    numTurns: 0,
+  };
+
+  // Step completion synchronization: resolves with `true` on success, `false` on error
+  // eslint-disable-next-line @typescript-eslint/no-empty-function
+  let resolveStepDone: (success: boolean) => void = () => {};
+  function waitForStepDone(): Promise<boolean> {
+    return new Promise((resolve) => {
+      resolveStepDone = resolve;
+    });
+  }
+
+  // Final cleanup signal for SDK stdin workaround
+  let signalAllDone: () => void;
+  const allDone = new Promise<void>((resolve) => {
+    signalAllDone = resolve;
+  });
+
+  // Prompt stream generator — yields user messages for each step in order,
+  // pausing between steps to wait for the result and discover more steps.
+  const promptStream = async function* () {
+    let i = 0;
+    while (i < allSteps.length) {
+      const step = allSteps[i];
+      stepStartTimes[i] = Date.now();
+
+      logToFile(`Yielding benchmark step ${i + 1}: ${step.name}`);
+      spinner.stop(
+        `${chalk.cyan('[BENCHMARK]')} Starting step ${i + 1}/${
+          allSteps.length
+        }: ${chalk.bold(step.name)}`,
+      );
+      spinner.start(
+        `Running step ${i + 1}/${allSteps.length}: ${step.name}...`,
+      );
+
+      yield {
+        type: 'user',
+        session_id: '',
+        message: { role: 'user', content: step.prompt },
+        parent_tool_use_id: null,
+      };
+
+      // Wait for this step's result before yielding the next prompt
+      const success = await waitForStepDone();
+      if (!success) {
+        // Step failed — stop yielding, let the generator end
+        break;
+      }
+
+      // Discover more steps after this one completes
+      if (onAfterStep) {
+        const moreSteps = onAfterStep(i, step.name);
+        if (moreSteps.length > 0) {
+          allSteps.push(...moreSteps);
+          clack.log.info(
+            `${chalk.cyan('[BENCHMARK]')} Discovered ${
+              moreSteps.length
+            } more phases: ${moreSteps.map((s) => s.name).join(', ')}`,
+          );
+        }
+      }
+
+      i++;
+    }
+
+    // Keep generator alive for SDK cleanup (stdin workaround)
+    await allDone;
+  };
+
+  const allowedTools = [
+    'Read',
+    'Write',
+    'Edit',
+    'Glob',
+    'Grep',
+    'Bash',
+    'ListMcpResourcesTool',
+    'Skill',
+  ];
+
+  try {
+    const response = query({
+      prompt: promptStream(),
+      options: {
+        model: agentConfig.model,
+        cwd: agentConfig.workingDirectory,
+        permissionMode: 'acceptEdits',
+        mcpServers: agentConfig.mcpServers,
+        settingSources: ['project'],
+        allowedTools,
+        env: {
+          ...process.env,
+          ANTHROPIC_API_KEY: undefined,
+        },
+        canUseTool: (toolName: string, input: unknown) => {
+          logToFile('canUseTool called:', { toolName, input });
+          const result = wizardCanUseTool(
+            toolName,
+            input as Record<string, unknown>,
+          );
+          logToFile('canUseTool result:', result);
+          return Promise.resolve(result);
+        },
+        tools: { type: 'preset', preset: 'claude_code' },
+        stderr: (data: string) => {
+          logToFile('CLI stderr:', data);
+          if (options.debug) {
+            debug('CLI stderr:', data);
+          }
+        },
+      },
+    });
+
+    for await (const message of response) {
+      handleSDKMessage(
+        message,
+        options,
+        spinner,
+        collectedText,
+        receivedSuccessResult,
+      );
+
+      // Track compaction events from the SDK
+      if (message.type === 'system' && message.subtype === 'compact_boundary') {
+        const preTokens = message.compact_metadata?.pre_tokens ?? 0;
+        const trigger = message.compact_metadata?.trigger ?? 'unknown';
+        stepCompactions++;
+        stepCompactionPreTokens.push(preTokens);
+        logToFile(
+          `[COMPACTION] Context compacted (trigger: ${trigger}, pre_tokens: ${formatTokenCount(
+            preTokens,
+          )})`,
+        );
+        clack.log.info(
+          `${chalk.yellow('[COMPACTION]')} Context compacted during step "${
+            allSteps[completedStepCount]?.name
+          }" (trigger: ${trigger}, pre_tokens: ${formatTokenCount(preTokens)})`,
+        );
+      }
+
+      if (message.type === 'result') {
+        if (message.subtype === 'success' && !message.is_error) {
+          receivedSuccessResult = true;
+
+          const stepIndex = completedStepCount;
+          const stepDurationMs = Date.now() - stepStartTimes[stepIndex];
+
+          // Compute delta usage from cumulative SDK values
+          const modelUsageData = message.modelUsage ?? {};
+          const cumulativeUsage = sumModelUsage(modelUsageData);
+          const cumulativeCost = message.total_cost_usd ?? 0;
+          const cumulativeDuration = message.duration_ms ?? 0;
+          const cumulativeDurationApi = message.duration_api_ms ?? 0;
+          const cumulativeTurns = message.num_turns ?? 0;
+
+          const deltaUsage = {
+            input_tokens:
+              cumulativeUsage.input_tokens - prevCumulative.usage.input_tokens,
+            output_tokens:
+              cumulativeUsage.output_tokens -
+              prevCumulative.usage.output_tokens,
+            cache_creation_input_tokens:
+              cumulativeUsage.cache_creation_input_tokens -
+              prevCumulative.usage.cache_creation_input_tokens,
+            cache_read_input_tokens:
+              cumulativeUsage.cache_read_input_tokens -
+              prevCumulative.usage.cache_read_input_tokens,
+          };
+          const deltaCost = cumulativeCost - prevCumulative.costUsd;
+          // num_turns is per-response (not cumulative), so use directly
+          const stepTurns = cumulativeTurns;
+          const deltaDurationApi =
+            cumulativeDurationApi - prevCumulative.durationApiMs;
+          const deltaModelUsage = computeModelUsageDelta(
+            modelUsageData,
+            prevCumulative.modelUsage,
+          );
+
+          // Context size from the last API call's usage (not cumulative modelUsage).
+          // The last call's input represents the actual conversation window at that point.
+          const lastCallUsage = message.usage ?? {};
+          const contextTokensOut =
+            Number(lastCallUsage.input_tokens ?? 0) +
+            Number(lastCallUsage.cache_read_input_tokens ?? 0) +
+            Number(lastCallUsage.cache_creation_input_tokens ?? 0);
+          const contextTokensIn =
+            stepUsages.length > 0
+              ? stepUsages[stepUsages.length - 1].contextTokensOut
+              : 0;
+
+          stepUsages.push({
+            name: allSteps[stepIndex].name,
+            usage: deltaUsage,
+            modelUsage: deltaModelUsage,
+            totalCostUsd: deltaCost,
+            durationMs: stepDurationMs,
+            durationApiMs: deltaDurationApi,
+            numTurns: stepTurns,
+            contextTokensIn,
+            contextTokensOut,
+            ...(stepCompactions > 0 && {
+              compactions: stepCompactions,
+              compactionPreTokens: stepCompactionPreTokens,
+            }),
+          });
+
+          // Reset per-step compaction tracking
+          stepCompactions = 0;
+          stepCompactionPreTokens = [];
+
+          // Update cumulative tracking
+          prevCumulative = {
+            usage: cumulativeUsage,
+            modelUsage: modelUsageData,
+            costUsd: cumulativeCost,
+            durationMs: cumulativeDuration,
+            durationApiMs: cumulativeDurationApi,
+            numTurns: cumulativeTurns,
+          };
+
+          spinner.stop(
+            `${chalk.cyan('[BENCHMARK]')} Completed step ${stepIndex + 1}/${
+              allSteps.length
+            }: ${chalk.bold(allSteps[stepIndex].name)} ${chalk.dim(
+              `(${formatDuration(stepDurationMs)}, $${deltaCost.toFixed(
+                4,
+              )}, ${stepTurns} turns, ctx: ${formatTokenCount(
+                contextTokensIn,
+              )} → ${formatTokenCount(contextTokensOut)})`,
+            )}`,
+          );
+          logToFile(
+            `Step "${allSteps[stepIndex].name}" completed in ${Math.round(
+              stepDurationMs / 1000,
+            )}s`,
+          );
+
+          completedStepCount++;
+          resolveStepDone(true);
+        } else {
+          // Error result — signal generator to stop yielding
+          resolveStepDone(false);
+        }
+
+        // Signal generator cleanup when all done
+        if (completedStepCount >= allSteps.length) {
+          signalAllDone!();
+        }
+      }
+    }
+
+    // Check for error signals in collected output
+    const outputText = collectedText.join('\n');
+    if (outputText.includes(AgentSignals.ERROR_MCP_MISSING)) {
+      spinner.stop('Agent could not access PostHog MCP');
+      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+      writeBenchmarkData(benchmark);
+      return { error: AgentErrorType.MCP_MISSING, benchmark };
+    }
+    if (outputText.includes(AgentSignals.ERROR_RESOURCE_MISSING)) {
+      spinner.stop('Agent could not access setup resource');
+      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+      writeBenchmarkData(benchmark);
+      return { error: AgentErrorType.RESOURCE_MISSING, benchmark };
+    }
+    if (outputText.includes('API Error: 429')) {
+      spinner.stop('Rate limit exceeded');
+      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+      writeBenchmarkData(benchmark);
+      return {
+        error: AgentErrorType.RATE_LIMIT,
+        message: outputText,
+        benchmark,
+      };
+    }
+    if (outputText.includes('API Error:')) {
+      spinner.stop('API error occurred');
+      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+      writeBenchmarkData(benchmark);
+      return {
+        error: AgentErrorType.API_ERROR,
+        message: outputText,
+        benchmark,
+      };
+    }
+
+    const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+    writeBenchmarkData(benchmark);
+
+    const totalDurationSeconds = Math.round(
+      (Date.now() - overallStartTime) / 1000,
+    );
+    const totalCost = stepUsages.reduce((sum, s) => sum + s.totalCostUsd, 0);
+    clack.log.success(
+      `${chalk.cyan(
+        '[BENCHMARK]',
+      )} All ${completedStepCount} steps completed in ${formatDuration(
+        totalDurationSeconds * 1000,
+      )}, total cost: $${totalCost.toFixed(4)}`,
+    );
+    clack.log.info(
+      `${chalk.cyan('[BENCHMARK]')} Results written to ${BENCHMARK_FILE_PATH}`,
+    );
+    logToFile(
+      `All ${completedStepCount} benchmark steps completed in ${totalDurationSeconds}s`,
+    );
+
+    analytics.capture(WIZARD_INTERACTION_EVENT_NAME, {
+      action: 'agent integration completed',
+      duration_ms: Date.now() - overallStartTime,
+      duration_seconds: totalDurationSeconds,
+      benchmark_steps: completedStepCount,
+    });
+
+    spinner.stop(successMessage);
+    return { benchmark };
+  } catch (error) {
+    signalAllDone!();
+
+    if (receivedSuccessResult) {
+      logToFile('Ignoring post-completion error, agent completed successfully');
+      const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+      writeBenchmarkData(benchmark);
+      spinner.stop(successMessage);
+      return { benchmark };
+    }
+
+    spinner.stop(errorMessage);
+    const benchmark = buildBenchmarkData(stepUsages, overallStartTime);
+    writeBenchmarkData(benchmark);
+
+    const outputText = collectedText.join('\n');
+    const apiErrorMatch = outputText.match(/API Error: [^\n]+/g);
+    const apiErrorMessage = apiErrorMatch
+      ? apiErrorMatch.join('\n')
+      : undefined;
+
+    if (outputText.includes('API Error: 429')) {
+      return {
+        error: AgentErrorType.RATE_LIMIT,
+        message: apiErrorMessage,
+        benchmark,
+      };
+    }
+    if (outputText.includes('API Error:')) {
+      return {
+        error: AgentErrorType.API_ERROR,
+        message: apiErrorMessage,
+        benchmark,
+      };
+    }
+
+    throw error;
+  }
+}
+
+/**
+ * Compute per-model usage deltas between current and previous cumulative modelUsage.
+ */
+function computeModelUsageDelta(
+  current: Record<string, any>,
+  previous: Record<string, any>,
+): Record<string, any> {
+  const result: Record<string, any> = {};
+  for (const [model, data] of Object.entries(current)) {
+    const prev = previous[model] ?? {};
+    result[model] = {
+      inputTokens: (data.inputTokens ?? 0) - (prev.inputTokens ?? 0),
+      outputTokens: (data.outputTokens ?? 0) - (prev.outputTokens ?? 0),
+      cacheReadInputTokens:
+        (data.cacheReadInputTokens ?? 0) - (prev.cacheReadInputTokens ?? 0),
+      cacheCreationInputTokens:
+        (data.cacheCreationInputTokens ?? 0) -
+        (prev.cacheCreationInputTokens ?? 0),
+      webSearchRequests:
+        (data.webSearchRequests ?? 0) - (prev.webSearchRequests ?? 0),
+      costUSD: (data.costUSD ?? 0) - (prev.costUSD ?? 0),
+      contextWindow: data.contextWindow ?? 0,
+    };
+  }
+  return result;
+}
+
+/**
+ * Build BenchmarkData from collected step usages.
+ */
+function buildBenchmarkData(
+  stepUsages: StepUsage[],
+  overallStartTime: number,
+): BenchmarkData {
+  return {
+    timestamp: new Date().toISOString(),
+    steps: stepUsages,
+    totals: {
+      totalCostUsd: stepUsages.reduce((sum, s) => sum + s.totalCostUsd, 0),
+      durationMs: Date.now() - overallStartTime,
+      inputTokens: stepUsages.reduce(
+        (sum, s) =>
+          sum +
+          s.usage.input_tokens +
+          s.usage.cache_read_input_tokens +
+          s.usage.cache_creation_input_tokens,
+        0,
+      ),
+      outputTokens: stepUsages.reduce(
+        (sum, s) => sum + s.usage.output_tokens,
+        0,
+      ),
+      numTurns: stepUsages.reduce((sum, s) => sum + s.numTurns, 0),
+    },
+  };
+}
+
 /**
  * Handle SDK messages and provide user feedback
  *
@@ -739,7 +1406,6 @@ function handleSDKMessage(
     }
 
     default:
-      // Log other message types for debugging
       if (options.debug) {
         debug(`Unhandled message type: ${message.type}`);
       }
diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts
index d319df2..bd75599 100644
--- a/src/lib/agent-runner.ts
+++ b/src/lib/agent-runner.ts
@@ -22,9 +22,13 @@ import clack from '../utils/clack';
 import {
   initializeAgent,
   runAgent,
+  runAgentSteps,
   AgentSignals,
   AgentErrorType,
 } from './agent-interface';
+import { logToFile } from '../utils/debug';
+import fs from 'fs';
+import path from 'path';
 import { getCloudUrlFromRegion } from '../utils/urls';
 import chalk from 'chalk';
 import * as semver from 'semver';
@@ -193,18 +197,90 @@ export async function runAgentWizard(
     options,
   );
 
-  const agentResult = await runAgent(
-    agent,
-    integrationPrompt,
-    options,
-    spinner,
-    {
-      estimatedDurationMinutes: config.ui.estimatedDurationMinutes,
-      spinnerMessage: SPINNER_MESSAGE,
-      successMessage: config.ui.successMessage,
-      errorMessage: 'Integration failed',
-    },
-  );
+  const agentRunConfig = {
+    estimatedDurationMinutes: config.ui.estimatedDurationMinutes,
+    spinnerMessage: SPINNER_MESSAGE,
+    successMessage: config.ui.successMessage,
+    errorMessage: 'Integration failed',
+  };
+
+  let agentResult;
+
+  if (options.benchmark) {
+    clack.log.info(
+      `${chalk.cyan(
+        '[BENCHMARK]',
+      )} Running in benchmark mode — each workflow phase will be tracked separately`,
+    );
+
+    // Benchmark mode: run setup + workflow phases in a single conversation,
+    // with per-step tracking. Context is preserved across steps (identical to normal mode).
+    const additionalLines = config.prompts.getAdditionalContextLines
+      ? config.prompts.getAdditionalContextLines(frameworkContext)
+      : [];
+    const additionalContext =
+      additionalLines.length > 0
+        ? '\n' + additionalLines.map((line) => `- ${line}`).join('\n')
+        : '';
+
+    const projectContext = `Project context:
+- Framework: ${config.metadata.name} ${frameworkVersion || 'latest'}
+- TypeScript: ${typeScriptDetected ? 'Yes' : 'No'}
+- PostHog API Key: ${projectApiKey}
+- PostHog Host: ${host}${additionalContext}`;
+
+    const setupPrompt = `You have access to the PostHog MCP server which provides skills to integrate PostHog into this ${config.metadata.name} project.
+
+${projectContext}
+
+Instructions (follow these steps IN ORDER - do not skip or reorder):
+
+STEP 1: List available skills from the PostHog MCP server using ListMcpResourcesTool. If this tool is not available or you cannot access the MCP server, you must emit: ${AgentSignals.ERROR_MCP_MISSING} Could not access the PostHog MCP server and halt.
+
+   Review the skill descriptions and choose the one that best matches this project's framework and configuration.
+   If no suitable skill is found, or you cannot access the MCP server, you emit: ${AgentSignals.ERROR_RESOURCE_MISSING} Could not find a suitable skill for this project.
+
+STEP 2: Fetch the chosen skill resource (e.g., posthog://skills/{skill-id}).
+   The resource returns a shell command to install the skill.
+
+STEP 3: Run the installation command using Bash:
+   - Execute the EXACT command returned by the resource (do not modify it)
+   - This will download and extract the skill to .claude/skills/{skill-id}/
+
+STEP 4: Load the installed skill's SKILL.md file to understand what references are available.
+
+STEP 5: Set up environment variables for PostHog in a .env file with the API key and host provided above, using the appropriate naming convention for ${config.metadata.name}. Make sure to use these environment variables in the code files you create instead of hardcoding the API key and host.
+
+Important: Look for lockfiles (pnpm-lock.yaml, package-lock.json, yarn.lock, bun.lockb) to determine the package manager (excluding the contents of node_modules). Do not manually edit package.json. Always install packages as a background task. Don't await completion; proceed with other work immediately after starting the installation. You must read a file immediately before attempting to write it, even if you have previously read it; failure to do so will cause a tool failure.
+
+`;
+
+    const setupStep = { name: 'setup', prompt: setupPrompt };
+
+    // Run all steps in a single conversation. After the setup step installs the skill,
+    // onAfterStep discovers workflow files on disk and adds them as additional steps.
+    agentResult = await runAgentSteps(agent, [setupStep], options, spinner, {
+      ...agentRunConfig,
+      onAfterStep: (stepIndex, stepName) => {
+        if (stepName === 'setup') {
+          return discoverBenchmarkSteps(
+            options.installDir,
+            config,
+            projectContext,
+          );
+        }
+        return [];
+      },
+    });
+  } else {
+    agentResult = await runAgent(
+      agent,
+      integrationPrompt,
+      options,
+      spinner,
+      agentRunConfig,
+    );
+  }
 
   // Handle error cases detected in agent output
   if (agentResult.error === AgentErrorType.MCP_MISSING) {
@@ -412,3 +488,93 @@ Important: Look for lockfiles (pnpm-lock.yaml, package-lock.json, yarn.lock, bun
 
 `;
 }
+
+/**
+ * Discover installed skill workflow files and build benchmark step prompts.
+ * Scans .claude/skills/ in the install directory for workflow files (1.0-*, 1.1-*, etc.).
+ */
+function discoverBenchmarkSteps(
+  installDir: string,
+  config: FrameworkConfig,
+  projectContext: string,
+): Array<{ name: string; prompt: string }> {
+  const skillsDir = path.join(installDir, '.claude', 'skills');
+
+  if (!fs.existsSync(skillsDir)) {
+    logToFile('No .claude/skills/ directory found for benchmark discovery');
+    return [];
+  }
+
+  // Find installed skill directory
+  const skillDirs = fs.readdirSync(skillsDir).filter((entry) => {
+    const fullPath = path.join(skillsDir, entry);
+    return fs.statSync(fullPath).isDirectory();
+  });
+
+  if (skillDirs.length === 0) {
+    logToFile('No skill directories found in .claude/skills/');
+    return [];
+  }
+
+  // Use the first skill directory found
+  const skillId = skillDirs[0];
+  const skillPath = path.join(skillsDir, skillId);
+  logToFile(`Discovered skill for benchmark: ${skillId}`);
+
+  // Workflow files live in the references/ subdirectory
+  const referencesDir = path.join(skillPath, 'references');
+  if (!fs.existsSync(referencesDir)) {
+    logToFile('No references/ directory found in skill directory');
+    return [];
+  }
+
+  // Find workflow files matching pattern like "basic-integration-1.0-begin.md"
+  // The naming convention is {category}-{number}.{step}-{name}.md
+  const allFiles = fs.readdirSync(referencesDir);
+  const workflowFiles = allFiles
+    .filter((f) => /\d+\.\d+-\w+\.md$/.test(f))
+    .sort();
+
+  if (workflowFiles.length === 0) {
+    logToFile(
+      `No workflow files found in references/ directory. Files present: ${allFiles.join(
+        ', ',
+      )}`,
+    );
+    return [];
+  }
+
+  logToFile(
+    `Found ${workflowFiles.length} workflow files: ${workflowFiles.join(', ')}`,
+  );
+
+  const steps: Array<{ name: string; prompt: string }> = [];
+
+  for (const workflowFile of workflowFiles) {
+    // Extract phase name from filename
+    // e.g., "basic-integration-1.0-begin.md" -> "1.0-begin"
+    const phaseMatch = workflowFile.match(/(\d+\.\d+-.+)\.md$/);
+    const phaseName = phaseMatch
+      ? phaseMatch[1]
+      : workflowFile.replace(/\.md$/, '');
+
+    const prompt = `You are performing phase "${phaseName}" of a PostHog integration for a ${config.metadata.name} project.
+
+${projectContext}
+
+The PostHog skill is installed at .claude/skills/${skillId}/.
+Read SKILL.md in that directory for available reference files.
+
+Follow the instructions in the workflow file: .claude/skills/${skillId}/references/${workflowFile}
+
+Important: Read files before writing. Use environment variables, not hardcoded keys.
+Do not manually edit package.json. Use lockfiles to determine the package manager.
+You must read a file immediately before attempting to write it, even if you have previously read it; failure to do so will cause a tool failure.
+Always install packages as a background task. Don't await completion; proceed with other work immediately after starting the installation.
+`;
+
+    steps.push({ name: phaseName, prompt });
+  }
+
+  return steps;
+}
diff --git a/src/run.ts b/src/run.ts
index e5e5311..42cc336 100644
--- a/src/run.ts
+++ b/src/run.ts
@@ -27,6 +27,7 @@ type Args = {
   ci?: boolean;
   apiKey?: string;
   menu?: boolean;
+  benchmark?: boolean;
 };
 
 export async function runWizard(argv: Args) {
@@ -57,6 +58,7 @@ export async function runWizard(argv: Args) {
     ci: finalArgs.ci ?? false,
     apiKey: finalArgs.apiKey,
     menu: finalArgs.menu ?? false,
+    benchmark: finalArgs.benchmark ?? false,
   };
 
   clack.intro(`Welcome to the PostHog setup wizard ✨`);
diff --git a/src/utils/types.ts b/src/utils/types.ts
index 7379156..5c0b8d6 100644
--- a/src/utils/types.ts
+++ b/src/utils/types.ts
@@ -60,6 +60,13 @@ export type WizardOptions = {
    * Whether to show the menu for manual integration selection instead of auto-detecting.
    */
   menu: boolean;
+
+  /**
+   * Whether to run in benchmark mode with per-phase token tracking.
+   * When enabled, the wizard runs each workflow phase as a separate agent call
+   * and writes detailed usage data to /tmp/posthog-wizard-benchmark.json.
+   */
+  benchmark: boolean;
 };
 
 export interface Feature {