From 4935e1a07f75195f7e862f36c59f5bbcd6c322ae Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" Date: Wed, 11 Feb 2026 15:32:10 -0500 Subject: [PATCH 1/2] Instrument the loop --- bin.ts | 6 + src/lib/__tests__/agent-interface.test.ts | 1 + src/lib/agent-interface.ts | 674 +++++++++++++++++++++- src/lib/agent-runner.ts | 190 +++++- src/run.ts | 2 + src/utils/types.ts | 7 + 6 files changed, 864 insertions(+), 16 deletions(-) diff --git a/bin.ts b/bin.ts index 7c82ae0..e8e1341 100644 --- a/bin.ts +++ b/bin.ts @@ -115,6 +115,12 @@ yargs(hideBin(process.argv)) 'Show menu for manual integration selection instead of auto-detecting\nenv: POSTHOG_WIZARD_MENU', type: 'boolean', }, + benchmark: { + default: false, + describe: + 'Run in benchmark mode with per-phase token tracking\nenv: POSTHOG_WIZARD_BENCHMARK', + type: 'boolean', + }, }); }, (argv) => { diff --git a/src/lib/__tests__/agent-interface.test.ts b/src/lib/__tests__/agent-interface.test.ts index beed455..d11a257 100644 --- a/src/lib/__tests__/agent-interface.test.ts +++ b/src/lib/__tests__/agent-interface.test.ts @@ -32,6 +32,7 @@ describe('runAgent', () => { localMcp: false, ci: false, menu: false, + benchmark: false, }; const defaultAgentConfig = { diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts index 2c315fe..a06acfd 100644 --- a/src/lib/agent-interface.ts +++ b/src/lib/agent-interface.ts @@ -3,7 +3,9 @@ * Uses Claude Agent SDK directly with PostHog LLM gateway */ +import fs from 'fs'; import path from 'path'; +import chalk from 'chalk'; import clack from '../utils/clack'; import { debug, logToFile, initLogFile, LOG_FILE_PATH } from '../utils/debug'; import type { WizardOptions } from '../utils/types'; @@ -84,6 +86,43 @@ type AgentRunConfig = { model: string; }; +export const BENCHMARK_FILE_PATH = '/tmp/posthog-wizard-benchmark.json'; + +export interface StepUsage { + name: string; + usage: { + input_tokens: number; + output_tokens: number; + cache_creation_input_tokens: number; + cache_read_input_tokens: number; + }; + modelUsage: Record; + totalCostUsd: number; + durationMs: number; + durationApiMs: number; + numTurns: number; + /** Conversation context size (tokens) entering this step */ + contextTokensIn: number; + /** Conversation context size (tokens) exiting this step */ + contextTokensOut: number; + /** Number of auto-compactions that occurred during this step */ + compactions?: number; + /** Token count before each compaction (from SDK compact_boundary messages) */ + compactionPreTokens?: number[]; +} + +export interface BenchmarkData { + timestamp: string; + steps: StepUsage[]; + totals: { + totalCostUsd: number; + durationMs: number; + inputTokens: number; + outputTokens: number; + numTurns: number; + }; +} + /** * Package managers that can be used to run commands. */ @@ -380,7 +419,11 @@ export async function runAgent( successMessage?: string; errorMessage?: string; }, -): Promise<{ error?: AgentErrorType; message?: string }> { +): Promise<{ + error?: AgentErrorType; + message?: string; + benchmark?: BenchmarkData; +}> { const { estimatedDurationMinutes = 8, spinnerMessage = 'Customizing your PostHog setup...', @@ -405,6 +448,8 @@ export async function runAgent( const collectedText: string[] = []; // Track if we received a successful result (before any cleanup errors) let receivedSuccessResult = false; + // Track the result message for benchmark data extraction + let resultMessage: SDKMessage = null; // Workaround for SDK bug: stdin closes before canUseTool responses can be sent. // The fix is to use an async generator for the prompt that stays open until @@ -429,7 +474,11 @@ export async function runAgent( // Helper to handle successful completion (used in normal path and race condition recovery) const completeWithSuccess = ( suppressedError?: Error, - ): { error?: AgentErrorType; message?: string } => { + ): { + error?: AgentErrorType; + message?: string; + benchmark?: BenchmarkData; + } => { const durationMs = Date.now() - startTime; const durationSeconds = Math.round(durationMs / 1000); @@ -465,7 +514,19 @@ export async function runAgent( duration_seconds: durationSeconds, }); spinner.stop(successMessage); - return {}; + + // Write benchmark data from the single-query result if available + let benchmark: BenchmarkData | undefined; + if (resultMessage && options.benchmark) { + benchmark = extractBenchmarkFromResult( + 'single-run', + resultMessage, + durationMs, + ); + writeBenchmarkData(benchmark); + } + + return { benchmark }; }; try { @@ -568,6 +629,7 @@ export async function runAgent( // The SDK may emit a second error result during cleanup due to a race condition if (message.subtype === 'success' && !message.is_error) { receivedSuccessResult = true; + resultMessage = message; } signalDone!(); } @@ -644,6 +706,611 @@ export async function runAgent( } } +/** + * Format milliseconds into a human-readable duration string (e.g., "2m 34s", "45s"). + */ +function formatDuration(ms: number): string { + const totalSeconds = Math.round(ms / 1000); + const minutes = Math.floor(totalSeconds / 60); + const seconds = totalSeconds % 60; + if (minutes > 0) { + return `${minutes}m ${seconds}s`; + } + return `${seconds}s`; +} + +/** + * Format token count into a human-readable string (e.g., "1.2M", "345K", "1,234"). + */ +function formatTokenCount(tokens: number): string { + if (tokens >= 1_000_000) { + return `${(tokens / 1_000_000).toFixed(1)}M`; + } + if (tokens >= 10_000) { + return `${Math.round(tokens / 1000)}K`; + } + return tokens.toLocaleString(); +} + +/** + * Sum token usage across all models from the SDK's modelUsage field. + * The top-level `usage` field only has the last API call's tokens; + * `modelUsage` has the accurate per-model aggregates (camelCase fields). + */ +function sumModelUsage(modelUsage: Record): { + input_tokens: number; + output_tokens: number; + cache_creation_input_tokens: number; + cache_read_input_tokens: number; +} { + let input_tokens = 0; + let output_tokens = 0; + let cache_creation_input_tokens = 0; + let cache_read_input_tokens = 0; + + for (const model of Object.values(modelUsage)) { + input_tokens += model.inputTokens ?? 0; + output_tokens += model.outputTokens ?? 0; + cache_creation_input_tokens += model.cacheCreationInputTokens ?? 0; + cache_read_input_tokens += model.cacheReadInputTokens ?? 0; + } + + return { + input_tokens, + output_tokens, + cache_creation_input_tokens, + cache_read_input_tokens, + }; +} + +/** + * Extract benchmark data from a single SDK result message. + */ +function extractBenchmarkFromResult( + stepName: string, + message: SDKMessage, + wallDurationMs: number, +): BenchmarkData { + const modelUsage = message.modelUsage ?? {}; + const usage = sumModelUsage(modelUsage); + const lastCallUsage = message.usage ?? {}; + const contextTokensOut = + Number(lastCallUsage.input_tokens ?? 0) + + Number(lastCallUsage.cache_read_input_tokens ?? 0) + + Number(lastCallUsage.cache_creation_input_tokens ?? 0); + const step: StepUsage = { + name: stepName, + usage, + modelUsage, + totalCostUsd: message.total_cost_usd ?? 0, + durationMs: message.duration_ms ?? wallDurationMs, + durationApiMs: message.duration_api_ms ?? 0, + numTurns: message.num_turns ?? 0, + contextTokensIn: 0, + contextTokensOut, + }; + + return { + timestamp: new Date().toISOString(), + steps: [step], + totals: { + totalCostUsd: step.totalCostUsd, + durationMs: step.durationMs, + inputTokens: step.usage.input_tokens, + outputTokens: step.usage.output_tokens, + numTurns: step.numTurns, + }, + }; +} + +/** + * Write benchmark data to the benchmark file. + */ +function writeBenchmarkData(data: BenchmarkData): void { + try { + fs.writeFileSync(BENCHMARK_FILE_PATH, JSON.stringify(data, null, 2)); + logToFile(`Benchmark data written to ${BENCHMARK_FILE_PATH}`); + } catch (error) { + logToFile('Failed to write benchmark data:', error); + } +} + +/** + * Execute multiple agent steps in a single conversation with per-step usage tracking. + * Uses one query() call with multiple user messages, so conversation context is preserved + * across steps (identical behavior to normal non-benchmark mode). + * + * Steps can be discovered dynamically via the onAfterStep callback — e.g., after the + * setup step installs a skill, onAfterStep discovers the workflow files and returns + * them as additional steps to run in the same conversation. + * + * Per-step usage is computed as deltas between consecutive SDK result messages. + * + * Writes benchmark data to BENCHMARK_FILE_PATH when all steps complete. + */ +export async function runAgentSteps( + agentConfig: AgentRunConfig, + initialSteps: Array<{ name: string; prompt: string }>, + options: WizardOptions, + spinner: ReturnType, + config?: { + estimatedDurationMinutes?: number; + spinnerMessage?: string; + successMessage?: string; + errorMessage?: string; + /** Called after each step completes. Return additional steps to append to the queue. */ + onAfterStep?: ( + stepIndex: number, + stepName: string, + ) => Array<{ name: string; prompt: string }>; + }, +): Promise<{ + error?: AgentErrorType; + message?: string; + benchmark?: BenchmarkData; +}> { + const { + estimatedDurationMinutes = 8, + spinnerMessage = 'Customizing your PostHog setup...', + successMessage = 'PostHog integration complete', + errorMessage = 'Integration failed', + onAfterStep, + } = config ?? {}; + + const { query } = await getSDKModule(); + + clack.log.step( + `This whole process should take about ${estimatedDurationMinutes} minutes including error checking and fixes.\n\nGrab some coffee!`, + ); + clack.log.info(`${chalk.cyan('[BENCHMARK]')} Verbose logs: ${LOG_FILE_PATH}`); + clack.log.info( + `${chalk.cyan( + '[BENCHMARK]', + )} Benchmark data will be written to: ${BENCHMARK_FILE_PATH}`, + ); + + spinner.start(spinnerMessage); + + const overallStartTime = Date.now(); + const stepUsages: StepUsage[] = []; + const collectedText: string[] = []; + let receivedSuccessResult = false; + + // Dynamic steps list — grows as onAfterStep discovers more + const allSteps = [...initialSteps]; + const stepStartTimes: number[] = []; + let completedStepCount = 0; + + // Per-step compaction tracking (reset after each step) + let stepCompactions = 0; + let stepCompactionPreTokens: number[] = []; + + // Previous cumulative values for delta computation + let prevCumulative = { + usage: { + input_tokens: 0, + output_tokens: 0, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + }, + modelUsage: {} as Record, + costUsd: 0, + durationMs: 0, + durationApiMs: 0, + numTurns: 0, + }; + + // Step completion synchronization: resolves with `true` on success, `false` on error + // eslint-disable-next-line @typescript-eslint/no-empty-function + let resolveStepDone: (success: boolean) => void = () => {}; + function waitForStepDone(): Promise { + return new Promise((resolve) => { + resolveStepDone = resolve; + }); + } + + // Final cleanup signal for SDK stdin workaround + let signalAllDone: () => void; + const allDone = new Promise((resolve) => { + signalAllDone = resolve; + }); + + // Prompt stream generator — yields user messages for each step in order, + // pausing between steps to wait for the result and discover more steps. + const promptStream = async function* () { + let i = 0; + while (i < allSteps.length) { + const step = allSteps[i]; + stepStartTimes[i] = Date.now(); + + logToFile(`Yielding benchmark step ${i + 1}: ${step.name}`); + spinner.stop( + `${chalk.cyan('[BENCHMARK]')} Starting step ${i + 1}/${ + allSteps.length + }: ${chalk.bold(step.name)}`, + ); + spinner.start( + `Running step ${i + 1}/${allSteps.length}: ${step.name}...`, + ); + + yield { + type: 'user', + session_id: '', + message: { role: 'user', content: step.prompt }, + parent_tool_use_id: null, + }; + + // Wait for this step's result before yielding the next prompt + const success = await waitForStepDone(); + if (!success) { + // Step failed — stop yielding, let the generator end + break; + } + + // Discover more steps after this one completes + if (onAfterStep) { + const moreSteps = onAfterStep(i, step.name); + if (moreSteps.length > 0) { + allSteps.push(...moreSteps); + clack.log.info( + `${chalk.cyan('[BENCHMARK]')} Discovered ${ + moreSteps.length + } more phases: ${moreSteps.map((s) => s.name).join(', ')}`, + ); + } + } + + i++; + } + + // Keep generator alive for SDK cleanup (stdin workaround) + await allDone; + }; + + const allowedTools = [ + 'Read', + 'Write', + 'Edit', + 'Glob', + 'Grep', + 'Bash', + 'ListMcpResourcesTool', + 'Skill', + ]; + + try { + const response = query({ + prompt: promptStream(), + options: { + model: agentConfig.model, + cwd: agentConfig.workingDirectory, + permissionMode: 'acceptEdits', + mcpServers: agentConfig.mcpServers, + settingSources: ['project'], + allowedTools, + env: { + ...process.env, + ANTHROPIC_API_KEY: undefined, + }, + canUseTool: (toolName: string, input: unknown) => { + logToFile('canUseTool called:', { toolName, input }); + const result = wizardCanUseTool( + toolName, + input as Record, + ); + logToFile('canUseTool result:', result); + return Promise.resolve(result); + }, + tools: { type: 'preset', preset: 'claude_code' }, + stderr: (data: string) => { + logToFile('CLI stderr:', data); + if (options.debug) { + debug('CLI stderr:', data); + } + }, + }, + }); + + for await (const message of response) { + handleSDKMessage( + message, + options, + spinner, + collectedText, + receivedSuccessResult, + ); + + // Track compaction events from the SDK + if (message.type === 'system' && message.subtype === 'compact_boundary') { + const preTokens = message.compact_metadata?.pre_tokens ?? 0; + const trigger = message.compact_metadata?.trigger ?? 'unknown'; + stepCompactions++; + stepCompactionPreTokens.push(preTokens); + logToFile( + `[COMPACTION] Context compacted (trigger: ${trigger}, pre_tokens: ${formatTokenCount( + preTokens, + )})`, + ); + clack.log.info( + `${chalk.yellow('[COMPACTION]')} Context compacted during step "${ + allSteps[completedStepCount]?.name + }" (trigger: ${trigger}, pre_tokens: ${formatTokenCount(preTokens)})`, + ); + } + + if (message.type === 'result') { + if (message.subtype === 'success' && !message.is_error) { + receivedSuccessResult = true; + + const stepIndex = completedStepCount; + const stepDurationMs = Date.now() - stepStartTimes[stepIndex]; + + // Compute delta usage from cumulative SDK values + const modelUsageData = message.modelUsage ?? {}; + const cumulativeUsage = sumModelUsage(modelUsageData); + const cumulativeCost = message.total_cost_usd ?? 0; + const cumulativeDuration = message.duration_ms ?? 0; + const cumulativeDurationApi = message.duration_api_ms ?? 0; + const cumulativeTurns = message.num_turns ?? 0; + + const deltaUsage = { + input_tokens: + cumulativeUsage.input_tokens - prevCumulative.usage.input_tokens, + output_tokens: + cumulativeUsage.output_tokens - + prevCumulative.usage.output_tokens, + cache_creation_input_tokens: + cumulativeUsage.cache_creation_input_tokens - + prevCumulative.usage.cache_creation_input_tokens, + cache_read_input_tokens: + cumulativeUsage.cache_read_input_tokens - + prevCumulative.usage.cache_read_input_tokens, + }; + const deltaCost = cumulativeCost - prevCumulative.costUsd; + // num_turns is per-response (not cumulative), so use directly + const stepTurns = cumulativeTurns; + const deltaDurationApi = + cumulativeDurationApi - prevCumulative.durationApiMs; + const deltaModelUsage = computeModelUsageDelta( + modelUsageData, + prevCumulative.modelUsage, + ); + + // Context size from the last API call's usage (not cumulative modelUsage). + // The last call's input represents the actual conversation window at that point. + const lastCallUsage = message.usage ?? {}; + const contextTokensOut = + Number(lastCallUsage.input_tokens ?? 0) + + Number(lastCallUsage.cache_read_input_tokens ?? 0) + + Number(lastCallUsage.cache_creation_input_tokens ?? 0); + const contextTokensIn = + stepUsages.length > 0 + ? stepUsages[stepUsages.length - 1].contextTokensOut + : 0; + + stepUsages.push({ + name: allSteps[stepIndex].name, + usage: deltaUsage, + modelUsage: deltaModelUsage, + totalCostUsd: deltaCost, + durationMs: stepDurationMs, + durationApiMs: deltaDurationApi, + numTurns: stepTurns, + contextTokensIn, + contextTokensOut, + ...(stepCompactions > 0 && { + compactions: stepCompactions, + compactionPreTokens: stepCompactionPreTokens, + }), + }); + + // Reset per-step compaction tracking + stepCompactions = 0; + stepCompactionPreTokens = []; + + // Update cumulative tracking + prevCumulative = { + usage: cumulativeUsage, + modelUsage: modelUsageData, + costUsd: cumulativeCost, + durationMs: cumulativeDuration, + durationApiMs: cumulativeDurationApi, + numTurns: cumulativeTurns, + }; + + spinner.stop( + `${chalk.cyan('[BENCHMARK]')} Completed step ${stepIndex + 1}/${ + allSteps.length + }: ${chalk.bold(allSteps[stepIndex].name)} ${chalk.dim( + `(${formatDuration(stepDurationMs)}, $${deltaCost.toFixed( + 4, + )}, ${stepTurns} turns, ctx: ${formatTokenCount( + contextTokensIn, + )} → ${formatTokenCount(contextTokensOut)})`, + )}`, + ); + logToFile( + `Step "${allSteps[stepIndex].name}" completed in ${Math.round( + stepDurationMs / 1000, + )}s`, + ); + + completedStepCount++; + resolveStepDone(true); + } else { + // Error result — signal generator to stop yielding + resolveStepDone(false); + } + + // Signal generator cleanup when all done + if (completedStepCount >= allSteps.length) { + signalAllDone!(); + } + } + } + + // Check for error signals in collected output + const outputText = collectedText.join('\n'); + if (outputText.includes(AgentSignals.ERROR_MCP_MISSING)) { + spinner.stop('Agent could not access PostHog MCP'); + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + return { error: AgentErrorType.MCP_MISSING, benchmark }; + } + if (outputText.includes(AgentSignals.ERROR_RESOURCE_MISSING)) { + spinner.stop('Agent could not access setup resource'); + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + return { error: AgentErrorType.RESOURCE_MISSING, benchmark }; + } + if (outputText.includes('API Error: 429')) { + spinner.stop('Rate limit exceeded'); + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + return { + error: AgentErrorType.RATE_LIMIT, + message: outputText, + benchmark, + }; + } + if (outputText.includes('API Error:')) { + spinner.stop('API error occurred'); + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + return { + error: AgentErrorType.API_ERROR, + message: outputText, + benchmark, + }; + } + + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + + const totalDurationSeconds = Math.round( + (Date.now() - overallStartTime) / 1000, + ); + const totalCost = stepUsages.reduce((sum, s) => sum + s.totalCostUsd, 0); + clack.log.success( + `${chalk.cyan( + '[BENCHMARK]', + )} All ${completedStepCount} steps completed in ${formatDuration( + totalDurationSeconds * 1000, + )}, total cost: $${totalCost.toFixed(4)}`, + ); + clack.log.info( + `${chalk.cyan('[BENCHMARK]')} Results written to ${BENCHMARK_FILE_PATH}`, + ); + logToFile( + `All ${completedStepCount} benchmark steps completed in ${totalDurationSeconds}s`, + ); + + analytics.capture(WIZARD_INTERACTION_EVENT_NAME, { + action: 'agent integration completed', + duration_ms: Date.now() - overallStartTime, + duration_seconds: totalDurationSeconds, + benchmark_steps: completedStepCount, + }); + + spinner.stop(successMessage); + return { benchmark }; + } catch (error) { + signalAllDone!(); + + if (receivedSuccessResult) { + logToFile('Ignoring post-completion error, agent completed successfully'); + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + spinner.stop(successMessage); + return { benchmark }; + } + + spinner.stop(errorMessage); + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + + const outputText = collectedText.join('\n'); + const apiErrorMatch = outputText.match(/API Error: [^\n]+/g); + const apiErrorMessage = apiErrorMatch + ? apiErrorMatch.join('\n') + : undefined; + + if (outputText.includes('API Error: 429')) { + return { + error: AgentErrorType.RATE_LIMIT, + message: apiErrorMessage, + benchmark, + }; + } + if (outputText.includes('API Error:')) { + return { + error: AgentErrorType.API_ERROR, + message: apiErrorMessage, + benchmark, + }; + } + + throw error; + } +} + +/** + * Compute per-model usage deltas between current and previous cumulative modelUsage. + */ +function computeModelUsageDelta( + current: Record, + previous: Record, +): Record { + const result: Record = {}; + for (const [model, data] of Object.entries(current)) { + const prev = previous[model] ?? {}; + result[model] = { + inputTokens: (data.inputTokens ?? 0) - (prev.inputTokens ?? 0), + outputTokens: (data.outputTokens ?? 0) - (prev.outputTokens ?? 0), + cacheReadInputTokens: + (data.cacheReadInputTokens ?? 0) - (prev.cacheReadInputTokens ?? 0), + cacheCreationInputTokens: + (data.cacheCreationInputTokens ?? 0) - + (prev.cacheCreationInputTokens ?? 0), + webSearchRequests: + (data.webSearchRequests ?? 0) - (prev.webSearchRequests ?? 0), + costUSD: (data.costUSD ?? 0) - (prev.costUSD ?? 0), + contextWindow: data.contextWindow ?? 0, + }; + } + return result; +} + +/** + * Build BenchmarkData from collected step usages. + */ +function buildBenchmarkData( + stepUsages: StepUsage[], + overallStartTime: number, +): BenchmarkData { + return { + timestamp: new Date().toISOString(), + steps: stepUsages, + totals: { + totalCostUsd: stepUsages.reduce((sum, s) => sum + s.totalCostUsd, 0), + durationMs: Date.now() - overallStartTime, + inputTokens: stepUsages.reduce( + (sum, s) => + sum + + s.usage.input_tokens + + s.usage.cache_read_input_tokens + + s.usage.cache_creation_input_tokens, + 0, + ), + outputTokens: stepUsages.reduce( + (sum, s) => sum + s.usage.output_tokens, + 0, + ), + numTurns: stepUsages.reduce((sum, s) => sum + s.numTurns, 0), + }, + }; +} + /** * Handle SDK messages and provide user feedback * @@ -739,7 +1406,6 @@ function handleSDKMessage( } default: - // Log other message types for debugging if (options.debug) { debug(`Unhandled message type: ${message.type}`); } diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts index d319df2..bd75599 100644 --- a/src/lib/agent-runner.ts +++ b/src/lib/agent-runner.ts @@ -22,9 +22,13 @@ import clack from '../utils/clack'; import { initializeAgent, runAgent, + runAgentSteps, AgentSignals, AgentErrorType, } from './agent-interface'; +import { logToFile } from '../utils/debug'; +import fs from 'fs'; +import path from 'path'; import { getCloudUrlFromRegion } from '../utils/urls'; import chalk from 'chalk'; import * as semver from 'semver'; @@ -193,18 +197,90 @@ export async function runAgentWizard( options, ); - const agentResult = await runAgent( - agent, - integrationPrompt, - options, - spinner, - { - estimatedDurationMinutes: config.ui.estimatedDurationMinutes, - spinnerMessage: SPINNER_MESSAGE, - successMessage: config.ui.successMessage, - errorMessage: 'Integration failed', - }, - ); + const agentRunConfig = { + estimatedDurationMinutes: config.ui.estimatedDurationMinutes, + spinnerMessage: SPINNER_MESSAGE, + successMessage: config.ui.successMessage, + errorMessage: 'Integration failed', + }; + + let agentResult; + + if (options.benchmark) { + clack.log.info( + `${chalk.cyan( + '[BENCHMARK]', + )} Running in benchmark mode — each workflow phase will be tracked separately`, + ); + + // Benchmark mode: run setup + workflow phases in a single conversation, + // with per-step tracking. Context is preserved across steps (identical to normal mode). + const additionalLines = config.prompts.getAdditionalContextLines + ? config.prompts.getAdditionalContextLines(frameworkContext) + : []; + const additionalContext = + additionalLines.length > 0 + ? '\n' + additionalLines.map((line) => `- ${line}`).join('\n') + : ''; + + const projectContext = `Project context: +- Framework: ${config.metadata.name} ${frameworkVersion || 'latest'} +- TypeScript: ${typeScriptDetected ? 'Yes' : 'No'} +- PostHog API Key: ${projectApiKey} +- PostHog Host: ${host}${additionalContext}`; + + const setupPrompt = `You have access to the PostHog MCP server which provides skills to integrate PostHog into this ${config.metadata.name} project. + +${projectContext} + +Instructions (follow these steps IN ORDER - do not skip or reorder): + +STEP 1: List available skills from the PostHog MCP server using ListMcpResourcesTool. If this tool is not available or you cannot access the MCP server, you must emit: ${AgentSignals.ERROR_MCP_MISSING} Could not access the PostHog MCP server and halt. + + Review the skill descriptions and choose the one that best matches this project's framework and configuration. + If no suitable skill is found, or you cannot access the MCP server, you emit: ${AgentSignals.ERROR_RESOURCE_MISSING} Could not find a suitable skill for this project. + +STEP 2: Fetch the chosen skill resource (e.g., posthog://skills/{skill-id}). + The resource returns a shell command to install the skill. + +STEP 3: Run the installation command using Bash: + - Execute the EXACT command returned by the resource (do not modify it) + - This will download and extract the skill to .claude/skills/{skill-id}/ + +STEP 4: Load the installed skill's SKILL.md file to understand what references are available. + +STEP 5: Set up environment variables for PostHog in a .env file with the API key and host provided above, using the appropriate naming convention for ${config.metadata.name}. Make sure to use these environment variables in the code files you create instead of hardcoding the API key and host. + +Important: Look for lockfiles (pnpm-lock.yaml, package-lock.json, yarn.lock, bun.lockb) to determine the package manager (excluding the contents of node_modules). Do not manually edit package.json. Always install packages as a background task. Don't await completion; proceed with other work immediately after starting the installation. You must read a file immediately before attempting to write it, even if you have previously read it; failure to do so will cause a tool failure. + +`; + + const setupStep = { name: 'setup', prompt: setupPrompt }; + + // Run all steps in a single conversation. After the setup step installs the skill, + // onAfterStep discovers workflow files on disk and adds them as additional steps. + agentResult = await runAgentSteps(agent, [setupStep], options, spinner, { + ...agentRunConfig, + onAfterStep: (stepIndex, stepName) => { + if (stepName === 'setup') { + return discoverBenchmarkSteps( + options.installDir, + config, + projectContext, + ); + } + return []; + }, + }); + } else { + agentResult = await runAgent( + agent, + integrationPrompt, + options, + spinner, + agentRunConfig, + ); + } // Handle error cases detected in agent output if (agentResult.error === AgentErrorType.MCP_MISSING) { @@ -412,3 +488,93 @@ Important: Look for lockfiles (pnpm-lock.yaml, package-lock.json, yarn.lock, bun `; } + +/** + * Discover installed skill workflow files and build benchmark step prompts. + * Scans .claude/skills/ in the install directory for workflow files (1.0-*, 1.1-*, etc.). + */ +function discoverBenchmarkSteps( + installDir: string, + config: FrameworkConfig, + projectContext: string, +): Array<{ name: string; prompt: string }> { + const skillsDir = path.join(installDir, '.claude', 'skills'); + + if (!fs.existsSync(skillsDir)) { + logToFile('No .claude/skills/ directory found for benchmark discovery'); + return []; + } + + // Find installed skill directory + const skillDirs = fs.readdirSync(skillsDir).filter((entry) => { + const fullPath = path.join(skillsDir, entry); + return fs.statSync(fullPath).isDirectory(); + }); + + if (skillDirs.length === 0) { + logToFile('No skill directories found in .claude/skills/'); + return []; + } + + // Use the first skill directory found + const skillId = skillDirs[0]; + const skillPath = path.join(skillsDir, skillId); + logToFile(`Discovered skill for benchmark: ${skillId}`); + + // Workflow files live in the references/ subdirectory + const referencesDir = path.join(skillPath, 'references'); + if (!fs.existsSync(referencesDir)) { + logToFile('No references/ directory found in skill directory'); + return []; + } + + // Find workflow files matching pattern like "basic-integration-1.0-begin.md" + // The naming convention is {category}-{number}.{step}-{name}.md + const allFiles = fs.readdirSync(referencesDir); + const workflowFiles = allFiles + .filter((f) => /\d+\.\d+-\w+\.md$/.test(f)) + .sort(); + + if (workflowFiles.length === 0) { + logToFile( + `No workflow files found in references/ directory. Files present: ${allFiles.join( + ', ', + )}`, + ); + return []; + } + + logToFile( + `Found ${workflowFiles.length} workflow files: ${workflowFiles.join(', ')}`, + ); + + const steps: Array<{ name: string; prompt: string }> = []; + + for (const workflowFile of workflowFiles) { + // Extract phase name from filename + // e.g., "basic-integration-1.0-begin.md" -> "1.0-begin" + const phaseMatch = workflowFile.match(/(\d+\.\d+-.+)\.md$/); + const phaseName = phaseMatch + ? phaseMatch[1] + : workflowFile.replace(/\.md$/, ''); + + const prompt = `You are performing phase "${phaseName}" of a PostHog integration for a ${config.metadata.name} project. + +${projectContext} + +The PostHog skill is installed at .claude/skills/${skillId}/. +Read SKILL.md in that directory for available reference files. + +Follow the instructions in the workflow file: .claude/skills/${skillId}/references/${workflowFile} + +Important: Read files before writing. Use environment variables, not hardcoded keys. +Do not manually edit package.json. Use lockfiles to determine the package manager. +You must read a file immediately before attempting to write it, even if you have previously read it; failure to do so will cause a tool failure. +Always install packages as a background task. Don't await completion; proceed with other work immediately after starting the installation. +`; + + steps.push({ name: phaseName, prompt }); + } + + return steps; +} diff --git a/src/run.ts b/src/run.ts index e5e5311..42cc336 100644 --- a/src/run.ts +++ b/src/run.ts @@ -27,6 +27,7 @@ type Args = { ci?: boolean; apiKey?: string; menu?: boolean; + benchmark?: boolean; }; export async function runWizard(argv: Args) { @@ -57,6 +58,7 @@ export async function runWizard(argv: Args) { ci: finalArgs.ci ?? false, apiKey: finalArgs.apiKey, menu: finalArgs.menu ?? false, + benchmark: finalArgs.benchmark ?? false, }; clack.intro(`Welcome to the PostHog setup wizard ✨`); diff --git a/src/utils/types.ts b/src/utils/types.ts index 7379156..5c0b8d6 100644 --- a/src/utils/types.ts +++ b/src/utils/types.ts @@ -60,6 +60,13 @@ export type WizardOptions = { * Whether to show the menu for manual integration selection instead of auto-detecting. */ menu: boolean; + + /** + * Whether to run in benchmark mode with per-phase token tracking. + * When enabled, the wizard runs each workflow phase as a separate agent call + * and writes detailed usage data to /tmp/posthog-wizard-benchmark.json. + */ + benchmark: boolean; }; export interface Feature { From 9731e3d76e1412b88243df7b7df3208ed96bb171 Mon Sep 17 00:00:00 2001 From: "Vincent (Wen Yu) Ge" Date: Wed, 11 Feb 2026 16:21:18 -0500 Subject: [PATCH 2/2] Clean up --- src/lib/agent-interface.ts | 662 +------------------------------------ src/lib/agent-runner.ts | 178 +--------- src/lib/benchmark.ts | 371 +++++++++++++++++++++ 3 files changed, 385 insertions(+), 826 deletions(-) create mode 100644 src/lib/benchmark.ts diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts index a06acfd..f5310f9 100644 --- a/src/lib/agent-interface.ts +++ b/src/lib/agent-interface.ts @@ -3,9 +3,7 @@ * Uses Claude Agent SDK directly with PostHog LLM gateway */ -import fs from 'fs'; import path from 'path'; -import chalk from 'chalk'; import clack from '../utils/clack'; import { debug, logToFile, initLogFile, LOG_FILE_PATH } from '../utils/debug'; import type { WizardOptions } from '../utils/types'; @@ -16,6 +14,8 @@ import { } from './constants'; import { getLlmGatewayUrlFromHost } from '../utils/urls'; import { LINTING_TOOLS } from './safe-tools'; +import type { BenchmarkData } from './benchmark'; +import { BenchmarkTracker } from './benchmark'; // Dynamic import cache for ESM module let _sdkModule: any = null; @@ -86,43 +86,6 @@ type AgentRunConfig = { model: string; }; -export const BENCHMARK_FILE_PATH = '/tmp/posthog-wizard-benchmark.json'; - -export interface StepUsage { - name: string; - usage: { - input_tokens: number; - output_tokens: number; - cache_creation_input_tokens: number; - cache_read_input_tokens: number; - }; - modelUsage: Record; - totalCostUsd: number; - durationMs: number; - durationApiMs: number; - numTurns: number; - /** Conversation context size (tokens) entering this step */ - contextTokensIn: number; - /** Conversation context size (tokens) exiting this step */ - contextTokensOut: number; - /** Number of auto-compactions that occurred during this step */ - compactions?: number; - /** Token count before each compaction (from SDK compact_boundary messages) */ - compactionPreTokens?: number[]; -} - -export interface BenchmarkData { - timestamp: string; - steps: StepUsage[]; - totals: { - totalCostUsd: number; - durationMs: number; - inputTokens: number; - outputTokens: number; - numTurns: number; - }; -} - /** * Package managers that can be used to run commands. */ @@ -437,6 +400,9 @@ export async function runAgent( `This whole process should take about ${estimatedDurationMinutes} minutes including error checking and fixes.\n\nGrab some coffee!`, ); + // Create benchmark tracker before spinner starts so its log output is visible + const tracker = options.benchmark ? new BenchmarkTracker(spinner) : null; + spinner.start(spinnerMessage); const cliPath = getClaudeCodeExecutablePath(); @@ -515,17 +481,7 @@ export async function runAgent( }); spinner.stop(successMessage); - // Write benchmark data from the single-query result if available - let benchmark: BenchmarkData | undefined; - if (resultMessage && options.benchmark) { - benchmark = extractBenchmarkFromResult( - 'single-run', - resultMessage, - durationMs, - ); - writeBenchmarkData(benchmark); - } - + const benchmark = tracker?.finalize(resultMessage, durationMs); return { benchmark }; }; @@ -622,6 +578,7 @@ export async function runAgent( collectedText, receivedSuccessResult, ); + tracker?.onMessage(message); // Signal completion when result received if (message.type === 'result') { @@ -706,611 +663,6 @@ export async function runAgent( } } -/** - * Format milliseconds into a human-readable duration string (e.g., "2m 34s", "45s"). - */ -function formatDuration(ms: number): string { - const totalSeconds = Math.round(ms / 1000); - const minutes = Math.floor(totalSeconds / 60); - const seconds = totalSeconds % 60; - if (minutes > 0) { - return `${minutes}m ${seconds}s`; - } - return `${seconds}s`; -} - -/** - * Format token count into a human-readable string (e.g., "1.2M", "345K", "1,234"). - */ -function formatTokenCount(tokens: number): string { - if (tokens >= 1_000_000) { - return `${(tokens / 1_000_000).toFixed(1)}M`; - } - if (tokens >= 10_000) { - return `${Math.round(tokens / 1000)}K`; - } - return tokens.toLocaleString(); -} - -/** - * Sum token usage across all models from the SDK's modelUsage field. - * The top-level `usage` field only has the last API call's tokens; - * `modelUsage` has the accurate per-model aggregates (camelCase fields). - */ -function sumModelUsage(modelUsage: Record): { - input_tokens: number; - output_tokens: number; - cache_creation_input_tokens: number; - cache_read_input_tokens: number; -} { - let input_tokens = 0; - let output_tokens = 0; - let cache_creation_input_tokens = 0; - let cache_read_input_tokens = 0; - - for (const model of Object.values(modelUsage)) { - input_tokens += model.inputTokens ?? 0; - output_tokens += model.outputTokens ?? 0; - cache_creation_input_tokens += model.cacheCreationInputTokens ?? 0; - cache_read_input_tokens += model.cacheReadInputTokens ?? 0; - } - - return { - input_tokens, - output_tokens, - cache_creation_input_tokens, - cache_read_input_tokens, - }; -} - -/** - * Extract benchmark data from a single SDK result message. - */ -function extractBenchmarkFromResult( - stepName: string, - message: SDKMessage, - wallDurationMs: number, -): BenchmarkData { - const modelUsage = message.modelUsage ?? {}; - const usage = sumModelUsage(modelUsage); - const lastCallUsage = message.usage ?? {}; - const contextTokensOut = - Number(lastCallUsage.input_tokens ?? 0) + - Number(lastCallUsage.cache_read_input_tokens ?? 0) + - Number(lastCallUsage.cache_creation_input_tokens ?? 0); - const step: StepUsage = { - name: stepName, - usage, - modelUsage, - totalCostUsd: message.total_cost_usd ?? 0, - durationMs: message.duration_ms ?? wallDurationMs, - durationApiMs: message.duration_api_ms ?? 0, - numTurns: message.num_turns ?? 0, - contextTokensIn: 0, - contextTokensOut, - }; - - return { - timestamp: new Date().toISOString(), - steps: [step], - totals: { - totalCostUsd: step.totalCostUsd, - durationMs: step.durationMs, - inputTokens: step.usage.input_tokens, - outputTokens: step.usage.output_tokens, - numTurns: step.numTurns, - }, - }; -} - -/** - * Write benchmark data to the benchmark file. - */ -function writeBenchmarkData(data: BenchmarkData): void { - try { - fs.writeFileSync(BENCHMARK_FILE_PATH, JSON.stringify(data, null, 2)); - logToFile(`Benchmark data written to ${BENCHMARK_FILE_PATH}`); - } catch (error) { - logToFile('Failed to write benchmark data:', error); - } -} - -/** - * Execute multiple agent steps in a single conversation with per-step usage tracking. - * Uses one query() call with multiple user messages, so conversation context is preserved - * across steps (identical behavior to normal non-benchmark mode). - * - * Steps can be discovered dynamically via the onAfterStep callback — e.g., after the - * setup step installs a skill, onAfterStep discovers the workflow files and returns - * them as additional steps to run in the same conversation. - * - * Per-step usage is computed as deltas between consecutive SDK result messages. - * - * Writes benchmark data to BENCHMARK_FILE_PATH when all steps complete. - */ -export async function runAgentSteps( - agentConfig: AgentRunConfig, - initialSteps: Array<{ name: string; prompt: string }>, - options: WizardOptions, - spinner: ReturnType, - config?: { - estimatedDurationMinutes?: number; - spinnerMessage?: string; - successMessage?: string; - errorMessage?: string; - /** Called after each step completes. Return additional steps to append to the queue. */ - onAfterStep?: ( - stepIndex: number, - stepName: string, - ) => Array<{ name: string; prompt: string }>; - }, -): Promise<{ - error?: AgentErrorType; - message?: string; - benchmark?: BenchmarkData; -}> { - const { - estimatedDurationMinutes = 8, - spinnerMessage = 'Customizing your PostHog setup...', - successMessage = 'PostHog integration complete', - errorMessage = 'Integration failed', - onAfterStep, - } = config ?? {}; - - const { query } = await getSDKModule(); - - clack.log.step( - `This whole process should take about ${estimatedDurationMinutes} minutes including error checking and fixes.\n\nGrab some coffee!`, - ); - clack.log.info(`${chalk.cyan('[BENCHMARK]')} Verbose logs: ${LOG_FILE_PATH}`); - clack.log.info( - `${chalk.cyan( - '[BENCHMARK]', - )} Benchmark data will be written to: ${BENCHMARK_FILE_PATH}`, - ); - - spinner.start(spinnerMessage); - - const overallStartTime = Date.now(); - const stepUsages: StepUsage[] = []; - const collectedText: string[] = []; - let receivedSuccessResult = false; - - // Dynamic steps list — grows as onAfterStep discovers more - const allSteps = [...initialSteps]; - const stepStartTimes: number[] = []; - let completedStepCount = 0; - - // Per-step compaction tracking (reset after each step) - let stepCompactions = 0; - let stepCompactionPreTokens: number[] = []; - - // Previous cumulative values for delta computation - let prevCumulative = { - usage: { - input_tokens: 0, - output_tokens: 0, - cache_creation_input_tokens: 0, - cache_read_input_tokens: 0, - }, - modelUsage: {} as Record, - costUsd: 0, - durationMs: 0, - durationApiMs: 0, - numTurns: 0, - }; - - // Step completion synchronization: resolves with `true` on success, `false` on error - // eslint-disable-next-line @typescript-eslint/no-empty-function - let resolveStepDone: (success: boolean) => void = () => {}; - function waitForStepDone(): Promise { - return new Promise((resolve) => { - resolveStepDone = resolve; - }); - } - - // Final cleanup signal for SDK stdin workaround - let signalAllDone: () => void; - const allDone = new Promise((resolve) => { - signalAllDone = resolve; - }); - - // Prompt stream generator — yields user messages for each step in order, - // pausing between steps to wait for the result and discover more steps. - const promptStream = async function* () { - let i = 0; - while (i < allSteps.length) { - const step = allSteps[i]; - stepStartTimes[i] = Date.now(); - - logToFile(`Yielding benchmark step ${i + 1}: ${step.name}`); - spinner.stop( - `${chalk.cyan('[BENCHMARK]')} Starting step ${i + 1}/${ - allSteps.length - }: ${chalk.bold(step.name)}`, - ); - spinner.start( - `Running step ${i + 1}/${allSteps.length}: ${step.name}...`, - ); - - yield { - type: 'user', - session_id: '', - message: { role: 'user', content: step.prompt }, - parent_tool_use_id: null, - }; - - // Wait for this step's result before yielding the next prompt - const success = await waitForStepDone(); - if (!success) { - // Step failed — stop yielding, let the generator end - break; - } - - // Discover more steps after this one completes - if (onAfterStep) { - const moreSteps = onAfterStep(i, step.name); - if (moreSteps.length > 0) { - allSteps.push(...moreSteps); - clack.log.info( - `${chalk.cyan('[BENCHMARK]')} Discovered ${ - moreSteps.length - } more phases: ${moreSteps.map((s) => s.name).join(', ')}`, - ); - } - } - - i++; - } - - // Keep generator alive for SDK cleanup (stdin workaround) - await allDone; - }; - - const allowedTools = [ - 'Read', - 'Write', - 'Edit', - 'Glob', - 'Grep', - 'Bash', - 'ListMcpResourcesTool', - 'Skill', - ]; - - try { - const response = query({ - prompt: promptStream(), - options: { - model: agentConfig.model, - cwd: agentConfig.workingDirectory, - permissionMode: 'acceptEdits', - mcpServers: agentConfig.mcpServers, - settingSources: ['project'], - allowedTools, - env: { - ...process.env, - ANTHROPIC_API_KEY: undefined, - }, - canUseTool: (toolName: string, input: unknown) => { - logToFile('canUseTool called:', { toolName, input }); - const result = wizardCanUseTool( - toolName, - input as Record, - ); - logToFile('canUseTool result:', result); - return Promise.resolve(result); - }, - tools: { type: 'preset', preset: 'claude_code' }, - stderr: (data: string) => { - logToFile('CLI stderr:', data); - if (options.debug) { - debug('CLI stderr:', data); - } - }, - }, - }); - - for await (const message of response) { - handleSDKMessage( - message, - options, - spinner, - collectedText, - receivedSuccessResult, - ); - - // Track compaction events from the SDK - if (message.type === 'system' && message.subtype === 'compact_boundary') { - const preTokens = message.compact_metadata?.pre_tokens ?? 0; - const trigger = message.compact_metadata?.trigger ?? 'unknown'; - stepCompactions++; - stepCompactionPreTokens.push(preTokens); - logToFile( - `[COMPACTION] Context compacted (trigger: ${trigger}, pre_tokens: ${formatTokenCount( - preTokens, - )})`, - ); - clack.log.info( - `${chalk.yellow('[COMPACTION]')} Context compacted during step "${ - allSteps[completedStepCount]?.name - }" (trigger: ${trigger}, pre_tokens: ${formatTokenCount(preTokens)})`, - ); - } - - if (message.type === 'result') { - if (message.subtype === 'success' && !message.is_error) { - receivedSuccessResult = true; - - const stepIndex = completedStepCount; - const stepDurationMs = Date.now() - stepStartTimes[stepIndex]; - - // Compute delta usage from cumulative SDK values - const modelUsageData = message.modelUsage ?? {}; - const cumulativeUsage = sumModelUsage(modelUsageData); - const cumulativeCost = message.total_cost_usd ?? 0; - const cumulativeDuration = message.duration_ms ?? 0; - const cumulativeDurationApi = message.duration_api_ms ?? 0; - const cumulativeTurns = message.num_turns ?? 0; - - const deltaUsage = { - input_tokens: - cumulativeUsage.input_tokens - prevCumulative.usage.input_tokens, - output_tokens: - cumulativeUsage.output_tokens - - prevCumulative.usage.output_tokens, - cache_creation_input_tokens: - cumulativeUsage.cache_creation_input_tokens - - prevCumulative.usage.cache_creation_input_tokens, - cache_read_input_tokens: - cumulativeUsage.cache_read_input_tokens - - prevCumulative.usage.cache_read_input_tokens, - }; - const deltaCost = cumulativeCost - prevCumulative.costUsd; - // num_turns is per-response (not cumulative), so use directly - const stepTurns = cumulativeTurns; - const deltaDurationApi = - cumulativeDurationApi - prevCumulative.durationApiMs; - const deltaModelUsage = computeModelUsageDelta( - modelUsageData, - prevCumulative.modelUsage, - ); - - // Context size from the last API call's usage (not cumulative modelUsage). - // The last call's input represents the actual conversation window at that point. - const lastCallUsage = message.usage ?? {}; - const contextTokensOut = - Number(lastCallUsage.input_tokens ?? 0) + - Number(lastCallUsage.cache_read_input_tokens ?? 0) + - Number(lastCallUsage.cache_creation_input_tokens ?? 0); - const contextTokensIn = - stepUsages.length > 0 - ? stepUsages[stepUsages.length - 1].contextTokensOut - : 0; - - stepUsages.push({ - name: allSteps[stepIndex].name, - usage: deltaUsage, - modelUsage: deltaModelUsage, - totalCostUsd: deltaCost, - durationMs: stepDurationMs, - durationApiMs: deltaDurationApi, - numTurns: stepTurns, - contextTokensIn, - contextTokensOut, - ...(stepCompactions > 0 && { - compactions: stepCompactions, - compactionPreTokens: stepCompactionPreTokens, - }), - }); - - // Reset per-step compaction tracking - stepCompactions = 0; - stepCompactionPreTokens = []; - - // Update cumulative tracking - prevCumulative = { - usage: cumulativeUsage, - modelUsage: modelUsageData, - costUsd: cumulativeCost, - durationMs: cumulativeDuration, - durationApiMs: cumulativeDurationApi, - numTurns: cumulativeTurns, - }; - - spinner.stop( - `${chalk.cyan('[BENCHMARK]')} Completed step ${stepIndex + 1}/${ - allSteps.length - }: ${chalk.bold(allSteps[stepIndex].name)} ${chalk.dim( - `(${formatDuration(stepDurationMs)}, $${deltaCost.toFixed( - 4, - )}, ${stepTurns} turns, ctx: ${formatTokenCount( - contextTokensIn, - )} → ${formatTokenCount(contextTokensOut)})`, - )}`, - ); - logToFile( - `Step "${allSteps[stepIndex].name}" completed in ${Math.round( - stepDurationMs / 1000, - )}s`, - ); - - completedStepCount++; - resolveStepDone(true); - } else { - // Error result — signal generator to stop yielding - resolveStepDone(false); - } - - // Signal generator cleanup when all done - if (completedStepCount >= allSteps.length) { - signalAllDone!(); - } - } - } - - // Check for error signals in collected output - const outputText = collectedText.join('\n'); - if (outputText.includes(AgentSignals.ERROR_MCP_MISSING)) { - spinner.stop('Agent could not access PostHog MCP'); - const benchmark = buildBenchmarkData(stepUsages, overallStartTime); - writeBenchmarkData(benchmark); - return { error: AgentErrorType.MCP_MISSING, benchmark }; - } - if (outputText.includes(AgentSignals.ERROR_RESOURCE_MISSING)) { - spinner.stop('Agent could not access setup resource'); - const benchmark = buildBenchmarkData(stepUsages, overallStartTime); - writeBenchmarkData(benchmark); - return { error: AgentErrorType.RESOURCE_MISSING, benchmark }; - } - if (outputText.includes('API Error: 429')) { - spinner.stop('Rate limit exceeded'); - const benchmark = buildBenchmarkData(stepUsages, overallStartTime); - writeBenchmarkData(benchmark); - return { - error: AgentErrorType.RATE_LIMIT, - message: outputText, - benchmark, - }; - } - if (outputText.includes('API Error:')) { - spinner.stop('API error occurred'); - const benchmark = buildBenchmarkData(stepUsages, overallStartTime); - writeBenchmarkData(benchmark); - return { - error: AgentErrorType.API_ERROR, - message: outputText, - benchmark, - }; - } - - const benchmark = buildBenchmarkData(stepUsages, overallStartTime); - writeBenchmarkData(benchmark); - - const totalDurationSeconds = Math.round( - (Date.now() - overallStartTime) / 1000, - ); - const totalCost = stepUsages.reduce((sum, s) => sum + s.totalCostUsd, 0); - clack.log.success( - `${chalk.cyan( - '[BENCHMARK]', - )} All ${completedStepCount} steps completed in ${formatDuration( - totalDurationSeconds * 1000, - )}, total cost: $${totalCost.toFixed(4)}`, - ); - clack.log.info( - `${chalk.cyan('[BENCHMARK]')} Results written to ${BENCHMARK_FILE_PATH}`, - ); - logToFile( - `All ${completedStepCount} benchmark steps completed in ${totalDurationSeconds}s`, - ); - - analytics.capture(WIZARD_INTERACTION_EVENT_NAME, { - action: 'agent integration completed', - duration_ms: Date.now() - overallStartTime, - duration_seconds: totalDurationSeconds, - benchmark_steps: completedStepCount, - }); - - spinner.stop(successMessage); - return { benchmark }; - } catch (error) { - signalAllDone!(); - - if (receivedSuccessResult) { - logToFile('Ignoring post-completion error, agent completed successfully'); - const benchmark = buildBenchmarkData(stepUsages, overallStartTime); - writeBenchmarkData(benchmark); - spinner.stop(successMessage); - return { benchmark }; - } - - spinner.stop(errorMessage); - const benchmark = buildBenchmarkData(stepUsages, overallStartTime); - writeBenchmarkData(benchmark); - - const outputText = collectedText.join('\n'); - const apiErrorMatch = outputText.match(/API Error: [^\n]+/g); - const apiErrorMessage = apiErrorMatch - ? apiErrorMatch.join('\n') - : undefined; - - if (outputText.includes('API Error: 429')) { - return { - error: AgentErrorType.RATE_LIMIT, - message: apiErrorMessage, - benchmark, - }; - } - if (outputText.includes('API Error:')) { - return { - error: AgentErrorType.API_ERROR, - message: apiErrorMessage, - benchmark, - }; - } - - throw error; - } -} - -/** - * Compute per-model usage deltas between current and previous cumulative modelUsage. - */ -function computeModelUsageDelta( - current: Record, - previous: Record, -): Record { - const result: Record = {}; - for (const [model, data] of Object.entries(current)) { - const prev = previous[model] ?? {}; - result[model] = { - inputTokens: (data.inputTokens ?? 0) - (prev.inputTokens ?? 0), - outputTokens: (data.outputTokens ?? 0) - (prev.outputTokens ?? 0), - cacheReadInputTokens: - (data.cacheReadInputTokens ?? 0) - (prev.cacheReadInputTokens ?? 0), - cacheCreationInputTokens: - (data.cacheCreationInputTokens ?? 0) - - (prev.cacheCreationInputTokens ?? 0), - webSearchRequests: - (data.webSearchRequests ?? 0) - (prev.webSearchRequests ?? 0), - costUSD: (data.costUSD ?? 0) - (prev.costUSD ?? 0), - contextWindow: data.contextWindow ?? 0, - }; - } - return result; -} - -/** - * Build BenchmarkData from collected step usages. - */ -function buildBenchmarkData( - stepUsages: StepUsage[], - overallStartTime: number, -): BenchmarkData { - return { - timestamp: new Date().toISOString(), - steps: stepUsages, - totals: { - totalCostUsd: stepUsages.reduce((sum, s) => sum + s.totalCostUsd, 0), - durationMs: Date.now() - overallStartTime, - inputTokens: stepUsages.reduce( - (sum, s) => - sum + - s.usage.input_tokens + - s.usage.cache_read_input_tokens + - s.usage.cache_creation_input_tokens, - 0, - ), - outputTokens: stepUsages.reduce( - (sum, s) => sum + s.usage.output_tokens, - 0, - ), - numTurns: stepUsages.reduce((sum, s) => sum + s.numTurns, 0), - }, - }; -} - /** * Handle SDK messages and provide user feedback * diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts index bd75599..483f6e0 100644 --- a/src/lib/agent-runner.ts +++ b/src/lib/agent-runner.ts @@ -22,13 +22,9 @@ import clack from '../utils/clack'; import { initializeAgent, runAgent, - runAgentSteps, AgentSignals, AgentErrorType, } from './agent-interface'; -import { logToFile } from '../utils/debug'; -import fs from 'fs'; -import path from 'path'; import { getCloudUrlFromRegion } from '../utils/urls'; import chalk from 'chalk'; import * as semver from 'semver'; @@ -204,83 +200,13 @@ export async function runAgentWizard( errorMessage: 'Integration failed', }; - let agentResult; - - if (options.benchmark) { - clack.log.info( - `${chalk.cyan( - '[BENCHMARK]', - )} Running in benchmark mode — each workflow phase will be tracked separately`, - ); - - // Benchmark mode: run setup + workflow phases in a single conversation, - // with per-step tracking. Context is preserved across steps (identical to normal mode). - const additionalLines = config.prompts.getAdditionalContextLines - ? config.prompts.getAdditionalContextLines(frameworkContext) - : []; - const additionalContext = - additionalLines.length > 0 - ? '\n' + additionalLines.map((line) => `- ${line}`).join('\n') - : ''; - - const projectContext = `Project context: -- Framework: ${config.metadata.name} ${frameworkVersion || 'latest'} -- TypeScript: ${typeScriptDetected ? 'Yes' : 'No'} -- PostHog API Key: ${projectApiKey} -- PostHog Host: ${host}${additionalContext}`; - - const setupPrompt = `You have access to the PostHog MCP server which provides skills to integrate PostHog into this ${config.metadata.name} project. - -${projectContext} - -Instructions (follow these steps IN ORDER - do not skip or reorder): - -STEP 1: List available skills from the PostHog MCP server using ListMcpResourcesTool. If this tool is not available or you cannot access the MCP server, you must emit: ${AgentSignals.ERROR_MCP_MISSING} Could not access the PostHog MCP server and halt. - - Review the skill descriptions and choose the one that best matches this project's framework and configuration. - If no suitable skill is found, or you cannot access the MCP server, you emit: ${AgentSignals.ERROR_RESOURCE_MISSING} Could not find a suitable skill for this project. - -STEP 2: Fetch the chosen skill resource (e.g., posthog://skills/{skill-id}). - The resource returns a shell command to install the skill. - -STEP 3: Run the installation command using Bash: - - Execute the EXACT command returned by the resource (do not modify it) - - This will download and extract the skill to .claude/skills/{skill-id}/ - -STEP 4: Load the installed skill's SKILL.md file to understand what references are available. - -STEP 5: Set up environment variables for PostHog in a .env file with the API key and host provided above, using the appropriate naming convention for ${config.metadata.name}. Make sure to use these environment variables in the code files you create instead of hardcoding the API key and host. - -Important: Look for lockfiles (pnpm-lock.yaml, package-lock.json, yarn.lock, bun.lockb) to determine the package manager (excluding the contents of node_modules). Do not manually edit package.json. Always install packages as a background task. Don't await completion; proceed with other work immediately after starting the installation. You must read a file immediately before attempting to write it, even if you have previously read it; failure to do so will cause a tool failure. - -`; - - const setupStep = { name: 'setup', prompt: setupPrompt }; - - // Run all steps in a single conversation. After the setup step installs the skill, - // onAfterStep discovers workflow files on disk and adds them as additional steps. - agentResult = await runAgentSteps(agent, [setupStep], options, spinner, { - ...agentRunConfig, - onAfterStep: (stepIndex, stepName) => { - if (stepName === 'setup') { - return discoverBenchmarkSteps( - options.installDir, - config, - projectContext, - ); - } - return []; - }, - }); - } else { - agentResult = await runAgent( - agent, - integrationPrompt, - options, - spinner, - agentRunConfig, - ); - } + const agentResult = await runAgent( + agent, + integrationPrompt, + options, + spinner, + agentRunConfig, + ); // Handle error cases detected in agent output if (agentResult.error === AgentErrorType.MCP_MISSING) { @@ -488,93 +414,3 @@ Important: Look for lockfiles (pnpm-lock.yaml, package-lock.json, yarn.lock, bun `; } - -/** - * Discover installed skill workflow files and build benchmark step prompts. - * Scans .claude/skills/ in the install directory for workflow files (1.0-*, 1.1-*, etc.). - */ -function discoverBenchmarkSteps( - installDir: string, - config: FrameworkConfig, - projectContext: string, -): Array<{ name: string; prompt: string }> { - const skillsDir = path.join(installDir, '.claude', 'skills'); - - if (!fs.existsSync(skillsDir)) { - logToFile('No .claude/skills/ directory found for benchmark discovery'); - return []; - } - - // Find installed skill directory - const skillDirs = fs.readdirSync(skillsDir).filter((entry) => { - const fullPath = path.join(skillsDir, entry); - return fs.statSync(fullPath).isDirectory(); - }); - - if (skillDirs.length === 0) { - logToFile('No skill directories found in .claude/skills/'); - return []; - } - - // Use the first skill directory found - const skillId = skillDirs[0]; - const skillPath = path.join(skillsDir, skillId); - logToFile(`Discovered skill for benchmark: ${skillId}`); - - // Workflow files live in the references/ subdirectory - const referencesDir = path.join(skillPath, 'references'); - if (!fs.existsSync(referencesDir)) { - logToFile('No references/ directory found in skill directory'); - return []; - } - - // Find workflow files matching pattern like "basic-integration-1.0-begin.md" - // The naming convention is {category}-{number}.{step}-{name}.md - const allFiles = fs.readdirSync(referencesDir); - const workflowFiles = allFiles - .filter((f) => /\d+\.\d+-\w+\.md$/.test(f)) - .sort(); - - if (workflowFiles.length === 0) { - logToFile( - `No workflow files found in references/ directory. Files present: ${allFiles.join( - ', ', - )}`, - ); - return []; - } - - logToFile( - `Found ${workflowFiles.length} workflow files: ${workflowFiles.join(', ')}`, - ); - - const steps: Array<{ name: string; prompt: string }> = []; - - for (const workflowFile of workflowFiles) { - // Extract phase name from filename - // e.g., "basic-integration-1.0-begin.md" -> "1.0-begin" - const phaseMatch = workflowFile.match(/(\d+\.\d+-.+)\.md$/); - const phaseName = phaseMatch - ? phaseMatch[1] - : workflowFile.replace(/\.md$/, ''); - - const prompt = `You are performing phase "${phaseName}" of a PostHog integration for a ${config.metadata.name} project. - -${projectContext} - -The PostHog skill is installed at .claude/skills/${skillId}/. -Read SKILL.md in that directory for available reference files. - -Follow the instructions in the workflow file: .claude/skills/${skillId}/references/${workflowFile} - -Important: Read files before writing. Use environment variables, not hardcoded keys. -Do not manually edit package.json. Use lockfiles to determine the package manager. -You must read a file immediately before attempting to write it, even if you have previously read it; failure to do so will cause a tool failure. -Always install packages as a background task. Don't await completion; proceed with other work immediately after starting the installation. -`; - - steps.push({ name: phaseName, prompt }); - } - - return steps; -} diff --git a/src/lib/benchmark.ts b/src/lib/benchmark.ts new file mode 100644 index 0000000..278ed22 --- /dev/null +++ b/src/lib/benchmark.ts @@ -0,0 +1,371 @@ +/** + * Benchmark tracking for wizard runs. + * + * Detects workflow phase transitions by watching for Read tool calls on + * workflow files (e.g., basic-integration-1.0-begin.md). Tracks per-phase + * timing, turns, and token usage. Writes results to a JSON file for CI. + * + * Usage in runAgent(): + * const tracker = options.benchmark ? new BenchmarkTracker() : null; + * // in message loop: + * tracker?.onMessage(message); + * // on success: + * const benchmark = tracker?.finalize(resultMessage, durationMs); + */ + +import fs from 'fs'; +import chalk from 'chalk'; +import clack from '../utils/clack'; +import { logToFile, LOG_FILE_PATH } from '../utils/debug'; + +export const BENCHMARK_FILE_PATH = '/tmp/posthog-wizard-benchmark.json'; + +// ── Types ────────────────────────────────────────────────────────────── + +export interface StepUsage { + name: string; + usage: { + input_tokens: number; + output_tokens: number; + cache_creation_input_tokens: number; + cache_read_input_tokens: number; + }; + modelUsage: Record; + totalCostUsd: number; + durationMs: number; + durationApiMs: number; + numTurns: number; + contextTokensIn?: number; + contextTokensOut?: number; + compactions?: number; + compactionPreTokens?: number[]; +} + +export interface BenchmarkData { + timestamp: string; + steps: StepUsage[]; + totals: { + totalCostUsd: number; + durationMs: number; + inputTokens: number; + outputTokens: number; + numTurns: number; + }; +} + +// ── Formatting helpers ───────────────────────────────────────────────── + +function formatDuration(ms: number): string { + const totalSeconds = Math.round(ms / 1000); + const minutes = Math.floor(totalSeconds / 60); + const seconds = totalSeconds % 60; + if (minutes > 0) return `${minutes}m ${seconds}s`; + return `${seconds}s`; +} + +function formatTokenCount(tokens: number): string { + if (tokens >= 1_000_000) return `${(tokens / 1_000_000).toFixed(1)}M`; + if (tokens >= 10_000) return `${Math.round(tokens / 1000)}K`; + return tokens.toLocaleString(); +} + +// ── Internal helpers ─────────────────────────────────────────────────── + +/** + * Sum token usage across all models from the SDK's modelUsage field. + * modelUsage has per-model aggregates with camelCase field names. + */ +function sumModelUsage(modelUsage: Record): { + input_tokens: number; + output_tokens: number; + cache_creation_input_tokens: number; + cache_read_input_tokens: number; +} { + let input_tokens = 0; + let output_tokens = 0; + let cache_creation_input_tokens = 0; + let cache_read_input_tokens = 0; + + for (const model of Object.values(modelUsage)) { + input_tokens += model.inputTokens ?? 0; + output_tokens += model.outputTokens ?? 0; + cache_creation_input_tokens += model.cacheCreationInputTokens ?? 0; + cache_read_input_tokens += model.cacheReadInputTokens ?? 0; + } + + return { + input_tokens, + output_tokens, + cache_creation_input_tokens, + cache_read_input_tokens, + }; +} + +/** Regex to detect workflow file references: matches "1.0-begin" from file paths/text */ +const WORKFLOW_FILE_RE = /(\d+\.\d+-[a-z]+)(?:\.md)?/; + +// ── BenchmarkTracker ─────────────────────────────────────────────────── + +interface PhaseRecord { + name: string; + startTime: number; + endTime: number; + turns: number; + inputTokens: number; + outputTokens: number; + compactions: number; + compactionPreTokens: number[]; +} + +/** + * Observes the SDK message stream and tracks per-phase metrics. + * + * Phase transitions are detected by watching for Read tool calls on + * workflow files (matching the pattern `*-1.0-begin.md`). Everything + * before the first workflow file is tracked as the "setup" phase. + */ +export class BenchmarkTracker { + private spinner: ReturnType; + private phases: PhaseRecord[] = []; + private currentPhase = 'setup'; + private phaseStartTime: number; + private phaseTurns = 0; + private phaseInputTokens = 0; + private phaseOutputTokens = 0; + private phaseCompactions = 0; + private phaseCompactionPreTokens: number[] = []; + private seenPhases = new Set(); + + constructor(spinner: ReturnType) { + this.spinner = spinner; + this.phaseStartTime = Date.now(); + clack.log.info( + `${chalk.cyan('[BENCHMARK]')} Verbose logs: ${LOG_FILE_PATH}`, + ); + clack.log.info( + `${chalk.cyan( + '[BENCHMARK]', + )} Benchmark data will be written to: ${BENCHMARK_FILE_PATH}`, + ); + clack.log.info( + `${chalk.cyan('[BENCHMARK]')} Starting phase: ${chalk.bold('setup')}`, + ); + logToFile('[BENCHMARK] Starting phase: setup'); + } + + /** + * Feed every SDK message into the tracker. + */ + onMessage(message: any): void { + if (message.type === 'assistant') { + this.phaseTurns++; + + // Accumulate per-turn token usage if the API response includes it + const usage = message.message?.usage; + if (usage) { + this.phaseInputTokens += usage.input_tokens ?? 0; + this.phaseOutputTokens += usage.output_tokens ?? 0; + } + + // Scan all content blocks for workflow phase references + const content = message.message?.content; + if (Array.isArray(content)) { + for (const block of content) { + // Detect from text content (agent mentioning/quoting workflow files) + if (block.type === 'text' && typeof block.text === 'string') { + this.detectPhaseFromText(block.text); + } + // Detect from tool_use blocks (Read tool on workflow files) + if (block.type === 'tool_use') { + const filePath = block.input?.file_path ?? block.input?.path ?? ''; + if (typeof filePath === 'string') { + this.detectPhaseFromText(filePath); + } + } + } + } + } + + // Track compaction events (SDK compact_boundary messages) + if (message.type === 'system' && message.subtype === 'compact_boundary') { + const preTokens = message.compact_metadata?.pre_tokens ?? 0; + const trigger = message.compact_metadata?.trigger ?? 'unknown'; + this.phaseCompactions++; + this.phaseCompactionPreTokens.push(preTokens); + logToFile( + `[BENCHMARK] [COMPACTION] Context compacted during "${ + this.currentPhase + }" (trigger: ${trigger}, pre_tokens: ${formatTokenCount(preTokens)})`, + ); + clack.log.info( + `${chalk.yellow('[COMPACTION]')} Context compacted during "${ + this.currentPhase + }" (trigger: ${trigger}, pre_tokens: ${formatTokenCount(preTokens)})`, + ); + } + } + + private detectPhaseFromText(text: string): void { + const match = text.match(WORKFLOW_FILE_RE); + if (match && !this.seenPhases.has(match[1])) { + this.transitionTo(match[1]); + } + } + + /** + * Close tracking and build the final BenchmarkData. + * Call this when the agent result message is received. + */ + finalize(resultMessage: any, totalDurationMs: number): BenchmarkData { + // Close the current (last) phase + this.closeCurrentPhase(); + + // Build per-phase StepUsage from tracked phases + aggregate from result + const modelUsage = resultMessage?.modelUsage ?? {}; + const aggregateUsage = sumModelUsage(modelUsage); + const lastCallUsage = resultMessage?.usage ?? {}; + const contextTokensOut = + Number(lastCallUsage.input_tokens ?? 0) + + Number(lastCallUsage.cache_read_input_tokens ?? 0) + + Number(lastCallUsage.cache_creation_input_tokens ?? 0); + + const totalTurns = this.phases.reduce((s, p) => s + p.turns, 0); + const totalCost = resultMessage?.total_cost_usd ?? 0; + + const steps: StepUsage[] = this.phases.map((phase) => ({ + name: phase.name, + // Per-phase token usage from assistant message usage fields + usage: { + input_tokens: phase.inputTokens, + output_tokens: phase.outputTokens, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + }, + modelUsage: {}, + // Proportional cost estimate based on turns + totalCostUsd: totalTurns > 0 ? totalCost * (phase.turns / totalTurns) : 0, + durationMs: phase.endTime - phase.startTime, + durationApiMs: 0, + numTurns: phase.turns, + ...(phase.compactions > 0 && { + compactions: phase.compactions, + compactionPreTokens: phase.compactionPreTokens, + }), + })); + + // Stamp context size on the last step + if (steps.length > 0) { + steps[steps.length - 1].contextTokensOut = contextTokensOut; + } + + const benchmark: BenchmarkData = { + timestamp: new Date().toISOString(), + steps, + totals: { + totalCostUsd: totalCost, + durationMs: totalDurationMs, + inputTokens: + aggregateUsage.input_tokens + + aggregateUsage.cache_read_input_tokens + + aggregateUsage.cache_creation_input_tokens, + outputTokens: aggregateUsage.output_tokens, + numTurns: resultMessage?.num_turns ?? totalTurns, + }, + }; + + // Log summary + const totalDurationStr = formatDuration(totalDurationMs); + clack.log.success( + `${chalk.cyan('[BENCHMARK]')} ${ + this.phases.length + } phases completed in ${totalDurationStr}, cost: $${totalCost.toFixed( + 4, + )}`, + ); + clack.log.info( + `${chalk.cyan('[BENCHMARK]')} Results written to ${BENCHMARK_FILE_PATH}`, + ); + + writeBenchmarkData(benchmark); + return benchmark; + } + + // ── Private ──────────────────────────────────────────────────────── + + private transitionTo(newPhase: string): void { + // Stop spinner so log output is visible + this.spinner.stop( + `${chalk.cyan('[BENCHMARK]')} Completed phase: ${chalk.bold( + this.currentPhase, + )} ${chalk.dim(`(${this.formatPhaseStats()})`)}`, + ); + this.closeCurrentPhase(); + + this.seenPhases.add(newPhase); + this.currentPhase = newPhase; + this.phaseStartTime = Date.now(); + this.phaseTurns = 0; + this.phaseInputTokens = 0; + this.phaseOutputTokens = 0; + this.phaseCompactions = 0; + this.phaseCompactionPreTokens = []; + + clack.log.info( + `${chalk.cyan('[BENCHMARK]')} Starting phase: ${chalk.bold(newPhase)}`, + ); + logToFile(`[BENCHMARK] Starting phase: ${newPhase}`); + + // Restart spinner + this.spinner.start(`Integrating PostHog (${newPhase})...`); + } + + private closeCurrentPhase(): void { + const now = Date.now(); + + this.phases.push({ + name: this.currentPhase, + startTime: this.phaseStartTime, + endTime: now, + turns: this.phaseTurns, + inputTokens: this.phaseInputTokens, + outputTokens: this.phaseOutputTokens, + compactions: this.phaseCompactions, + compactionPreTokens: [...this.phaseCompactionPreTokens], + }); + + logToFile( + `[BENCHMARK] Completed phase: ${ + this.currentPhase + } (${this.formatPhaseStats()})`, + ); + } + + private formatPhaseStats(): string { + const duration = Date.now() - this.phaseStartTime; + const parts = [formatDuration(duration), `${this.phaseTurns} turns`]; + if (this.phaseInputTokens > 0 || this.phaseOutputTokens > 0) { + parts.push( + `in: ${formatTokenCount(this.phaseInputTokens)}`, + `out: ${formatTokenCount(this.phaseOutputTokens)}`, + ); + } + if (this.phaseCompactions > 0) { + parts.push(`${this.phaseCompactions} compaction(s)`); + } + return parts.join(', '); + } +} + +// ── File I/O ─────────────────────────────────────────────────────────── + +/** + * Write benchmark data to the benchmark file. + */ +export function writeBenchmarkData(data: BenchmarkData): void { + try { + fs.writeFileSync(BENCHMARK_FILE_PATH, JSON.stringify(data, null, 2)); + logToFile(`Benchmark data written to ${BENCHMARK_FILE_PATH}`); + } catch (error) { + logToFile('Failed to write benchmark data:', error); + } +}