diff --git a/bin.ts b/bin.ts index 7c82ae0..e8e1341 100644 --- a/bin.ts +++ b/bin.ts @@ -115,6 +115,12 @@ yargs(hideBin(process.argv)) 'Show menu for manual integration selection instead of auto-detecting\nenv: POSTHOG_WIZARD_MENU', type: 'boolean', }, + benchmark: { + default: false, + describe: + 'Run in benchmark mode with per-phase token tracking\nenv: POSTHOG_WIZARD_BENCHMARK', + type: 'boolean', + }, }); }, (argv) => { diff --git a/src/lib/__tests__/agent-interface.test.ts b/src/lib/__tests__/agent-interface.test.ts index beed455..d11a257 100644 --- a/src/lib/__tests__/agent-interface.test.ts +++ b/src/lib/__tests__/agent-interface.test.ts @@ -32,6 +32,7 @@ describe('runAgent', () => { localMcp: false, ci: false, menu: false, + benchmark: false, }; const defaultAgentConfig = { diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts index 2c315fe..a06acfd 100644 --- a/src/lib/agent-interface.ts +++ b/src/lib/agent-interface.ts @@ -3,7 +3,9 @@ * Uses Claude Agent SDK directly with PostHog LLM gateway */ +import fs from 'fs'; import path from 'path'; +import chalk from 'chalk'; import clack from '../utils/clack'; import { debug, logToFile, initLogFile, LOG_FILE_PATH } from '../utils/debug'; import type { WizardOptions } from '../utils/types'; @@ -84,6 +86,43 @@ type AgentRunConfig = { model: string; }; +export const BENCHMARK_FILE_PATH = '/tmp/posthog-wizard-benchmark.json'; + +export interface StepUsage { + name: string; + usage: { + input_tokens: number; + output_tokens: number; + cache_creation_input_tokens: number; + cache_read_input_tokens: number; + }; + modelUsage: Record; + totalCostUsd: number; + durationMs: number; + durationApiMs: number; + numTurns: number; + /** Conversation context size (tokens) entering this step */ + contextTokensIn: number; + /** Conversation context size (tokens) exiting this step */ + contextTokensOut: number; + /** Number of auto-compactions that occurred during this step */ + compactions?: number; + /** Token count before each compaction (from SDK compact_boundary messages) */ + compactionPreTokens?: number[]; +} + +export interface BenchmarkData { + timestamp: string; + steps: StepUsage[]; + totals: { + totalCostUsd: number; + durationMs: number; + inputTokens: number; + outputTokens: number; + numTurns: number; + }; +} + /** * Package managers that can be used to run commands. */ @@ -380,7 +419,11 @@ export async function runAgent( successMessage?: string; errorMessage?: string; }, -): Promise<{ error?: AgentErrorType; message?: string }> { +): Promise<{ + error?: AgentErrorType; + message?: string; + benchmark?: BenchmarkData; +}> { const { estimatedDurationMinutes = 8, spinnerMessage = 'Customizing your PostHog setup...', @@ -405,6 +448,8 @@ export async function runAgent( const collectedText: string[] = []; // Track if we received a successful result (before any cleanup errors) let receivedSuccessResult = false; + // Track the result message for benchmark data extraction + let resultMessage: SDKMessage = null; // Workaround for SDK bug: stdin closes before canUseTool responses can be sent. // The fix is to use an async generator for the prompt that stays open until @@ -429,7 +474,11 @@ export async function runAgent( // Helper to handle successful completion (used in normal path and race condition recovery) const completeWithSuccess = ( suppressedError?: Error, - ): { error?: AgentErrorType; message?: string } => { + ): { + error?: AgentErrorType; + message?: string; + benchmark?: BenchmarkData; + } => { const durationMs = Date.now() - startTime; const durationSeconds = Math.round(durationMs / 1000); @@ -465,7 +514,19 @@ export async function runAgent( duration_seconds: durationSeconds, }); spinner.stop(successMessage); - return {}; + + // Write benchmark data from the single-query result if available + let benchmark: BenchmarkData | undefined; + if (resultMessage && options.benchmark) { + benchmark = extractBenchmarkFromResult( + 'single-run', + resultMessage, + durationMs, + ); + writeBenchmarkData(benchmark); + } + + return { benchmark }; }; try { @@ -568,6 +629,7 @@ export async function runAgent( // The SDK may emit a second error result during cleanup due to a race condition if (message.subtype === 'success' && !message.is_error) { receivedSuccessResult = true; + resultMessage = message; } signalDone!(); } @@ -644,6 +706,611 @@ export async function runAgent( } } +/** + * Format milliseconds into a human-readable duration string (e.g., "2m 34s", "45s"). + */ +function formatDuration(ms: number): string { + const totalSeconds = Math.round(ms / 1000); + const minutes = Math.floor(totalSeconds / 60); + const seconds = totalSeconds % 60; + if (minutes > 0) { + return `${minutes}m ${seconds}s`; + } + return `${seconds}s`; +} + +/** + * Format token count into a human-readable string (e.g., "1.2M", "345K", "1,234"). + */ +function formatTokenCount(tokens: number): string { + if (tokens >= 1_000_000) { + return `${(tokens / 1_000_000).toFixed(1)}M`; + } + if (tokens >= 10_000) { + return `${Math.round(tokens / 1000)}K`; + } + return tokens.toLocaleString(); +} + +/** + * Sum token usage across all models from the SDK's modelUsage field. + * The top-level `usage` field only has the last API call's tokens; + * `modelUsage` has the accurate per-model aggregates (camelCase fields). + */ +function sumModelUsage(modelUsage: Record): { + input_tokens: number; + output_tokens: number; + cache_creation_input_tokens: number; + cache_read_input_tokens: number; +} { + let input_tokens = 0; + let output_tokens = 0; + let cache_creation_input_tokens = 0; + let cache_read_input_tokens = 0; + + for (const model of Object.values(modelUsage)) { + input_tokens += model.inputTokens ?? 0; + output_tokens += model.outputTokens ?? 0; + cache_creation_input_tokens += model.cacheCreationInputTokens ?? 0; + cache_read_input_tokens += model.cacheReadInputTokens ?? 0; + } + + return { + input_tokens, + output_tokens, + cache_creation_input_tokens, + cache_read_input_tokens, + }; +} + +/** + * Extract benchmark data from a single SDK result message. + */ +function extractBenchmarkFromResult( + stepName: string, + message: SDKMessage, + wallDurationMs: number, +): BenchmarkData { + const modelUsage = message.modelUsage ?? {}; + const usage = sumModelUsage(modelUsage); + const lastCallUsage = message.usage ?? {}; + const contextTokensOut = + Number(lastCallUsage.input_tokens ?? 0) + + Number(lastCallUsage.cache_read_input_tokens ?? 0) + + Number(lastCallUsage.cache_creation_input_tokens ?? 0); + const step: StepUsage = { + name: stepName, + usage, + modelUsage, + totalCostUsd: message.total_cost_usd ?? 0, + durationMs: message.duration_ms ?? wallDurationMs, + durationApiMs: message.duration_api_ms ?? 0, + numTurns: message.num_turns ?? 0, + contextTokensIn: 0, + contextTokensOut, + }; + + return { + timestamp: new Date().toISOString(), + steps: [step], + totals: { + totalCostUsd: step.totalCostUsd, + durationMs: step.durationMs, + inputTokens: step.usage.input_tokens, + outputTokens: step.usage.output_tokens, + numTurns: step.numTurns, + }, + }; +} + +/** + * Write benchmark data to the benchmark file. + */ +function writeBenchmarkData(data: BenchmarkData): void { + try { + fs.writeFileSync(BENCHMARK_FILE_PATH, JSON.stringify(data, null, 2)); + logToFile(`Benchmark data written to ${BENCHMARK_FILE_PATH}`); + } catch (error) { + logToFile('Failed to write benchmark data:', error); + } +} + +/** + * Execute multiple agent steps in a single conversation with per-step usage tracking. + * Uses one query() call with multiple user messages, so conversation context is preserved + * across steps (identical behavior to normal non-benchmark mode). + * + * Steps can be discovered dynamically via the onAfterStep callback — e.g., after the + * setup step installs a skill, onAfterStep discovers the workflow files and returns + * them as additional steps to run in the same conversation. + * + * Per-step usage is computed as deltas between consecutive SDK result messages. + * + * Writes benchmark data to BENCHMARK_FILE_PATH when all steps complete. + */ +export async function runAgentSteps( + agentConfig: AgentRunConfig, + initialSteps: Array<{ name: string; prompt: string }>, + options: WizardOptions, + spinner: ReturnType, + config?: { + estimatedDurationMinutes?: number; + spinnerMessage?: string; + successMessage?: string; + errorMessage?: string; + /** Called after each step completes. Return additional steps to append to the queue. */ + onAfterStep?: ( + stepIndex: number, + stepName: string, + ) => Array<{ name: string; prompt: string }>; + }, +): Promise<{ + error?: AgentErrorType; + message?: string; + benchmark?: BenchmarkData; +}> { + const { + estimatedDurationMinutes = 8, + spinnerMessage = 'Customizing your PostHog setup...', + successMessage = 'PostHog integration complete', + errorMessage = 'Integration failed', + onAfterStep, + } = config ?? {}; + + const { query } = await getSDKModule(); + + clack.log.step( + `This whole process should take about ${estimatedDurationMinutes} minutes including error checking and fixes.\n\nGrab some coffee!`, + ); + clack.log.info(`${chalk.cyan('[BENCHMARK]')} Verbose logs: ${LOG_FILE_PATH}`); + clack.log.info( + `${chalk.cyan( + '[BENCHMARK]', + )} Benchmark data will be written to: ${BENCHMARK_FILE_PATH}`, + ); + + spinner.start(spinnerMessage); + + const overallStartTime = Date.now(); + const stepUsages: StepUsage[] = []; + const collectedText: string[] = []; + let receivedSuccessResult = false; + + // Dynamic steps list — grows as onAfterStep discovers more + const allSteps = [...initialSteps]; + const stepStartTimes: number[] = []; + let completedStepCount = 0; + + // Per-step compaction tracking (reset after each step) + let stepCompactions = 0; + let stepCompactionPreTokens: number[] = []; + + // Previous cumulative values for delta computation + let prevCumulative = { + usage: { + input_tokens: 0, + output_tokens: 0, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + }, + modelUsage: {} as Record, + costUsd: 0, + durationMs: 0, + durationApiMs: 0, + numTurns: 0, + }; + + // Step completion synchronization: resolves with `true` on success, `false` on error + // eslint-disable-next-line @typescript-eslint/no-empty-function + let resolveStepDone: (success: boolean) => void = () => {}; + function waitForStepDone(): Promise { + return new Promise((resolve) => { + resolveStepDone = resolve; + }); + } + + // Final cleanup signal for SDK stdin workaround + let signalAllDone: () => void; + const allDone = new Promise((resolve) => { + signalAllDone = resolve; + }); + + // Prompt stream generator — yields user messages for each step in order, + // pausing between steps to wait for the result and discover more steps. + const promptStream = async function* () { + let i = 0; + while (i < allSteps.length) { + const step = allSteps[i]; + stepStartTimes[i] = Date.now(); + + logToFile(`Yielding benchmark step ${i + 1}: ${step.name}`); + spinner.stop( + `${chalk.cyan('[BENCHMARK]')} Starting step ${i + 1}/${ + allSteps.length + }: ${chalk.bold(step.name)}`, + ); + spinner.start( + `Running step ${i + 1}/${allSteps.length}: ${step.name}...`, + ); + + yield { + type: 'user', + session_id: '', + message: { role: 'user', content: step.prompt }, + parent_tool_use_id: null, + }; + + // Wait for this step's result before yielding the next prompt + const success = await waitForStepDone(); + if (!success) { + // Step failed — stop yielding, let the generator end + break; + } + + // Discover more steps after this one completes + if (onAfterStep) { + const moreSteps = onAfterStep(i, step.name); + if (moreSteps.length > 0) { + allSteps.push(...moreSteps); + clack.log.info( + `${chalk.cyan('[BENCHMARK]')} Discovered ${ + moreSteps.length + } more phases: ${moreSteps.map((s) => s.name).join(', ')}`, + ); + } + } + + i++; + } + + // Keep generator alive for SDK cleanup (stdin workaround) + await allDone; + }; + + const allowedTools = [ + 'Read', + 'Write', + 'Edit', + 'Glob', + 'Grep', + 'Bash', + 'ListMcpResourcesTool', + 'Skill', + ]; + + try { + const response = query({ + prompt: promptStream(), + options: { + model: agentConfig.model, + cwd: agentConfig.workingDirectory, + permissionMode: 'acceptEdits', + mcpServers: agentConfig.mcpServers, + settingSources: ['project'], + allowedTools, + env: { + ...process.env, + ANTHROPIC_API_KEY: undefined, + }, + canUseTool: (toolName: string, input: unknown) => { + logToFile('canUseTool called:', { toolName, input }); + const result = wizardCanUseTool( + toolName, + input as Record, + ); + logToFile('canUseTool result:', result); + return Promise.resolve(result); + }, + tools: { type: 'preset', preset: 'claude_code' }, + stderr: (data: string) => { + logToFile('CLI stderr:', data); + if (options.debug) { + debug('CLI stderr:', data); + } + }, + }, + }); + + for await (const message of response) { + handleSDKMessage( + message, + options, + spinner, + collectedText, + receivedSuccessResult, + ); + + // Track compaction events from the SDK + if (message.type === 'system' && message.subtype === 'compact_boundary') { + const preTokens = message.compact_metadata?.pre_tokens ?? 0; + const trigger = message.compact_metadata?.trigger ?? 'unknown'; + stepCompactions++; + stepCompactionPreTokens.push(preTokens); + logToFile( + `[COMPACTION] Context compacted (trigger: ${trigger}, pre_tokens: ${formatTokenCount( + preTokens, + )})`, + ); + clack.log.info( + `${chalk.yellow('[COMPACTION]')} Context compacted during step "${ + allSteps[completedStepCount]?.name + }" (trigger: ${trigger}, pre_tokens: ${formatTokenCount(preTokens)})`, + ); + } + + if (message.type === 'result') { + if (message.subtype === 'success' && !message.is_error) { + receivedSuccessResult = true; + + const stepIndex = completedStepCount; + const stepDurationMs = Date.now() - stepStartTimes[stepIndex]; + + // Compute delta usage from cumulative SDK values + const modelUsageData = message.modelUsage ?? {}; + const cumulativeUsage = sumModelUsage(modelUsageData); + const cumulativeCost = message.total_cost_usd ?? 0; + const cumulativeDuration = message.duration_ms ?? 0; + const cumulativeDurationApi = message.duration_api_ms ?? 0; + const cumulativeTurns = message.num_turns ?? 0; + + const deltaUsage = { + input_tokens: + cumulativeUsage.input_tokens - prevCumulative.usage.input_tokens, + output_tokens: + cumulativeUsage.output_tokens - + prevCumulative.usage.output_tokens, + cache_creation_input_tokens: + cumulativeUsage.cache_creation_input_tokens - + prevCumulative.usage.cache_creation_input_tokens, + cache_read_input_tokens: + cumulativeUsage.cache_read_input_tokens - + prevCumulative.usage.cache_read_input_tokens, + }; + const deltaCost = cumulativeCost - prevCumulative.costUsd; + // num_turns is per-response (not cumulative), so use directly + const stepTurns = cumulativeTurns; + const deltaDurationApi = + cumulativeDurationApi - prevCumulative.durationApiMs; + const deltaModelUsage = computeModelUsageDelta( + modelUsageData, + prevCumulative.modelUsage, + ); + + // Context size from the last API call's usage (not cumulative modelUsage). + // The last call's input represents the actual conversation window at that point. + const lastCallUsage = message.usage ?? {}; + const contextTokensOut = + Number(lastCallUsage.input_tokens ?? 0) + + Number(lastCallUsage.cache_read_input_tokens ?? 0) + + Number(lastCallUsage.cache_creation_input_tokens ?? 0); + const contextTokensIn = + stepUsages.length > 0 + ? stepUsages[stepUsages.length - 1].contextTokensOut + : 0; + + stepUsages.push({ + name: allSteps[stepIndex].name, + usage: deltaUsage, + modelUsage: deltaModelUsage, + totalCostUsd: deltaCost, + durationMs: stepDurationMs, + durationApiMs: deltaDurationApi, + numTurns: stepTurns, + contextTokensIn, + contextTokensOut, + ...(stepCompactions > 0 && { + compactions: stepCompactions, + compactionPreTokens: stepCompactionPreTokens, + }), + }); + + // Reset per-step compaction tracking + stepCompactions = 0; + stepCompactionPreTokens = []; + + // Update cumulative tracking + prevCumulative = { + usage: cumulativeUsage, + modelUsage: modelUsageData, + costUsd: cumulativeCost, + durationMs: cumulativeDuration, + durationApiMs: cumulativeDurationApi, + numTurns: cumulativeTurns, + }; + + spinner.stop( + `${chalk.cyan('[BENCHMARK]')} Completed step ${stepIndex + 1}/${ + allSteps.length + }: ${chalk.bold(allSteps[stepIndex].name)} ${chalk.dim( + `(${formatDuration(stepDurationMs)}, $${deltaCost.toFixed( + 4, + )}, ${stepTurns} turns, ctx: ${formatTokenCount( + contextTokensIn, + )} → ${formatTokenCount(contextTokensOut)})`, + )}`, + ); + logToFile( + `Step "${allSteps[stepIndex].name}" completed in ${Math.round( + stepDurationMs / 1000, + )}s`, + ); + + completedStepCount++; + resolveStepDone(true); + } else { + // Error result — signal generator to stop yielding + resolveStepDone(false); + } + + // Signal generator cleanup when all done + if (completedStepCount >= allSteps.length) { + signalAllDone!(); + } + } + } + + // Check for error signals in collected output + const outputText = collectedText.join('\n'); + if (outputText.includes(AgentSignals.ERROR_MCP_MISSING)) { + spinner.stop('Agent could not access PostHog MCP'); + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + return { error: AgentErrorType.MCP_MISSING, benchmark }; + } + if (outputText.includes(AgentSignals.ERROR_RESOURCE_MISSING)) { + spinner.stop('Agent could not access setup resource'); + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + return { error: AgentErrorType.RESOURCE_MISSING, benchmark }; + } + if (outputText.includes('API Error: 429')) { + spinner.stop('Rate limit exceeded'); + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + return { + error: AgentErrorType.RATE_LIMIT, + message: outputText, + benchmark, + }; + } + if (outputText.includes('API Error:')) { + spinner.stop('API error occurred'); + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + return { + error: AgentErrorType.API_ERROR, + message: outputText, + benchmark, + }; + } + + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + + const totalDurationSeconds = Math.round( + (Date.now() - overallStartTime) / 1000, + ); + const totalCost = stepUsages.reduce((sum, s) => sum + s.totalCostUsd, 0); + clack.log.success( + `${chalk.cyan( + '[BENCHMARK]', + )} All ${completedStepCount} steps completed in ${formatDuration( + totalDurationSeconds * 1000, + )}, total cost: $${totalCost.toFixed(4)}`, + ); + clack.log.info( + `${chalk.cyan('[BENCHMARK]')} Results written to ${BENCHMARK_FILE_PATH}`, + ); + logToFile( + `All ${completedStepCount} benchmark steps completed in ${totalDurationSeconds}s`, + ); + + analytics.capture(WIZARD_INTERACTION_EVENT_NAME, { + action: 'agent integration completed', + duration_ms: Date.now() - overallStartTime, + duration_seconds: totalDurationSeconds, + benchmark_steps: completedStepCount, + }); + + spinner.stop(successMessage); + return { benchmark }; + } catch (error) { + signalAllDone!(); + + if (receivedSuccessResult) { + logToFile('Ignoring post-completion error, agent completed successfully'); + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + spinner.stop(successMessage); + return { benchmark }; + } + + spinner.stop(errorMessage); + const benchmark = buildBenchmarkData(stepUsages, overallStartTime); + writeBenchmarkData(benchmark); + + const outputText = collectedText.join('\n'); + const apiErrorMatch = outputText.match(/API Error: [^\n]+/g); + const apiErrorMessage = apiErrorMatch + ? apiErrorMatch.join('\n') + : undefined; + + if (outputText.includes('API Error: 429')) { + return { + error: AgentErrorType.RATE_LIMIT, + message: apiErrorMessage, + benchmark, + }; + } + if (outputText.includes('API Error:')) { + return { + error: AgentErrorType.API_ERROR, + message: apiErrorMessage, + benchmark, + }; + } + + throw error; + } +} + +/** + * Compute per-model usage deltas between current and previous cumulative modelUsage. + */ +function computeModelUsageDelta( + current: Record, + previous: Record, +): Record { + const result: Record = {}; + for (const [model, data] of Object.entries(current)) { + const prev = previous[model] ?? {}; + result[model] = { + inputTokens: (data.inputTokens ?? 0) - (prev.inputTokens ?? 0), + outputTokens: (data.outputTokens ?? 0) - (prev.outputTokens ?? 0), + cacheReadInputTokens: + (data.cacheReadInputTokens ?? 0) - (prev.cacheReadInputTokens ?? 0), + cacheCreationInputTokens: + (data.cacheCreationInputTokens ?? 0) - + (prev.cacheCreationInputTokens ?? 0), + webSearchRequests: + (data.webSearchRequests ?? 0) - (prev.webSearchRequests ?? 0), + costUSD: (data.costUSD ?? 0) - (prev.costUSD ?? 0), + contextWindow: data.contextWindow ?? 0, + }; + } + return result; +} + +/** + * Build BenchmarkData from collected step usages. + */ +function buildBenchmarkData( + stepUsages: StepUsage[], + overallStartTime: number, +): BenchmarkData { + return { + timestamp: new Date().toISOString(), + steps: stepUsages, + totals: { + totalCostUsd: stepUsages.reduce((sum, s) => sum + s.totalCostUsd, 0), + durationMs: Date.now() - overallStartTime, + inputTokens: stepUsages.reduce( + (sum, s) => + sum + + s.usage.input_tokens + + s.usage.cache_read_input_tokens + + s.usage.cache_creation_input_tokens, + 0, + ), + outputTokens: stepUsages.reduce( + (sum, s) => sum + s.usage.output_tokens, + 0, + ), + numTurns: stepUsages.reduce((sum, s) => sum + s.numTurns, 0), + }, + }; +} + /** * Handle SDK messages and provide user feedback * @@ -739,7 +1406,6 @@ function handleSDKMessage( } default: - // Log other message types for debugging if (options.debug) { debug(`Unhandled message type: ${message.type}`); } diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts index d319df2..bd75599 100644 --- a/src/lib/agent-runner.ts +++ b/src/lib/agent-runner.ts @@ -22,9 +22,13 @@ import clack from '../utils/clack'; import { initializeAgent, runAgent, + runAgentSteps, AgentSignals, AgentErrorType, } from './agent-interface'; +import { logToFile } from '../utils/debug'; +import fs from 'fs'; +import path from 'path'; import { getCloudUrlFromRegion } from '../utils/urls'; import chalk from 'chalk'; import * as semver from 'semver'; @@ -193,18 +197,90 @@ export async function runAgentWizard( options, ); - const agentResult = await runAgent( - agent, - integrationPrompt, - options, - spinner, - { - estimatedDurationMinutes: config.ui.estimatedDurationMinutes, - spinnerMessage: SPINNER_MESSAGE, - successMessage: config.ui.successMessage, - errorMessage: 'Integration failed', - }, - ); + const agentRunConfig = { + estimatedDurationMinutes: config.ui.estimatedDurationMinutes, + spinnerMessage: SPINNER_MESSAGE, + successMessage: config.ui.successMessage, + errorMessage: 'Integration failed', + }; + + let agentResult; + + if (options.benchmark) { + clack.log.info( + `${chalk.cyan( + '[BENCHMARK]', + )} Running in benchmark mode — each workflow phase will be tracked separately`, + ); + + // Benchmark mode: run setup + workflow phases in a single conversation, + // with per-step tracking. Context is preserved across steps (identical to normal mode). + const additionalLines = config.prompts.getAdditionalContextLines + ? config.prompts.getAdditionalContextLines(frameworkContext) + : []; + const additionalContext = + additionalLines.length > 0 + ? '\n' + additionalLines.map((line) => `- ${line}`).join('\n') + : ''; + + const projectContext = `Project context: +- Framework: ${config.metadata.name} ${frameworkVersion || 'latest'} +- TypeScript: ${typeScriptDetected ? 'Yes' : 'No'} +- PostHog API Key: ${projectApiKey} +- PostHog Host: ${host}${additionalContext}`; + + const setupPrompt = `You have access to the PostHog MCP server which provides skills to integrate PostHog into this ${config.metadata.name} project. + +${projectContext} + +Instructions (follow these steps IN ORDER - do not skip or reorder): + +STEP 1: List available skills from the PostHog MCP server using ListMcpResourcesTool. If this tool is not available or you cannot access the MCP server, you must emit: ${AgentSignals.ERROR_MCP_MISSING} Could not access the PostHog MCP server and halt. + + Review the skill descriptions and choose the one that best matches this project's framework and configuration. + If no suitable skill is found, or you cannot access the MCP server, you emit: ${AgentSignals.ERROR_RESOURCE_MISSING} Could not find a suitable skill for this project. + +STEP 2: Fetch the chosen skill resource (e.g., posthog://skills/{skill-id}). + The resource returns a shell command to install the skill. + +STEP 3: Run the installation command using Bash: + - Execute the EXACT command returned by the resource (do not modify it) + - This will download and extract the skill to .claude/skills/{skill-id}/ + +STEP 4: Load the installed skill's SKILL.md file to understand what references are available. + +STEP 5: Set up environment variables for PostHog in a .env file with the API key and host provided above, using the appropriate naming convention for ${config.metadata.name}. Make sure to use these environment variables in the code files you create instead of hardcoding the API key and host. + +Important: Look for lockfiles (pnpm-lock.yaml, package-lock.json, yarn.lock, bun.lockb) to determine the package manager (excluding the contents of node_modules). Do not manually edit package.json. Always install packages as a background task. Don't await completion; proceed with other work immediately after starting the installation. You must read a file immediately before attempting to write it, even if you have previously read it; failure to do so will cause a tool failure. + +`; + + const setupStep = { name: 'setup', prompt: setupPrompt }; + + // Run all steps in a single conversation. After the setup step installs the skill, + // onAfterStep discovers workflow files on disk and adds them as additional steps. + agentResult = await runAgentSteps(agent, [setupStep], options, spinner, { + ...agentRunConfig, + onAfterStep: (stepIndex, stepName) => { + if (stepName === 'setup') { + return discoverBenchmarkSteps( + options.installDir, + config, + projectContext, + ); + } + return []; + }, + }); + } else { + agentResult = await runAgent( + agent, + integrationPrompt, + options, + spinner, + agentRunConfig, + ); + } // Handle error cases detected in agent output if (agentResult.error === AgentErrorType.MCP_MISSING) { @@ -412,3 +488,93 @@ Important: Look for lockfiles (pnpm-lock.yaml, package-lock.json, yarn.lock, bun `; } + +/** + * Discover installed skill workflow files and build benchmark step prompts. + * Scans .claude/skills/ in the install directory for workflow files (1.0-*, 1.1-*, etc.). + */ +function discoverBenchmarkSteps( + installDir: string, + config: FrameworkConfig, + projectContext: string, +): Array<{ name: string; prompt: string }> { + const skillsDir = path.join(installDir, '.claude', 'skills'); + + if (!fs.existsSync(skillsDir)) { + logToFile('No .claude/skills/ directory found for benchmark discovery'); + return []; + } + + // Find installed skill directory + const skillDirs = fs.readdirSync(skillsDir).filter((entry) => { + const fullPath = path.join(skillsDir, entry); + return fs.statSync(fullPath).isDirectory(); + }); + + if (skillDirs.length === 0) { + logToFile('No skill directories found in .claude/skills/'); + return []; + } + + // Use the first skill directory found + const skillId = skillDirs[0]; + const skillPath = path.join(skillsDir, skillId); + logToFile(`Discovered skill for benchmark: ${skillId}`); + + // Workflow files live in the references/ subdirectory + const referencesDir = path.join(skillPath, 'references'); + if (!fs.existsSync(referencesDir)) { + logToFile('No references/ directory found in skill directory'); + return []; + } + + // Find workflow files matching pattern like "basic-integration-1.0-begin.md" + // The naming convention is {category}-{number}.{step}-{name}.md + const allFiles = fs.readdirSync(referencesDir); + const workflowFiles = allFiles + .filter((f) => /\d+\.\d+-\w+\.md$/.test(f)) + .sort(); + + if (workflowFiles.length === 0) { + logToFile( + `No workflow files found in references/ directory. Files present: ${allFiles.join( + ', ', + )}`, + ); + return []; + } + + logToFile( + `Found ${workflowFiles.length} workflow files: ${workflowFiles.join(', ')}`, + ); + + const steps: Array<{ name: string; prompt: string }> = []; + + for (const workflowFile of workflowFiles) { + // Extract phase name from filename + // e.g., "basic-integration-1.0-begin.md" -> "1.0-begin" + const phaseMatch = workflowFile.match(/(\d+\.\d+-.+)\.md$/); + const phaseName = phaseMatch + ? phaseMatch[1] + : workflowFile.replace(/\.md$/, ''); + + const prompt = `You are performing phase "${phaseName}" of a PostHog integration for a ${config.metadata.name} project. + +${projectContext} + +The PostHog skill is installed at .claude/skills/${skillId}/. +Read SKILL.md in that directory for available reference files. + +Follow the instructions in the workflow file: .claude/skills/${skillId}/references/${workflowFile} + +Important: Read files before writing. Use environment variables, not hardcoded keys. +Do not manually edit package.json. Use lockfiles to determine the package manager. +You must read a file immediately before attempting to write it, even if you have previously read it; failure to do so will cause a tool failure. +Always install packages as a background task. Don't await completion; proceed with other work immediately after starting the installation. +`; + + steps.push({ name: phaseName, prompt }); + } + + return steps; +} diff --git a/src/run.ts b/src/run.ts index e5e5311..42cc336 100644 --- a/src/run.ts +++ b/src/run.ts @@ -27,6 +27,7 @@ type Args = { ci?: boolean; apiKey?: string; menu?: boolean; + benchmark?: boolean; }; export async function runWizard(argv: Args) { @@ -57,6 +58,7 @@ export async function runWizard(argv: Args) { ci: finalArgs.ci ?? false, apiKey: finalArgs.apiKey, menu: finalArgs.menu ?? false, + benchmark: finalArgs.benchmark ?? false, }; clack.intro(`Welcome to the PostHog setup wizard ✨`); diff --git a/src/utils/types.ts b/src/utils/types.ts index 7379156..5c0b8d6 100644 --- a/src/utils/types.ts +++ b/src/utils/types.ts @@ -60,6 +60,13 @@ export type WizardOptions = { * Whether to show the menu for manual integration selection instead of auto-detecting. */ menu: boolean; + + /** + * Whether to run in benchmark mode with per-phase token tracking. + * When enabled, the wizard runs each workflow phase as a separate agent call + * and writes detailed usage data to /tmp/posthog-wizard-benchmark.json. + */ + benchmark: boolean; }; export interface Feature {