diff --git a/bin.ts b/bin.ts index 7c82ae0..e8e1341 100644 --- a/bin.ts +++ b/bin.ts @@ -115,6 +115,12 @@ yargs(hideBin(process.argv)) 'Show menu for manual integration selection instead of auto-detecting\nenv: POSTHOG_WIZARD_MENU', type: 'boolean', }, + benchmark: { + default: false, + describe: + 'Run in benchmark mode with per-phase token tracking\nenv: POSTHOG_WIZARD_BENCHMARK', + type: 'boolean', + }, }); }, (argv) => { diff --git a/src/lib/__tests__/agent-interface.test.ts b/src/lib/__tests__/agent-interface.test.ts index beed455..d11a257 100644 --- a/src/lib/__tests__/agent-interface.test.ts +++ b/src/lib/__tests__/agent-interface.test.ts @@ -32,6 +32,7 @@ describe('runAgent', () => { localMcp: false, ci: false, menu: false, + benchmark: false, }; const defaultAgentConfig = { diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts index 8c36715..6a74e0f 100644 --- a/src/lib/agent-interface.ts +++ b/src/lib/agent-interface.ts @@ -14,6 +14,8 @@ import { } from './constants'; import { getLlmGatewayUrlFromHost } from '../utils/urls'; import { LINTING_TOOLS } from './safe-tools'; +import type { BenchmarkData } from './benchmark'; +import { BenchmarkTracker } from './benchmark'; import { createEnvFileServer, ENV_FILE_TOOL_NAMES } from './env-file-tools'; // Dynamic import cache for ESM module @@ -416,7 +418,11 @@ export async function runAgent( successMessage?: string; errorMessage?: string; }, -): Promise<{ error?: AgentErrorType; message?: string }> { +): Promise<{ + error?: AgentErrorType; + message?: string; + benchmark?: BenchmarkData; +}> { const { estimatedDurationMinutes = 8, spinnerMessage = 'Customizing your PostHog setup...', @@ -430,6 +436,9 @@ export async function runAgent( `This whole process should take about ${estimatedDurationMinutes} minutes including error checking and fixes.\n\nGrab some coffee!`, ); + // Create benchmark tracker before spinner starts so its log output is visible + const tracker = options.benchmark ? new BenchmarkTracker(spinner) : null; + spinner.start(spinnerMessage); const cliPath = getClaudeCodeExecutablePath(); @@ -441,6 +450,8 @@ export async function runAgent( const collectedText: string[] = []; // Track if we received a successful result (before any cleanup errors) let receivedSuccessResult = false; + // Track the result message for benchmark data extraction + let resultMessage: SDKMessage = null; // Workaround for SDK bug: stdin closes before canUseTool responses can be sent. // The fix is to use an async generator for the prompt that stays open until @@ -465,7 +476,11 @@ export async function runAgent( // Helper to handle successful completion (used in normal path and race condition recovery) const completeWithSuccess = ( suppressedError?: Error, - ): { error?: AgentErrorType; message?: string } => { + ): { + error?: AgentErrorType; + message?: string; + benchmark?: BenchmarkData; + } => { const durationMs = Date.now() - startTime; const durationSeconds = Math.round(durationMs / 1000); @@ -501,7 +516,9 @@ export async function runAgent( duration_seconds: durationSeconds, }); spinner.stop(successMessage); - return {}; + + const benchmark = tracker?.finalize(resultMessage, durationMs); + return { benchmark }; }; try { @@ -598,6 +615,7 @@ export async function runAgent( collectedText, receivedSuccessResult, ); + tracker?.onMessage(message); // Signal completion when result received if (message.type === 'result') { @@ -605,6 +623,7 @@ export async function runAgent( // The SDK may emit a second error result during cleanup due to a race condition if (message.subtype === 'success' && !message.is_error) { receivedSuccessResult = true; + resultMessage = message; } signalDone!(); } @@ -776,7 +795,6 @@ function handleSDKMessage( } default: - // Log other message types for debugging if (options.debug) { debug(`Unhandled message type: ${message.type}`); } diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts index 46ed98b..eeb3f6f 100644 --- a/src/lib/agent-runner.ts +++ b/src/lib/agent-runner.ts @@ -193,17 +193,19 @@ export async function runAgentWizard( options, ); + const agentRunConfig = { + estimatedDurationMinutes: config.ui.estimatedDurationMinutes, + spinnerMessage: SPINNER_MESSAGE, + successMessage: config.ui.successMessage, + errorMessage: 'Integration failed', + }; + const agentResult = await runAgent( agent, integrationPrompt, options, spinner, - { - estimatedDurationMinutes: config.ui.estimatedDurationMinutes, - spinnerMessage: SPINNER_MESSAGE, - successMessage: config.ui.successMessage, - errorMessage: 'Integration failed', - }, + agentRunConfig, ); // Handle error cases detected in agent output diff --git a/src/lib/benchmark.ts b/src/lib/benchmark.ts new file mode 100644 index 0000000..278ed22 --- /dev/null +++ b/src/lib/benchmark.ts @@ -0,0 +1,371 @@ +/** + * Benchmark tracking for wizard runs. + * + * Detects workflow phase transitions by watching for Read tool calls on + * workflow files (e.g., basic-integration-1.0-begin.md). Tracks per-phase + * timing, turns, and token usage. Writes results to a JSON file for CI. + * + * Usage in runAgent(): + * const tracker = options.benchmark ? new BenchmarkTracker() : null; + * // in message loop: + * tracker?.onMessage(message); + * // on success: + * const benchmark = tracker?.finalize(resultMessage, durationMs); + */ + +import fs from 'fs'; +import chalk from 'chalk'; +import clack from '../utils/clack'; +import { logToFile, LOG_FILE_PATH } from '../utils/debug'; + +export const BENCHMARK_FILE_PATH = '/tmp/posthog-wizard-benchmark.json'; + +// ── Types ────────────────────────────────────────────────────────────── + +export interface StepUsage { + name: string; + usage: { + input_tokens: number; + output_tokens: number; + cache_creation_input_tokens: number; + cache_read_input_tokens: number; + }; + modelUsage: Record; + totalCostUsd: number; + durationMs: number; + durationApiMs: number; + numTurns: number; + contextTokensIn?: number; + contextTokensOut?: number; + compactions?: number; + compactionPreTokens?: number[]; +} + +export interface BenchmarkData { + timestamp: string; + steps: StepUsage[]; + totals: { + totalCostUsd: number; + durationMs: number; + inputTokens: number; + outputTokens: number; + numTurns: number; + }; +} + +// ── Formatting helpers ───────────────────────────────────────────────── + +function formatDuration(ms: number): string { + const totalSeconds = Math.round(ms / 1000); + const minutes = Math.floor(totalSeconds / 60); + const seconds = totalSeconds % 60; + if (minutes > 0) return `${minutes}m ${seconds}s`; + return `${seconds}s`; +} + +function formatTokenCount(tokens: number): string { + if (tokens >= 1_000_000) return `${(tokens / 1_000_000).toFixed(1)}M`; + if (tokens >= 10_000) return `${Math.round(tokens / 1000)}K`; + return tokens.toLocaleString(); +} + +// ── Internal helpers ─────────────────────────────────────────────────── + +/** + * Sum token usage across all models from the SDK's modelUsage field. + * modelUsage has per-model aggregates with camelCase field names. + */ +function sumModelUsage(modelUsage: Record): { + input_tokens: number; + output_tokens: number; + cache_creation_input_tokens: number; + cache_read_input_tokens: number; +} { + let input_tokens = 0; + let output_tokens = 0; + let cache_creation_input_tokens = 0; + let cache_read_input_tokens = 0; + + for (const model of Object.values(modelUsage)) { + input_tokens += model.inputTokens ?? 0; + output_tokens += model.outputTokens ?? 0; + cache_creation_input_tokens += model.cacheCreationInputTokens ?? 0; + cache_read_input_tokens += model.cacheReadInputTokens ?? 0; + } + + return { + input_tokens, + output_tokens, + cache_creation_input_tokens, + cache_read_input_tokens, + }; +} + +/** Regex to detect workflow file references: matches "1.0-begin" from file paths/text */ +const WORKFLOW_FILE_RE = /(\d+\.\d+-[a-z]+)(?:\.md)?/; + +// ── BenchmarkTracker ─────────────────────────────────────────────────── + +interface PhaseRecord { + name: string; + startTime: number; + endTime: number; + turns: number; + inputTokens: number; + outputTokens: number; + compactions: number; + compactionPreTokens: number[]; +} + +/** + * Observes the SDK message stream and tracks per-phase metrics. + * + * Phase transitions are detected by watching for Read tool calls on + * workflow files (matching the pattern `*-1.0-begin.md`). Everything + * before the first workflow file is tracked as the "setup" phase. + */ +export class BenchmarkTracker { + private spinner: ReturnType; + private phases: PhaseRecord[] = []; + private currentPhase = 'setup'; + private phaseStartTime: number; + private phaseTurns = 0; + private phaseInputTokens = 0; + private phaseOutputTokens = 0; + private phaseCompactions = 0; + private phaseCompactionPreTokens: number[] = []; + private seenPhases = new Set(); + + constructor(spinner: ReturnType) { + this.spinner = spinner; + this.phaseStartTime = Date.now(); + clack.log.info( + `${chalk.cyan('[BENCHMARK]')} Verbose logs: ${LOG_FILE_PATH}`, + ); + clack.log.info( + `${chalk.cyan( + '[BENCHMARK]', + )} Benchmark data will be written to: ${BENCHMARK_FILE_PATH}`, + ); + clack.log.info( + `${chalk.cyan('[BENCHMARK]')} Starting phase: ${chalk.bold('setup')}`, + ); + logToFile('[BENCHMARK] Starting phase: setup'); + } + + /** + * Feed every SDK message into the tracker. + */ + onMessage(message: any): void { + if (message.type === 'assistant') { + this.phaseTurns++; + + // Accumulate per-turn token usage if the API response includes it + const usage = message.message?.usage; + if (usage) { + this.phaseInputTokens += usage.input_tokens ?? 0; + this.phaseOutputTokens += usage.output_tokens ?? 0; + } + + // Scan all content blocks for workflow phase references + const content = message.message?.content; + if (Array.isArray(content)) { + for (const block of content) { + // Detect from text content (agent mentioning/quoting workflow files) + if (block.type === 'text' && typeof block.text === 'string') { + this.detectPhaseFromText(block.text); + } + // Detect from tool_use blocks (Read tool on workflow files) + if (block.type === 'tool_use') { + const filePath = block.input?.file_path ?? block.input?.path ?? ''; + if (typeof filePath === 'string') { + this.detectPhaseFromText(filePath); + } + } + } + } + } + + // Track compaction events (SDK compact_boundary messages) + if (message.type === 'system' && message.subtype === 'compact_boundary') { + const preTokens = message.compact_metadata?.pre_tokens ?? 0; + const trigger = message.compact_metadata?.trigger ?? 'unknown'; + this.phaseCompactions++; + this.phaseCompactionPreTokens.push(preTokens); + logToFile( + `[BENCHMARK] [COMPACTION] Context compacted during "${ + this.currentPhase + }" (trigger: ${trigger}, pre_tokens: ${formatTokenCount(preTokens)})`, + ); + clack.log.info( + `${chalk.yellow('[COMPACTION]')} Context compacted during "${ + this.currentPhase + }" (trigger: ${trigger}, pre_tokens: ${formatTokenCount(preTokens)})`, + ); + } + } + + private detectPhaseFromText(text: string): void { + const match = text.match(WORKFLOW_FILE_RE); + if (match && !this.seenPhases.has(match[1])) { + this.transitionTo(match[1]); + } + } + + /** + * Close tracking and build the final BenchmarkData. + * Call this when the agent result message is received. + */ + finalize(resultMessage: any, totalDurationMs: number): BenchmarkData { + // Close the current (last) phase + this.closeCurrentPhase(); + + // Build per-phase StepUsage from tracked phases + aggregate from result + const modelUsage = resultMessage?.modelUsage ?? {}; + const aggregateUsage = sumModelUsage(modelUsage); + const lastCallUsage = resultMessage?.usage ?? {}; + const contextTokensOut = + Number(lastCallUsage.input_tokens ?? 0) + + Number(lastCallUsage.cache_read_input_tokens ?? 0) + + Number(lastCallUsage.cache_creation_input_tokens ?? 0); + + const totalTurns = this.phases.reduce((s, p) => s + p.turns, 0); + const totalCost = resultMessage?.total_cost_usd ?? 0; + + const steps: StepUsage[] = this.phases.map((phase) => ({ + name: phase.name, + // Per-phase token usage from assistant message usage fields + usage: { + input_tokens: phase.inputTokens, + output_tokens: phase.outputTokens, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, + }, + modelUsage: {}, + // Proportional cost estimate based on turns + totalCostUsd: totalTurns > 0 ? totalCost * (phase.turns / totalTurns) : 0, + durationMs: phase.endTime - phase.startTime, + durationApiMs: 0, + numTurns: phase.turns, + ...(phase.compactions > 0 && { + compactions: phase.compactions, + compactionPreTokens: phase.compactionPreTokens, + }), + })); + + // Stamp context size on the last step + if (steps.length > 0) { + steps[steps.length - 1].contextTokensOut = contextTokensOut; + } + + const benchmark: BenchmarkData = { + timestamp: new Date().toISOString(), + steps, + totals: { + totalCostUsd: totalCost, + durationMs: totalDurationMs, + inputTokens: + aggregateUsage.input_tokens + + aggregateUsage.cache_read_input_tokens + + aggregateUsage.cache_creation_input_tokens, + outputTokens: aggregateUsage.output_tokens, + numTurns: resultMessage?.num_turns ?? totalTurns, + }, + }; + + // Log summary + const totalDurationStr = formatDuration(totalDurationMs); + clack.log.success( + `${chalk.cyan('[BENCHMARK]')} ${ + this.phases.length + } phases completed in ${totalDurationStr}, cost: $${totalCost.toFixed( + 4, + )}`, + ); + clack.log.info( + `${chalk.cyan('[BENCHMARK]')} Results written to ${BENCHMARK_FILE_PATH}`, + ); + + writeBenchmarkData(benchmark); + return benchmark; + } + + // ── Private ──────────────────────────────────────────────────────── + + private transitionTo(newPhase: string): void { + // Stop spinner so log output is visible + this.spinner.stop( + `${chalk.cyan('[BENCHMARK]')} Completed phase: ${chalk.bold( + this.currentPhase, + )} ${chalk.dim(`(${this.formatPhaseStats()})`)}`, + ); + this.closeCurrentPhase(); + + this.seenPhases.add(newPhase); + this.currentPhase = newPhase; + this.phaseStartTime = Date.now(); + this.phaseTurns = 0; + this.phaseInputTokens = 0; + this.phaseOutputTokens = 0; + this.phaseCompactions = 0; + this.phaseCompactionPreTokens = []; + + clack.log.info( + `${chalk.cyan('[BENCHMARK]')} Starting phase: ${chalk.bold(newPhase)}`, + ); + logToFile(`[BENCHMARK] Starting phase: ${newPhase}`); + + // Restart spinner + this.spinner.start(`Integrating PostHog (${newPhase})...`); + } + + private closeCurrentPhase(): void { + const now = Date.now(); + + this.phases.push({ + name: this.currentPhase, + startTime: this.phaseStartTime, + endTime: now, + turns: this.phaseTurns, + inputTokens: this.phaseInputTokens, + outputTokens: this.phaseOutputTokens, + compactions: this.phaseCompactions, + compactionPreTokens: [...this.phaseCompactionPreTokens], + }); + + logToFile( + `[BENCHMARK] Completed phase: ${ + this.currentPhase + } (${this.formatPhaseStats()})`, + ); + } + + private formatPhaseStats(): string { + const duration = Date.now() - this.phaseStartTime; + const parts = [formatDuration(duration), `${this.phaseTurns} turns`]; + if (this.phaseInputTokens > 0 || this.phaseOutputTokens > 0) { + parts.push( + `in: ${formatTokenCount(this.phaseInputTokens)}`, + `out: ${formatTokenCount(this.phaseOutputTokens)}`, + ); + } + if (this.phaseCompactions > 0) { + parts.push(`${this.phaseCompactions} compaction(s)`); + } + return parts.join(', '); + } +} + +// ── File I/O ─────────────────────────────────────────────────────────── + +/** + * Write benchmark data to the benchmark file. + */ +export function writeBenchmarkData(data: BenchmarkData): void { + try { + fs.writeFileSync(BENCHMARK_FILE_PATH, JSON.stringify(data, null, 2)); + logToFile(`Benchmark data written to ${BENCHMARK_FILE_PATH}`); + } catch (error) { + logToFile('Failed to write benchmark data:', error); + } +} diff --git a/src/run.ts b/src/run.ts index e5e5311..42cc336 100644 --- a/src/run.ts +++ b/src/run.ts @@ -27,6 +27,7 @@ type Args = { ci?: boolean; apiKey?: string; menu?: boolean; + benchmark?: boolean; }; export async function runWizard(argv: Args) { @@ -57,6 +58,7 @@ export async function runWizard(argv: Args) { ci: finalArgs.ci ?? false, apiKey: finalArgs.apiKey, menu: finalArgs.menu ?? false, + benchmark: finalArgs.benchmark ?? false, }; clack.intro(`Welcome to the PostHog setup wizard ✨`); diff --git a/src/utils/types.ts b/src/utils/types.ts index 7379156..5c0b8d6 100644 --- a/src/utils/types.ts +++ b/src/utils/types.ts @@ -60,6 +60,13 @@ export type WizardOptions = { * Whether to show the menu for manual integration selection instead of auto-detecting. */ menu: boolean; + + /** + * Whether to run in benchmark mode with per-phase token tracking. + * When enabled, the wizard runs each workflow phase as a separate agent call + * and writes detailed usage data to /tmp/posthog-wizard-benchmark.json. + */ + benchmark: boolean; }; export interface Feature {