diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index 90b70e6c..f5db6a1b 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -10,6 +10,8 @@ export interface WorkerProgress { targetLabel?: string; score?: number; verdict?: Verdict; + durationMs?: number; + totalDurationMs?: number; } const ANSI_BOLD = '\x1b[1m'; @@ -37,6 +39,23 @@ function formatVerdict(score: number | undefined, verdict: Verdict | undefined): return ` | ${color}${ANSI_BOLD}${verdictLabel}${ANSI_RESET}`; } +function formatDurations( + durationMs: number | undefined, + totalDurationMs: number | undefined, +): string { + if (durationMs === undefined && totalDurationMs === undefined) { + return ''; + } + + if (durationMs !== undefined && totalDurationMs !== undefined) { + const normalizedTotalMs = Math.max(durationMs, totalDurationMs); + return ` | ${durationMs}/${normalizedTotalMs}ms`; + } + + const singleDurationMs = durationMs ?? totalDurationMs; + return singleDurationMs !== undefined ? ` | ${singleDurationMs}ms` : ''; +} + /** * Simple line-based progress display. * Prints each status update as a new line - no ANSI cursor manipulation. @@ -99,14 +118,14 @@ export class ProgressDisplay { // Pick icon based on verdict: ✅ PASS, ⚠️ FAIL, ❌ ERROR const icon = progress.verdict === 'FAIL' ? '⚠️' : progress.verdict === 'ERROR' ? '❌' : '✅'; console.log( - `${countPrefix} ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`, + `${countPrefix} ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${formatDurations(progress.durationMs, progress.totalDurationMs)}`, ); break; } case 'failed': { const failIcon = progress.verdict === 'ERROR' ? '❌' : '⚠️'; console.log( - `${countPrefix} ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`, + `${countPrefix} ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${formatDurations(progress.durationMs, progress.totalDurationMs)}${progress.error ? `: ${progress.error}` : ''}`, ); break; } diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 9d3cafe8..552db330 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -932,6 +932,8 @@ async function runSingleEvalFile(params: { targetLabel: inlineTargetLabel, score: event.score, verdict, + durationMs: event.durationMs, + totalDurationMs: event.evalRunDurationMs, }); }, }); diff --git a/apps/cli/test/commands/eval/progress-display.test.ts b/apps/cli/test/commands/eval/progress-display.test.ts new file mode 100644 index 00000000..5b505791 --- /dev/null +++ b/apps/cli/test/commands/eval/progress-display.test.ts @@ -0,0 +1,108 @@ +import { afterEach, beforeEach, describe, expect, it, mock } from 'bun:test'; + +import { ProgressDisplay } from '../../../src/commands/eval/progress-display.js'; + +describe('ProgressDisplay', () => { + const originalNoColor = process.env.NO_COLOR; + + beforeEach(() => { + process.env.NO_COLOR = '1'; + }); + + afterEach(() => { + if (originalNoColor === undefined) { + process.env.NO_COLOR = undefined; + } else { + process.env.NO_COLOR = originalNoColor; + } + }); + + it('prints compact agent/total durations after the verdict', () => { + const display = new ProgressDisplay(1); + const logs: string[] = []; + const logSpy = mock((message?: unknown) => { + logs.push(String(message ?? '')); + }); + const originalLog = console.log; + console.log = logSpy as typeof console.log; + + try { + display.start(); + display.setTotalTests(1); + display.updateWorker({ + workerId: 1, + testId: 'test-42-billing-negative-margin', + status: 'completed', + targetLabel: 'wtalms-stg', + score: 0.94, + verdict: 'PASS', + durationMs: 18342, + totalDurationMs: 22109, + }); + } finally { + console.log = originalLog; + } + + expect(logs).toEqual([ + '1/1 ✅ test-42-billing-negative-margin | wtalms-stg | 94% PASS | 18342/22109ms', + ]); + }); + + it('normalizes total duration when reported agent time is higher', () => { + const display = new ProgressDisplay(1); + const logs: string[] = []; + const logSpy = mock((message?: unknown) => { + logs.push(String(message ?? '')); + }); + const originalLog = console.log; + console.log = logSpy as typeof console.log; + + try { + display.start(); + display.setTotalTests(1); + display.updateWorker({ + workerId: 1, + testId: 'simple-thresholds-pass', + status: 'completed', + targetLabel: 'mock_metrics_agent', + score: 1, + verdict: 'PASS', + durationMs: 245, + totalDurationMs: 78, + }); + } finally { + console.log = originalLog; + } + + expect(logs).toEqual([ + '1/1 ✅ simple-thresholds-pass | mock_metrics_agent | 100% PASS | 245/245ms', + ]); + }); + + it('omits duration segments when metrics are unavailable', () => { + const display = new ProgressDisplay(1); + const logs: string[] = []; + const logSpy = mock((message?: unknown) => { + logs.push(String(message ?? '')); + }); + const originalLog = console.log; + console.log = logSpy as typeof console.log; + + try { + display.start(); + display.setTotalTests(1); + display.updateWorker({ + workerId: 1, + testId: 'test-01-biosecurity', + status: 'completed', + targetLabel: 'wtalms-stg', + score: 0.98, + verdict: 'PASS', + }); + } finally { + console.log = originalLog; + } + + expect(logs).toEqual(['1/1 ✅ test-01-biosecurity | wtalms-stg | 98% PASS']); + }); +}); diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 0a72197c..8d37c8b9 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -384,6 +384,10 @@ export interface ProgressEvent { readonly score?: number; /** Execution status classification for completed/failed tests */ readonly executionStatus?: ExecutionStatus; + /** Candidate/agent execution duration in milliseconds */ + readonly durationMs?: number; + /** Full eval duration in milliseconds, including grading/orchestration */ + readonly evalRunDurationMs?: number; } export interface RunEvaluationOptions { @@ -1386,6 +1390,8 @@ export async function runEvaluation( error: result.error, score: result.score, executionStatus: result.executionStatus, + durationMs: result.durationMs, + evalRunDurationMs: result.evalRun?.durationMs, }); } @@ -1768,6 +1774,7 @@ async function runBatchEvaluation(options: { error: error instanceof Error ? error.message : String(error), score: errorResult.score, executionStatus: errorResult.executionStatus, + evalRunDurationMs: errorResult.evalRun?.durationMs, }); } continue; @@ -1788,6 +1795,8 @@ async function runBatchEvaluation(options: { error: result.error, score: result.score, executionStatus: result.executionStatus, + durationMs: result.durationMs, + evalRunDurationMs: result.evalRun?.durationMs, }); } }