From 2d734686beeaa2f01661de27c7dce256d7ee6cb9 Mon Sep 17 00:00:00 2001 From: Christopher Date: Tue, 28 Apr 2026 12:52:05 +1000 Subject: [PATCH 1/4] feat(cli): show eval progress durations in ms --- .../cli/src/commands/eval/progress-display.ts | 20 ++++- apps/cli/src/commands/eval/run-eval.ts | 2 + .../commands/eval/progress-display.test.ts | 77 +++++++++++++++++++ packages/core/src/evaluation/orchestrator.ts | 9 +++ 4 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 apps/cli/test/commands/eval/progress-display.test.ts diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index 90b70e6c..11ce0c85 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -10,6 +10,8 @@ export interface WorkerProgress { targetLabel?: string; score?: number; verdict?: Verdict; + durationMs?: number; + totalDurationMs?: number; } const ANSI_BOLD = '\x1b[1m'; @@ -37,6 +39,20 @@ function formatVerdict(score: number | undefined, verdict: Verdict | undefined): return ` | ${color}${ANSI_BOLD}${verdictLabel}${ANSI_RESET}`; } +function formatDurations( + durationMs: number | undefined, + totalDurationMs: number | undefined, +): string { + const segments: string[] = []; + if (durationMs !== undefined) { + segments.push(`agent ${durationMs}ms`); + } + if (totalDurationMs !== undefined) { + segments.push(`total ${totalDurationMs}ms`); + } + return segments.length > 0 ? ` | ${segments.join(' | ')}` : ''; +} + /** * Simple line-based progress display. * Prints each status update as a new line - no ANSI cursor manipulation. @@ -99,14 +115,14 @@ export class ProgressDisplay { // Pick icon based on verdict: ✅ PASS, ⚠️ FAIL, ❌ ERROR const icon = progress.verdict === 'FAIL' ? '⚠️' : progress.verdict === 'ERROR' ? '❌' : '✅'; console.log( - `${countPrefix} ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`, + `${countPrefix} ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${formatDurations(progress.durationMs, progress.totalDurationMs)}`, ); break; } case 'failed': { const failIcon = progress.verdict === 'ERROR' ? '❌' : '⚠️'; console.log( - `${countPrefix} ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`, + `${countPrefix} ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${formatDurations(progress.durationMs, progress.totalDurationMs)}${progress.error ? `: ${progress.error}` : ''}`, ); break; } diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 9d3cafe8..552db330 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -932,6 +932,8 @@ async function runSingleEvalFile(params: { targetLabel: inlineTargetLabel, score: event.score, verdict, + durationMs: event.durationMs, + totalDurationMs: event.evalRunDurationMs, }); }, }); diff --git a/apps/cli/test/commands/eval/progress-display.test.ts b/apps/cli/test/commands/eval/progress-display.test.ts new file mode 100644 index 00000000..db6ee5b4 --- /dev/null +++ b/apps/cli/test/commands/eval/progress-display.test.ts @@ -0,0 +1,77 @@ +import { afterEach, beforeEach, describe, expect, it, mock } from 'bun:test'; + +import { ProgressDisplay } from '../../../src/commands/eval/progress-display.js'; + +describe('ProgressDisplay', () => { + const originalNoColor = process.env.NO_COLOR; + + beforeEach(() => { + process.env.NO_COLOR = '1'; + }); + + afterEach(() => { + if (originalNoColor === undefined) { + delete process.env.NO_COLOR; + } else { + process.env.NO_COLOR = originalNoColor; + } + }); + + it('prints agent and total durations after the verdict', () => { + const display = new ProgressDisplay(1); + const logs: string[] = []; + const logSpy = mock((message?: unknown) => { + logs.push(String(message ?? '')); + }); + const originalLog = console.log; + console.log = logSpy as typeof console.log; + + try { + display.start(); + display.setTotalTests(1); + display.updateWorker({ + workerId: 1, + testId: 'test-42-billing-negative-margin', + status: 'completed', + targetLabel: 'wtalms-stg', + score: 0.94, + verdict: 'PASS', + durationMs: 18342, + totalDurationMs: 22109, + }); + } finally { + console.log = originalLog; + } + + expect(logs).toEqual([ + '1/1 ✅ test-42-billing-negative-margin | wtalms-stg | 94% PASS | agent 18342ms | total 22109ms', + ]); + }); + + it('omits duration segments when metrics are unavailable', () => { + const display = new ProgressDisplay(1); + const logs: string[] = []; + const logSpy = mock((message?: unknown) => { + logs.push(String(message ?? '')); + }); + const originalLog = console.log; + console.log = logSpy as typeof console.log; + + try { + display.start(); + display.setTotalTests(1); + display.updateWorker({ + workerId: 1, + testId: 'test-01-biosecurity', + status: 'completed', + targetLabel: 'wtalms-stg', + score: 0.98, + verdict: 'PASS', + }); + } finally { + console.log = originalLog; + } + + expect(logs).toEqual(['1/1 ✅ test-01-biosecurity | wtalms-stg | 98% PASS']); + }); +}); diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 0a72197c..8d37c8b9 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -384,6 +384,10 @@ export interface ProgressEvent { readonly score?: number; /** Execution status classification for completed/failed tests */ readonly executionStatus?: ExecutionStatus; + /** Candidate/agent execution duration in milliseconds */ + readonly durationMs?: number; + /** Full eval duration in milliseconds, including grading/orchestration */ + readonly evalRunDurationMs?: number; } export interface RunEvaluationOptions { @@ -1386,6 +1390,8 @@ export async function runEvaluation( error: result.error, score: result.score, executionStatus: result.executionStatus, + durationMs: result.durationMs, + evalRunDurationMs: result.evalRun?.durationMs, }); } @@ -1768,6 +1774,7 @@ async function runBatchEvaluation(options: { error: error instanceof Error ? error.message : String(error), score: errorResult.score, executionStatus: errorResult.executionStatus, + evalRunDurationMs: errorResult.evalRun?.durationMs, }); } continue; @@ -1788,6 +1795,8 @@ async function runBatchEvaluation(options: { error: result.error, score: result.score, executionStatus: result.executionStatus, + durationMs: result.durationMs, + evalRunDurationMs: result.evalRun?.durationMs, }); } } From 8ea21a35c323af6a2887c07d26aaf609b4bd985c Mon Sep 17 00:00:00 2001 From: Christopher Date: Tue, 28 Apr 2026 12:53:10 +1000 Subject: [PATCH 2/4] test(cli): satisfy progress display lint --- apps/cli/test/commands/eval/progress-display.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/cli/test/commands/eval/progress-display.test.ts b/apps/cli/test/commands/eval/progress-display.test.ts index db6ee5b4..bc454cfd 100644 --- a/apps/cli/test/commands/eval/progress-display.test.ts +++ b/apps/cli/test/commands/eval/progress-display.test.ts @@ -11,7 +11,7 @@ describe('ProgressDisplay', () => { afterEach(() => { if (originalNoColor === undefined) { - delete process.env.NO_COLOR; + process.env.NO_COLOR = undefined; } else { process.env.NO_COLOR = originalNoColor; } From 5301632de2ef691f01b787da970c096ed146e5aa Mon Sep 17 00:00:00 2001 From: Christopher Date: Tue, 28 Apr 2026 13:00:02 +1000 Subject: [PATCH 3/4] fix(cli): normalize compact progress durations --- .../cli/src/commands/eval/progress-display.ts | 15 ++++---- .../commands/eval/progress-display.test.ts | 35 +++++++++++++++++-- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index 11ce0c85..f5ccf104 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -43,14 +43,17 @@ function formatDurations( durationMs: number | undefined, totalDurationMs: number | undefined, ): string { - const segments: string[] = []; - if (durationMs !== undefined) { - segments.push(`agent ${durationMs}ms`); + if (durationMs === undefined && totalDurationMs === undefined) { + return ''; } - if (totalDurationMs !== undefined) { - segments.push(`total ${totalDurationMs}ms`); + + if (durationMs !== undefined && totalDurationMs !== undefined) { + const normalizedTotalMs = Math.max(durationMs, totalDurationMs); + return ` | ⏱ ${durationMs}/${normalizedTotalMs}ms`; } - return segments.length > 0 ? ` | ${segments.join(' | ')}` : ''; + + const singleDurationMs = durationMs ?? totalDurationMs; + return singleDurationMs !== undefined ? ` | ⏱ ${singleDurationMs}ms` : ''; } /** diff --git a/apps/cli/test/commands/eval/progress-display.test.ts b/apps/cli/test/commands/eval/progress-display.test.ts index bc454cfd..33f25996 100644 --- a/apps/cli/test/commands/eval/progress-display.test.ts +++ b/apps/cli/test/commands/eval/progress-display.test.ts @@ -17,7 +17,7 @@ describe('ProgressDisplay', () => { } }); - it('prints agent and total durations after the verdict', () => { + it('prints compact agent/total durations after the verdict', () => { const display = new ProgressDisplay(1); const logs: string[] = []; const logSpy = mock((message?: unknown) => { @@ -44,7 +44,38 @@ describe('ProgressDisplay', () => { } expect(logs).toEqual([ - '1/1 ✅ test-42-billing-negative-margin | wtalms-stg | 94% PASS | agent 18342ms | total 22109ms', + '1/1 ✅ test-42-billing-negative-margin | wtalms-stg | 94% PASS | ⏱ 18342/22109ms', + ]); + }); + + it('normalizes total duration when reported agent time is higher', () => { + const display = new ProgressDisplay(1); + const logs: string[] = []; + const logSpy = mock((message?: unknown) => { + logs.push(String(message ?? '')); + }); + const originalLog = console.log; + console.log = logSpy as typeof console.log; + + try { + display.start(); + display.setTotalTests(1); + display.updateWorker({ + workerId: 1, + testId: 'simple-thresholds-pass', + status: 'completed', + targetLabel: 'mock_metrics_agent', + score: 1, + verdict: 'PASS', + durationMs: 245, + totalDurationMs: 78, + }); + } finally { + console.log = originalLog; + } + + expect(logs).toEqual([ + '1/1 ✅ simple-thresholds-pass | mock_metrics_agent | 100% PASS | ⏱ 245/245ms', ]); }); From 020c42040e4ee127e1fdf6c69bb5cf0d220cb23f Mon Sep 17 00:00:00 2001 From: Christopher Date: Tue, 28 Apr 2026 14:28:18 +1000 Subject: [PATCH 4/4] style(cli): soften progress timing marker --- apps/cli/src/commands/eval/progress-display.ts | 4 ++-- apps/cli/test/commands/eval/progress-display.test.ts | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index f5ccf104..7e09a660 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -49,11 +49,11 @@ function formatDurations( if (durationMs !== undefined && totalDurationMs !== undefined) { const normalizedTotalMs = Math.max(durationMs, totalDurationMs); - return ` | ⏱ ${durationMs}/${normalizedTotalMs}ms`; + return ` | τ ${durationMs}/${normalizedTotalMs}ms`; } const singleDurationMs = durationMs ?? totalDurationMs; - return singleDurationMs !== undefined ? ` | ⏱ ${singleDurationMs}ms` : ''; + return singleDurationMs !== undefined ? ` | τ ${singleDurationMs}ms` : ''; } /** diff --git a/apps/cli/test/commands/eval/progress-display.test.ts b/apps/cli/test/commands/eval/progress-display.test.ts index 33f25996..ee56737b 100644 --- a/apps/cli/test/commands/eval/progress-display.test.ts +++ b/apps/cli/test/commands/eval/progress-display.test.ts @@ -44,7 +44,7 @@ describe('ProgressDisplay', () => { } expect(logs).toEqual([ - '1/1 ✅ test-42-billing-negative-margin | wtalms-stg | 94% PASS | ⏱ 18342/22109ms', + '1/1 ✅ test-42-billing-negative-margin | wtalms-stg | 94% PASS | τ 18342/22109ms', ]); }); @@ -75,7 +75,7 @@ describe('ProgressDisplay', () => { } expect(logs).toEqual([ - '1/1 ✅ simple-thresholds-pass | mock_metrics_agent | 100% PASS | ⏱ 245/245ms', + '1/1 ✅ simple-thresholds-pass | mock_metrics_agent | 100% PASS | τ 245/245ms', ]); });