Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions apps/cli/src/commands/eval/progress-display.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ export interface WorkerProgress {
targetLabel?: string;
score?: number;
verdict?: Verdict;
durationMs?: number;
totalDurationMs?: number;
}

const ANSI_BOLD = '\x1b[1m';
Expand Down Expand Up @@ -37,6 +39,23 @@ function formatVerdict(score: number | undefined, verdict: Verdict | undefined):
return ` | ${color}${ANSI_BOLD}${verdictLabel}${ANSI_RESET}`;
}

function formatDurations(
durationMs: number | undefined,
totalDurationMs: number | undefined,
): string {
if (durationMs === undefined && totalDurationMs === undefined) {
return '';
}

if (durationMs !== undefined && totalDurationMs !== undefined) {
const normalizedTotalMs = Math.max(durationMs, totalDurationMs);
return ` | τ ${durationMs}/${normalizedTotalMs}ms`;
}

const singleDurationMs = durationMs ?? totalDurationMs;
return singleDurationMs !== undefined ? ` | τ ${singleDurationMs}ms` : '';
}

/**
* Simple line-based progress display.
* Prints each status update as a new line - no ANSI cursor manipulation.
Expand Down Expand Up @@ -99,14 +118,14 @@ export class ProgressDisplay {
// Pick icon based on verdict: ✅ PASS, ⚠️ FAIL, ❌ ERROR
const icon = progress.verdict === 'FAIL' ? '⚠️' : progress.verdict === 'ERROR' ? '❌' : '✅';
console.log(
`${countPrefix} ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`,
`${countPrefix} ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${formatDurations(progress.durationMs, progress.totalDurationMs)}`,
);
break;
}
case 'failed': {
const failIcon = progress.verdict === 'ERROR' ? '❌' : '⚠️';
console.log(
`${countPrefix} ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`,
`${countPrefix} ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${formatDurations(progress.durationMs, progress.totalDurationMs)}${progress.error ? `: ${progress.error}` : ''}`,
);
break;
}
Expand Down
2 changes: 2 additions & 0 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -932,6 +932,8 @@ async function runSingleEvalFile(params: {
targetLabel: inlineTargetLabel,
score: event.score,
verdict,
durationMs: event.durationMs,
totalDurationMs: event.evalRunDurationMs,
});
},
});
Expand Down
108 changes: 108 additions & 0 deletions apps/cli/test/commands/eval/progress-display.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import { afterEach, beforeEach, describe, expect, it, mock } from 'bun:test';

import { ProgressDisplay } from '../../../src/commands/eval/progress-display.js';

describe('ProgressDisplay', () => {
const originalNoColor = process.env.NO_COLOR;

beforeEach(() => {
process.env.NO_COLOR = '1';
});

afterEach(() => {
if (originalNoColor === undefined) {
process.env.NO_COLOR = undefined;
} else {
process.env.NO_COLOR = originalNoColor;
}
});

it('prints compact agent/total durations after the verdict', () => {
const display = new ProgressDisplay(1);
const logs: string[] = [];
const logSpy = mock((message?: unknown) => {
logs.push(String(message ?? ''));
});
const originalLog = console.log;
console.log = logSpy as typeof console.log;

try {
display.start();
display.setTotalTests(1);
display.updateWorker({
workerId: 1,
testId: 'test-42-billing-negative-margin',
status: 'completed',
targetLabel: 'wtalms-stg',
score: 0.94,
verdict: 'PASS',
durationMs: 18342,
totalDurationMs: 22109,
});
} finally {
console.log = originalLog;
}

expect(logs).toEqual([
'1/1 ✅ test-42-billing-negative-margin | wtalms-stg | 94% PASS | τ 18342/22109ms',
]);
});

it('normalizes total duration when reported agent time is higher', () => {
const display = new ProgressDisplay(1);
const logs: string[] = [];
const logSpy = mock((message?: unknown) => {
logs.push(String(message ?? ''));
});
const originalLog = console.log;
console.log = logSpy as typeof console.log;

try {
display.start();
display.setTotalTests(1);
display.updateWorker({
workerId: 1,
testId: 'simple-thresholds-pass',
status: 'completed',
targetLabel: 'mock_metrics_agent',
score: 1,
verdict: 'PASS',
durationMs: 245,
totalDurationMs: 78,
});
} finally {
console.log = originalLog;
}

expect(logs).toEqual([
'1/1 ✅ simple-thresholds-pass | mock_metrics_agent | 100% PASS | τ 245/245ms',
]);
});

it('omits duration segments when metrics are unavailable', () => {
const display = new ProgressDisplay(1);
const logs: string[] = [];
const logSpy = mock((message?: unknown) => {
logs.push(String(message ?? ''));
});
const originalLog = console.log;
console.log = logSpy as typeof console.log;

try {
display.start();
display.setTotalTests(1);
display.updateWorker({
workerId: 1,
testId: 'test-01-biosecurity',
status: 'completed',
targetLabel: 'wtalms-stg',
score: 0.98,
verdict: 'PASS',
});
} finally {
console.log = originalLog;
}

expect(logs).toEqual(['1/1 ✅ test-01-biosecurity | wtalms-stg | 98% PASS']);
});
});
9 changes: 9 additions & 0 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,10 @@ export interface ProgressEvent {
readonly score?: number;
/** Execution status classification for completed/failed tests */
readonly executionStatus?: ExecutionStatus;
/** Candidate/agent execution duration in milliseconds */
readonly durationMs?: number;
/** Full eval duration in milliseconds, including grading/orchestration */
readonly evalRunDurationMs?: number;
}

export interface RunEvaluationOptions {
Expand Down Expand Up @@ -1386,6 +1390,8 @@ export async function runEvaluation(
error: result.error,
score: result.score,
executionStatus: result.executionStatus,
durationMs: result.durationMs,
evalRunDurationMs: result.evalRun?.durationMs,
});
}

Expand Down Expand Up @@ -1768,6 +1774,7 @@ async function runBatchEvaluation(options: {
error: error instanceof Error ? error.message : String(error),
score: errorResult.score,
executionStatus: errorResult.executionStatus,
evalRunDurationMs: errorResult.evalRun?.durationMs,
});
}
continue;
Expand All @@ -1788,6 +1795,8 @@ async function runBatchEvaluation(options: {
error: result.error,
score: result.score,
executionStatus: result.executionStatus,
durationMs: result.durationMs,
evalRunDurationMs: result.evalRun?.durationMs,
});
}
}
Expand Down
Loading