EntityProcess · christso · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts
@@ -10,6 +10,8 @@ export interface WorkerProgress {
   targetLabel?: string;
   score?: number;
   verdict?: Verdict;
+  durationMs?: number;
+  totalDurationMs?: number;
 }
 
 const ANSI_BOLD = '\x1b[1m';
@@ -37,6 +39,23 @@ function formatVerdict(score: number | undefined, verdict: Verdict | undefined):
   return ` | ${color}${ANSI_BOLD}${verdictLabel}${ANSI_RESET}`;
 }
 
+function formatDurations(
+  durationMs: number | undefined,
+  totalDurationMs: number | undefined,
+): string {
+  if (durationMs === undefined && totalDurationMs === undefined) {
+    return '';
+  }
+
+  if (durationMs !== undefined && totalDurationMs !== undefined) {
+    const normalizedTotalMs = Math.max(durationMs, totalDurationMs);
+    return ` | τ ${durationMs}/${normalizedTotalMs}ms`;
+  }
+
+  const singleDurationMs = durationMs ?? totalDurationMs;
+  return singleDurationMs !== undefined ? ` | τ ${singleDurationMs}ms` : '';
+}
+
 /**
  * Simple line-based progress display.
  * Prints each status update as a new line - no ANSI cursor manipulation.
@@ -99,14 +118,14 @@ export class ProgressDisplay {
         // Pick icon based on verdict: ✅ PASS, ⚠️ FAIL, ❌ ERROR
         const icon = progress.verdict === 'FAIL' ? '⚠️' : progress.verdict === 'ERROR' ? '❌' : '✅';
         console.log(
-          `${countPrefix}   ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}`,
+          `${countPrefix}   ${icon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${formatDurations(progress.durationMs, progress.totalDurationMs)}`,
         );
         break;
       }
       case 'failed': {
         const failIcon = progress.verdict === 'ERROR' ? '❌' : '⚠️';
         console.log(
-          `${countPrefix}   ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${progress.error ? `: ${progress.error}` : ''}`,
+          `${countPrefix}   ${failIcon} ${progress.testId}${targetSuffix}${formatVerdict(progress.score, progress.verdict)}${formatDurations(progress.durationMs, progress.totalDurationMs)}${progress.error ? `: ${progress.error}` : ''}`,
         );
         break;
       }

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -932,6 +932,8 @@ async function runSingleEvalFile(params: {
         targetLabel: inlineTargetLabel,
         score: event.score,
         verdict,
+        durationMs: event.durationMs,
+        totalDurationMs: event.evalRunDurationMs,
       });
     },
   });

diff --git a/apps/cli/test/commands/eval/progress-display.test.ts b/apps/cli/test/commands/eval/progress-display.test.ts
@@ -0,0 +1,108 @@
+import { afterEach, beforeEach, describe, expect, it, mock } from 'bun:test';
+
+import { ProgressDisplay } from '../../../src/commands/eval/progress-display.js';
+
+describe('ProgressDisplay', () => {
+  const originalNoColor = process.env.NO_COLOR;
+
+  beforeEach(() => {
+    process.env.NO_COLOR = '1';
+  });
+
+  afterEach(() => {
+    if (originalNoColor === undefined) {
+      process.env.NO_COLOR = undefined;
+    } else {
+      process.env.NO_COLOR = originalNoColor;
+    }
+  });
+
+  it('prints compact agent/total durations after the verdict', () => {
+    const display = new ProgressDisplay(1);
+    const logs: string[] = [];
+    const logSpy = mock((message?: unknown) => {
+      logs.push(String(message ?? ''));
+    });
+    const originalLog = console.log;
+    console.log = logSpy as typeof console.log;
+
+    try {
+      display.start();
+      display.setTotalTests(1);
+      display.updateWorker({
+        workerId: 1,
+        testId: 'test-42-billing-negative-margin',
+        status: 'completed',
+        targetLabel: 'wtalms-stg',
+        score: 0.94,
+        verdict: 'PASS',
+        durationMs: 18342,
+        totalDurationMs: 22109,
+      });
+    } finally {
+      console.log = originalLog;
+    }
+
+    expect(logs).toEqual([
+      '1/1   ✅ test-42-billing-negative-margin | wtalms-stg | 94% PASS | τ 18342/22109ms',
+    ]);
+  });
+
+  it('normalizes total duration when reported agent time is higher', () => {
+    const display = new ProgressDisplay(1);
+    const logs: string[] = [];
+    const logSpy = mock((message?: unknown) => {
+      logs.push(String(message ?? ''));
+    });
+    const originalLog = console.log;
+    console.log = logSpy as typeof console.log;
+
+    try {
+      display.start();
+      display.setTotalTests(1);
+      display.updateWorker({
+        workerId: 1,
+        testId: 'simple-thresholds-pass',
+        status: 'completed',
+        targetLabel: 'mock_metrics_agent',
+        score: 1,
+        verdict: 'PASS',
+        durationMs: 245,
+        totalDurationMs: 78,
+      });
+    } finally {
+      console.log = originalLog;
+    }
+
+    expect(logs).toEqual([
+      '1/1   ✅ simple-thresholds-pass | mock_metrics_agent | 100% PASS | τ 245/245ms',
+    ]);
+  });
+
+  it('omits duration segments when metrics are unavailable', () => {
+    const display = new ProgressDisplay(1);
+    const logs: string[] = [];
+    const logSpy = mock((message?: unknown) => {
+      logs.push(String(message ?? ''));
+    });
+    const originalLog = console.log;
+    console.log = logSpy as typeof console.log;
+
+    try {
+      display.start();
+      display.setTotalTests(1);
+      display.updateWorker({
+        workerId: 1,
+        testId: 'test-01-biosecurity',
+        status: 'completed',
+        targetLabel: 'wtalms-stg',
+        score: 0.98,
+        verdict: 'PASS',
+      });
+    } finally {
+      console.log = originalLog;
+    }
+
+    expect(logs).toEqual(['1/1   ✅ test-01-biosecurity | wtalms-stg | 98% PASS']);
+  });
+});
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
@@ -384,6 +384,10 @@ export interface ProgressEvent {
   readonly score?: number;
   /** Execution status classification for completed/failed tests */
   readonly executionStatus?: ExecutionStatus;
+  /** Candidate/agent execution duration in milliseconds */
+  readonly durationMs?: number;
+  /** Full eval duration in milliseconds, including grading/orchestration */
+  readonly evalRunDurationMs?: number;
 }
 
 export interface RunEvaluationOptions {
@@ -1386,6 +1390,8 @@ export async function runEvaluation(
             error: result.error,
             score: result.score,
             executionStatus: result.executionStatus,
+            durationMs: result.durationMs,
+            evalRunDurationMs: result.evalRun?.durationMs,
           });
         }
 
@@ -1768,6 +1774,7 @@ async function runBatchEvaluation(options: {
           error: error instanceof Error ? error.message : String(error),
           score: errorResult.score,
           executionStatus: errorResult.executionStatus,
+          evalRunDurationMs: errorResult.evalRun?.durationMs,
         });
       }
       continue;
@@ -1788,6 +1795,8 @@ async function runBatchEvaluation(options: {
         error: result.error,
         score: result.score,
         executionStatus: result.executionStatus,
+        durationMs: result.durationMs,
+        evalRunDurationMs: result.evalRun?.durationMs,
       });
     }
   }