Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions apps/cli/test/commands/eval/assert.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ describe('agentv eval assert', () => {
} finally {
await rm(baseDir, { recursive: true, force: true });
}
});
}, 30_000);

it('exits 1 when grader returns score 0', async () => {
const { baseDir } = await createGraderFixture();
Expand All @@ -87,7 +87,7 @@ describe('agentv eval assert', () => {
} finally {
await rm(baseDir, { recursive: true, force: true });
}
});
}, 30_000);

it('exits 0 when grader returns passing score', async () => {
const { baseDir } = await createGraderFixture();
Expand All @@ -113,7 +113,7 @@ describe('agentv eval assert', () => {
} finally {
await rm(baseDir, { recursive: true, force: true });
}
});
}, 30_000);

it('errors when grader name not found', async () => {
const { baseDir } = await createGraderFixture();
Expand All @@ -137,5 +137,5 @@ describe('agentv eval assert', () => {
} finally {
await rm(baseDir, { recursive: true, force: true });
}
});
}, 30_000);
});
10 changes: 5 additions & 5 deletions apps/cli/test/commands/eval/pipeline/bench.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ describe('pipeline bench', () => {
expect(grading.summary.pass_rate).toBeGreaterThan(0);
expect(grading.assertions.length).toBeGreaterThan(0);
expect(grading.graders).toHaveLength(2);
});
}, 30_000);

it('writes index.jsonl with one entry per test', async () => {
await writeFile(
Expand All @@ -98,7 +98,7 @@ describe('pipeline bench', () => {
expect(lines).toHaveLength(1);
expect(lines[0].test_id).toBe('test-01');
expect(lines[0].score).toBeGreaterThan(0);
});
}, 30_000);

it('writes benchmark.json with run_summary', async () => {
await writeFile(
Expand All @@ -115,7 +115,7 @@ describe('pipeline bench', () => {
const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
expect(benchmark.metadata.targets).toContain('test-target');
expect(benchmark.run_summary['test-target']).toBeDefined();
});
}, 30_000);

it('propagates experiment from manifest to index.jsonl and benchmark.json', async () => {
// Overwrite manifest with experiment field
Expand All @@ -139,7 +139,7 @@ describe('pipeline bench', () => {

const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
expect(benchmark.metadata.experiment).toBe('without_skills');
});
}, 30_000);

it('omits experiment from output when manifest has no experiment', async () => {
const { execa } = await import('execa');
Expand All @@ -151,5 +151,5 @@ describe('pipeline bench', () => {

const benchmark = JSON.parse(await readFile(join(OUT_DIR, 'benchmark.json'), 'utf8'));
expect(benchmark.metadata.experiment).toBeUndefined();
});
}, 30_000);
});
12 changes: 6 additions & 6 deletions apps/cli/test/commands/eval/pipeline/grade.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ describe('pipeline grade', () => {
);
expect(result.score).toBe(1);
expect(result.name).toBe('always_pass');
});
}, 30_000);

it('includes assertions from code grader output', async () => {
const { execa } = await import('execa');
Expand All @@ -66,7 +66,7 @@ describe('pipeline grade', () => {
);
expect(result.assertions).toHaveLength(1);
expect(result.assertions[0].passed).toBe(true);
});
}, 30_000);
});

describe('pipeline grade — builtin assertions', () => {
Expand Down Expand Up @@ -144,7 +144,7 @@ describe('pipeline grade — builtin assertions', () => {
expect(result.score).toBe(1);
expect(result.type).toBe('contains');
expect(result.assertions[0].passed).toBe(true);
});
}, 30_000);

it('evaluates regex assertion and writes result', async () => {
const { execa } = await import('execa');
Expand All @@ -158,7 +158,7 @@ describe('pipeline grade — builtin assertions', () => {
);
expect(result.score).toBe(1);
expect(result.type).toBe('regex');
});
}, 30_000);

it('scores 0 when contains assertion does not match', async () => {
const { execa } = await import('execa');
Expand All @@ -172,7 +172,7 @@ describe('pipeline grade — builtin assertions', () => {
);
expect(result.score).toBe(0);
expect(result.assertions[0].passed).toBe(false);
});
}, 30_000);

it('applies negate to invert score', async () => {
// Overwrite has_goodbye with negate: true — "not contains goodbye" should pass
Expand All @@ -198,5 +198,5 @@ describe('pipeline grade — builtin assertions', () => {
);
expect(result.score).toBe(1);
expect(result.assertions[0].passed).toBe(true);
});
}, 30_000);
});
6 changes: 3 additions & 3 deletions apps/cli/test/commands/trend/trend.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ describe('trend command', () => {
});
expect((parsed.summary as Record<string, unknown>).direction).toBe('degrading');
expect((parsed.summary as Record<string, unknown>).matched_test_count).toBe(2);
});
}, 30_000);

it('normalizes explicit run inputs to chronological order before analysis', async () => {
const cwd = await createTempDir();
Expand Down Expand Up @@ -499,7 +499,7 @@ describe('trend command', () => {
expect(result.exitCode).toBe(1);
expect(result.stdout).toContain('Trend Analysis');
expect(result.stdout).toContain('degrading');
});
}, 30_000);

it('errors when target filtering leaves a selected run empty in CLI mode', async () => {
const cwd = await createTempDir();
Expand Down Expand Up @@ -532,5 +532,5 @@ describe('trend command', () => {

expect(result.exitCode).toBe(1);
expect(result.stderr).toContain('Run has no matching records');
});
}, 30_000);
});
8 changes: 4 additions & 4 deletions apps/cli/test/eval.integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ describe('agentv eval CLI', () => {
} finally {
await rm(fixture.baseDir, { recursive: true, force: true });
}
});
}, 30_000);

it('loads the nearest .env first and uses parent .env only for missing keys', async () => {
const fixture = await createNestedEnvFixture();
Expand All @@ -249,7 +249,7 @@ describe('agentv eval CLI', () => {
} finally {
await rm(fixture.baseDir, { recursive: true, force: true });
}
});
}, 30_000);

it('supports repeatable --test-id flags with OR matching', async () => {
const fixture = await createFixture();
Expand All @@ -269,7 +269,7 @@ describe('agentv eval CLI', () => {
} finally {
await rm(fixture.baseDir, { recursive: true, force: true });
}
});
}, 30_000);

it('passes run-level budget tracking through to the evaluator', async () => {
const fixture = await createFixture();
Expand All @@ -285,5 +285,5 @@ describe('agentv eval CLI', () => {
} finally {
await rm(fixture.baseDir, { recursive: true, force: true });
}
});
}, 30_000);
});
2 changes: 1 addition & 1 deletion packages/core/src/evaluation/loaders/config-loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import { readFile } from 'node:fs/promises';
import path from 'node:path';

import { interpolateEnv } from '../interpolation.js';
import { parseYamlValue } from '../yaml-loader.js';
import type {
EvalTargetRef,
FailOnError,
Expand All @@ -13,6 +12,7 @@ import type {
WorkspaceHookConfig,
} from '../types.js';
import { isJsonObject } from '../types.js';
import { parseYamlValue } from '../yaml-loader.js';
import { buildDirectoryChain, fileExists } from './file-resolver.js';

const ANSI_YELLOW = '\u001b[33m';
Expand Down
2 changes: 1 addition & 1 deletion packages/core/src/evaluation/loaders/grader-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ import path from 'node:path';

import { normalizePreprocessorType } from '../content-preprocessor.js';
import { interpolateEnv } from '../interpolation.js';
import { parseYamlValue } from '../yaml-loader.js';
import type { ToolTrajectoryExpectedItem, ToolTrajectoryGraderConfig } from '../trace.js';
import type {
ContentPreprocessorConfig,
Expand All @@ -14,6 +13,7 @@ import type {
} from '../types.js';
import { isGraderKind } from '../types.js';
import { validateCustomPromptContent } from '../validation/prompt-validator.js';
import { parseYamlValue } from '../yaml-loader.js';
import { resolveFileReference } from './file-resolver.js';

const ANSI_YELLOW = '\u001b[33m';
Expand Down
Loading
Loading