diff --git a/apps/studio/src/components/TargetsTab.tsx b/apps/studio/src/components/TargetsTab.tsx index 18c24ca1..ab43fbd1 100644 --- a/apps/studio/src/components/TargetsTab.tsx +++ b/apps/studio/src/components/TargetsTab.tsx @@ -1,24 +1,97 @@ /** - * Targets table showing targets grouped across all runs. + * Targets tab with drill-down from target -> experiment-grouped runs. * - * Displays target name, number of runs, experiments, pass rate, and - * eval counts (passed/total). Links are not needed since targets are - * informational groupings. + * The summary table opens a target detail view. That detail view groups runs + * by experiment and reuses the existing run-detail routes for the final click, + * so category breakdowns and individual test cases stay consistent everywhere. */ -import { useTargets } from '~/lib/api'; -import type { TargetSummary } from '~/lib/types'; +import { useQuery } from '@tanstack/react-query'; +import { useEffect, useMemo, useState } from 'react'; + +import { + benchmarkRunListOptions, + benchmarkTargetsOptions, + runListOptions, + targetsOptions, +} from '~/lib/api'; +import type { RunMeta, TargetsResponse } from '~/lib/types'; import { PassRatePill } from './PassRatePill'; +import { RunList } from './RunList'; + +interface TargetsTabProps { + benchmarkId?: string; +} + +interface ExperimentRunGroup { + name: string; + runs: RunMeta[]; + latestTimestamp: string | null; + evalCount: number; + passedCount: number; + passRate: number; +} + +export function TargetsTab({ benchmarkId }: TargetsTabProps = {}) { + const [selectedTargetName, setSelectedTargetName] = useState(null); + const targetsQuery = useQuery( + benchmarkId ? benchmarkTargetsOptions(benchmarkId) : targetsOptions, + ); + const runsQuery = useQuery(benchmarkId ? benchmarkRunListOptions(benchmarkId) : runListOptions); + const targets = (targetsQuery.data as TargetsResponse | undefined)?.targets ?? []; + const runs = runsQuery.data?.runs ?? []; + const error = targetsQuery.error ?? runsQuery.error; + const isLoading = targetsQuery.isLoading || runsQuery.isLoading; + + const selectedTarget = useMemo( + () => targets.find((target) => target.name === selectedTargetName) ?? null, + [selectedTargetName, targets], + ); + + useEffect(() => { + if (selectedTargetName && !targets.some((target) => target.name === selectedTargetName)) { + setSelectedTargetName(null); + } + }, [selectedTargetName, targets]); + + const experimentGroups = useMemo(() => { + if (!selectedTarget) return []; + + const groups = new Map(); + for (const run of runs) { + const targetName = run.target ?? 'default'; + if (targetName !== selectedTarget.name) continue; -export function TargetsTab() { - const { data, isLoading } = useTargets(); + const experimentName = run.experiment ?? 'default'; + const existing = groups.get(experimentName) ?? []; + existing.push(run); + groups.set(experimentName, existing); + } + + return [...groups.entries()] + .map(([name, experimentRuns]) => buildExperimentGroup(name, experimentRuns)) + .sort((a, b) => { + if (a.latestTimestamp && b.latestTimestamp && a.latestTimestamp !== b.latestTimestamp) { + return b.latestTimestamp.localeCompare(a.latestTimestamp); + } + if (a.latestTimestamp) return -1; + if (b.latestTimestamp) return 1; + return a.name.localeCompare(b.name); + }); + }, [runs, selectedTarget]); if (isLoading) { return ; } - const targets = data?.targets ?? []; + if (error) { + return ( +
+ Failed to load targets: {error.message} +
+ ); + } if (targets.length === 0) { return ( @@ -31,60 +104,191 @@ export function TargetsTab() { ); } - return ( -
- - - - - - - - - - - - {targets.map((target: TargetSummary) => ( - - - - - - + if (!selectedTarget) { + return ( +
+
TargetRunsExperimentsPass RateEvals
{target.name} - {target.run_count} - - {target.experiment_count} - - - - {target.passed_count} - / - {target.eval_count} -
+ + + + + + + + + + {targets.map((target) => ( + + + + + + + + ))} + +
TargetRunsExperimentsPass RateEvals
+ + + {target.run_count} + + {target.experiment_count} + + + + {target.passed_count} + / + {target.eval_count} +
+
+ ); + } + + return ( +
+
+ +
+
+
+

{selectedTarget.name}

+

+ {selectedTarget.run_count} run{selectedTarget.run_count === 1 ? '' : 's'} ·{' '} + {selectedTarget.experiment_count} experiment + {selectedTarget.experiment_count === 1 ? '' : 's'} ·{' '} + {selectedTarget.passed_count} + / + {selectedTarget.eval_count} evals passed +

+
+
+ +
+
+
+
+ + {experimentGroups.length === 0 ? ( +
+

No runs found for this target

+

+ This target summary exists, but there are no matching runs to group by experiment. +

+
+ ) : ( +
+ {experimentGroups.map((group) => ( +
+
+
+

+ {formatExperimentName(group.name)} +

+

+ {group.runs.length} run{group.runs.length === 1 ? '' : 's'} ·{' '} + {group.passedCount} + / + {group.evalCount} evals passed + {group.latestTimestamp && ( + + · Last run {formatTimestamp(group.latestTimestamp)} + + )} +

+
+
+ +
+
+ +
))} - - +
+ )}
); } +function buildExperimentGroup(name: string, runs: RunMeta[]): ExperimentRunGroup { + let evalCount = 0; + let passedCount = 0; + let latestTimestamp: string | null = null; + + for (const run of runs) { + evalCount += run.test_count; + passedCount += Math.round(run.pass_rate * run.test_count); + if (run.timestamp && (!latestTimestamp || run.timestamp > latestTimestamp)) { + latestTimestamp = run.timestamp; + } + } + + return { + name, + runs, + latestTimestamp, + evalCount, + passedCount, + passRate: evalCount > 0 ? passedCount / evalCount : 0, + }; +} + +function formatExperimentName(name: string): string { + return name === 'default' ? 'Default experiment' : name; +} + +function formatTimestamp(ts: string): string { + const date = new Date(ts); + if (Number.isNaN(date.getTime())) return ts; + + const diffMs = Date.now() - date.getTime(); + const diffMin = Math.floor(diffMs / 60_000); + const diffHour = Math.floor(diffMs / 3_600_000); + + if (diffMin < 1) return 'just now'; + if (diffMin < 60) return `${diffMin} min ago`; + if (diffHour < 24) return `${diffHour} hour${diffHour === 1 ? '' : 's'} ago`; + return date.toLocaleDateString(); +} + function LoadingSkeleton() { return ( -
-
-
-
-
- {['sk-1', 'sk-2', 'sk-3', 'sk-4', 'sk-5'].map((id) => ( -
-
-
-
+
+
+
+
+
+
+
+
-
- ))} + {['sk-1', 'sk-2', 'sk-3', 'sk-4', 'sk-5'].map((id) => ( +
+
+
+
+
+
+
+ ))} +
+
+
+
+
); diff --git a/apps/studio/src/routes/benchmarks/$benchmarkId.tsx b/apps/studio/src/routes/benchmarks/$benchmarkId.tsx index f8d69819..76301c45 100644 --- a/apps/studio/src/routes/benchmarks/$benchmarkId.tsx +++ b/apps/studio/src/routes/benchmarks/$benchmarkId.tsx @@ -12,16 +12,16 @@ import { AnalyticsTab } from '~/components/AnalyticsTab'; import { RunEvalModal } from '~/components/RunEvalModal'; import { RunList } from '~/components/RunList'; import { type RunSourceFilter, RunSourceToolbar } from '~/components/RunSourceToolbar'; +import { TargetsTab } from '~/components/TargetsTab'; import { benchmarkCompareOptions, benchmarkExperimentsOptions, - benchmarkTargetsOptions, syncRemoteResultsApi, useBenchmarkRunList, useRemoteStatus, useStudioConfig, } from '~/lib/api'; -import type { ExperimentsResponse, TargetsResponse } from '~/lib/types'; +import type { ExperimentsResponse } from '~/lib/types'; type TabId = 'runs' | 'experiments' | 'analytics' | 'targets'; @@ -94,7 +94,7 @@ function BenchmarkHomePage() { {activeTab === 'analytics' && ( )} - {activeTab === 'targets' && } + {activeTab === 'targets' && } {!isReadOnly && ( ); } - -function BenchmarkTargetsTab({ benchmarkId }: { benchmarkId: string }) { - const { data, isLoading } = useQuery(benchmarkTargetsOptions(benchmarkId)); - const targets = (data as TargetsResponse | undefined)?.targets ?? []; - - if (isLoading) { - return ( -
- {['s1', 's2', 's3'].map((id) => ( -
- ))} -
- ); - } - - if (targets.length === 0) { - return ( -
-

No targets found

-
- ); - } - - return ( -
- {targets.map((t) => ( -
-
-

{t.name}

-

- {t.run_count} run{t.run_count !== 1 ? 's' : ''} · {t.experiment_count}{' '} - experiment{t.experiment_count !== 1 ? 's' : ''} -

-
- - {Math.round(t.pass_rate * 100)}% - -
- ))} -
- ); -}