diff --git a/artifacts/venue_correction_validation_latest.md b/artifacts/venue_correction_validation_latest.md index b8ac8c2..890f581 100644 --- a/artifacts/venue_correction_validation_latest.md +++ b/artifacts/venue_correction_validation_latest.md @@ -1,6 +1,6 @@ # Venue Correction Validation Scorecard -Generated: 2026-05-03T17:06:52+00:00 +Generated: 2026-05-05T00:24:36+00:00 Correction method: `distance_mean_shrinkage_v1 (latest prior-season only)` @@ -10,10 +10,10 @@ Training snapshot: `schema=v5; seasons=20092010-20252026; rows=1,854,812; adjust | Gate | Result | Metric | |------|--------|--------| -| Held-out log loss non-worse | PASS | delta = -0.000017 | +| Held-out log loss non-worse | PASS | delta = -0.000015 | | Home-ice over-correction guardrail | PASS | removed = -0.013, max = 0.500 | -| Distance/location residual z-scores | FAIL | blocking regimes = 24, supported regimes = 4, max abs(z) = 4.067, limit < 2.000 | -| Event-frequency residual z-scores | FAIL | blocking regimes = 4, supported regimes = 23, max abs(z) = 3.572, limit < 2.000 | +| Distance/location residual z-scores | FAIL | blocking regimes = 10, supported regimes = 18, max abs(z) = 4.067, limit < 2.000 | +| Event-frequency residual z-scores | FAIL | blocking regimes = 5, supported regimes = 22, max abs(z) = 3.572, limit < 2.000 | ## Summary Metrics @@ -21,16 +21,16 @@ Training snapshot: `schema=v5; seasons=20092010-20252026; rows=1,854,812; adjust - Holdout rows: 1,525,907 - Distance residual venue-seasons evaluated: 532 - Distance residual gate mode: `regime_aware` -- Distance blocking regimes: 24 -- Distance supported regimes: 4 +- Distance blocking regimes: 10 +- Distance supported regimes: 18 - Event-frequency residual venue-seasons evaluated: 525 - Event-frequency residual gate mode: `regime_aware` -- Event-frequency blocking regimes: 4 -- Event-frequency supported regimes: 23 -- Baseline log loss: 0.229272 -- Corrected log loss: 0.229255 +- Event-frequency blocking regimes: 5 +- Event-frequency supported regimes: 22 +- Baseline log loss: 0.229270 +- Corrected log loss: 0.229254 - Baseline home advantage: 0.001848 -- Corrected home advantage: 0.001872 +- Corrected home advantage: 0.001873 - Worst distance/location residual: `20092010:Madison Square Garden` - Worst event-frequency residual: `20112012:Prudential Center` @@ -39,27 +39,47 @@ Training snapshot: `schema=v5; seasons=20092010-20252026; rows=1,854,812; adjust | Metric | Venue-season | z | Classification | Prior roll | Centered roll | Population anomaly share | Evidence | Known prior | |--------|--------------|---|----------------|------------|---------------|--------------------------|----------|-------------| -| `distance_location` | `20092010:Madison Square Garden` | -4.067 | `persistent_bias` | n/a | -3.114 | 0.032 | NO | YES | -| `distance_location` | `20172018:Bell MTS Place` | 3.123 | `unexplained_or_confounded` | n/a | 1.456 | 0.091 | NO | NO | -| `distance_location` | `20192020:United Center` | -3.121 | `unexplained_or_confounded` | -1.237 | -1.840 | 0.062 | NO | NO | -| `distance_location` | `20222023:SAP Center at San Jose` | -2.885 | `unexplained_or_confounded` | 0.700 | 0.009 | 0.062 | NO | NO | +| `distance_location` | `20092010:Madison Square Garden` | -4.067 | `temporary_supported_regime` | n/a | -3.114 | 0.032 | YES | YES | +| `distance_location` | `20172018:Bell MTS Place` | 3.123 | `temporary_supported_regime` | n/a | 1.456 | 0.091 | YES | NO | +| `distance_location` | `20192020:United Center` | -3.121 | `temporary_supported_regime` | -1.237 | -1.840 | 0.062 | YES | NO | +| `distance_location` | `20222023:SAP Center at San Jose` | -2.885 | `temporary_supported_regime` | 0.700 | 0.009 | 0.062 | YES | NO | | `distance_location` | `20182019:NYCB Live/Nassau Coliseum` | -2.838 | `unexplained_or_confounded` | n/a | -1.383 | 0.031 | NO | NO | | `distance_location` | `20202021:Amalie Arena` | -2.801 | `unexplained_or_confounded` | 0.476 | -0.896 | 0.065 | NO | NO | -| `distance_location` | `20122013:Wells Fargo Center` | 2.690 | `unexplained_or_confounded` | -0.618 | 0.947 | 0.067 | NO | NO | -| `distance_location` | `20112012:American Airlines Center` | 2.640 | `unexplained_or_confounded` | -0.110 | -0.144 | 0.031 | NO | NO | -| `distance_location` | `20222023:Little Caesars Arena` | 2.635 | `unexplained_or_confounded` | -0.547 | 0.596 | 0.062 | NO | NO | -| `distance_location` | `20212022:Enterprise Center` | -2.628 | `unexplained_or_confounded` | -0.049 | -0.280 | 0.061 | NO | NO | +| `distance_location` | `20122013:Wells Fargo Center` | 2.690 | `temporary_supported_regime` | -0.618 | 0.947 | 0.067 | YES | NO | +| `distance_location` | `20112012:American Airlines Center` | 2.640 | `temporary_supported_regime` | -0.110 | -0.144 | 0.031 | YES | NO | +| `distance_location` | `20222023:Little Caesars Arena` | 2.635 | `temporary_supported_regime` | -0.547 | 0.596 | 0.062 | YES | NO | +| `distance_location` | `20212022:Enterprise Center` | -2.628 | `temporary_supported_regime` | -0.049 | -0.280 | 0.061 | YES | NO | | `event_frequency` | `20112012:Prudential Center` | -3.572 | `persistent_bias` | -3.033 | -3.103 | 0.033 | YES | NO | | `event_frequency` | `20152016:Prudential Center` | -3.485 | `persistent_bias` | -2.771 | -2.592 | 0.067 | YES | NO | | `event_frequency` | `20102011:Prudential Center` | -3.155 | `persistent_bias` | -2.910 | -3.212 | 0.033 | YES | NO | | `event_frequency` | `20182019:Scotiabank Arena` | 2.982 | `temporary_supported_regime` | n/a | 2.445 | 0.031 | YES | NO | | `event_frequency` | `20132014:Prudential Center` | -2.967 | `persistent_bias` | -3.103 | -2.771 | 0.033 | YES | NO | -| `event_frequency` | `20092010:Prudential Center` | -2.910 | `persistent_bias` | n/a | -3.033 | 0.033 | YES | NO | +| `event_frequency` | `20092010:Prudential Center` | -2.910 | `temporary_supported_regime` | n/a | -3.033 | 0.033 | YES | NO | | `event_frequency` | `20202021:Amalie Arena` | -2.845 | `unexplained_or_confounded` | -0.292 | -1.640 | 0.032 | NO | NO | | `event_frequency` | `20252026:American Airlines Center` | -2.785 | `temporary_supported_regime` | 0.085 | -1.350 | 0.062 | YES | NO | | `event_frequency` | `20142015:Prudential Center` | -2.765 | `persistent_bias` | -3.040 | -3.073 | 0.067 | YES | NO | | `event_frequency` | `20232024:Nationwide Arena` | 2.607 | `temporary_supported_regime` | 0.538 | 1.465 | 0.062 | YES | NO | +## Distance-Location Paired Diagnostics + +- Primary distance gate: venue-season corrected-distance residuals with visiting-team paired evidence stratified by shot type and manpower state. + +- Candidate distance residuals: 28 +- Supported paired distance regimes: 17 + +| Venue-season | z | Paired diff | 95% CI | d | Pairs | Evidence | Evidence classification | Regime classification | +|--------------|---|-------------|--------|---|-------|----------|-------------------------|-----------------------| +| `20092010:Madison Square Garden` | -4.067 | -8.167 | [-9.935, -5.944] | -1.647 | 23 | YES | `real_scorekeeper_regime_supported` | `temporary_supported_regime` | +| `20172018:Bell MTS Place` | 3.123 | 1.529 | [0.249, 2.838] | 0.421 | 30 | YES | `real_scorekeeper_regime_supported` | `temporary_supported_regime` | +| `20192020:United Center` | -3.121 | -3.010 | [-4.433, -1.513] | -0.760 | 27 | YES | `real_scorekeeper_regime_supported` | `temporary_supported_regime` | +| `20222023:SAP Center at San Jose` | -2.885 | -2.810 | [-3.981, -1.618] | -0.821 | 31 | YES | `real_scorekeeper_regime_supported` | `temporary_supported_regime` | +| `20182019:NYCB Live/Nassau Coliseum` | -2.838 | -0.598 | [-2.050, 0.838] | -0.185 | 18 | NO | `hockey_context_confounded` | `unexplained_or_confounded` | +| `20202021:Amalie Arena` | -2.801 | -3.881 | [-5.373, -2.479] | -1.649 | 9 | NO | `insufficient_evidence` | `unexplained_or_confounded` | +| `20122013:Wells Fargo Center` | 2.690 | 2.319 | [0.910, 3.648] | 0.849 | 14 | YES | `real_scorekeeper_regime_supported` | `temporary_supported_regime` | +| `20112012:American Airlines Center` | 2.640 | 1.273 | [0.396, 2.218] | 0.561 | 23 | YES | `real_scorekeeper_regime_supported` | `temporary_supported_regime` | +| `20222023:Little Caesars Arena` | 2.635 | 2.150 | [0.885, 3.461] | 0.584 | 31 | YES | `real_scorekeeper_regime_supported` | `temporary_supported_regime` | +| `20212022:Enterprise Center` | -2.628 | -3.291 | [-4.394, -2.089] | -0.986 | 31 | YES | `real_scorekeeper_regime_supported` | `temporary_supported_regime` | + ## Event-Frequency Diagnostics Primary frequency gate: sample-adequate `regular_season:training_attempts` @@ -82,4 +102,4 @@ Primary frequency gate: sample-adequate `regular_season:training_attempts` ## Notes -Generated from live SQLite data with forward-chaining temporal CV. Each shot uses the latest venue distance adjustment from a season before the shot's season; same-season venue corrections are not used for holdout rows. Distance residual z-scores are venue-season corrected-distance mean z-scores. Rolling venue-regime diagnostics use prior-only rolling estimates for production-safe context and centered rolling estimates only for exploratory historical-spike labeling. Event-frequency residual z-scores use sample-adequate regular-season training attempts as the primary gate; blocked-shot and all-attempt frequencies are reported as diagnostics and remain outside the current shot-level xG training contract. +Generated from live SQLite data with forward-chaining temporal CV. Each shot uses the latest venue distance adjustment from a season before the shot's season; same-season venue corrections are not used for holdout rows. Distance residual z-scores are venue-season corrected-distance mean z-scores. Distance/location candidates are annotated with paired visiting-team evidence stratified by shot type and manpower state; this diagnostic uses the in-memory prior-corrected distances and does not mutate shot_events or venue_bias_corrections. Rolling venue-regime diagnostics use prior-only rolling estimates for production-safe context and centered rolling estimates only for exploratory historical-spike labeling. Event-frequency residual z-scores use sample-adequate regular-season training attempts as the primary gate; blocked-shot and all-attempt frequencies are reported as diagnostics and remain outside the current shot-level xG training contract. diff --git a/docs/xg_model_components/04_scorekeeper_bias.md b/docs/xg_model_components/04_scorekeeper_bias.md index 80db7ae..8358530 100644 --- a/docs/xg_model_components/04_scorekeeper_bias.md +++ b/docs/xg_model_components/04_scorekeeper_bias.md @@ -15,26 +15,34 @@ Estimate and correct rink/venue scorer effects that distort event recording and - `scripts/export_venue_correction_validation.py` exports the Phase 2.5.4 scorecard once a metrics JSON has been generated from a current database. The scorecard gates are held-out log-loss non-worsening, home-ice - over-correction, max distance/location residual venue z-score, and max - sample-adequate event-frequency residual venue z-score. + over-correction, distance/location residuals, and sample-adequate + event-frequency residuals. Residual z-scores mark candidate venue-seasons + for regime-aware review rather than acting as automatic vetoes. - `scripts/export_venue_correction_validation_from_db.py` generates that metrics payload directly from SQLite with forward-chaining temporal CV and prior-season-only venue distance corrections under the shared model-training contract. It also computes normalized event-frequency diagnostics by - venue-season, event group, and game-type scope. The primary frequency gate - uses sample-adequate regular-season training attempts; blocked-shot and - all-attempt frequencies are diagnostic only. The 2026-05-01 live v5 refresh - passes held-out log-loss and home-ice guardrails but fails the residual - corrected-distance z-score gate (`max |z| = 4.067`) and event-frequency - residual gate (`max |z| = 3.572`), so the current correction remains - exploratory rather than a production xG training feature. -- The 2026-05-03 rolling venue-regime extension adds a less brittle + venue-season, event group, and game-type scope plus paired distance-location + diagnostics from in-memory prior-corrected distances. The distance diagnostic + compares each visiting team's corrected shot distance at a venue against that + same team's away shots elsewhere in the same season, stratified by shot type + and manpower state. The primary frequency gate uses sample-adequate + regular-season training attempts; blocked-shot and all-attempt frequencies + are diagnostic only. The 2026-05-05 live v5 refresh uses the regime-aware + residual gate. It passes held-out log-loss and home-ice guardrails but still + fails the residual corrected-distance gate (`max |z| = 4.067`, 10 blocking + regimes) and event-frequency residual gate (`max |z| = 3.572`, 5 blocking + regimes), so the current correction remains exploratory rather than a + production xG training feature. +- The 2026-05-03 rolling venue-regime extension, expanded with paired + distance evidence on 2026-05-05, adds a less brittle acceptance path for historically real scorer spikes. `src/venue_bias.py` now computes prior-only rolling residual estimates for production-safe context, centered rolling estimates for exploratory historical diagnosis, and regime labels: `persistent_bias`, `temporary_supported_regime`, and `unexplained_or_confounded`. `evaluate_venue_correction_scorecard()` can - use those labels so supported temporary or persistent regimes are reported + use those labels so `|z| >= 2` is a candidate residual rather than an + automatic veto. Supported temporary or persistent regimes are reported without automatically failing the correction layer. Unexplained/confounded residuals, population-wide shifts, insufficient evidence, held-out log-loss harm, and home-ice over-correction remain blocking. diff --git a/docs/xg_model_roadmap.md b/docs/xg_model_roadmap.md index e0e9e46..ed70e5e 100644 --- a/docs/xg_model_roadmap.md +++ b/docs/xg_model_roadmap.md @@ -92,7 +92,7 @@ The target architecture must be modular enough to support incremental delivery a --- -## Current Implementation Status (as of 2026-05-01) +## Current Implementation Status (as of 2026-05-05) Status here is verified against the live database, not just self-reported from prior commits. @@ -101,7 +101,7 @@ Status here is verified against the live database, not just self-reported from p | Phase 0 — contracts, schema, reproducibility | **Complete** | `_XG_EVENT_SCHEMA_VERSION = "v5"` (`src/database.py`); live validation run found all 2,122,963 `shot_events` rows at v5 and zero stale training-eligible rows; version-aware backfill (`game_has_current_shot_events`, `delete_game_shot_events`) present; `validate_shot_events_quality` covers enums/ranges/duplicates. | | Phase 1 — event/state foundation | **Complete** | `normalize_coordinates`, distance/angle, 11-type shot taxonomy, score/manpower/time classifiers in `src/xg_features.py`. Pre-2020 negative-x rate is now 0.0 (was ~50% at v2). Pre-event score tracker `_track_score` prevents post-goal leakage. | | Phase 2 — context feature engineering | **Complete (with two criteria formally deferred)** | `game_context` populated (26,372 rows at v1, verified 2026-05-01) with rest/travel/timezone features; `validate_game_context_quality` added. Faceoff decay bins implemented. `populate_venue_diagnostics` is wired into the scraper pipeline via `finalize_season_diagnostics` and runs per season. VIF review done on live data (see below). The two remaining acceptance criteria are now split by dependency: held-out faceoff-decay validation can proceed using the promoted `src/validation.py` helpers, while zone-start change-on-the-fly inference remains blocked on populated shift/on-ice data. | -| Phase 2.5 — rigor foundation (new, gates feature inclusion) | **In progress (selected Phase 3 baseline unblocked)** | 2.5.1 and 2.5.2 are implemented in `src/` with tests; live player readiness now passes with 4,694 `players`, 831,573 `player_game_stats` rows, and 831,573 current-version `player_game_features` rows. 2.5.3 now has a live v5 validation scorecard artifact that passes all 8 gates; 2.5.4 now has populated diagnostics/corrections (653 `venue_bias_diagnostics` rows and 532 `venue_bias_corrections` rows), shrinkage-based distance adjustment parameters wired into `finalize_season_diagnostics`, event-frequency scorekeeper diagnostics, rolling venue-regime classification, a JSON scorecard exporter, and a DB-backed live runner. The latest live venue-correction scorecard passes held-out log-loss and home-ice guardrail gates but fails both distance/location and event-frequency residual z-score gates under the prior max-z policy; the next live run can evaluate the new regime-aware gates. 2.5.5 has a recorded decision and an enforced loader guard (`load_training_shot_events`) excluding pre-2009 seasons and non-training shot rows. | +| Phase 2.5 — rigor foundation (new, gates feature inclusion) | **In progress (selected Phase 3 baseline unblocked)** | 2.5.1 and 2.5.2 are implemented in `src/` with tests; live player readiness now passes with 4,694 `players`, 831,573 `player_game_stats` rows, and 831,573 current-version `player_game_features` rows. 2.5.3 now has a live v5 validation scorecard artifact that passes all 8 gates; 2.5.4 now has populated diagnostics/corrections (653 `venue_bias_diagnostics` rows and 532 `venue_bias_corrections` rows), shrinkage-based distance adjustment parameters wired into `finalize_season_diagnostics`, event-frequency scorekeeper diagnostics, paired/stratified distance-location evidence, rolling venue-regime classification, a JSON scorecard exporter, and a DB-backed live runner. The latest live venue-correction scorecard uses the regime-aware residual gate and passes held-out log-loss and home-ice guardrail gates, but remains unaccepted because blocking unexplained/confounded residuals remain (10 distance/location, 5 event-frequency). 2.5.5 has a recorded decision and an enforced loader guard (`load_training_shot_events`) excluding pre-2009 seasons and non-training shot rows. | | Phase 3 — baseline xG model | **Ready to start** | No training code in `src/`. The live validation scorecard now passes 8/8 gates with a selected calibrated logistic model (`artifacts/validation_scorecard_latest.md`), so Phase 3 model implementation can proceed while keeping unresolved features excluded. | | Phase 4 — enhanced xG model | **Not started** | Depends on Phase 3. | | Phase 5 — RAPM on xG | **Blocked on downstream prerequisites** | Player identity and player-game foundations are populated and validated. Shift/on-ice/RAPM schemas and Phase 1 shift utilities exist, but the live DB still has 0 `shifts`, 0 `on_ice_intervals`, 0 `shift_quality_features`, and 0 `rapm_player_ratings` rows. Remaining blockers are validated xG predictions with uncertainty plus populated shift/TOI/on-ice exposure data for a true RAPM design matrix. | @@ -111,7 +111,7 @@ Status here is verified against the live database, not just self-reported from p 1. **Player database blocker is closed.** Live validation on 2026-05-01 found `ids_missing_and_not_unavailable = 0`, 2,301/2,301 players with ≥ 50 career shots have `shoots_catches`, `player_game_stats` covers all 831,573 event-derived player-game pairs, and `player_game_features` now has 831,573 current-version rows with zero missing, duplicate, stale-version, or unsupported-value rows. The remaining RAPM data gap is shift/TOI/on-ice exposure and xG prediction/residual inputs, not player identity metadata. 2. **2007–2008 shot-distance anomaly.** Average `distance_to_goal` for 2007–08 is ~19–20 units vs ~34 for 2009+. `wrap-around` and `deflected` shots have `NULL` distances in 2007–08 (coordinates absent in that era). **Phase 2.5.5 decision:** exclude pre-2009 seasons from model-training inputs; enforced by `load_training_shot_events` and test coverage. -3. **Venue bias correction is implemented but not accepted.** `scripts/export_venue_correction_validation_from_db.py` runs the Phase 2.5.4 gates against live v5 data using only prior-season venue adjustments for each held-out shot and the same tightened training contract as the validation scorecard. The 2026-05-01 scorecard passes held-out log-loss (`delta = -0.000017`) and home-ice over-correction (`removed = -0.013`, limit 0.500) but fails distance/location residuals (`max |z| = 4.067`, limit < 2.000; worst venue-season `20092010:Madison Square Garden`) and sample-adequate event-frequency residuals (`max |z| = 3.572`, limit < 2.000; worst venue-season `20112012:Prudential Center`) under the original max-z residual policy. As of 2026-05-03, the scorecard also supports a rolling venue-regime policy: prior-only rolling estimates provide production-safe context, centered rolling estimates are diagnostic only, and candidate spikes are labeled `persistent_bias`, `temporary_supported_regime`, or `unexplained_or_confounded`. Keep the correction out of production xG training until a live regime-aware run shows no blocking unexplained/confounded residuals, does not worsen held-out log-loss, and does not over-remove home-ice advantage. +3. **Venue bias correction is implemented but not accepted.** `scripts/export_venue_correction_validation_from_db.py` runs the Phase 2.5.4 gates against live v5 data using only prior-season venue adjustments for each held-out shot and the same tightened training contract as the validation scorecard. The 2026-05-05 scorecard uses the regime-aware residual policy: `|z| >= 2` marks a candidate residual, supported `persistent_bias` and `temporary_supported_regime` rows are reported as non-blocking, and `unexplained_or_confounded`, `population_shift_detected`, and `insufficient_evidence` rows remain blocking. Distance/location candidates now also get paired visiting-team evidence stratified by `shot_type` and `manpower_state`. The live run passes held-out log-loss (`delta = -0.000015`) and home-ice over-correction (`removed = -0.013`, limit 0.500), but still fails distance/location residuals (10 blocking regimes, 18 supported regimes, `max |z| = 4.067`) and sample-adequate event-frequency residuals (5 blocking regimes, 22 supported regimes, `max |z| = 3.572`). Keep the correction out of production xG training until both residual gates have zero blocking regimes, held-out log-loss remains non-worse, and the home-ice guardrail still passes. 4. **Validation scorecard blocker is resolved for selected features.** The 2026-04-30 live v5 validation framework run passes 8/8 gates: mean AUC 0.7551, calibration slope 0.9870, max decile error 0.407 pp, ECE 0.193 pp, subgroup max error 1.24 pp, and AUC drift +0.0001/season. Faceoff, rest/travel, raw venue features, and other unresolved candidates remain listed as excluded pending and do not feed the selected model. ### Phase 2 multicollinearity review (VIF, live data, 2026-04-19) @@ -202,7 +202,7 @@ Acceptance criteria: - ⏸ Zone-start features carry a documented inference accuracy estimate. **Deferred to shift persistence/backfill** — proper change-on-the-fly inference needs populated `shifts` and `on_ice_intervals` tables. The raw `faceoff_zone_code` + `seconds_since_faceoff` captured today is a usable-but-weak proxy; the richer feature follows shift data. - ✅ Multicollinearity review across rest/travel/score-state features; VIF < 5 for each. **Met**, with the finding that `rest_advantage` is a perfect linear combination of `home_rest_days − away_rest_days` and must be excluded from the Phase 3 design matrix. Remaining six features: max VIF = 2.76. See "Phase 2 multicollinearity review" block above. - ✅ Venue bias diagnostics populated per season via `finalize_season_diagnostics` (`src/main.py`), running after the scraper/backfill loop. -- ❌ Venue correction layer has live DB-backed acceptance results but is not accepted for production xG training yet: held-out log-loss and home-ice guardrail pass, while distance/location and event-frequency residual z-score gates fail under the 2026-05-01 max-z Phase 2.5.4 scorecard. A rolling venue-regime scorecard path is now implemented and needs a fresh live artifact before acceptance can be reconsidered. +- ❌ Venue correction layer has live DB-backed acceptance results but is not accepted for production xG training yet: held-out log-loss and home-ice guardrail pass, while the 2026-05-05 regime-aware paired-distance Phase 2.5.4 scorecard still fails both residual gates because blocking unexplained/confounded venue-season residuals remain. ## Phase 2.5: Rigor Foundation (gates feature inclusion) @@ -251,13 +251,14 @@ Acceptance: - **Hierarchical venue intercepts:** add a partially-pooled venue-season intercept to the xG model, shrinking toward a league prior. - Implement one approach; decision recorded. - **Guardrail evaluator added (2026-04-24):** `src/validation.py::evaluate_venue_correction_holdout` now computes held-out log-loss delta and the share of baseline home-ice advantage removed by correction, with a pre-registered threshold `VENUE_CORRECTION_MAX_HOME_ICE_ADVANTAGE_REMOVAL = 0.5` and regression tests in `tests/test_validation.py`. -- **Scorecard harness added (2026-04-28, expanded 2026-05-01 and 2026-05-03):** `src/validation.py::evaluate_venue_correction_scorecard` combines held-out log-loss, home-ice over-correction, distance/location residual z-score, and event-frequency residual z-score gates; it now optionally evaluates residual gates using rolling venue-regime diagnostics instead of a blunt max-z veto. `scripts/export_venue_correction_validation.py` exports a Markdown artifact from a metrics JSON payload. -- **Live DB runner added and executed (refreshed 2026-05-01; regime-aware path added 2026-05-03):** `scripts/export_venue_correction_validation_from_db.py` builds leakage-safe temporal CV metrics from SQLite by applying only the latest prior-season venue distance adjustment to each held-out shot, using the same model-training contract as `load_training_shot_events`. It now also computes normalized event-frequency diagnostics by venue-season, event group, and game-type scope. The primary frequency gate uses sample-adequate regular-season training attempts; blocked-shot and all-attempt frequencies are diagnostic only. The runner now attaches rolling venue-regime diagnostics: prior-only rolling estimates are production-safe, centered rolling estimates are explicitly diagnostic, and supported temporary or persistent regimes are reportable rather than automatically blocking. It writes `artifacts/venue_correction_validation_latest.md`. Current committed result remains the 2026-05-01 max-z artifact: log-loss and home-ice gates pass, corrected-distance residuals fail (`max |z| = 4.067`), and event-frequency residuals fail (`max |z| = 3.572`), so the current correction policy remains exploratory until the regime-aware scorecard is rerun. +- **Scorecard harness added (2026-04-28, expanded 2026-05-01, 2026-05-03, and 2026-05-05):** `src/validation.py::evaluate_venue_correction_scorecard` combines held-out log-loss, home-ice over-correction, distance/location residual z-score, and event-frequency residual z-score gates; it now evaluates residual gates using rolling venue-regime diagnostics instead of a blunt max-z veto when diagnostics are supplied. `scripts/export_venue_correction_validation.py` exports a Markdown artifact from a metrics JSON payload and reports both event-frequency and distance-location paired diagnostics. +- **Live DB runner added and executed (refreshed 2026-05-01; regime-aware run executed 2026-05-03; paired distance run executed 2026-05-05):** `scripts/export_venue_correction_validation_from_db.py` builds leakage-safe temporal CV metrics from SQLite by applying only the latest prior-season venue distance adjustment to each held-out shot, using the same model-training contract as `load_training_shot_events`. It now computes normalized event-frequency diagnostics by venue-season, event group, and game-type scope plus paired distance-location diagnostics from in-memory prior-corrected distances. The distance diagnostic compares visiting-team shots at each venue against those same teams' away shots elsewhere in the same season, stratified by `shot_type` and `manpower_state`. The primary frequency gate uses sample-adequate regular-season training attempts; blocked-shot and all-attempt frequencies are diagnostic only. The runner attaches rolling venue-regime diagnostics: prior-only rolling estimates are production-safe, centered rolling estimates are explicitly diagnostic, and supported temporary or persistent regimes are reportable rather than automatically blocking. It writes `artifacts/venue_correction_validation_latest.md`. Current committed result is the 2026-05-05 regime-aware paired-distance artifact: log-loss and home-ice gates pass, but corrected-distance residuals still fail with 10 blocking regimes and event-frequency residuals still fail with 5 blocking regimes, so the current correction policy remains exploratory. +- **Next blocker triage:** start with the largest distance/location `unexplained_or_confounded` residuals and the event-frequency blockers. Each blocker must either gain leakage-safe supporting evidence, motivate a correction-policy improvement, or remain a true rejection; labels should not be relaxed merely to pass the scorecard. Acceptance: -- ✅ Held-out log-loss does not worsen after applying correction (live DB scorecard `delta = -0.000017`). -- ❌ Distance/location residual gate has not yet passed. Under the original max-z policy, every sample-adequate corrected-distance venue-season must satisfy `|z| < 2` (live DB scorecard `max |z| = 4.067`). Under the new regime-aware policy, residuals above threshold are accepted only if classified as supported `persistent_bias` or `temporary_supported_regime`; `unexplained_or_confounded`, `population_shift_detected`, and `insufficient_evidence` rows remain blocking. -- ❌ Event-frequency residual gate has not yet passed. Under the original max-z policy, every sample-adequate regular-season training-attempt venue-season must satisfy `|z| < 2` (live DB scorecard `max |z| = 3.572`). Under the new regime-aware policy, supported scorer regimes can be reported as non-blocking, while unexplained/confounded frequency residuals remain blocking. Frequency diagnostics also report blocked-shot and all-attempt event groups as non-blocking diagnostics. +- ✅ Held-out log-loss does not worsen after applying correction (live DB scorecard `delta = -0.000015`). +- ❌ Distance/location residual gate has not yet passed. Under the regime-aware policy, `|z| >= 2` marks a candidate residual rather than an automatic veto. Candidate residuals are accepted only if classified as supported `persistent_bias` or `temporary_supported_regime`; `unexplained_or_confounded`, `population_shift_detected`, and `insufficient_evidence` rows remain blocking. Same-season paired/stratified visiting-team distance evidence can support temporary regimes when sample adequacy, same-direction bootstrap CI, and effect-size gates pass. Current live result: 10 blocking regimes, 18 supported regimes, `max |z| = 4.067`. +- ❌ Event-frequency residual gate has not yet passed. Under the regime-aware policy, supported scorer regimes can be reported as non-blocking, while unexplained/confounded frequency residuals remain blocking. Frequency diagnostics also report blocked-shot and all-attempt event groups as non-blocking diagnostics. Current live result: 5 blocking regimes, 22 supported regimes, `max |z| = 3.572`. - ✅ Guardrail test (pre-registered): correction must not eliminate > 50% of the home-ice goal-rate advantage (live DB scorecard `removed = -0.013`, limit 0.500). ### 2.5.5 Pre-2009 data-quality triage diff --git a/knowledge_base/index.md b/knowledge_base/index.md index 30608c5..08e904f 100644 --- a/knowledge_base/index.md +++ b/knowledge_base/index.md @@ -1,6 +1,6 @@ # Knowledge Base Index -> Last updated: 2026-05-03 (Added rolling venue-regime diagnostics for venue scorekeeper bias) +> Last updated: 2026-05-05 (Added paired/stratified distance-location venue-regime evidence) ## Concepts diff --git a/knowledge_base/log.md b/knowledge_base/log.md index 002e112..28c2b7e 100644 --- a/knowledge_base/log.md +++ b/knowledge_base/log.md @@ -278,3 +278,21 @@ - Updated `wiki/concepts/venue-scorekeeper-bias.md` - documented prior-only rolling estimates, centered exploratory diagnostics, regime classifications, and regime-aware scorecard acceptance semantics. - Updated `index.md` - refreshed Last updated summary. **Notes:** The venue-correction scorecard can now distinguish blocking unexplained/confounded residuals from supported persistent or temporary scorekeeper regimes. The committed live scorecard artifact is still the 2026-05-01 max-z result; venue correction remains exploratory until the DB-backed scorecard is rerun with the new regime-aware diagnostics and passes all hard gates. + +### 2026-05-04 - UPDATE + +**Action:** Recorded the executed live regime-aware venue-correction scorecard result +**Source:** `artifacts/venue_correction_validation_latest.md`, `docs/xg_model_roadmap.md`, `docs/xg_model_components/04_scorekeeper_bias.md`, `knowledge_base/wiki/concepts/venue-scorekeeper-bias.md` +**Pages touched:** +- Updated `wiki/concepts/venue-scorekeeper-bias.md` - documented that the live scorecard now runs in `regime_aware` mode and still fails because blocking unexplained/confounded residuals remain. +- Updated `index.md` - refreshed Last updated summary. +**Notes:** This supersedes the 2026-05-03 note that the committed artifact still needed a regime-aware rerun. The current live artifact treats `|z| >= 2` as a candidate residual rather than an automatic veto; supported `persistent_bias` and `temporary_supported_regime` rows are non-blocking, while `unexplained_or_confounded`, `population_shift_detected`, and `insufficient_evidence` remain blocking. The latest result passes held-out log-loss and home-ice guardrails but fails distance/location residuals (24 blocking regimes, 4 supported regimes) and event-frequency residuals (4 blocking regimes, 23 supported regimes), so venue correction remains exploratory. + +### 2026-05-05 - UPDATE + +**Action:** Added paired/stratified distance-location venue-regime evidence +**Source:** `src/venue_bias.py`, `scripts/export_venue_correction_validation_from_db.py`, `scripts/export_venue_correction_validation.py`, `tests/test_venue_bias.py`, `tests/test_venue_correction_validation_export.py`, `tests/test_venue_correction_validation_from_db.py`, `artifacts/venue_correction_validation_latest.md`, `docs/xg_model_roadmap.md`, `docs/xg_model_components/04_scorekeeper_bias.md` +**Pages touched:** +- Updated `wiki/concepts/venue-scorekeeper-bias.md` - documented paired visiting-team distance comparisons, shot-type/manpower stratification, evidence thresholds, and the refreshed live scorecard result. +- Updated `index.md` - refreshed Last updated summary. +**Notes:** The DB-backed runner now computes distance-location paired evidence from in-memory prior-corrected shot distances without mutating `shot_events` or `venue_bias_corrections`. The 2026-05-05 artifact still fails overall but reduces distance/location blockers to 10 with 18 supported regimes; event-frequency residuals show 5 blockers and 22 supported regimes. Venue correction remains exploratory until both residual gates have zero blocking regimes while log-loss and home-ice guardrails still pass. diff --git a/knowledge_base/wiki/concepts/venue-scorekeeper-bias.md b/knowledge_base/wiki/concepts/venue-scorekeeper-bias.md index 0defe08..defde96 100644 --- a/knowledge_base/wiki/concepts/venue-scorekeeper-bias.md +++ b/knowledge_base/wiki/concepts/venue-scorekeeper-bias.md @@ -28,6 +28,7 @@ The project's venue bias analysis (`notebooks/venue_bias_analysis.ipynb`) comput - Total shot event count per game per venue - Normalized event-frequency residuals by venue-season, event group, and game-type scope - Paired away-team-season comparisons for frequency anomalies +- Paired visiting-team distance comparisons for distance/location anomalies, stratified by shot type and manpower state - Distance-to-goal distribution per venue - Season-over-season stability of venue effects @@ -53,7 +54,7 @@ This approach corrects distance distortion without assuming the bias is a simple The project now distinguishes a raw residual spike from an accepted venue regime by adding rolling diagnostics around each venue-season residual [6]. The production-safe view is prior-only: the estimate for season `t` uses only seasons before `t`, so it can provide context for a correction without same-season or future leakage [6]. A centered rolling estimate is also computed for exploratory diagnosis of historical spikes, but it is explicitly not a production correction input because it can use future seasons [6]. -Rolling diagnostics classify candidate residual spikes as `persistent_bias`, `temporary_supported_regime`, or `unexplained_or_confounded` [6]. Persistent bias means the venue is repeatedly biased in the same direction across sample-adequate seasons. Temporary supported regimes are short spikes with adequate evidence, such as paired away-team support for event-frequency anomalies. Unexplained/confounded spikes remain blocking evidence against accepting the correction layer [6]. +Rolling diagnostics classify candidate residual spikes as `persistent_bias`, `temporary_supported_regime`, or `unexplained_or_confounded` [6]. Persistent bias means the venue is repeatedly biased in the same direction across sample-adequate seasons. Temporary supported regimes are short spikes with adequate evidence, such as paired away-team support for event-frequency anomalies or paired/stratified visiting-team support for distance/location anomalies. Unexplained/confounded spikes remain blocking evidence against accepting the correction layer [6]. ### Relationship to Coordinate Normalization @@ -67,15 +68,17 @@ Phase 2 also wired the per-season diagnostic populator into the scraper pipeline The initial correction layer is now implemented in `src/database.py` [5]. `populate_venue_bias_corrections(conn, season)` computes a per-venue distance adjustment toward the season league mean and shrinks it by sample size (`sample_shots / (sample_shots + prior)`), storing parameters in `venue_bias_corrections`. `finalize_season_diagnostics()` now runs both diagnostic and correction population each season [4]. At consumption time, `load_game_shots_with_venue_correction()` adds `distance_to_goal_corrected` using the persisted adjustment while preserving raw distance values [5]. -This is an implementation baseline, not final model policy. The Phase 2.5.4 scorecard harness is implemented: `evaluate_venue_correction_scorecard()` combines held-out log-loss, home-ice over-correction, distance/location residual z-score, and event-frequency residual z-score gates, while `scripts/export_venue_correction_validation.py` formats the artifact [6]. As of 2026-05-03, the scorecard can evaluate residual gates in a regime-aware mode: supported `persistent_bias` and `temporary_supported_regime` rows are non-blocking but reported, while `unexplained_or_confounded`, `population_shift_detected`, and `insufficient_evidence` rows remain blocking [6]. +This is an implementation baseline, not final model policy. The Phase 2.5.4 scorecard harness is implemented: `evaluate_venue_correction_scorecard()` combines held-out log-loss, home-ice over-correction, distance/location residual z-score, and event-frequency residual z-score gates, while `scripts/export_venue_correction_validation.py` formats the artifact [6]. As of 2026-05-05, the scorecard can evaluate residual gates in a regime-aware mode: supported `persistent_bias` and `temporary_supported_regime` rows are non-blocking but reported, while `unexplained_or_confounded`, `population_shift_detected`, and `insufficient_evidence` rows remain blocking [6]. The event-frequency refresh adds `src/venue_bias.py` helpers for venue-season event rates, frequency z-scores, paired away-team-season comparisons, bootstrap CIs, paired Cohen's d, known-regime priors, and anomaly classification [6]. The primary frequency gate uses sample-adequate regular-season training attempts, and sample-inadequate venue-seasons are excluded from the league mean/std baseline used to compute frequency z-scores. Blocked-shot and all-attempt frequencies are reported as diagnostics because they are important scorekeeper-bias evidence but remain outside the current shot-level xG training contract [6]. -After the v5 backfill, `scripts/export_venue_correction_validation_from_db.py` ran the live validation from SQLite using forward-chaining temporal CV and only prior-season venue distance corrections for each held-out shot [6]. The 2026-05-01 refresh uses the same tightened model-training contract as `load_training_shot_events`: schema v5, season >= 20092010, regular/playoff in-game shots, no regular-season shootouts, non-blocked target-consistent shot rows, and non-null core model features. The live scorecard passes held-out log-loss (`delta = -0.000017`) and the home-ice over-correction guardrail (`removed = -0.013`, limit 0.500), but fails the residual corrected-distance venue-season z-score gate (`max |z| = 4.067`, limit < 2.000; worst venue-season `20092010:Madison Square Garden`) and the sample-adequate event-frequency residual gate (`max |z| = 3.572`, limit < 2.000; worst venue-season `20112012:Prudential Center`) under the original max-z policy [6]. The current shrinkage distance correction therefore remains exploratory and should not feed production xG training until a fresh regime-aware run shows no blocking unexplained/confounded residuals, held-out log-loss remains non-worse, and the home-ice guardrail still passes [2][6]. +The distance-location refresh adds paired/stratified evidence for corrected-distance residuals [6]. For each venue-season, the diagnostic uses visiting-team shots only, then compares each visiting team's corrected shot distance at that venue with that same team's away shots at other venues in the same season. Comparisons are stratified by `shot_type` and `manpower_state`, and only strata present on both sides contribute. A distance candidate is supported only when at least 10 paired visiting team-seasons are available, the bootstrap CI excludes zero in the same direction as the residual z-score, and `|Cohen's d| >= 0.2` [6]. + +After the v5 backfill, `scripts/export_venue_correction_validation_from_db.py` ran the live validation from SQLite using forward-chaining temporal CV and only prior-season venue distance corrections for each held-out shot [6]. The 2026-05-05 refresh uses the same tightened model-training contract as `load_training_shot_events`: schema v5, season >= 20092010, regular/playoff in-game shots, no regular-season shootouts, non-blocked target-consistent shot rows, and non-null core model features [6]. The live scorecard now uses the regime-aware residual policy: `|z| >= 2` marks a candidate residual, supported `persistent_bias` and `temporary_supported_regime` rows are non-blocking, and `unexplained_or_confounded`, `population_shift_detected`, and `insufficient_evidence` rows remain blocking [6]. The scorecard passes held-out log-loss (`delta = -0.000015`) and the home-ice over-correction guardrail (`removed = -0.013`, limit 0.500), but fails distance/location residuals (10 blocking regimes, 18 supported regimes, `max |z| = 4.067`) and sample-adequate event-frequency residuals (5 blocking regimes, 22 supported regimes, `max |z| = 3.572`) [6]. The current shrinkage distance correction therefore remains exploratory and should not feed production xG training until both residual gates have zero blocking regimes, held-out log-loss remains non-worse, and the home-ice guardrail still passes [2][6]. The venue bias analysis was particularly sensitive to the v2 coordinate normalization bug. Pre-2020 data with ~50% unnormalized coordinates would have produced spurious venue effects that were actually normalization failures. The current v5 refresh resolves the stale-schema blocker, but venue-level coordinate analyses should still be re-derived after any future coordinate-normalization or correction-policy change. -Last verified: 2026-05-03 +Last verified: 2026-05-05 ## Sources @@ -97,7 +100,9 @@ Last verified: 2026-05-03 ## Revision History -- 2026-05-03 - Added rolling venue-regime diagnostics, regime-aware scorecard acceptance semantics, and the requirement for a fresh live regime-aware scorecard before venue correction can feed production xG training. +- 2026-05-05 - Added paired/stratified distance-location residual evidence and refreshed the live regime-aware scorecard result. +- 2026-05-04 - Recorded the live regime-aware scorecard result and clarified that supported residuals are non-blocking while unexplained/confounded residuals keep venue correction exploratory. +- 2026-05-03 - Added rolling venue-regime diagnostics and regime-aware scorecard acceptance semantics. - 2026-05-01 - Added event-frequency scorekeeper diagnostics, anomaly classification, sample-adequate z-score baselines, and the refreshed live scorecard result with separate distance/location and event-frequency residual gates. - 2026-04-28 - Recorded the live v5 DB-backed venue-correction validation result: log-loss and home-ice guardrails pass, residual corrected-distance z-score fails. - 2026-04-28 - Added Phase 2.5.4 scorecard harness status and source references for held-out/log-loss, home-ice guardrail, and residual z-score validation gates. diff --git a/scripts/export_venue_correction_validation.py b/scripts/export_venue_correction_validation.py index 3fb66d8..d8cc38d 100644 --- a/scripts/export_venue_correction_validation.py +++ b/scripts/export_venue_correction_validation.py @@ -108,6 +108,9 @@ def evaluate_payload(payload: dict[str, Any]) -> dict[str, Any]: "event_frequency_primary_scope", "event_frequency_primary_group", "distance_top_regime_diagnostics", + "distance_top_paired_diagnostics", + "distance_location_candidate_count", + "distance_location_supported_count", "event_frequency_top_regime_diagnostics", ) for metadata_field in metadata_fields: @@ -172,6 +175,7 @@ def format_scorecard(metrics: dict[str, Any]) -> str: f"- Worst event-frequency residual: " f"`{metrics['worst_event_frequency_residual_venue']}`\n" f"{_format_venue_regime_diagnostics(metrics)}" + f"{_format_distance_location_paired_diagnostics(metrics)}" f"{_format_event_frequency_anomalies(metrics)}" f"{_format_notes(notes)}" ) @@ -284,6 +288,46 @@ def _format_event_frequency_anomalies(metrics: dict[str, Any]) -> str: return text + "\n".join(lines) + "\n" +def _format_distance_location_paired_diagnostics(metrics: dict[str, Any]) -> str: + top_rows = metrics.get("distance_top_paired_diagnostics") or [] + candidate_count = metrics.get("distance_location_candidate_count", 0) + supported_count = metrics.get("distance_location_supported_count", 0) + text = ( + "\n## Distance-Location Paired Diagnostics\n\n" + "- Primary distance gate: venue-season corrected-distance residuals " + "with visiting-team paired evidence stratified by shot type and manpower state.\n\n" + f"- Candidate distance residuals: {candidate_count:,}\n" + f"- Supported paired distance regimes: {supported_count:,}\n" + ) + if not top_rows: + return text + "\nNo distance-location paired diagnostics exceeded the reporting threshold.\n" + + lines = [ + "", + "| Venue-season | z | Paired diff | 95% CI | d | Pairs | Evidence | " + "Evidence classification | Regime classification |", + "|--------------|---|-------------|--------|---|-------|----------|" + "-------------------------|-----------------------|", + ] + for row in top_rows: + ci_text = ( + f"[{_format_optional_float(row.get('paired_bootstrap_ci_low'))}, " + f"{_format_optional_float(row.get('paired_bootstrap_ci_high'))}]" + ) + evidence = "YES" if row.get("evidence_supports_regime") else "NO" + lines.append( + f"| `{row['season']}:{row['venue_name']}` | " + f"{float(row['residual_z_score']):.3f} | " + f"{_format_optional_float(row.get('paired_mean_diff_distance'))} | " + f"{ci_text} | {_format_optional_float(row.get('paired_cohens_d'))} | " + f"{int(row.get('paired_away_team_seasons', 0)):,} | " + f"{evidence} | " + f"`{row.get('distance_location_evidence_classification', 'n/a')}` | " + f"`{row.get('regime_classification', 'n/a')}` |" + ) + return text + "\n".join(lines) + "\n" + + def _format_optional_float(value: Any) -> str: if value is None: return "n/a" diff --git a/scripts/export_venue_correction_validation_from_db.py b/scripts/export_venue_correction_validation_from_db.py index 06cca5c..6ea9845 100644 --- a/scripts/export_venue_correction_validation_from_db.py +++ b/scripts/export_venue_correction_validation_from_db.py @@ -14,7 +14,7 @@ import time from collections import defaultdict from pathlib import Path -from typing import Any +from typing import Any, Mapping, Sequence import numpy as np from sklearn.linear_model import LogisticRegression @@ -46,7 +46,10 @@ DEFAULT_OUTPUT_PATH, format_scorecard, ) -from validation import MIN_TRAIN_SEASONS, evaluate_venue_correction_scorecard # noqa: E402 +from validation import ( # noqa: E402 + MIN_TRAIN_SEASONS, + evaluate_venue_correction_scorecard, +) from venue_bias import ( # noqa: E402 ANOMALY_REAL_SCOREKEEPER_REGIME_SUPPORTED, EVENT_FREQUENCY_GROUP_ALL_ATTEMPTS, @@ -58,13 +61,16 @@ EVENT_FREQUENCY_SCOPES, PRIMARY_EVENT_FREQUENCY_GROUP, PRIMARY_EVENT_FREQUENCY_SCOPE, + annotate_distance_location_regime_evidence, annotate_event_frequency_anomalies, classify_rolling_venue_regimes, compute_event_frequency_diagnostics, + compute_paired_away_distance_location_comparisons, compute_paired_away_frequency_comparisons, primary_event_frequency_residual_z_scores, primary_event_frequency_regime_diagnostics, residual_z_score_rows, + top_distance_location_paired_diagnostics, top_event_frequency_anomalies, top_venue_regime_diagnostics, VENUE_REGIME_METRIC_DISTANCE, @@ -105,8 +111,8 @@ def _load_training_rows(conn: sqlite3.Connection) -> list[sqlite3.Row]: cursor = conn.cursor() cursor.execute( """SELECT se.is_goal, se.distance_to_goal, se.angle_to_goal, - se.shot_type, se.shooting_team_id, - g.season, g.venue_name, g.home_team_id + se.shot_type, se.manpower_state, se.shooting_team_id, + g.season, g.venue_name, g.home_team_id, g.away_team_id FROM shot_events se JOIN games g ON se.game_id = g.game_id WHERE se.event_schema_version = ? @@ -118,11 +124,13 @@ def _load_training_rows(conn: sqlite3.Connection) -> list[sqlite3.Row]: AND se.period >= ? ) AND g.venue_name IS NOT NULL + AND g.away_team_id IS NOT NULL AND se.distance_to_goal IS NOT NULL AND se.angle_to_goal IS NOT NULL AND se.shot_type IS NOT NULL AND se.manpower_state IS NOT NULL AND se.score_state IS NOT NULL + AND se.shooting_team_id IS NOT NULL AND ( (se.shot_event_type = ? AND se.is_goal = 1) OR (se.shot_event_type IN (?, ?) AND se.is_goal = 0) @@ -416,6 +424,29 @@ def _compute_residual_distance_z_scores( return result +def _build_distance_location_shot_rows( + rows: Sequence[Mapping[str, Any]], + corrected_distances: Sequence[float], +) -> list[dict[str, Any]]: + if len(rows) != len(corrected_distances): + raise ValueError("rows and corrected_distances must have equal length.") + + shot_rows: list[dict[str, Any]] = [] + for row, corrected_distance in zip(rows, corrected_distances): + shot_rows.append( + { + "season": str(row["season"]), + "venue_name": str(row["venue_name"]), + "shooting_team_id": int(row["shooting_team_id"]), + "away_team_id": int(row["away_team_id"]), + "shot_type": str(row["shot_type"]), + "manpower_state": str(row["manpower_state"]), + "corrected_distance_to_goal": float(corrected_distance), + } + ) + return shot_rows + + def build_metrics(conn: sqlite3.Connection, correction_method: str) -> dict[str, Any]: run_started_at = time.monotonic() _progress("Loading training rows.", run_started_at) @@ -486,6 +517,17 @@ def build_metrics(conn: sqlite3.Connection, correction_method: str) -> dict[str, run_started_at, ) + _progress("Computing paired distance-location diagnostics.", run_started_at) + distance_shot_rows = _build_distance_location_shot_rows(rows, corrected_distances) + paired_distance = compute_paired_away_distance_location_comparisons( + distance_shot_rows + ) + _progress( + f"Computed paired distance-location diagnostics for " + f"{len(paired_distance):,} venue-seasons.", + run_started_at, + ) + _progress("Building baseline and corrected feature matrices.", run_started_at) X_baseline = _build_feature_matrix(distances, angles, shot_types) X_corrected = _build_feature_matrix(corrected_distances, angles, shot_types) @@ -521,11 +563,16 @@ def build_metrics(conn: sqlite3.Connection, correction_method: str) -> dict[str, f"{len(residual_z_scores):,} residual venue-season z-scores.", run_started_at, ) + distance_residual_rows = residual_z_score_rows( + residual_z_scores, + VENUE_REGIME_METRIC_DISTANCE, + ) + annotated_distance = annotate_distance_location_regime_evidence( + distance_residual_rows, + paired_distance, + ) distance_regime_diagnostics = classify_rolling_venue_regimes( - residual_z_score_rows( - residual_z_scores, - VENUE_REGIME_METRIC_DISTANCE, - ) + annotated_distance ) frequency_regime_diagnostics = primary_event_frequency_regime_diagnostics( annotated_frequency @@ -550,14 +597,27 @@ def build_metrics(conn: sqlite3.Connection, correction_method: str) -> dict[str, "Each shot uses the latest venue distance adjustment from a season before " "the shot's season; same-season venue corrections are not used for holdout " "rows. Distance residual z-scores are venue-season corrected-distance mean " - "z-scores. Rolling venue-regime diagnostics use prior-only rolling " - "estimates for production-safe context and centered rolling estimates only " - "for exploratory historical-spike labeling. Event-frequency residual " - "z-scores use sample-adequate regular-season training attempts as the " - "primary gate; blocked-shot and all-attempt frequencies are reported as " - "diagnostics and remain outside the current shot-level xG training " - "contract." + "z-scores. Distance/location candidates are annotated with paired " + "visiting-team evidence stratified by shot type and manpower state; " + "this diagnostic uses the in-memory prior-corrected distances and does " + "not mutate shot_events or venue_bias_corrections. Rolling venue-regime " + "diagnostics use prior-only rolling estimates for production-safe " + "context and centered rolling estimates only for exploratory " + "historical-spike labeling. Event-frequency residual z-scores use " + "sample-adequate regular-season training attempts as the primary gate; " + "blocked-shot and all-attempt frequencies are reported as diagnostics " + "and remain outside the current shot-level xG training contract." ) + distance_candidates = [ + row for row in distance_regime_diagnostics + if row.get("candidate_regime") + ] + supported_distance_regimes = [ + row for row in distance_candidates + if row.get("evidence_supports_regime") + ] + metrics["distance_location_candidate_count"] = len(distance_candidates) + metrics["distance_location_supported_count"] = len(supported_distance_regimes) metrics["event_frequency_primary_scope"] = PRIMARY_EVENT_FREQUENCY_SCOPE metrics["event_frequency_primary_group"] = PRIMARY_EVENT_FREQUENCY_GROUP metrics["event_frequency_candidate_count"] = len(frequency_candidates) @@ -570,6 +630,12 @@ def build_metrics(conn: sqlite3.Connection, correction_method: str) -> dict[str, distance_regime_diagnostics, limit=EVENT_FREQUENCY_REPORT_LIMIT, ) + metrics["distance_top_paired_diagnostics"] = ( + top_distance_location_paired_diagnostics( + distance_regime_diagnostics, + limit=EVENT_FREQUENCY_REPORT_LIMIT, + ) + ) metrics["event_frequency_top_regime_diagnostics"] = ( top_venue_regime_diagnostics( frequency_regime_diagnostics, diff --git a/src/venue_bias.py b/src/venue_bias.py index 5e01cf7..04f7cb8 100644 --- a/src/venue_bias.py +++ b/src/venue_bias.py @@ -35,6 +35,12 @@ EVENT_FREQUENCY_BOOTSTRAP_SAMPLES = 10_000 EVENT_FREQUENCY_BOOTSTRAP_ALPHA = 0.05 EVENT_FREQUENCY_BOOTSTRAP_SEED = 42 +DISTANCE_LOCATION_VALUE_FIELD = "corrected_distance_to_goal" +DISTANCE_LOCATION_STRATIFY_FIELDS = ("shot_type", "manpower_state") +DISTANCE_LOCATION_MIN_PAIRED_TEAM_SEASONS = ( + EVENT_FREQUENCY_MIN_PAIRED_TEAM_SEASONS +) +DISTANCE_LOCATION_MIN_ABS_COHENS_D = EVENT_FREQUENCY_MIN_ABS_COHENS_D VENUE_REGIME_METRIC_DISTANCE = "distance_location" VENUE_REGIME_METRIC_EVENT_FREQUENCY = "event_frequency" @@ -245,6 +251,127 @@ def annotate_event_frequency_anomalies( return annotated +def compute_paired_away_distance_location_comparisons( + shot_rows: Sequence[Mapping[str, Any]], +) -> list[dict[str, Any]]: + """Compare visiting-team corrected distances at a venue against elsewhere. + + The comparison controls for visitor team-season and shot mix by matching on + ``shot_type`` and ``manpower_state``. Each paired diff is one visiting + team's weighted mean at-venue-minus-elsewhere corrected distance for the + same season. + """ + rows_by_season: dict[str, list[dict[str, Any]]] = defaultdict(list) + for row in shot_rows: + if not _is_visiting_team_shot(row): + continue + distance = _finite_float_or_none(row.get(DISTANCE_LOCATION_VALUE_FIELD)) + if distance is None: + continue + stratum = _distance_location_stratum(row) + if stratum is None: + continue + rows_by_season[str(row["season"])].append( + { + "venue_name": str(row["venue_name"]), + "away_team_id": int(row["away_team_id"]), + "stratum": stratum, + "distance": distance, + } + ) + + comparisons: list[dict[str, Any]] = [] + for season, rows in rows_by_season.items(): + venues = sorted({row["venue_name"] for row in rows}) + all_stats: dict[tuple[int, tuple[str, ...]], list[float]] = defaultdict( + lambda: [0.0, 0.0] + ) + venue_stats: dict[tuple[str, int, tuple[str, ...]], list[float]] = defaultdict( + lambda: [0.0, 0.0] + ) + for row in rows: + team_id = int(row["away_team_id"]) + stratum = tuple(row["stratum"]) + distance = float(row["distance"]) + all_values = all_stats[(team_id, stratum)] + all_values[0] += distance + all_values[1] += 1.0 + venue_values = venue_stats[(row["venue_name"], team_id, stratum)] + venue_values[0] += distance + venue_values[1] += 1.0 + + for venue_name in venues: + team_weighted_diffs: dict[int, list[float]] = defaultdict( + lambda: [0.0, 0.0] + ) + for ( + row_venue, + team_id, + stratum, + ), at_values in venue_stats.items(): + if row_venue != venue_name: + continue + total_values = all_stats[(team_id, stratum)] + elsewhere_count = total_values[1] - at_values[1] + if elsewhere_count <= 0: + continue + at_mean = at_values[0] / at_values[1] + elsewhere_mean = ( + total_values[0] - at_values[0] + ) / elsewhere_count + at_count = at_values[1] + diff_values = team_weighted_diffs[team_id] + diff_values[0] += (at_mean - elsewhere_mean) * at_count + diff_values[1] += at_count + + diffs = [ + weighted_sum / weight + for weighted_sum, weight in team_weighted_diffs.values() + if weight > 0 + ] + comparisons.append( + { + "season": season, + "venue_name": venue_name, + **_summarize_paired_distance_diffs(diffs), + } + ) + + return sorted( + comparisons, + key=lambda item: ( + item["season"], + item["venue_name"], + ), + ) + + +def annotate_distance_location_regime_evidence( + residual_rows: Sequence[Mapping[str, Any]], + paired_comparisons: Sequence[Mapping[str, Any]], +) -> list[dict[str, Any]]: + """Attach paired distance-location support to residual regime rows.""" + comparison_lookup = { + _distance_location_comparison_key(row): row + for row in paired_comparisons + } + annotated: list[dict[str, Any]] = [] + for row in residual_rows: + item = dict(row) + comparison = comparison_lookup.get(_distance_location_comparison_key(row), {}) + item.update(_distance_location_comparison_fields(comparison)) + paired_supports_regime = _paired_distance_evidence_supports_residual(item) + item["evidence_supports_regime"] = bool( + item.get("evidence_supports_regime", False) + or paired_supports_regime + ) + item["distance_location_evidence_classification"] = ( + _classify_distance_location_evidence(item) + ) + annotated.append(item) + return _sort_regime_rows(annotated) + + def primary_event_frequency_residual_z_scores( annotated_diagnostics: Sequence[Mapping[str, Any]], ) -> dict[str, float]: @@ -558,7 +685,56 @@ def top_event_frequency_anomalies( ] -def _summarize_paired_diffs(diffs: Iterable[float]) -> dict[str, Any]: +def top_distance_location_paired_diagnostics( + regime_diagnostics: Sequence[Mapping[str, Any]], + limit: int = 10, + candidates_only: bool = True, +) -> list[dict[str, Any]]: + """Return largest distance residuals with paired evidence details.""" + rows = [ + dict(row) + for row in regime_diagnostics + if row.get("metric_name") == VENUE_REGIME_METRIC_DISTANCE + and _finite_float_or_none(row.get(VENUE_REGIME_RESIDUAL_FIELD)) is not None + and (not candidates_only or row.get("candidate_regime")) + ] + rows.sort( + key=lambda row: abs(float(row[VENUE_REGIME_RESIDUAL_FIELD])), + reverse=True, + ) + return [ + { + "season": row["season"], + "venue_name": row["venue_name"], + VENUE_REGIME_RESIDUAL_FIELD: row[VENUE_REGIME_RESIDUAL_FIELD], + "regime_classification": row.get( + "regime_classification", + VENUE_REGIME_NOT_FLAGGED, + ), + "evidence_supports_regime": row.get( + "evidence_supports_regime", + False, + ), + "paired_away_team_seasons": row.get("paired_away_team_seasons", 0), + "paired_mean_diff_distance": row.get("paired_mean_diff_distance"), + "paired_bootstrap_ci_low": row.get("paired_bootstrap_ci_low"), + "paired_bootstrap_ci_high": row.get("paired_bootstrap_ci_high"), + "paired_wilcoxon_p_value": row.get("paired_wilcoxon_p_value"), + "paired_cohens_d": row.get("paired_cohens_d"), + "paired_sample_adequate": row.get("paired_sample_adequate", False), + "distance_location_evidence_classification": row.get( + "distance_location_evidence_classification", + ANOMALY_NOT_FLAGGED, + ), + } + for row in rows[:limit] + ] + + +def _summarize_paired_diffs( + diffs: Iterable[float], + min_paired_team_seasons: int = EVENT_FREQUENCY_MIN_PAIRED_TEAM_SEASONS, +) -> dict[str, Any]: diff_array = np.asarray(list(diffs), dtype=float) diff_array = diff_array[np.isfinite(diff_array)] n_pairs = int(len(diff_array)) @@ -593,11 +769,27 @@ def _summarize_paired_diffs(diffs: Iterable[float]) -> dict[str, Any]: "paired_wilcoxon_p_value": p_value, "paired_cohens_d": float(cohens_d), "paired_sample_adequate": bool( - n_pairs >= EVENT_FREQUENCY_MIN_PAIRED_TEAM_SEASONS + n_pairs >= min_paired_team_seasons ), } +def _summarize_paired_distance_diffs(diffs: Iterable[float]) -> dict[str, Any]: + summary = _summarize_paired_diffs( + diffs, + min_paired_team_seasons=DISTANCE_LOCATION_MIN_PAIRED_TEAM_SEASONS, + ) + return { + "paired_away_team_seasons": summary["paired_away_team_seasons"], + "paired_mean_diff_distance": summary["paired_mean_diff_per_game"], + "paired_bootstrap_ci_low": summary["paired_bootstrap_ci_low"], + "paired_bootstrap_ci_high": summary["paired_bootstrap_ci_high"], + "paired_wilcoxon_p_value": summary["paired_wilcoxon_p_value"], + "paired_cohens_d": summary["paired_cohens_d"], + "paired_sample_adequate": summary["paired_sample_adequate"], + } + + def _bootstrap_mean_ci(values: np.ndarray) -> tuple[float, float]: if len(values) == 1: only_value = float(values[0]) @@ -646,6 +838,30 @@ def _prefixed_comparison_fields(comparison: Mapping[str, Any]) -> dict[str, Any] } +def _distance_location_comparison_fields( + comparison: Mapping[str, Any], +) -> dict[str, Any]: + if not comparison: + return { + "paired_away_team_seasons": 0, + "paired_mean_diff_distance": None, + "paired_bootstrap_ci_low": None, + "paired_bootstrap_ci_high": None, + "paired_wilcoxon_p_value": None, + "paired_cohens_d": None, + "paired_sample_adequate": False, + } + return { + "paired_away_team_seasons": comparison["paired_away_team_seasons"], + "paired_mean_diff_distance": comparison["paired_mean_diff_distance"], + "paired_bootstrap_ci_low": comparison["paired_bootstrap_ci_low"], + "paired_bootstrap_ci_high": comparison["paired_bootstrap_ci_high"], + "paired_wilcoxon_p_value": comparison["paired_wilcoxon_p_value"], + "paired_cohens_d": comparison["paired_cohens_d"], + "paired_sample_adequate": comparison["paired_sample_adequate"], + } + + def _is_candidate_frequency_anomaly( z_score: Any, z_score_threshold: float, @@ -672,19 +888,81 @@ def _classify_event_frequency_anomaly(row: Mapping[str, Any]) -> str: def _paired_evidence_supports_z_score(row: Mapping[str, Any]) -> bool: - ci_low = row.get("paired_bootstrap_ci_low") - ci_high = row.get("paired_bootstrap_ci_high") - cohens_d = row.get("paired_cohens_d") - z_score = float(row["frequency_z_score"]) + return _paired_evidence_supports_direction( + row["frequency_z_score"], + row.get("paired_bootstrap_ci_low"), + row.get("paired_bootstrap_ci_high"), + row.get("paired_cohens_d"), + min_abs_cohens_d=EVENT_FREQUENCY_MIN_ABS_COHENS_D, + ) + + +def _paired_distance_evidence_supports_residual(row: Mapping[str, Any]) -> bool: + if not row.get("paired_sample_adequate", False): + return False + return _paired_evidence_supports_direction( + row[VENUE_REGIME_RESIDUAL_FIELD], + row.get("paired_bootstrap_ci_low"), + row.get("paired_bootstrap_ci_high"), + row.get("paired_cohens_d"), + min_abs_cohens_d=DISTANCE_LOCATION_MIN_ABS_COHENS_D, + ) + + +def _paired_evidence_supports_direction( + z_score_value: Any, + ci_low: Any, + ci_high: Any, + cohens_d: Any, + min_abs_cohens_d: float, +) -> bool: + z_score = float(z_score_value) if ci_low is None or ci_high is None or cohens_d is None: return False - if abs(float(cohens_d)) < EVENT_FREQUENCY_MIN_ABS_COHENS_D: + if abs(float(cohens_d)) < min_abs_cohens_d: return False if z_score > 0: return bool(float(ci_low) > 0) return bool(float(ci_high) < 0) +def _classify_distance_location_evidence(row: Mapping[str, Any]) -> str: + z_score = row.get(VENUE_REGIME_RESIDUAL_FIELD) + if z_score is None or not np.isfinite(float(z_score)): + return ANOMALY_CALCULATION_ERROR_SUSPECTED + if abs(float(z_score)) < EVENT_FREQUENCY_Z_SCORE_THRESHOLD: + return ANOMALY_NOT_FLAGGED + if not row.get("paired_sample_adequate", False): + return ANOMALY_INSUFFICIENT_EVIDENCE + if _paired_distance_evidence_supports_residual(row): + return ANOMALY_REAL_SCOREKEEPER_REGIME_SUPPORTED + return ANOMALY_HOCKEY_CONTEXT_CONFOUNDED + + +def _distance_location_comparison_key(row: Mapping[str, Any]) -> tuple[str, str]: + return ( + str(row["season"]), + str(row["venue_name"]), + ) + + +def _is_visiting_team_shot(row: Mapping[str, Any]) -> bool: + try: + return bool(int(row["shooting_team_id"]) == int(row["away_team_id"])) + except (TypeError, ValueError): + return False + + +def _distance_location_stratum(row: Mapping[str, Any]) -> tuple[str, ...] | None: + values = [] + for field in DISTANCE_LOCATION_STRATIFY_FIELDS: + value = row.get(field) + if value is None: + return None + values.append(str(value)) + return tuple(values) + + def _split_venue_season_label(label: str) -> tuple[str, str]: if ":" not in label: raise ValueError("Residual labels must use the 'season:venue' format.") diff --git a/tests/test_venue_bias.py b/tests/test_venue_bias.py index baa41f9..f2aa6da 100644 --- a/tests/test_venue_bias.py +++ b/tests/test_venue_bias.py @@ -17,14 +17,17 @@ VENUE_REGIME_POPULATION_SHIFT, VENUE_REGIME_TEMPORARY_SUPPORTED, VENUE_REGIME_UNEXPLAINED_OR_CONFOUNDED, + annotate_distance_location_regime_evidence, annotate_event_frequency_anomalies, classify_rolling_venue_regimes, compute_centered_rolling_bias_estimates, compute_event_frequency_diagnostics, + compute_paired_away_distance_location_comparisons, compute_paired_away_frequency_comparisons, compute_prior_rolling_bias_estimates, primary_event_frequency_residual_z_scores, primary_event_frequency_regime_diagnostics, + residual_z_score_rows, ) @@ -52,6 +55,28 @@ def _game_row( } +def _distance_shot_row( + venue_name, + away_team_id, + corrected_distance_to_goal, + shot_type="wrist", + manpower_state="5v5", + season="20202021", + shooting_team_id=None, +): + if shooting_team_id is None: + shooting_team_id = away_team_id + return { + "season": season, + "venue_name": venue_name, + "shooting_team_id": shooting_team_id, + "away_team_id": away_team_id, + "shot_type": shot_type, + "manpower_state": manpower_state, + "corrected_distance_to_goal": corrected_distance_to_goal, + } + + def test_event_frequency_diagnostics_compute_rates_and_z_scores(): rows = [] for idx in range(20): @@ -153,6 +178,230 @@ def test_paired_away_frequency_comparison_controls_for_visitor_team_season(): assert msg["paired_sample_adequate"] is True +def test_distance_location_paired_evidence_supports_positive_residual(): + rows = [] + for offset, team_id in enumerate(range(10, 20)): + diff = 3.0 + (offset * 0.2) + rows.append(_distance_shot_row("Arena A", team_id, 16.0 + diff)) + rows.append(_distance_shot_row("Arena B", team_id, 16.0)) + rows.append(_distance_shot_row("Arena C", team_id, 16.0)) + + comparisons = compute_paired_away_distance_location_comparisons(rows) + residual_rows = residual_z_score_rows( + { + "20202021:Arena A": 2.5, + "20202021:Arena B": 0.1, + "20202021:Arena C": -0.1, + "20202021:Arena D": 0.2, + "20202021:Arena E": -0.2, + "20202021:Arena F": 0.0, + }, + VENUE_REGIME_METRIC_DISTANCE, + ) + + annotated = annotate_distance_location_regime_evidence( + residual_rows, + comparisons, + ) + classified = classify_rolling_venue_regimes(annotated) + arena = [ + row for row in classified + if row["venue_name"] == "Arena A" + ][0] + + assert arena["paired_away_team_seasons"] == 10 + assert arena["paired_mean_diff_distance"] == pytest.approx(3.9) + assert arena["paired_bootstrap_ci_low"] > 0 + assert arena["paired_cohens_d"] >= 0.2 + assert arena["evidence_supports_regime"] is True + assert arena["distance_location_evidence_classification"] == ( + ANOMALY_REAL_SCOREKEEPER_REGIME_SUPPORTED + ) + assert arena["regime_classification"] == VENUE_REGIME_TEMPORARY_SUPPORTED + + +def test_distance_location_paired_evidence_supports_negative_residual(): + rows = [] + for offset, team_id in enumerate(range(10, 20)): + diff = -5.0 + (offset * 0.2) + rows.append(_distance_shot_row("Arena A", team_id, 16.0 + diff)) + rows.append(_distance_shot_row("Arena B", team_id, 16.0)) + rows.append(_distance_shot_row("Arena C", team_id, 16.0)) + + comparisons = compute_paired_away_distance_location_comparisons(rows) + residual_rows = residual_z_score_rows( + { + "20202021:Arena A": -2.6, + "20202021:Arena B": 0.1, + "20202021:Arena C": -0.1, + "20202021:Arena D": 0.2, + "20202021:Arena E": -0.2, + "20202021:Arena F": 0.0, + }, + VENUE_REGIME_METRIC_DISTANCE, + ) + + annotated = annotate_distance_location_regime_evidence( + residual_rows, + comparisons, + ) + classified = classify_rolling_venue_regimes(annotated) + arena = [ + row for row in classified + if row["venue_name"] == "Arena A" + ][0] + + assert arena["paired_bootstrap_ci_high"] < 0 + assert arena["paired_cohens_d"] <= -0.2 + assert arena["evidence_supports_regime"] is True + assert arena["regime_classification"] == VENUE_REGIME_TEMPORARY_SUPPORTED + + +def test_distance_location_pairing_removes_shot_mix_confounding(): + rows = [] + for team_id in range(10, 20): + rows.append( + _distance_shot_row( + "Arena A", + team_id, + 30.0, + shot_type="slap", + ) + ) + rows.append( + _distance_shot_row( + "Arena B", + team_id, + 30.0, + shot_type="slap", + ) + ) + rows.append(_distance_shot_row("Arena B", team_id, 10.0)) + + comparisons = compute_paired_away_distance_location_comparisons(rows) + residual_rows = residual_z_score_rows( + {"20202021:Arena A": 2.4}, + VENUE_REGIME_METRIC_DISTANCE, + ) + + annotated = annotate_distance_location_regime_evidence( + residual_rows, + comparisons, + ) + + assert annotated[0]["paired_away_team_seasons"] == 10 + assert annotated[0]["paired_mean_diff_distance"] == pytest.approx(0.0) + assert annotated[0]["paired_sample_adequate"] is True + assert annotated[0]["evidence_supports_regime"] is False + assert annotated[0]["distance_location_evidence_classification"] == ( + ANOMALY_HOCKEY_CONTEXT_CONFOUNDED + ) + + +def test_distance_location_pairing_requires_adequate_matched_visitor_teams(): + rows = [] + for team_id in range(10, 19): + rows.append(_distance_shot_row("Arena A", team_id, 22.0)) + rows.append(_distance_shot_row("Arena B", team_id, 16.0)) + + comparisons = compute_paired_away_distance_location_comparisons(rows) + residual_rows = residual_z_score_rows( + {"20202021:Arena A": 2.4}, + VENUE_REGIME_METRIC_DISTANCE, + ) + + annotated = annotate_distance_location_regime_evidence( + residual_rows, + comparisons, + ) + + assert annotated[0]["paired_away_team_seasons"] == 9 + assert annotated[0]["paired_sample_adequate"] is False + assert annotated[0]["evidence_supports_regime"] is False + assert annotated[0]["distance_location_evidence_classification"] == ( + ANOMALY_INSUFFICIENT_EVIDENCE + ) + + +def test_distance_location_pairing_requires_matched_strata(): + rows = [] + for team_id in range(10, 20): + rows.append(_distance_shot_row("Arena A", team_id, 22.0)) + rows.append( + _distance_shot_row( + "Arena B", + team_id, + 16.0, + shot_type="slap", + ) + ) + + comparisons = compute_paired_away_distance_location_comparisons(rows) + residual_rows = residual_z_score_rows( + {"20202021:Arena A": 2.4}, + VENUE_REGIME_METRIC_DISTANCE, + ) + + annotated = annotate_distance_location_regime_evidence( + residual_rows, + comparisons, + ) + + assert annotated[0]["paired_away_team_seasons"] == 0 + assert annotated[0]["evidence_supports_regime"] is False + assert annotated[0]["distance_location_evidence_classification"] == ( + ANOMALY_INSUFFICIENT_EVIDENCE + ) + + +@pytest.mark.parametrize( + "comparison", + [ + { + "paired_bootstrap_ci_low": -0.2, + "paired_bootstrap_ci_high": 1.5, + "paired_cohens_d": 0.5, + }, + { + "paired_bootstrap_ci_low": 1.0, + "paired_bootstrap_ci_high": 2.0, + "paired_cohens_d": 0.1, + }, + { + "paired_bootstrap_ci_low": -2.0, + "paired_bootstrap_ci_high": -1.0, + "paired_cohens_d": -0.5, + }, + ], +) +def test_distance_location_pairing_blocks_weak_crossing_or_mismatched_evidence( + comparison, +): + comparison = { + "season": "20202021", + "venue_name": "Arena A", + "paired_away_team_seasons": 10, + "paired_mean_diff_distance": 1.0, + "paired_wilcoxon_p_value": 0.05, + "paired_sample_adequate": True, + **comparison, + } + residual_rows = residual_z_score_rows( + {"20202021:Arena A": 2.4}, + VENUE_REGIME_METRIC_DISTANCE, + ) + + annotated = annotate_distance_location_regime_evidence( + residual_rows, + [comparison], + ) + + assert annotated[0]["evidence_supports_regime"] is False + assert annotated[0]["distance_location_evidence_classification"] == ( + ANOMALY_HOCKEY_CONTEXT_CONFOUNDED + ) + + def test_anomaly_classifier_marks_supported_real_scorekeeper_regime(): diagnostics = [ { diff --git a/tests/test_venue_correction_validation_export.py b/tests/test_venue_correction_validation_export.py index 15b003c..1da867c 100644 --- a/tests/test_venue_correction_validation_export.py +++ b/tests/test_venue_correction_validation_export.py @@ -36,6 +36,25 @@ def _passing_payload(): }, "event_frequency_primary_scope": "regular_season", "event_frequency_primary_group": "training_attempts", + "distance_location_candidate_count": 1, + "distance_location_supported_count": 1, + "distance_top_paired_diagnostics": [ + { + "season": "20202021", + "venue_name": "Arena A", + "residual_z_score": 2.4, + "paired_away_team_seasons": 10, + "paired_mean_diff_distance": 1.2, + "paired_bootstrap_ci_low": 0.4, + "paired_bootstrap_ci_high": 2.0, + "paired_cohens_d": 0.35, + "evidence_supports_regime": True, + "distance_location_evidence_classification": ( + "real_scorekeeper_regime_supported" + ), + "regime_classification": "temporary_supported_regime", + } + ], "event_frequency_candidate_count": 1, "event_frequency_supported_count": 0, "event_frequency_top_anomalies": [ @@ -83,6 +102,8 @@ def test_format_scorecard_includes_gate_summary(): assert "max abs(z) = 1.800" in text assert "Overall pass: PASS" in text assert "Arena B" in text + assert "Distance-Location Paired Diagnostics" in text + assert "real_scorekeeper_regime_supported" in text assert "Event-Frequency Diagnostics" in text assert "hockey_context_confounded" in text diff --git a/tests/test_venue_correction_validation_from_db.py b/tests/test_venue_correction_validation_from_db.py index 7ae391a..38c3480 100644 --- a/tests/test_venue_correction_validation_from_db.py +++ b/tests/test_venue_correction_validation_from_db.py @@ -53,6 +53,60 @@ def test_compute_residual_distance_z_scores_by_season_and_venue(): assert result["20202021:C"] == pytest.approx(1.224744871) +def test_build_distance_location_shot_rows_uses_in_memory_corrected_distances(): + rows = [ + { + "season": "20202021", + "venue_name": "Arena A", + "shooting_team_id": 12, + "away_team_id": 12, + "shot_type": "wrist", + "manpower_state": "5v5", + "distance_to_goal": 20.0, + }, + { + "season": "20202021", + "venue_name": "Arena B", + "shooting_team_id": 13, + "away_team_id": 13, + "shot_type": "slap", + "manpower_state": "5v4", + "distance_to_goal": 18.0, + }, + ] + + shot_rows = exporter._build_distance_location_shot_rows( + rows, + [17.5, 19.25], + ) + + assert shot_rows == [ + { + "season": "20202021", + "venue_name": "Arena A", + "shooting_team_id": 12, + "away_team_id": 12, + "shot_type": "wrist", + "manpower_state": "5v5", + "corrected_distance_to_goal": 17.5, + }, + { + "season": "20202021", + "venue_name": "Arena B", + "shooting_team_id": 13, + "away_team_id": 13, + "shot_type": "slap", + "manpower_state": "5v4", + "corrected_distance_to_goal": 19.25, + }, + ] + + +def test_build_distance_location_shot_rows_requires_aligned_lengths(): + with pytest.raises(ValueError, match="equal length"): + exporter._build_distance_location_shot_rows([], [12.0]) + + def test_build_feature_matrix_uses_fixed_shot_type_contract(): matrix = exporter._build_feature_matrix( np.array([12.0, 20.0]),