STET

summary

reports/summary.json

280264 bytes

Back to adjudication

Preview truncated to keep static evidence pages bounded.

{
  "generated_at": "2026-03-07T17:34:22Z",
  "dataset": "/Users/ben/dev/flux/.tmp/sqlparser-rs-dataset",
  "output_root": "/Users/ben/dev/flux/.tmp/h2h-sqlparser-w2",
  "statistics": {
    "pass_definition": {
      "denominator_statuses": [
        "fail_guardrail",
        "fail_high_conf",
        "fail_infra",
        "fail_likely_equiv",
        "fail_no_patch",
        "fail_with_diag",
        "pass",
        "pass_with_warn"
      ],
      "positive_statuses": [
        "pass",
        "pass_with_warn"
      ]
    },
    "bootstrap": {
      "base_seed": 1337,
      "confidence_level": 0.95,
      "method": "nonparametric_task_bootstrap",
      "resamples": 5000
    },
    "tiering": {
      "rule": "A is strictly superior to B iff passRate(A) \u003e ciHigh(B)",
      "strategy": "conservative_non_superiority_grouping"
    }
  },
  "models": [
    {
      "name": "gpt-5.1-codex-mini",
      "key": "gpt-5-1-codex-mini",
      "run_id": "2026-02-28__18-27-22__gpt-5-1-codex-mini"
    },
    {
      "name": "gpt-5.3-codex",
      "key": "gpt-5-3-codex",
      "run_id": "2026-02-28__18-27-22__gpt-5-3-codex"
    },
    {
      "name": "gpt-5.4",
      "key": "gpt-5-4",
      "run_id": "2026-02-28__18-27-22__gpt-5-4"
    }
  ],
  "runs": {
    "gpt-5-1-codex-mini": {
      "model": "gpt-5.1-codex-mini",
      "requested_model": "gpt-5.1-codex-mini",
      "run_id": "2026-02-28__18-27-22__gpt-5-1-codex-mini",
      "run_dir": "/Users/ben/dev/flux/.tmp/h2h-sqlparser-w2/runs/2026-02-28__18-27-22__gpt-5-1-codex-mini",
      "results_path": "/Users/ben/dev/flux/.tmp/h2h-sqlparser-w2/runs/2026-02-28__18-27-22__gpt-5-1-codex-mini/results.json",
      "validation_metrics": {
        "validated": 30,
        "leaderboard_eligible": 30,
        "leaderboard_excluded": 0,
        "binary_pass_count": 28,
        "binary_pass_rate": 0.9333333333333333,
        "tests_only_pass_count": 28,
        "tests_only_pass_rate": 0.9333333333333333,
        "rescue_aware_pass_count": 28,
        "rescue_aware_pass_rate": 0.9333333333333333,
        "rescue_delta_rate": 0,
        "equiv_rate": 0.36666666666666664,
        "equiv_equivalent_count": 11,
        "equiv_non_equivalent_count": 19,
        "equiv_unknown_count": 0,
        "code_review_pass_count": 5,
        "code_review_fail_count": 23,
        "code_review_unsure_count": 2,
        "code_review_fail_rate": 0.7666666666666667,
        "behavioral_robustness_used_count": 29,
        "behavioral_robustness_skipped_count": 0,
        "behavioral_robustness_unavailable_count": 1,
        "probe_accepted_commands_count": 0,
        "probe_gold_pass_candidate_pass_count": 28,
        "probe_gold_pass_candidate_fail_count": 0,
        "probe_review_required_count": 0,
        "probe_agreement_rate": 1,
        "tb_unresolved_but_tests_pass_count": 21,
        "tb_resolved_but_tests_not_pass_count": 1,
        "tests_unknown_count": 2,
        "tests_unknown_cause_counts": {
          "all_commands_ignored_gold_failure_mode_unset": 1,
          "no_gold_pass_commands": 1
        },
        "tests_unknown_rate": 0.06666666666666667,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false,
        "cache_evaluated_count": 30,
        "cache_hit_count": 0,
        "cache_miss_count": 30,
        "cache_hit_rate": 0,
        "cache_miss_reason_counts": {
          "missing_pinned_dependencies": 30
        },
        "total_setup_ms_saved": null,
        "total_pinned_bytes": null,
        "footprint_risk_mean_score": 0.32358187049861825,
        "footprint_risk_median_score": 0.31172590751928275,
        "footprint_risk_scored_count": 30,
        "footprint_risk_used_count": 30,
        "footprint_risk_unavailable_count": 0,
        "footprint_risk_missing_count": 0,
        "footprint_risk_flagged_count": 0,
        "footprint_risk_flagged_rate": 0,
        "footprint_risk_severe_count": 0,
        "footprint_risk_severe_rate": 0,
        "footprint_risk_level_low_count": 19,
        "footprint_risk_level_medium_count": 11,
        "footprint_risk_level_high_count": 0,
        "footprint_risk_level_unknown_count": 0,
        "total_cost": 53.2629618,
        "cost_per_task": 1.77543206,
        "tests_only_quality_per_dollar": 0.5256936350092345,
        "equiv_quality_per_dollar": 0.20652249946791354,
        "total_input_tokens": 139367546,
        "total_output_tokens": 1220573,
        "total_tokens": 140588119,
        "total_uncached_input_tokens": 18543994,
        "total_cache_creation_input_tokens": null,
        "total_cache_read_input_tokens": 120823552,
        "total_cached_input_tokens": 120823552,
        "cost_tasks_total": 30,
        "cost_tasks_with_tokens": 30,
        "cost_tasks_with_cache_tokens": 28,
        "cost_tasks_with_cache_aware_pricing": 28,
        "cost_tasks_with_legacy_pricing": 2,
        "cost_tasks_with_pricing": 30,
        "cost_tasks_with_cost": 30,
        "cost_tasks_missing_tokens": 0,
        "cost_tasks_missing_pricing": 0,
        "cost_tasks_missing_cost": 0,
        "cost_tasks_with_cost_rate": 1,
        "pricing_version": "local-placeholder-2026-02-19",
        "pricing_source": "local_static_table",
        "pricing_model_key": "gpt-5.1-codex-mini"
      },
      "publish_exclusions": {},
      "publish_guard": {
        "publishable": true,
        "blocked": false,
        "reasons": [],
        "tests_unknown_rate": 0.06666666666666667,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false
      },
      "partial_metrics": {
        "failed_task_count": 2,
        "failed_task_with_partial_score": 0,
        "failed_task_partial_threshold": 0.8,
        "failed_task_partial_threshold_hits": 0,
        "failed_task_partial_ppr": 0,
        "failed_task_partial_mean_score": null,
        "failed_task_partial_coverage": 0
      },
      "footprint_risk_metrics": {
        "used_count": 30,
        "unavailable_count": 0,
        "missing_count": 0,
        "scored_count": 30,
        "mean_score": 0.32358187049861825,
        "median_score": 0.31172590751928275,
        "flagged_count": 0,
        "flagged_rate": 0,
        "severe_count": 0,
        "severe_rate": 0,
        "level_low_count": 19,
        "level_medium_count": 11,
        "level_high_count": 0,
        "level_unknown_count": 0
      },
      "passRate": 0.9333333333333333,
      "ciLow": 0.8333333333333334,
      "ciHigh": 1,
      "effectiveN": 30,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 1,
        "fail_infra": 0,
        "fail_likely_equiv": 1,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 3,
        "pass_with_warn": 25
      }
    },
    "gpt-5-3-codex": {
      "model": "gpt-5.3-codex",
      "requested_model": "gpt-5.3-codex",
      "run_id": "2026-02-28__18-27-22__gpt-5-3-codex",
      "run_dir": "/Users/ben/dev/flux/.tmp/h2h-sqlparser-w2/runs/2026-02-28__18-27-22__gpt-5-3-codex",
      "results_path": "/Users/ben/dev/flux/.tmp/h2h-sqlparser-w2/runs/2026-02-28__18-27-22__gpt-5-3-codex/results.json",
      "validation_metrics": {
        "validated": 30,
        "leaderboard_eligible": 30,
        "leaderboard_excluded": 0,
        "binary_pass_count": 30,
        "binary_pass_rate": 1,
        "tests_only_pass_count": 30,
        "tests_only_pass_rate": 1,
        "rescue_aware_pass_count": 30,
        "rescue_aware_pass_rate": 1,
        "rescue_delta_rate": 0,
        "equiv_rate": 0.5,
        "equiv_equivalent_count": 15,
        "equiv_non_equivalent_count": 15,
        "equiv_unknown_count": 0,
        "code_review_pass_count": 3,
        "code_review_fail_count": 23,
        "code_review_unsure_count": 4,
        "code_review_fail_rate": 0.7666666666666667,
        "behavioral_robustness_used_count": 30,
        "behavioral_robustness_skipped_count": 0,
        "behavioral_robustness_unavailable_count": 0,
        "probe_accepted_commands_count": 0,
        "probe_gold_pass_candidate_pass_count": 30,
        "probe_gold_pass_candidate_fail_count": 0,
        "probe_review_required_count": 0,
        "probe_agreement_rate": 1,
        "tb_unresolved_but_tests_pass_count": 24,
        "tb_resolved_but_tests_not_pass_count": 0,
        "tests_unknown_count": 0,
        "tests_unknown_cause_counts": {},
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false,
        "cache_evaluated_count": 30,
        "cache_hit_count": 0,
        "cache_miss_count": 30,
        "cache_hit_rate": 0,
        "cache_miss_reason_counts": {
          "missing_pinned_dependencies": 30
        },
        "total_setup_ms_saved": null,
        "total_pinned_bytes": null,
        "footprint_risk_mean_score": 0.32729286669422947,
        "footprint_risk_median_score": 0.3037878161599828,
        "footprint_risk_scored_count": 30,
        "footprint_risk_used_count": 30,
        "footprint_risk_unavailable_count": 0,
        "footprint_risk_missing_count": 0,
        "footprint_risk_flagged_count": 0,
        "footprint_risk_flagged_rate": 0,
        "footprint_risk_severe_count": 0,
        "footprint_risk_severe_rate": 0,
        "footprint_risk_level_low_count": 18,
        "footprint_risk_level_medium_count": 12,
        "footprint_risk_level_high_count": 0,
        "footprint_risk_level_unknown_count": 0,
        "total_cost": 152.37069,
        "cost_per_task": 5.079023,
        "tests_only_quality_per_dollar": 0.1968882598090223,
        "equiv_quality_per_dollar": 0.09844412990451115,
        "total_input_tokens": 66038210,
        "total_output_tokens": 350183,
        "total_tokens": 66388393,
        "total_uncached_input_tokens": 2392770,
        "total_cache_creation_input_tokens": null,
        "total_cache_read_input_tokens": 63645440,
        "total_cached_input_tokens": 63645440,
        "cost_tasks_total": 30,
        "cost_tasks_with_tokens": 30,
        "cost_tasks_with_cache_tokens": 30,
        "cost_tasks_with_cache_aware_pricing": 30,
        "cost_tasks_with_legacy_pricing": 0,
        "cost_tasks_with_pricing": 30,
        "cost_tasks_with_cost": 30,
        "cost_tasks_missing_tokens": 0,
        "cost_tasks_missing_pricing": 0,
        "cost_tasks_missing_cost": 0,
        "cost_tasks_with_cost_rate": 1,
        "pricing_version": "local-placeholder-2026-02-19",
        "pricing_source": "local_static_table",
        "pricing_model_key": "gpt-5.3-codex"
      },
      "publish_exclusions": {},
      "publish_guard": {
        "publishable": true,
        "blocked": false,
        "reasons": [],
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false
      },
      "partial_metrics": {
        "failed_task_count": 0,
        "failed_task_with_partial_score": 0,
        "failed_task_partial_threshold": 0.8,
        "failed_task_partial_threshold_hits": 0,
        "failed_task_partial_ppr": null,
        "failed_task_partial_mean_score": null,
        "failed_task_partial_coverage": null
      },
      "footprint_risk_metrics": {
        "used_count": 30,
        "unavailable_count": 0,
        "missing_count": 0,
        "scored_count": 30,
        "mean_score": 0.32729286669422947,
        "median_score": 0.3037878161599828,
        "flagged_count": 0,
        "flagged_rate": 0,
        "severe_count": 0,
        "severe_rate": 0,
        "level_low_count": 18,
        "level_medium_count": 12,
        "level_high_count": 0,
        "level_unknown_count": 0
      },
      "passRate": 1,
      "ciLow": 1,
      "ciHigh": 1,
      "effectiveN": 30,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 0,
        "fail_infra": 0,
        "fail_likely_equiv": 0,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 7,
        "pass_with_warn": 23
      }
    },
    "gpt-5-4": {
      "model": "gpt-5.4",
      "requested_model": "gpt-5.4",
      "run_id": "2026-02-28__18-27-22__gpt-5-4",
      "run_dir": "/Users/ben/dev/flux/.tmp/h2h-sqlparser-w2/runs/2026-02-28__18-27-22__gpt-5-4",
      "results_path": "/Users/ben/dev/flux/.tmp/h2h-sqlparser-w2/runs/2026-02-28__18-27-22__gpt-5-4/results.json",
      "validation_metrics": {
        "validated": 30,
        "leaderboard_eligible": 30,
        "leaderboard_excluded": 0,
        "binary_pass_count": 30,
        "binary_pass_rate": 1,
        "tests_only_pass_count": 30,
        "tests_only_pass_rate": 1,
        "rescue_aware_pass_count": 30,
        "rescue_aware_pass_rate": 1,
        "rescue_delta_rate": 0,
        "equiv_rate": 0.5333333333333333,
        "equiv_equivalent_count": 16,
        "equiv_non_equivalent_count": 14,
        "equiv_unknown_count": 0,
        "code_review_pass_count": 3,
        "code_review_fail_count": 20,
        "code_review_unsure_count": 7,
        "code_review_fail_rate": 0.6666666666666666,
        "behavioral_robustness_used_count": 30,
        "behavioral_robustness_skipped_count": 0,
        "behavioral_robustness_unavailable_count": 0,
        "probe_accepted_commands_count": 1,
        "probe_gold_pass_candidate_pass_count": 29,
        "probe_gold_pass_candidate_fail_count": 1,
        "probe_review_required_count": 0,
        "probe_agreement_rate": 0.9666666666666667,
        "tb_unresolved_but_tests_pass_count": 24,
        "tb_resolved_but_tests_not_pass_count": 0,
        "tests_unknown_count": 0,
        "tests_unknown_cause_counts": {},
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false,
        "cache_evaluated_count": 30,
        "cache_hit_count": 0,
        "cache_miss_count": 30,
        "cache_hit_rate": 0,
        "cache_miss_reason_counts": {
          "missing_pinned_dependencies": 30
        },
        "total_setup_ms_saved": null,
        "total_pinned_bytes": null,
        "footprint_risk_mean_score": 0.33059440635761644,
        "footprint_risk_median_score": 0.3111657194331642,
        "footprint_risk_scored_count": 30,
        "footprint_risk_used_count": 30,
        "footprint_risk_unavailable_count": 0,
        "footprint_risk_missing_count": 0,
        "footprint_risk_flagged_count": 0,
        "footprint_risk_flagged_rate": 0,
        "footprint_risk_severe_count": 0,
        "footprint_risk_severe_rate": 0,
        "footprint_risk_level_low_count": 17,
        "footprint_risk_level_medium_count": 13,
        "footprint_risk_level_high_count": 0,
        "footprint_risk_level_unknown_count": 0,
        "total_cost": 37.697810000000004,
        "cost_per_task": 1.2565936666666668,
        "tests_only_quality_per_dollar": 0.7958021964671156,
        "equiv_quality_per_dollar": 0.424427838115795,
        "total_input_tokens": 61661657,
        "total_output_tokens": 377564,
        "total_tokens": 62039221,
        "total_uncached_input_tokens": 2564313,
        "total_cache_creation_input_tokens": null,
        "total_cache_read_input_tokens": 59097344,
        "total_cached_input_tokens": 59097344,
        "cost_tasks_total": 30,
        "cost_tasks_with_tokens": 30,
        "cost_tasks_with_cache_tokens": 30,
        "cost_tasks_with_cache_aware_pricing": 30,
        "cost_tasks_with_legacy_pricing": 0,
        "cost_tasks_with_pricing": 30,
        "cost_tasks_with_cost": 30,
        "cost_tasks_missing_tokens": 0,
        "cost_tasks_missing_pricing": 0,
        "cost_tasks_missing_cost": 0,
        "cost_tasks_with_cost_rate": 1,
        "pricing_version": "local-placeholder-2026-02-19",
        "pricing_source": "local_static_table",
        "pricing_model_key": "gpt-5.4"
      },
      "publish_exclusions": {},
      "publish_guard": {
        "publishable": true,
        "blocked": false,
        "reasons": [],
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false
      },
      "partial_metrics": {
        "failed_task_count": 0,
        "failed_task_with_partial_score": 0,
        "failed_task_partial_threshold": 0.8,
        "failed_task_partial_threshold_hits": 0,
        "failed_task_partial_ppr": null,
        "failed_task_partial_mean_score": null,
        "failed_task_partial_coverage": null
      },
      "footprint_risk_metrics": {
        "used_count": 30,
        "unavailable_count": 0,
        "missing_count": 0,
        "scored_count": 30,
        "mean_score": 0.33059440635761644,
        "median_score": 0.3111657194331642,
        "flagged_count": 0,
        "flagged_rate": 0,
        "severe_count": 0,
        "severe_rate": 0,
        "level_low_count": 17,
        "level_medium_count": 13,
        "level_high_count": 0,
        "level_unknown_count": 0
      },
      "passRate": 1,
      "ciLow": 1,
      "ciHigh": 1,
      "effectiveN": 30,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 0,
        "fail_infra": 0,
        "fail_likely_equiv": 0,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 5,
        "pass_with_warn": 25
      }
    }
  },
  "tasks": {
    "flux-pr-1414": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 6617353,
          "tb_total_output_tokens": 63499,
          "tb_total_tokens": 6680852,
          "tb_uncached_input_tokens": 900873,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5716480,
          "tb_cached_input_tokens": 5716480,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.5897755,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3259359058066356,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2011769,
          "tb_total_output_tokens": 11058,
          "tb_total_tokens": 2022827,
          "tb_uncached_input_tokens": 68473,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1943296,
          "tb_cached_input_tokens": 1943296,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.605519,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.33672650602743776,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1460628,
          "tb_total_output_tokens": 11444,
          "tb_total_tokens": 1472072,
          "tb_uncached_input_tokens": 106260,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1354368,
          "tb_cached_input_tokens": 1354368,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.981256,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.27434023954433173,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1435": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1465254,
          "tb_total_output_tokens": 18505,
          "tb_total_tokens": 1483759,
          "tb_uncached_input_tokens": 268966,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1196288,
          "tb_cached_input_tokens": 1196288,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.6939221999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.5325472606515167,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 8945965,
          "tb_total_output_tokens": 29922,
          "tb_total_tokens": 8975887,
          "tb_uncached_input_tokens": 167725,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 8778240,
          "tb_cached_input_tokens": 8778240,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 17.478555,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2901955617070667,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 13399949,
          "tb_total_output_tokens": 34203,
          "tb_total_tokens": 13434152,
          "tb_uncached_input_tokens": 163853,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 13236096,
          "tb_cached_input_tokens": 13236096,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 7.219378,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2927309734881166,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1441": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2502027,
          "tb_total_output_tokens": 33257,
          "tb_total_tokens": 2535284,
          "tb_uncached_input_tokens": 325003,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2177024,
          "tb_cached_input_tokens": 2177024,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.0136001000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.49707909347478624,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 725328,
          "tb_total_output_tokens": 7798,
          "tb_total_tokens": 733126,
          "tb_uncached_input_tokens": 37200,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 688128,
          "tb_cached_input_tokens": 688128,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.0580719999999997,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4667108568188249,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 755020,
          "tb_total_output_tokens": 7040,
          "tb_total_tokens": 762060,
          "tb_uncached_input_tokens": 24908,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 730112,
          "tb_cached_input_tokens": 730112,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.471192,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4689258285389157,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1495": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1912153,
          "tb_total_output_tokens": 23722,
          "tb_total_tokens": 1935875,
          "tb_uncached_input_tokens": 214105,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1698048,
          "tb_cached_input_tokens": 1698048,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.7181967,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.29588769280747484,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1411812,
          "tb_total_output_tokens": 7724,
          "tb_total_tokens": 1419536,
          "tb_uncached_input_tokens": 47716,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1364096,
          "tb_cached_input_tokens": 1364096,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.225324,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.5219206168310221,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 825428,
          "tb_total_output_tokens": 6348,
          "tb_total_tokens": 831776,
          "tb_uncached_input_tokens": 31316,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 794112,
          "tb_cached_input_tokens": 794112,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.510472,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3470915042991237,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1500": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 7581204,
          "tb_total_output_tokens": 40448,
          "tb_total_tokens": 7621652,
          "tb_uncached_input_tokens": 940436,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 6640768,
          "tb_cached_input_tokens": 6640768,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.6494571999999996,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.21593548197062992,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2204347,
          "tb_total_output_tokens": 9760,
          "tb_total_tokens": 2214107,
          "tb_uncached_input_tokens": 88891,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2115456,
          "tb_cached_input_tokens": 2115456,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 5.092149,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.21575173197062994,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1384558,
          "tb_total_output_tokens": 8689,
          "tb_total_tokens": 1393247,
          "tb_uncached_input_tokens": 58222,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1326336,
          "tb_cached_input_tokens": 1326336,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.849124,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3006081902851141,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1501": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 6406986,
          "tb_total_output_tokens": 84912,
          "tb_total_tokens": 6491898,
          "tb_uncached_input_tokens": 802122,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5604864,
          "tb_cached_input_tokens": 5604864,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.5533846000000002,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2995330140417937,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 669974,
          "tb_total_output_tokens": 4013,
          "tb_total_tokens": 673987,
          "tb_uncached_input_tokens": 63382,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 606592,
          "tb_cached_input_tokens": 606592,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.101398,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4210678499500824,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2166850,
          "tb_total_output_tokens": 26245,
          "tb_total_tokens": 2193095,
          "tb_uncached_input_tokens": 67266,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2099584,
          "tb_cached_input_tokens": 2099584,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.394284,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.34456698283904336,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1526": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4797885,
          "tb_total_output_tokens": 40342,
          "tb_total_tokens": 4838227,
          "tb_uncached_input_tokens": 669373,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4128512,
          "tb_cached_input_tokens": 4128512,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.8653882999999998,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4039946358934862,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1477012,
          "tb_total_output_tokens": 9512,
          "tb_total_tokens": 1486524,
          "tb_uncached_input_tokens": 61204,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1415808,
          "tb_cached_input_tokens": 1415808,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.612492,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3516982712990421,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1337166,
          "tb_total_output_tokens": 10745,
          "tb_total_tokens": 1347911,
          "tb_uncached_input_tokens": 35278,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1301888,
          "tb_cached_input_tokens": 1301888,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.80746,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.40424898177998664,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1534": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5102763,
          "tb_total_output_tokens": 33798,
          "tb_total_tokens": 5136561,
          "tb_uncached_input_tokens": 614827,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4487936,
          "tb_cached_input_tokens": 4487936,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.7982189,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.635535828108232,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3736946,
          "tb_total_output_tokens": 13616,
          "tb_total_tokens": 3750562,
          "tb_uncached_input_tokens": 103026,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3633920,
          "tb_cached_input_tokens": 3633920,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 7.81323,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.6373442549036268,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2368490,
          "tb_total_output_tokens": 12118,
          "tb_total_tokens": 2380608,
          "tb_uncached_input_tokens": 63850,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2304640,
          "tb_cached_input_tokens": 2304640,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.3769639999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.5901892618607406,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1576": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2355842,
          "tb_total_output_tokens": 40826,
          "tb_total_tokens": 2396668,
          "tb_uncached_input_tokens": 360194,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1995648,
          "tb_cached_input_tokens": 1995648,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.0845942000000002,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2132648676674215,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 755722,
          "tb_total_output_tokens": 10638,
          "tb_total_tokens": 766360,
          "tb_uncached_input_tokens": 41610,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 714112,
          "tb_cached_input_tokens": 714112,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.333598,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.22483665670130842,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1308802,
          "tb_total_output_tokens": 8293,
          "tb_total_tokens": 1317095,
          "tb_uncached_input_tokens": 61442,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1247360,
          "tb_cached_input_tokens": 1247360,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.812908,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.23520306499383095,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1604": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3709661,
          "tb_total_output_tokens": 43500,
          "tb_total_tokens": 3753161,
          "tb_uncached_input_tokens": 437341,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3272320,
          "tb_cached_input_tokens": 3272320,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.4078594999999998,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.33496645847592105,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 692920,
          "tb_total_output_tokens": 6349,
          "tb_total_tokens": 699269,
          "tb_uncached_input_tokens": 36664,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 656256,
          "tb_cached_input_tokens": 656256,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.915284,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2456958564588239,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 390699,
          "tb_total_output_tokens": 5645,
          "tb_total_tokens": 396344,
          "tb_uncached_input_tokens": 24107,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 366592,
          "tb_cached_input_tokens": 366592,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.27666999999999997,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3297443366823985,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1628": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5380476,
          "tb_total_output_tokens": 36633,
          "tb_total_tokens": 5417109,
          "tb_uncached_input_tokens": 514556,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4865920,
          "tb_cached_input_tokens": 4865920,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.72152,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3121816076329319,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1332049,
          "tb_total_output_tokens": 11224,
          "tb_total_tokens": 1343273,
          "tb_uncached_input_tokens": 53329,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1278720,
          "tb_cached_input_tokens": 1278720,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.391455,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.13528945755660507,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2819412,
          "tb_total_output_tokens": 13530,
          "tb_total_tokens": 2832942,
          "tb_uncached_input_tokens": 73940,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2745472,
          "tb_cached_input_tokens": 2745472,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.6288559999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.17328196579524682,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1649": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4122401,
          "tb_total_output_tokens": 44097,
          "tb_total_tokens": 4166498,
          "tb_uncached_input_tokens": 780193,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3342208,
          "tb_cached_input_tokens": 3342208,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.9362027,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.299924666533249,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 750948,
          "tb_total_output_tokens": 8758,
          "tb_total_tokens": 759706,
          "tb_uncached_input_tokens": 34020,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 716928,
          "tb_cached_input_tokens": 716928,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.111172,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2873361965258539,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 765114,
          "tb_total_output_tokens": 9921,
          "tb_total_tokens": 775035,
          "tb_uncached_input_tokens": 32186,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 732928,
          "tb_cached_input_tokens": 732928,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.510204,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.24508186812282998,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1747": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.18104751786566406,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 7283696,
          "tb_total_output_tokens": 26538,
          "tb_total_tokens": 7310234,
          "tb_uncached_input_tokens": 190960,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 7092736,
          "tb_cached_input_tokens": 7092736,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 15.095784,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2880666409978172,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4898783,
          "tb_total_output_tokens": 26938,
          "tb_total_tokens": 4925721,
          "tb_uncached_input_tokens": 233951,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4664832,
          "tb_cached_input_tokens": 4664832,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.015822,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3547431506239699,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1759": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.38937200370478836,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4677412,
          "tb_total_output_tokens": 18666,
          "tb_total_tokens": 4696078,
          "tb_uncached_input_tokens": 143524,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4533888,
          "tb_cached_input_tokens": 4533888,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 10.073652000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.32979756365254076,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2653859,
          "tb_total_output_tokens": 14296,
          "tb_total_tokens": 2668155,
          "tb_uncached_input_tokens": 119587,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2534272,
          "tb_cached_input_tokens": 2534272,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.620678,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2471565417622457,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1765": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 6080644,
          "tb_total_output_tokens": 64048,
          "tb_total_tokens": 6144692,
          "tb_uncached_input_tokens": 748676,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5331968,
          "tb_cached_input_tokens": 5331968,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.3070972,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.19595683852417284,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3096528,
          "tb_total_output_tokens": 13771,
          "tb_total_tokens": 3110299,
          "tb_uncached_input_tokens": 76496,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3020032,
          "tb_cached_input_tokens": 3020032,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 6.503748000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3450914553920046,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2928144,
          "tb_total_output_tokens": 16227,
          "tb_total_tokens": 2944371,
          "tb_uncached_input_tokens": 189200,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2738944,
          "tb_cached_input_tokens": 2738944,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.877688,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4842505878873178,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1791": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 10199339,
          "tb_total_output_tokens": 63945,
          "tb_total_tokens": 10263284,
          "tb_uncached_input_tokens": 1535147,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 8664192,
          "tb_cached_input_tokens": 8664192,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.9860192999999997,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.36810079089124204,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2631283,
          "tb_total_output_tokens": 12408,
          "tb_total_tokens": 2643691,
          "tb_uncached_input_tokens": 83443,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2547840,
          "tb_cached_input_tokens": 2547840,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 5.817885,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2529124182973325,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 961495,
          "tb_total_output_tokens": 10429,
          "tb_total_tokens": 971924,
          "tb_uncached_input_tokens": 44887,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 916608,
          "tb_cached_input_tokens": 916608,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.63151,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2110163197186077,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1839": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5412175,
          "tb_total_output_tokens": 43720,
          "tb_total_tokens": 5455895,
          "tb_uncached_input_tokens": 593871,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4818304,
          "tb_cached_input_tokens": 4818304,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.8758721000000003,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3029025395346498,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2031600,
          "tb_total_output_tokens": 10482,
          "tb_total_tokens": 2042082,
          "tb_uncached_input_tokens": 111472,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1920128,
          "tb_cached_input_tokens": 1920128,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 5.181192,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2765123993060045,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2191928,
          "tb_total_output_tokens": 10193,
          "tb_total_tokens": 2202121,
          "tb_uncached_input_tokens": 67896,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2124032,
          "tb_cached_input_tokens": 2124032,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.279352,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.29789606134928953,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1891": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 6201734,
          "tb_total_output_tokens": 44260,
          "tb_total_tokens": 6245994,
          "tb_uncached_input_tokens": 1146502,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5055232,
          "tb_cached_input_tokens": 5055232,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.7435978,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.27826224072845596,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1943594,
          "tb_total_output_tokens": 10105,
          "tb_total_tokens": 1953699,
          "tb_uncached_input_tokens": 140970,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1802624,
          "tb_cached_input_tokens": 1802624,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 5.424786,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3029241730218071,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1444239,
          "tb_total_output_tokens": 11241,
          "tb_total_tokens": 1455480,
          "tb_uncached_input_tokens": 56207,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1388032,
          "tb_cached_input_tokens": 1388032,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.896358,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3108164467334927,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1900": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3893861,
          "tb_total_output_tokens": 36820,
          "tb_total_tokens": 3930681,
          "tb_uncached_input_tokens": 596453,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3297408,
          "tb_cached_input_tokens": 3297408,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.6102107,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.17069042855761843,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1337787,
          "tb_total_output_tokens": 10417,
          "tb_total_tokens": 1348204,
          "tb_uncached_input_tokens": 87611,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1250176,
          "tb_cached_input_tokens": 1250176,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.814449,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.24035555138980993,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2131645,
          "tb_total_output_tokens": 11555,
          "tb_total_tokens": 2143200,
          "tb_uncached_input_tokens": 144189,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1987456,
          "tb_cached_input_tokens": 1987456,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.374546,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3410864963861544,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1908": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "all_commands_ignored_gold_failure_mode_unset",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2302211,
          "tb_total_output_tokens": 26985,
          "tb_total_tokens": 2329196,
          "tb_uncached_input_tokens": 274179,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2028032,
          "tb_cached_input_tokens": 2028032,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.8773833,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3024402890316348,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1268258,
          "tb_total_output_tokens": 7130,
          "tb_total_tokens": 1275388,
          "tb_uncached_input_tokens": 53538,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1214720,
          "tb_cached_input_tokens": 1214720,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.05295,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.37353253561694505,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 321971,
          "tb_total_output_tokens": 4685,
          "tb_total_tokens": 326656,
          "tb_uncached_input_tokens": 28467,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 293504,
          "tb_cached_input_tokens": 293504,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.241166,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.363532440745108,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1918": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2563954,
          "tb_total_output_tokens": 25733,
          "tb_total_tokens": 2589687,
          "tb_uncached_input_tokens": 216818,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2347136,
          "tb_cached_input_tokens": 2347136,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.8316954000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.32817527165617505,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1193588,
          "tb_total_output_tokens": 5945,
          "tb_total_tokens": 1199533,
          "tb_uncached_input_tokens": 116340,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1077248,
          "tb_cached_input_tokens": 1077248,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.717672,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.35230981800982236,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 415175,
          "tb_total_output_tokens": 4054,
          "tb_total_tokens": 419229,
          "tb_uncached_input_tokens": 41671,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 373504,
          "tb_cached_input_tokens": 373504,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.302526,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.41479892755204806,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1965": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_likely_equiv",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "unavailable",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2326950,
          "tb_total_output_tokens": 24125,
          "tb_total_tokens": 2351075,
          "tb_uncached_input_tokens": 360230,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1966720,
          "tb_cached_input_tokens": 1966720,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.9801030000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.343103881649892,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 536486,
          "tb_total_output_tokens": 5299,
          "tb_total_tokens": 541785,
          "tb_uncached_input_tokens": 32550,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 503936,
          "tb_cached_input_tokens": 503936,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.5620939999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4097916085520558,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 464408,
          "tb_total_output_tokens": 4017,
          "tb_total_tokens": 468425,
          "tb_uncached_input_tokens": 24984,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 439424,
          "tb_cached_input_tokens": 439424,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.301816,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3536446820943295,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1984": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4162823,
          "tb_total_output_tokens": 46165,
          "tb_total_tokens": 4208988,
          "tb_uncached_input_tokens": 613895,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3548928,
          "tb_cached_input_tokens": 3548928,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.7301717,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.32278161549925155,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1885888,
          "tb_total_output_tokens": 11548,
          "tb_total_tokens": 1897436,
          "tb_uncached_input_tokens": 65728,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1820160,
          "tb_cached_input_tokens": 1820160,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.40904,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.25952235337836205,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1826989,
          "tb_total_output_tokens": 10606,
          "tb_total_tokens": 1837595,
          "tb_uncached_input_tokens": 89261,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1737728,
          "tb_cached_input_tokens": 1737728,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.132234,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.259644853378362,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-2011": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3852708,
          "tb_total_output_tokens": 38620,
          "tb_total_tokens": 3891328,
          "tb_uncached_input_tokens": 388388,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3464320,
          "tb_cached_input_tokens": 3464320,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.33395,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.34089648829036606,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1017447,
          "tb_total_output_tokens": 7517,
          "tb_total_tokens": 1024964,
          "tb_uncached_input_tokens": 51559,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 965888,
          "tb_cached_input_tokens": 965888,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.6732370000000003,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3046514592981585,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1332874,
          "tb_total_output_tokens": 8663,
          "tb_total_tokens": 1341537,
          "tb_uncached_input_tokens": 108298,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1224576,
          "tb_cached_input_tokens": 1224576,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.8981880000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3785031747780122,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-2096": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 7305106,
          "tb_total_output_tokens": 68335,
          "tb_total_tokens": 7373441,
          "tb_uncached_input_tokens": 1138578,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 6166528,
          "tb_cached_input_tokens": 6166528,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.0428562000000006,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.18179016965245917,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2986215,
          "tb_total_output_tokens": 20496,
          "tb_total_tokens": 3006711,
          "tb_uncached_input_tokens": 106471,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2879744,
          "tb_cached_input_tokens": 2879744,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 7.146441,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.209786621818165,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2302639,
          "tb_total_output_tokens": 18593,
          "tb_total_tokens": 2321232,
          "tb_uncached_input_tokens": 153263,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2149376,
          "tb_cached_input_tokens": 2149376,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.5299580000000002,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.18393391965245917,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-2148": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 6680632,
          "tb_total_output_tokens": 57665,
          "tb_total_tokens": 6738297,
          "tb_uncached_input_tokens": 610744,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 6069888,
          "tb_cached_input_tokens": 6069888,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.1725891999999996,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.19190162181816497,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1169343,
          "tb_total_output_tokens": 10531,
          "tb_total_tokens": 1179874,
          "tb_uncached_input_tokens": 65727,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1103616,
          "tb_cached_input_tokens": 1103616,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.273189,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.19796537181816498,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1414741,
          "tb_total_output_tokens": 14608,
          "tb_total_tokens": 1429349,
          "tb_uncached_input_tokens": 135381,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1279360,
          "tb_cached_input_tokens": 1279360,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.027306,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.22812364200666493,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-2151": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2000849,
          "tb_total_output_tokens": 19063,
          "tb_total_tokens": 2019912,
          "tb_uncached_input_tokens": 315473,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1685376,
          "tb_cached_input_tokens": 1685376,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.8403939,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.31127020740563366,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1949623,
          "tb_total_output_tokens": 10007,
          "tb_total_tokens": 1959630,
          "tb_uncached_input_tokens": 67127,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1882496,
          "tb_cached_input_tokens": 1882496,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.431069,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.6190642863498039,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1424542,
          "tb_total_output_tokens": 12308,
          "tb_total_tokens": 1436850,
          "tb_uncached_input_tokens": 35614,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1388928,
          "tb_cached_input_tokens": 1388928,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.8641559999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.6078773507971134,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-2170": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 7002010,
          "tb_total_output_tokens": 60264,
          "tb_total_tokens": 7062274,
          "tb_uncached_input_tokens": 946970,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 6055040,
          "tb_cached_input_tokens": 6055040,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.6902950000000003,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.5007238717339156,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2093357,
          "tb_total_output_tokens": 13034,
          "tb_total_tokens": 2106391,
          "tb_uncached_input_tokens": 65453,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2027904,
          "tb_cached_input_tokens": 2027904,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.805691,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3362709118495355,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1142749,
          "tb_total_output_tokens": 11557,
          "tb_total_tokens": 1154306,
          "tb_uncached_input_tokens": 74717,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1068032,
          "tb_cached_input_tokens": 1068032,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.7759060000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3115149921328357,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-2172": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 12088080,
          "tb_total_output_tokens": 60654,
          "tb_total_tokens": 12148734,
          "tb_uncached_input_tokens": 1661072,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 10427008,
          "tb_cached_input_tokens": 10427008,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.4195832,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score"