STET

summary

reports/summary.json

269968 bytes

Back to adjudication

Preview truncated to keep static evidence pages bounded.

{
  "generated_at": "2026-03-07T17:34:10Z",
  "dataset": "/Users/ben/dev/flux/.tmp/graphql-go-tools-dataset",
  "output_root": "/Users/ben/dev/flux/.tmp/h2h-graphql-w2",
  "statistics": {
    "pass_definition": {
      "denominator_statuses": [
        "fail_guardrail",
        "fail_high_conf",
        "fail_infra",
        "fail_likely_equiv",
        "fail_no_patch",
        "fail_with_diag",
        "pass",
        "pass_with_warn"
      ],
      "positive_statuses": [
        "pass",
        "pass_with_warn"
      ]
    },
    "bootstrap": {
      "base_seed": 1337,
      "confidence_level": 0.95,
      "method": "nonparametric_task_bootstrap",
      "resamples": 5000
    },
    "tiering": {
      "rule": "A is strictly superior to B iff passRate(A) \u003e ciHigh(B)",
      "strategy": "conservative_non_superiority_grouping"
    }
  },
  "models": [
    {
      "name": "gpt-5.1-codex-mini",
      "key": "gpt-5-1-codex-mini",
      "run_id": "2026-02-28__13-38-10__gpt-5-1-codex-mini"
    },
    {
      "name": "gpt-5.3-codex",
      "key": "gpt-5-3-codex",
      "run_id": "2026-02-28__13-38-10__gpt-5-3-codex"
    },
    {
      "name": "gpt-5.4",
      "key": "gpt-5-4",
      "run_id": "2026-02-28__13-38-10__gpt-5-4"
    }
  ],
  "runs": {
    "gpt-5-1-codex-mini": {
      "model": "gpt-5.1-codex-mini",
      "requested_model": "gpt-5.1-codex-mini",
      "run_id": "2026-02-28__13-38-10__gpt-5-1-codex-mini",
      "run_dir": "/Users/ben/dev/flux/.tmp/h2h-graphql-w2/runs/2026-02-28__13-38-10__gpt-5-1-codex-mini",
      "results_path": "/Users/ben/dev/flux/.tmp/h2h-graphql-w2/runs/2026-02-28__13-38-10__gpt-5-1-codex-mini/results.json",
      "validation_metrics": {
        "validated": 29,
        "leaderboard_eligible": 29,
        "leaderboard_excluded": 0,
        "binary_pass_count": 28,
        "binary_pass_rate": 0.9655172413793104,
        "tests_only_pass_count": 28,
        "tests_only_pass_rate": 0.9655172413793104,
        "rescue_aware_pass_count": 28,
        "rescue_aware_pass_rate": 0.9655172413793104,
        "rescue_delta_rate": 0,
        "equiv_rate": 0.1724137931034483,
        "equiv_equivalent_count": 5,
        "equiv_non_equivalent_count": 24,
        "equiv_unknown_count": 0,
        "code_review_pass_count": 1,
        "code_review_fail_count": 23,
        "code_review_unsure_count": 5,
        "code_review_fail_rate": 0.7931034482758621,
        "behavioral_robustness_used_count": 29,
        "behavioral_robustness_skipped_count": 0,
        "behavioral_robustness_unavailable_count": 0,
        "probe_accepted_commands_count": 0,
        "probe_gold_pass_candidate_pass_count": 28,
        "probe_gold_pass_candidate_fail_count": 1,
        "probe_review_required_count": 0,
        "probe_agreement_rate": 0.9655172413793104,
        "tb_unresolved_but_tests_pass_count": 8,
        "tb_resolved_but_tests_not_pass_count": 0,
        "tests_unknown_count": 0,
        "tests_unknown_cause_counts": {},
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false,
        "cache_evaluated_count": 29,
        "cache_hit_count": 0,
        "cache_miss_count": 29,
        "cache_hit_rate": 0,
        "cache_miss_reason_counts": {
          "missing_pinned_dependencies": 29
        },
        "total_setup_ms_saved": null,
        "total_pinned_bytes": null,
        "footprint_risk_mean_score": 0.3022031029297725,
        "footprint_risk_median_score": 0.3171248834535135,
        "footprint_risk_scored_count": 29,
        "footprint_risk_used_count": 29,
        "footprint_risk_unavailable_count": 0,
        "footprint_risk_missing_count": 0,
        "footprint_risk_flagged_count": 0,
        "footprint_risk_flagged_rate": 0,
        "footprint_risk_severe_count": 0,
        "footprint_risk_severe_rate": 0,
        "footprint_risk_level_low_count": 16,
        "footprint_risk_level_medium_count": 13,
        "footprint_risk_level_high_count": 0,
        "footprint_risk_level_unknown_count": 0,
        "total_cost": 70.8667071,
        "cost_per_task": 2.4436795551724138,
        "tests_only_quality_per_dollar": 0.3951079589529849,
        "equiv_quality_per_dollar": 0.07055499267017587,
        "total_input_tokens": 226809497,
        "total_output_tokens": 1229870,
        "total_tokens": 228039367,
        "total_uncached_input_tokens": 21826713,
        "total_cache_creation_input_tokens": null,
        "total_cache_read_input_tokens": 204982784,
        "total_cached_input_tokens": 204982784,
        "cost_tasks_total": 29,
        "cost_tasks_with_tokens": 29,
        "cost_tasks_with_cache_tokens": 26,
        "cost_tasks_with_cache_aware_pricing": 26,
        "cost_tasks_with_legacy_pricing": 3,
        "cost_tasks_with_pricing": 29,
        "cost_tasks_with_cost": 29,
        "cost_tasks_missing_tokens": 0,
        "cost_tasks_missing_pricing": 0,
        "cost_tasks_missing_cost": 0,
        "cost_tasks_with_cost_rate": 1,
        "pricing_version": "local-placeholder-2026-02-19",
        "pricing_source": "local_static_table",
        "pricing_model_key": "gpt-5.1-codex-mini"
      },
      "publish_exclusions": {},
      "publish_guard": {
        "publishable": true,
        "blocked": false,
        "reasons": [],
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false
      },
      "partial_metrics": {
        "failed_task_count": 1,
        "failed_task_with_partial_score": 1,
        "failed_task_partial_threshold": 0.8,
        "failed_task_partial_threshold_hits": 0,
        "failed_task_partial_ppr": 0,
        "failed_task_partial_mean_score": 0,
        "failed_task_partial_coverage": 1
      },
      "footprint_risk_metrics": {
        "used_count": 29,
        "unavailable_count": 0,
        "missing_count": 0,
        "scored_count": 29,
        "mean_score": 0.3022031029297725,
        "median_score": 0.3171248834535135,
        "flagged_count": 0,
        "flagged_rate": 0,
        "severe_count": 0,
        "severe_rate": 0,
        "level_low_count": 16,
        "level_medium_count": 13,
        "level_high_count": 0,
        "level_unknown_count": 0
      },
      "passRate": 0.9655172413793104,
      "ciLow": 0.896551724137931,
      "ciHigh": 1,
      "effectiveN": 29,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 0,
        "fail_infra": 1,
        "fail_likely_equiv": 0,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 2,
        "pass_with_warn": 26
      }
    },
    "gpt-5-3-codex": {
      "model": "gpt-5.3-codex",
      "requested_model": "gpt-5.3-codex",
      "run_id": "2026-02-28__13-38-10__gpt-5-3-codex",
      "run_dir": "/Users/ben/dev/flux/.tmp/h2h-graphql-w2/runs/2026-02-28__13-38-10__gpt-5-3-codex",
      "results_path": "/Users/ben/dev/flux/.tmp/h2h-graphql-w2/runs/2026-02-28__13-38-10__gpt-5-3-codex/results.json",
      "validation_metrics": {
        "validated": 29,
        "leaderboard_eligible": 29,
        "leaderboard_excluded": 0,
        "binary_pass_count": 27,
        "binary_pass_rate": 0.9310344827586207,
        "tests_only_pass_count": 27,
        "tests_only_pass_rate": 0.9310344827586207,
        "rescue_aware_pass_count": 27,
        "rescue_aware_pass_rate": 0.9310344827586207,
        "rescue_delta_rate": 0,
        "equiv_rate": 0.3103448275862069,
        "equiv_equivalent_count": 9,
        "equiv_non_equivalent_count": 18,
        "equiv_unknown_count": 2,
        "code_review_pass_count": 1,
        "code_review_fail_count": 22,
        "code_review_unsure_count": 6,
        "code_review_fail_rate": 0.7586206896551724,
        "behavioral_robustness_used_count": 29,
        "behavioral_robustness_skipped_count": 0,
        "behavioral_robustness_unavailable_count": 0,
        "probe_accepted_commands_count": 0,
        "probe_gold_pass_candidate_pass_count": 27,
        "probe_gold_pass_candidate_fail_count": 1,
        "probe_review_required_count": 0,
        "probe_agreement_rate": 0.9642857142857143,
        "tb_unresolved_but_tests_pass_count": 2,
        "tb_resolved_but_tests_not_pass_count": 1,
        "tests_unknown_count": 1,
        "tests_unknown_cause_counts": {
          "all_commands_ignored_gold_failure_mode_unset": 1
        },
        "tests_unknown_rate": 0.034482758620689655,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false,
        "cache_evaluated_count": 29,
        "cache_hit_count": 0,
        "cache_miss_count": 29,
        "cache_hit_rate": 0,
        "cache_miss_reason_counts": {
          "missing_pinned_dependencies": 29
        },
        "total_setup_ms_saved": null,
        "total_pinned_bytes": null,
        "footprint_risk_mean_score": 0.32032261077393087,
        "footprint_risk_median_score": 0.29588453910101414,
        "footprint_risk_scored_count": 29,
        "footprint_risk_used_count": 29,
        "footprint_risk_unavailable_count": 0,
        "footprint_risk_missing_count": 0,
        "footprint_risk_flagged_count": 1,
        "footprint_risk_flagged_rate": 0.034482758620689655,
        "footprint_risk_severe_count": 0,
        "footprint_risk_severe_rate": 0,
        "footprint_risk_level_low_count": 19,
        "footprint_risk_level_medium_count": 9,
        "footprint_risk_level_high_count": 1,
        "footprint_risk_level_unknown_count": 0,
        "total_cost": 207.58233899999996,
        "cost_per_task": 7.158011689655171,
        "tests_only_quality_per_dollar": 0.13006886872008896,
        "equiv_quality_per_dollar": 0.04335628957336299,
        "total_input_tokens": 95269285,
        "total_output_tokens": 409978,
        "total_tokens": 95679263,
        "total_uncached_input_tokens": 2968869,
        "total_cache_creation_input_tokens": null,
        "total_cache_read_input_tokens": 92300416,
        "total_cached_input_tokens": 92300416,
        "cost_tasks_total": 29,
        "cost_tasks_with_tokens": 29,
        "cost_tasks_with_cache_tokens": 29,
        "cost_tasks_with_cache_aware_pricing": 29,
        "cost_tasks_with_legacy_pricing": 0,
        "cost_tasks_with_pricing": 29,
        "cost_tasks_with_cost": 29,
        "cost_tasks_missing_tokens": 0,
        "cost_tasks_missing_pricing": 0,
        "cost_tasks_missing_cost": 0,
        "cost_tasks_with_cost_rate": 1,
        "pricing_version": "local-placeholder-2026-02-19",
        "pricing_source": "local_static_table",
        "pricing_model_key": "gpt-5.3-codex"
      },
      "publish_exclusions": {},
      "publish_guard": {
        "publishable": true,
        "blocked": false,
        "reasons": [],
        "tests_unknown_rate": 0.034482758620689655,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false
      },
      "partial_metrics": {
        "failed_task_count": 2,
        "failed_task_with_partial_score": 1,
        "failed_task_partial_threshold": 0.8,
        "failed_task_partial_threshold_hits": 0,
        "failed_task_partial_ppr": 0,
        "failed_task_partial_mean_score": 0,
        "failed_task_partial_coverage": 0.5
      },
      "footprint_risk_metrics": {
        "used_count": 29,
        "unavailable_count": 0,
        "missing_count": 0,
        "scored_count": 29,
        "mean_score": 0.32032261077393087,
        "median_score": 0.29588453910101414,
        "flagged_count": 1,
        "flagged_rate": 0.034482758620689655,
        "severe_count": 0,
        "severe_rate": 0,
        "level_low_count": 19,
        "level_medium_count": 9,
        "level_high_count": 1,
        "level_unknown_count": 0
      },
      "passRate": 0.9310344827586207,
      "ciLow": 0.8275862068965517,
      "ciHigh": 1,
      "effectiveN": 29,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 1,
        "fail_infra": 0,
        "fail_likely_equiv": 1,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 6,
        "pass_with_warn": 21
      }
    },
    "gpt-5-4": {
      "model": "gpt-5.4",
      "requested_model": "gpt-5.4",
      "run_id": "2026-02-28__13-38-10__gpt-5-4",
      "run_dir": "/Users/ben/dev/flux/.tmp/h2h-graphql-w2/runs/2026-02-28__13-38-10__gpt-5-4",
      "results_path": "/Users/ben/dev/flux/.tmp/h2h-graphql-w2/runs/2026-02-28__13-38-10__gpt-5-4/results.json",
      "validation_metrics": {
        "validated": 29,
        "leaderboard_eligible": 29,
        "leaderboard_excluded": 0,
        "binary_pass_count": 26,
        "binary_pass_rate": 0.896551724137931,
        "tests_only_pass_count": 26,
        "tests_only_pass_rate": 0.896551724137931,
        "rescue_aware_pass_count": 25,
        "rescue_aware_pass_rate": 0.8620689655172413,
        "rescue_delta_rate": -0.034482758620689724,
        "equiv_rate": 0.5172413793103449,
        "equiv_equivalent_count": 15,
        "equiv_non_equivalent_count": 13,
        "equiv_unknown_count": 1,
        "code_review_pass_count": 4,
        "code_review_fail_count": 18,
        "code_review_unsure_count": 7,
        "code_review_fail_rate": 0.6206896551724138,
        "behavioral_robustness_used_count": 29,
        "behavioral_robustness_skipped_count": 0,
        "behavioral_robustness_unavailable_count": 0,
        "probe_accepted_commands_count": 1,
        "probe_gold_pass_candidate_pass_count": 25,
        "probe_gold_pass_candidate_fail_count": 4,
        "probe_review_required_count": 0,
        "probe_agreement_rate": 0.8620689655172413,
        "tb_unresolved_but_tests_pass_count": 4,
        "tb_resolved_but_tests_not_pass_count": 0,
        "tests_unknown_count": 0,
        "tests_unknown_cause_counts": {},
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false,
        "cache_evaluated_count": 29,
        "cache_hit_count": 0,
        "cache_miss_count": 29,
        "cache_hit_rate": 0,
        "cache_miss_reason_counts": {
          "missing_pinned_dependencies": 29
        },
        "total_setup_ms_saved": null,
        "total_pinned_bytes": null,
        "footprint_risk_mean_score": 0.3306436529135121,
        "footprint_risk_median_score": 0.3187045054012213,
        "footprint_risk_scored_count": 29,
        "footprint_risk_used_count": 29,
        "footprint_risk_unavailable_count": 0,
        "footprint_risk_missing_count": 0,
        "footprint_risk_flagged_count": 1,
        "footprint_risk_flagged_rate": 0.034482758620689655,
        "footprint_risk_severe_count": 0,
        "footprint_risk_severe_rate": 0,
        "footprint_risk_level_low_count": 15,
        "footprint_risk_level_medium_count": 13,
        "footprint_risk_level_high_count": 1,
        "footprint_risk_level_unknown_count": 0,
        "total_cost": 55.45011,
        "cost_per_task": 1.9120727586206898,
        "tests_only_quality_per_dollar": 0.46888996252667486,
        "equiv_quality_per_dollar": 0.2705134399192355,
        "total_input_tokens": 93164727,
        "total_output_tokens": 414810,
        "total_tokens": 93579537,
        "total_uncached_input_tokens": 3699511,
        "total_cache_creation_input_tokens": null,
        "total_cache_read_input_tokens": 89465216,
        "total_cached_input_tokens": 89465216,
        "cost_tasks_total": 29,
        "cost_tasks_with_tokens": 29,
        "cost_tasks_with_cache_tokens": 29,
        "cost_tasks_with_cache_aware_pricing": 29,
        "cost_tasks_with_legacy_pricing": 0,
        "cost_tasks_with_pricing": 29,
        "cost_tasks_with_cost": 29,
        "cost_tasks_missing_tokens": 0,
        "cost_tasks_missing_pricing": 0,
        "cost_tasks_missing_cost": 0,
        "cost_tasks_with_cost_rate": 1,
        "pricing_version": "local-placeholder-2026-02-19",
        "pricing_source": "local_static_table",
        "pricing_model_key": "gpt-5.4"
      },
      "publish_exclusions": {},
      "publish_guard": {
        "publishable": true,
        "blocked": false,
        "reasons": [],
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false
      },
      "partial_metrics": {
        "failed_task_count": 4,
        "failed_task_with_partial_score": 4,
        "failed_task_partial_threshold": 0.8,
        "failed_task_partial_threshold_hits": 1,
        "failed_task_partial_ppr": 0.25,
        "failed_task_partial_mean_score": 0.25,
        "failed_task_partial_coverage": 1
      },
      "footprint_risk_metrics": {
        "used_count": 29,
        "unavailable_count": 0,
        "missing_count": 0,
        "scored_count": 29,
        "mean_score": 0.3306436529135121,
        "median_score": 0.3187045054012213,
        "flagged_count": 1,
        "flagged_rate": 0.034482758620689655,
        "severe_count": 0,
        "severe_rate": 0,
        "level_low_count": 15,
        "level_medium_count": 13,
        "level_high_count": 1,
        "level_unknown_count": 0
      },
      "passRate": 0.896551724137931,
      "ciLow": 0.7586206896551724,
      "ciHigh": 1,
      "effectiveN": 29,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 1,
        "fail_high_conf": 2,
        "fail_infra": 0,
        "fail_likely_equiv": 1,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 5,
        "pass_with_warn": 20
      }
    }
  },
  "tasks": {
    "flux-pr-1001": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3169476,
          "tb_total_output_tokens": 26591,
          "tb_total_tokens": 3196067,
          "tb_uncached_input_tokens": 560324,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2609152,
          "tb_cached_input_tokens": 2609152,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.3914048,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2551513182660356,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1050248,
          "tb_total_output_tokens": 7013,
          "tb_total_tokens": 1057261,
          "tb_uncached_input_tokens": 64648,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 985600,
          "tb_cached_input_tokens": 985600,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.8689,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2692425139668345,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1249871,
          "tb_total_output_tokens": 6724,
          "tb_total_tokens": 1256595,
          "tb_uncached_input_tokens": 83023,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1166848,
          "tb_cached_input_tokens": 1166848,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.803262,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3118850085432988,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1034": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3836326632175216,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 6923096,
          "tb_total_output_tokens": 23474,
          "tb_total_tokens": 6946570,
          "tb_uncached_input_tokens": 135384,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 6787712,
          "tb_cached_input_tokens": 6787712,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 13.620768,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.22682349504576949,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 7590363,
          "tb_total_output_tokens": 16438,
          "tb_total_tokens": 7606801,
          "tb_uncached_input_tokens": 447579,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 7142784,
          "tb_cached_input_tokens": 7142784,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.598053999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.26356485337836205,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1076": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5812390,
          "tb_total_output_tokens": 41282,
          "tb_total_tokens": 5853672,
          "tb_uncached_input_tokens": 659494,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5152896,
          "tb_cached_input_tokens": 5152896,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.0098674,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1062551504413978,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 8125751,
          "tb_total_output_tokens": 17545,
          "tb_total_tokens": 8143296,
          "tb_uncached_input_tokens": 170295,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 7955456,
          "tb_cached_input_tokens": 7955456,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 15.540308999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.19399968967529543,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3285250,
          "tb_total_output_tokens": 15763,
          "tb_total_tokens": 3301013,
          "tb_uncached_input_tokens": 209666,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3075584,
          "tb_cached_input_tokens": 3075584,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.083228,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.29383820530535576,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1087": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3660541,
          "tb_total_output_tokens": 22177,
          "tb_total_tokens": 3682718,
          "tb_uncached_input_tokens": 360573,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3299968,
          "tb_cached_input_tokens": 3299968,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.1689166999999998,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.19900566064728448,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1655871,
          "tb_total_output_tokens": 9182,
          "tb_total_tokens": 1665053,
          "tb_uncached_input_tokens": 70975,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1584896,
          "tb_cached_input_tokens": 1584896,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.9928890000000004,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.41749001055306256,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1087351,
          "tb_total_output_tokens": 10051,
          "tb_total_tokens": 1097402,
          "tb_uncached_input_tokens": 57975,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1029376,
          "tb_cached_input_tokens": 1029376,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.7110460000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.44521170303628677,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1099": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 8929497,
          "tb_total_output_tokens": 50845,
          "tb_total_tokens": 8980342,
          "tb_uncached_input_tokens": 705113,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 8224384,
          "tb_cached_input_tokens": 8224384,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.5963971000000003,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.6021949250304236,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2903162,
          "tb_total_output_tokens": 17333,
          "tb_total_tokens": 2920495,
          "tb_uncached_input_tokens": 77178,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2825984,
          "tb_cached_input_tokens": 2825984,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 6.436626,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 0.6937119052450148,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1092750,
          "tb_total_output_tokens": 14049,
          "tb_total_tokens": 1106799,
          "tb_uncached_input_tokens": 60558,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1032192,
          "tb_cached_input_tokens": 1032192,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.749604,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 0.6622988531040126,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1128": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 12379587,
          "tb_total_output_tokens": 71699,
          "tb_total_tokens": 12451286,
          "tb_uncached_input_tokens": 776131,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 11603456,
          "tb_cached_input_tokens": 11603456,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.3349089000000003,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.401487223152394,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2699155,
          "tb_total_output_tokens": 10985,
          "tb_total_tokens": 2710140,
          "tb_uncached_input_tokens": 94099,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2605056,
          "tb_cached_input_tokens": 2605056,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 5.978168999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4039983014622203,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1805288,
          "tb_total_output_tokens": 7586,
          "tb_total_tokens": 1812874,
          "tb_uncached_input_tokens": 101096,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1704192,
          "tb_cached_input_tokens": 1704192,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.114976,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3187045054012213,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1155": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 8146242,
          "tb_total_output_tokens": 63128,
          "tb_total_tokens": 8209370,
          "tb_uncached_input_tokens": 578882,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 7567360,
          "tb_cached_input_tokens": 7567360,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.382195,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.16595248593469225,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2640248,
          "tb_total_output_tokens": 13086,
          "tb_total_tokens": 2653334,
          "tb_uncached_input_tokens": 85240,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2555008,
          "tb_cached_input_tokens": 2555008,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 5.896272000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1923514373528737,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3875776,
          "tb_total_output_tokens": 18250,
          "tb_total_tokens": 3894026,
          "tb_uncached_input_tokens": 173120,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3702656,
          "tb_cached_input_tokens": 3702656,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.343568,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1967001873528737,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1169": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 10582144,
          "tb_total_output_tokens": 40641,
          "tb_total_tokens": 10622785,
          "tb_uncached_input_tokens": 823296,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 9758848,
          "tb_cached_input_tokens": 9758848,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.9426172,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4013426929746283,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2907171,
          "tb_total_output_tokens": 11460,
          "tb_total_tokens": 2918631,
          "tb_uncached_input_tokens": 80803,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2826368,
          "tb_cached_input_tokens": 2826368,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 6.139196999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.29953545357263966,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3470052,
          "tb_total_output_tokens": 11966,
          "tb_total_tokens": 3482018,
          "tb_uncached_input_tokens": 188644,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3281408,
          "tb_cached_input_tokens": 3281408,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.11372,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.43204305751870115,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1184": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3807458,
          "tb_total_output_tokens": 53920,
          "tb_total_tokens": 3861378,
          "tb_uncached_input_tokens": 390626,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3416832,
          "tb_cached_input_tokens": 3416832,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.4219838,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3762493019070734,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1267725,
          "tb_total_output_tokens": 12325,
          "tb_total_tokens": 1280050,
          "tb_uncached_input_tokens": 57485,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1210240,
          "tb_cached_input_tokens": 1210240,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.417135,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.41279805759516364,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 480777,
          "tb_total_output_tokens": 11030,
          "tb_total_tokens": 491807,
          "tb_uncached_input_tokens": 51977,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 428800,
          "tb_cached_input_tokens": 428800,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.406594,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.40211387838901175,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1209": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4595521,
          "tb_total_output_tokens": 37182,
          "tb_total_tokens": 4632703,
          "tb_uncached_input_tokens": 860609,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3734912,
          "tb_cached_input_tokens": 3734912,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.0742423,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.17937162166678647,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3677707,
          "tb_total_output_tokens": 14735,
          "tb_total_tokens": 3692442,
          "tb_uncached_input_tokens": 102795,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3574912,
          "tb_cached_input_tokens": 3574912,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 7.788393,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4779725640137824,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3477467,
          "tb_total_output_tokens": 10694,
          "tb_total_tokens": 3488161,
          "tb_uncached_input_tokens": 93659,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3383808,
          "tb_cached_input_tokens": 3383808,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.9647740000000002,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4265959597383993,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1230": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 8890508,
          "tb_total_output_tokens": 46273,
          "tb_total_tokens": 8936781,
          "tb_uncached_input_tokens": 712332,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 8178176,
          "tb_cached_input_tokens": 8178176,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.5728624,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1606808917486659,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2474990,
          "tb_total_output_tokens": 8409,
          "tb_total_tokens": 2483399,
          "tb_uncached_input_tokens": 108398,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2366592,
          "tb_cached_input_tokens": 2366592,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 5.680398,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1584758917486659,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3246043,
          "tb_total_output_tokens": 19217,
          "tb_total_tokens": 3265260,
          "tb_uncached_input_tokens": 84955,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3161088,
          "tb_cached_input_tokens": 3161088,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.90419,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.33348545366466276,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1232": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1099167,
          "tb_total_output_tokens": 18569,
          "tb_total_tokens": 1117736,
          "tb_uncached_input_tokens": 187807,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 911360,
          "tb_cached_input_tokens": 911360,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.5298284999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.29331329089134045,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 450721,
          "tb_total_output_tokens": 6112,
          "tb_total_tokens": 456833,
          "tb_uncached_input_tokens": 40353,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 410368,
          "tb_cached_input_tokens": 410368,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.587567,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.29588453910101414,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 558818,
          "tb_total_output_tokens": 8119,
          "tb_total_tokens": 566937,
          "tb_uncached_input_tokens": 46306,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 512512,
          "tb_cached_input_tokens": 512512,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.41381999999999997,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3744081835817403,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1240": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5464910,
          "tb_total_output_tokens": 42796,
          "tb_total_tokens": 5507706,
          "tb_uncached_input_tokens": 655694,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4809216,
          "tb_cached_input_tokens": 4809216,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.9616994,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3515059319612382,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 656537,
          "tb_total_output_tokens": 5162,
          "tb_total_tokens": 661699,
          "tb_uncached_input_tokens": 39193,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 617344,
          "tb_cached_input_tokens": 617344,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.823631,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1602187678380773,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 615766,
          "tb_total_output_tokens": 6575,
          "tb_total_tokens": 622341,
          "tb_uncached_input_tokens": 46806,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 568960,
          "tb_cached_input_tokens": 568960,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.43069199999999996,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2532067576643908,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1241": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 6210284,
          "tb_total_output_tokens": 51277,
          "tb_total_tokens": 6261561,
          "tb_uncached_input_tokens": 788460,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5421824,
          "tb_cached_input_tokens": 5421824,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.3036256,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2984741074241833,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1544316,
          "tb_total_output_tokens": 13775,
          "tb_total_tokens": 1558091,
          "tb_uncached_input_tokens": 68732,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1475584,
          "tb_cached_input_tokens": 1475584,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.070856,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.6211208909896463,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2236398,
          "tb_total_output_tokens": 13676,
          "tb_total_tokens": 2250074,
          "tb_uncached_input_tokens": 72686,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2163712,
          "tb_cached_input_tokens": 2163712,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.336636,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.40854626203235583,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1260": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 15699575,
          "tb_total_output_tokens": 49399,
          "tb_total_tokens": 15748974,
          "tb_uncached_input_tokens": 1373559,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 14326016,
          "tb_cached_input_tokens": 14326016,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.5056348999999996,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.42521487101909444,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1535458,
          "tb_total_output_tokens": 7764,
          "tb_total_tokens": 1543222,
          "tb_uncached_input_tokens": 87266,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1448192,
          "tb_cached_input_tokens": 1448192,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.9471179999999997,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.12920079680973096,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2708056,
          "tb_total_output_tokens": 8442,
          "tb_total_tokens": 2716498,
          "tb_uncached_input_tokens": 70360,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2637696,
          "tb_cached_input_tokens": 2637696,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.527104,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3073128731098082,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1262": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4597442,
          "tb_total_output_tokens": 40246,
          "tb_total_tokens": 4637688,
          "tb_uncached_input_tokens": 646978,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3950464,
          "tb_cached_input_tokens": 3950464,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.8045126,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.13448825336816045,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "fail_likely_equiv",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "all_commands_ignored_gold_failure_mode_unset",
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 7820072,
          "tb_total_output_tokens": 33624,
          "tb_total_tokens": 7853696,
          "tb_uncached_input_tokens": 255400,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 7564672,
          "tb_cached_input_tokens": 7564672,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 17.195448,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.25433447663269804,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5445377,
          "tb_total_output_tokens": 29177,
          "tb_total_tokens": 5474554,
          "tb_uncached_input_tokens": 109313,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5336064,
          "tb_cached_input_tokens": 5336064,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.1200740000000002,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.23320626519550683,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1268": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 16446072,
          "tb_total_output_tokens": 56578,
          "tb_total_tokens": 16502650,
          "tb_uncached_input_tokens": 1281784,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 15164288,
          "tb_cached_input_tokens": 15164288,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.5367872,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.38161288344439065,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4975992,
          "tb_total_output_tokens": 13441,
          "tb_total_tokens": 4989433,
          "tb_uncached_input_tokens": 106104,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4869888,
          "tb_cached_input_tokens": 4869888,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 9.702852,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.32586530156543875,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5950812,
          "tb_total_output_tokens": 14876,
          "tb_total_tokens": 5965688,
          "tb_uncached_input_tokens": 302684,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5648128,
          "tb_cached_input_tokens": 5648128,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.54844,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4195650895870833,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1293": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_infra",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "infra_failure",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0,
          "partial_score_numerator": 0,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3201119213800818,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4303725,
          "tb_total_output_tokens": 18867,
          "tb_total_tokens": 4322592,
          "tb_uncached_input_tokens": 101485,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4202240,
          "tb_cached_input_tokens": 4202240,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 8.957655,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0,
          "partial_score_numerator": 0,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.30310009516125536,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "fail_likely_equiv",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "likely_equivalent_despite_test_fail",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3921159,
          "tb_total_output_tokens": 20484,
          "tb_total_tokens": 3941643,
          "tb_uncached_input_tokens": 155271,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3765888,
          "tb_cached_input_tokens": 3765888,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.357358,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": true,
          "rescue_eligible": false,
          "rescue_decision": "rejected_not_high_confidence_stylistic",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0,
          "partial_score_numerator": 0,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3694447094908994,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1297": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 13392580,
          "tb_total_output_tokens": 47979,
          "tb_total_tokens": 13440559,
          "tb_uncached_input_tokens": 1389892,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 12002688,
          "tb_cached_input_tokens": 12002688,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.1731152,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.11589991203175924,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": null,
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4546351,
          "tb_total_output_tokens": 16854,
          "tb_total_tokens": 4563205,
          "tb_uncached_input_tokens": 149935,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4396416,
          "tb_cached_input_tokens": 4396416,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 9.854889000000002,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2385217788602799,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5982366,
          "tb_total_output_tokens": 24247,
          "tb_total_tokens": 6006613,
          "tb_uncached_input_tokens": 160030,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5822336,
          "tb_cached_input_tokens": 5822336,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.425204,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.26238358101880926,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1308": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 10881552,
          "tb_total_output_tokens": 62689,
          "tb_total_tokens": 10944241,
          "tb_uncached_input_tokens": 937360,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 9944192,
          "tb_cached_input_tokens": 9944192,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.2738028,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4272754775114472,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5994908,
          "tb_total_output_tokens": 18102,
          "tb_total_tokens": 6013010,
          "tb_uncached_input_tokens": 123292,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5871616,
          "tb_cached_input_tokens": 5871616,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 11.742924,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.6463775438172967,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 8073032,
          "tb_total_output_tokens": 23551,
          "tb_total_tokens": 8096583,
          "tb_uncached_input_tokens": 141896,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 7931136,
          "tb_cached_input_tokens": 7931136,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.437768,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0,
          "partial_score_numerator": 0,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.33159231072556666,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1309": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 10739431,
          "tb_total_output_tokens": 64067,
          "tb_total_tokens": 10803498,
          "tb_uncached_input_tokens": 1316199,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 9423232,
          "tb_cached_input_tokens": 9423232,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.7721853,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.45680361670345676,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": null,
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3452599,
          "tb_total_output_tokens": 17591,
          "tb_total_tokens": 3470190,
          "tb_uncached_input_tokens": 95927,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3356672,
          "tb_cached_input_tokens": 3356672,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 7.529373,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.5065884355335363,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": null,
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5582094,
          "tb_total_output_tokens": 20296,
          "tb_total_tokens": 5602390,
          "tb_uncached_input_tokens": 130702,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5451392,
          "tb_cached_input_tokens": 5451392,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.149468,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.5218545949309796,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1338": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5157856,
          "tb_total_output_tokens": 23781,
          "tb_total_tokens": 5181637,
          "tb_uncached_input_tokens": 658400,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4499456,
          "tb_cached_input_tokens": 4499456,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.8052044,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.554566763640818,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1743361,
          "tb_total_output_tokens": 11923,
          "tb_total_tokens": 1755284,
          "tb_uncached_input_tokens": 95105,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1648256,
          "tb_cached_input_tokens": 1648256,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.614338999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.11107493716175802,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2336310,
          "tb_total_output_tokens": 9994,
          "tb_total_tokens": 2346304,
          "tb_uncached_input_tokens": 84150,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2252160,
          "tb_cached_input_tokens": 2252160,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.3743319999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0,
          "partial_score_numerator": 0,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.11938894184581253,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1351": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5407381,
          "tb_total_output_tokens": 40745,
          "tb_total_tokens": 5448126,
          "tb_uncached_input_tokens": 491541,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4915840,
          "tb_cached_input_tokens": 4915840,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.7191575,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4119675384553558,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1427587,
          "tb_total_output_tokens": 8619,
          "tb_total_tokens": 1436206,
          "tb_uncached_input_tokens": 62595,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1364992,
          "tb_cached_input_tokens": 1364992,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.5035529999999997,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.45114231648401637,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1638145,
          "tb_total_output_tokens": 8594,
          "tb_total_tokens": 1646739,
          "tb_uncached_input_tokens": 65153,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1572992,
          "tb_cached_input_tokens": 1572992,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.985554,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.5295491958472022,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1380": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 6698650,
          "tb_total_output_tokens": 47325,
          "tb_total_tokens": 6745975,
          "tb_uncached_input_tokens": 724890,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5973760,
          "tb_cached_input_tokens": 5973760,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.267349,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.09908243987526603,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3616202,
          "tb_total_output_tokens": 14030,
          "tb_total_tokens": 3630232,
          "tb_uncached_input_tokens": 104906,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3511296,
          "tb_cached_input_tokens": 3511296,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 7.682334,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.08800265044139781,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2325458,
          "tb_total_output_tokens": 15195,
          "tb_total_tokens": 2340653,
          "tb_uncached_input_tokens": 74834,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2250624,
          "tb_cached_input_tokens": 2250624,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.39654,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.13439327041694063,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-817": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.150064235084352,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2665058,
          "tb_total_output_tokens": 16672,
          "tb_total_tokens": 2681730,
          "tb_uncached_input_tokens": 66530,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2598528,
          "tb_cached_input_tokens": 2598528,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 5.896062000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.20423378169225104,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2251876,
          "tb_total_output_tokens": 18533,
          "tb_total_tokens": 2270409,
          "tb_uncached_input_tokens": 66788,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2185088,
          "tb_cached_input_tokens": 2185088,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.3743839999999998,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.26220693219867675,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-828": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 14120627,
          "tb_total_output_tokens": 68877,
          "tb_total_tokens": 14189504,
          "tb_uncached_input_tokens": 1481395,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 12639232,
          "tb_cached_input_tokens": 12639232,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.531239299999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.09971022772754222,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4222768,
          "tb_total_output_tokens": 18062,
          "tb_total_tokens": 4240830,
          "tb_uncached_input_tokens": 183088,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4039680,
          "tb_cached_input_tokens": 4039680,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 9.88956,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.14080882787202723,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5157251,
          "tb_total_output_tokens": 15845,
          "tb_total_tokens": 5173096,
          "tb_uncached_input_tokens": 289155,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4868096,
          "tb_cached_input_tokens": 4868096,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.1391180000000003,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 0,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.11499086711582324,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-859": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 10365685,
          "tb_total_output_tokens": 61516,
          "tb_total_tokens": 10427201,
          "tb_uncached_input_tokens": 849781,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 9515904,
          "tb_cached_input_tokens": 9515904,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.0711531,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4548638896386543,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1607836,
          "tb_total_output_tokens": 12877,
          "tb_total_tokens": 1620713,
          "tb_uncached_input_tokens": 71836,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1536000,
          "tb_cached_input_tokens": 1536000,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.15416,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.5306650282982427,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1256714,
          "tb_total_output_tokens": 7877,
          "tb_total_tokens": 1264591,
          "tb_uncached_input_tokens": 87946,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1168768,
          "tb_cached_input_tokens": 1168768,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.8232919999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.29033936587366094,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-870": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 10290242,
          "tb_total_output_tokens": 45480,
          "tb_total_tokens": 10335722,
          "tb_uncached_input_tokens": 927682,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 9362560,
          "tb_cached_input_tokens": 9362560,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.068787,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.24048580636584216,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4803363,
          "tb_total_output_tokens": 13843,
          "tb_total_tokens": 4817206,
          "tb_uncached_input_tokens": 142371,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4660992,
          "tb_cached_input_tokens": 4660992,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 9.957633,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.20868466301552432,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1911085,
          "tb_total_output_tokens": 11048,
          "tb_total_tokens": 1922133,
          "tb_uncached_input_tokens": 89261,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1821824,
          "tb_cached_input_tokens": 1821824,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.177818,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.22247256993607625,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-891": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 20264679,
          "tb_total_output_tokens": 54808,
          "tb_total_tokens": 20319487,
          "tb_uncached_input_tokens": 1687911,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 18576768,
          "tb_cached_input_tokens": 18576768,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 5.6472297,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3171248834535135,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candida