STET

summary

reports/summary.json

263553 bytes

Back to adjudication

Preview truncated to keep static evidence pages bounded.

{
  "generated_at": "2026-03-06T23:48:13Z",
  "dataset": "/Users/ben/dev/flux/.tmp/validation-zod-cleaned50-r7-p1-20260208-170124",
  "output_root": "/Users/ben/dev/flux/.tmp/h2h-zod-w2",
  "statistics": {
    "pass_definition": {
      "denominator_statuses": [
        "fail_guardrail",
        "fail_high_conf",
        "fail_infra",
        "fail_likely_equiv",
        "fail_no_patch",
        "fail_with_diag",
        "pass",
        "pass_with_warn"
      ],
      "positive_statuses": [
        "pass",
        "pass_with_warn"
      ]
    },
    "bootstrap": {
      "base_seed": 1337,
      "confidence_level": 0.95,
      "method": "nonparametric_task_bootstrap",
      "resamples": 5000
    },
    "tiering": {
      "rule": "A is strictly superior to B iff passRate(A) \u003e ciHigh(B)",
      "strategy": "conservative_non_superiority_grouping"
    }
  },
  "models": [
    {
      "name": "gpt-5.1-codex-mini",
      "key": "gpt-5-1-codex-mini",
      "run_id": "2026-02-27__21-30-28__gpt-5-1-codex-mini"
    },
    {
      "name": "gpt-5.3-codex",
      "key": "gpt-5-3-codex",
      "run_id": "2026-02-27__21-30-28__gpt-5-3-codex"
    },
    {
      "name": "gpt-5.4",
      "key": "gpt-5-4",
      "run_id": "2026-02-27__21-30-28__gpt-5-4"
    }
  ],
  "runs": {
    "gpt-5-1-codex-mini": {
      "model": "gpt-5.1-codex-mini",
      "requested_model": "gpt-5.1-codex-mini",
      "run_id": "2026-02-27__21-30-28__gpt-5-1-codex-mini",
      "run_dir": "/Users/ben/dev/flux/.tmp/h2h-zod-w2/runs/2026-02-27__21-30-28__gpt-5-1-codex-mini",
      "results_path": "/Users/ben/dev/flux/.tmp/h2h-zod-w2/runs/2026-02-27__21-30-28__gpt-5-1-codex-mini/results.json",
      "validation_metrics": {
        "validated": 28,
        "leaderboard_eligible": 13,
        "leaderboard_excluded": 15,
        "binary_pass_count": 8,
        "binary_pass_rate": 0.6153846153846154,
        "tests_only_pass_count": 8,
        "tests_only_pass_rate": 0.6153846153846154,
        "rescue_aware_pass_count": 8,
        "rescue_aware_pass_rate": 0.6153846153846154,
        "rescue_delta_rate": 0,
        "equiv_rate": 0.15384615384615385,
        "equiv_equivalent_count": 2,
        "equiv_non_equivalent_count": 11,
        "equiv_unknown_count": 0,
        "code_review_pass_count": 1,
        "code_review_fail_count": 12,
        "code_review_unsure_count": 0,
        "code_review_fail_rate": 0.9230769230769231,
        "behavioral_robustness_used_count": 13,
        "behavioral_robustness_skipped_count": 0,
        "behavioral_robustness_unavailable_count": 0,
        "probe_accepted_commands_count": 0,
        "probe_gold_pass_candidate_pass_count": 27,
        "probe_gold_pass_candidate_fail_count": 16,
        "probe_review_required_count": 0,
        "probe_agreement_rate": 0.627906976744186,
        "tb_unresolved_but_tests_pass_count": 6,
        "tb_resolved_but_tests_not_pass_count": 1,
        "tests_unknown_count": 0,
        "tests_unknown_cause_counts": {},
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false,
        "cache_evaluated_count": 28,
        "cache_hit_count": 0,
        "cache_miss_count": 28,
        "cache_hit_rate": 0,
        "cache_miss_reason_counts": {
          "missing_pinned_dependencies": 28
        },
        "total_setup_ms_saved": null,
        "total_pinned_bytes": null,
        "footprint_risk_mean_score": 0.6341535743815215,
        "footprint_risk_median_score": 0.7852904228436781,
        "footprint_risk_scored_count": 28,
        "footprint_risk_used_count": 28,
        "footprint_risk_unavailable_count": 0,
        "footprint_risk_missing_count": 0,
        "footprint_risk_flagged_count": 14,
        "footprint_risk_flagged_rate": 0.5,
        "footprint_risk_severe_count": 14,
        "footprint_risk_severe_rate": 0.5,
        "footprint_risk_level_low_count": 11,
        "footprint_risk_level_medium_count": 3,
        "footprint_risk_level_high_count": 14,
        "footprint_risk_level_unknown_count": 0,
        "total_cost": 37.10659440000001,
        "cost_per_task": 1.3252355142857148,
        "tests_only_quality_per_dollar": 0.464358680967209,
        "equiv_quality_per_dollar": 0.11608967024180225,
        "total_input_tokens": 99402452,
        "total_output_tokens": 1043733,
        "total_tokens": 100446185,
        "total_uncached_input_tokens": 11802836,
        "total_cache_creation_input_tokens": null,
        "total_cache_read_input_tokens": 87599616,
        "total_cached_input_tokens": 87599616,
        "cost_tasks_total": 28,
        "cost_tasks_with_tokens": 28,
        "cost_tasks_with_cache_tokens": 28,
        "cost_tasks_with_cache_aware_pricing": 28,
        "cost_tasks_with_legacy_pricing": 0,
        "cost_tasks_with_pricing": 28,
        "cost_tasks_with_cost": 28,
        "cost_tasks_missing_tokens": 0,
        "cost_tasks_missing_pricing": 0,
        "cost_tasks_missing_cost": 0,
        "cost_tasks_with_cost_rate": 1,
        "pricing_version": "local-placeholder-2026-02-19",
        "pricing_source": "local_static_table",
        "pricing_model_key": "gpt-5.1-codex-mini"
      },
      "publish_exclusions": {
        "base_tests_pass_without_patch": 15
      },
      "publish_guard": {
        "publishable": true,
        "blocked": false,
        "reasons": [],
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false
      },
      "partial_metrics": {
        "failed_task_count": 7,
        "failed_task_with_partial_score": 7,
        "failed_task_partial_threshold": 0.8,
        "failed_task_partial_threshold_hits": 1,
        "failed_task_partial_ppr": 0.14285714285714285,
        "failed_task_partial_mean_score": 0.30357142857142855,
        "failed_task_partial_coverage": 1
      },
      "footprint_risk_metrics": {
        "used_count": 28,
        "unavailable_count": 0,
        "missing_count": 0,
        "scored_count": 28,
        "mean_score": 0.6341535743815215,
        "median_score": 0.7852904228436781,
        "flagged_count": 14,
        "flagged_rate": 0.5,
        "severe_count": 14,
        "severe_rate": 0.5,
        "level_low_count": 11,
        "level_medium_count": 3,
        "level_high_count": 14,
        "level_unknown_count": 0
      },
      "passRate": 0.6153846153846154,
      "ciLow": 0.3076923076923077,
      "ciHigh": 0.8461538461538461,
      "effectiveN": 13,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 7,
        "fail_infra": 0,
        "fail_likely_equiv": 0,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 2,
        "pass_with_warn": 19
      }
    },
    "gpt-5-3-codex": {
      "model": "gpt-5.3-codex",
      "requested_model": "gpt-5.3-codex",
      "run_id": "2026-02-27__21-30-28__gpt-5-3-codex",
      "run_dir": "/Users/ben/dev/flux/.tmp/h2h-zod-w2/runs/2026-02-27__21-30-28__gpt-5-3-codex",
      "results_path": "/Users/ben/dev/flux/.tmp/h2h-zod-w2/runs/2026-02-27__21-30-28__gpt-5-3-codex/results.json",
      "validation_metrics": {
        "validated": 28,
        "leaderboard_eligible": 13,
        "leaderboard_excluded": 15,
        "binary_pass_count": 9,
        "binary_pass_rate": 0.6923076923076923,
        "tests_only_pass_count": 9,
        "tests_only_pass_rate": 0.6923076923076923,
        "rescue_aware_pass_count": 9,
        "rescue_aware_pass_rate": 0.6923076923076923,
        "rescue_delta_rate": 0,
        "equiv_rate": 0.38461538461538464,
        "equiv_equivalent_count": 5,
        "equiv_non_equivalent_count": 7,
        "equiv_unknown_count": 1,
        "code_review_pass_count": 1,
        "code_review_fail_count": 10,
        "code_review_unsure_count": 2,
        "code_review_fail_rate": 0.7692307692307693,
        "behavioral_robustness_used_count": 13,
        "behavioral_robustness_skipped_count": 0,
        "behavioral_robustness_unavailable_count": 0,
        "probe_accepted_commands_count": 2,
        "probe_gold_pass_candidate_pass_count": 27,
        "probe_gold_pass_candidate_fail_count": 8,
        "probe_review_required_count": 0,
        "probe_agreement_rate": 0.7714285714285715,
        "tb_unresolved_but_tests_pass_count": 6,
        "tb_resolved_but_tests_not_pass_count": 2,
        "tests_unknown_count": 1,
        "tests_unknown_cause_counts": {
          "no_gold_pass_commands": 1
        },
        "tests_unknown_rate": 0.03571428571428571,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false,
        "cache_evaluated_count": 28,
        "cache_hit_count": 0,
        "cache_miss_count": 28,
        "cache_hit_rate": 0,
        "cache_miss_reason_counts": {
          "missing_pinned_dependencies": 28
        },
        "total_setup_ms_saved": null,
        "total_pinned_bytes": null,
        "footprint_risk_mean_score": 0.5267283617210462,
        "footprint_risk_median_score": 0.4187119400088103,
        "footprint_risk_scored_count": 28,
        "footprint_risk_used_count": 28,
        "footprint_risk_unavailable_count": 0,
        "footprint_risk_missing_count": 0,
        "footprint_risk_flagged_count": 9,
        "footprint_risk_flagged_rate": 0.32142857142857145,
        "footprint_risk_severe_count": 9,
        "footprint_risk_severe_rate": 0.32142857142857145,
        "footprint_risk_level_low_count": 12,
        "footprint_risk_level_medium_count": 7,
        "footprint_risk_level_high_count": 9,
        "footprint_risk_level_unknown_count": 0,
        "total_cost": 85.750158,
        "cost_per_task": 3.062505642857143,
        "tests_only_quality_per_dollar": 0.22605923810210804,
        "equiv_quality_per_dollar": 0.12558846561228226,
        "total_input_tokens": 30705182,
        "total_output_tokens": 210957,
        "total_tokens": 30916139,
        "total_uncached_input_tokens": 2002590,
        "total_cache_creation_input_tokens": null,
        "total_cache_read_input_tokens": 28702592,
        "total_cached_input_tokens": 28702592,
        "cost_tasks_total": 28,
        "cost_tasks_with_tokens": 28,
        "cost_tasks_with_cache_tokens": 28,
        "cost_tasks_with_cache_aware_pricing": 28,
        "cost_tasks_with_legacy_pricing": 0,
        "cost_tasks_with_pricing": 28,
        "cost_tasks_with_cost": 28,
        "cost_tasks_missing_tokens": 0,
        "cost_tasks_missing_pricing": 0,
        "cost_tasks_missing_cost": 0,
        "cost_tasks_with_cost_rate": 1,
        "pricing_version": "local-placeholder-2026-02-19",
        "pricing_source": "local_static_table",
        "pricing_model_key": "gpt-5.3-codex"
      },
      "publish_exclusions": {
        "base_tests_pass_without_patch": 15
      },
      "publish_guard": {
        "publishable": true,
        "blocked": false,
        "reasons": [],
        "tests_unknown_rate": 0.03571428571428571,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false
      },
      "partial_metrics": {
        "failed_task_count": 7,
        "failed_task_with_partial_score": 6,
        "failed_task_partial_threshold": 0.8,
        "failed_task_partial_threshold_hits": 0,
        "failed_task_partial_ppr": 0,
        "failed_task_partial_mean_score": 0.22777777777777777,
        "failed_task_partial_coverage": 0.8571428571428571
      },
      "footprint_risk_metrics": {
        "used_count": 28,
        "unavailable_count": 0,
        "missing_count": 0,
        "scored_count": 28,
        "mean_score": 0.5267283617210462,
        "median_score": 0.4187119400088103,
        "flagged_count": 9,
        "flagged_rate": 0.32142857142857145,
        "severe_count": 9,
        "severe_rate": 0.32142857142857145,
        "level_low_count": 12,
        "level_medium_count": 7,
        "level_high_count": 9,
        "level_unknown_count": 0
      },
      "passRate": 0.6923076923076923,
      "ciLow": 0.46153846153846156,
      "ciHigh": 0.9230769230769231,
      "effectiveN": 13,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 6,
        "fail_infra": 0,
        "fail_likely_equiv": 1,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 6,
        "pass_with_warn": 15
      }
    },
    "gpt-5-4": {
      "model": "gpt-5.4",
      "requested_model": "gpt-5.4",
      "run_id": "2026-02-27__21-30-28__gpt-5-4",
      "run_dir": "/Users/ben/dev/flux/.tmp/h2h-zod-w2/runs/2026-02-27__21-30-28__gpt-5-4",
      "results_path": "/Users/ben/dev/flux/.tmp/h2h-zod-w2/runs/2026-02-27__21-30-28__gpt-5-4/results.json",
      "validation_metrics": {
        "validated": 28,
        "leaderboard_eligible": 13,
        "leaderboard_excluded": 15,
        "binary_pass_count": 9,
        "binary_pass_rate": 0.6923076923076923,
        "tests_only_pass_count": 9,
        "tests_only_pass_rate": 0.6923076923076923,
        "rescue_aware_pass_count": 9,
        "rescue_aware_pass_rate": 0.6923076923076923,
        "rescue_delta_rate": 0,
        "equiv_rate": 0.3076923076923077,
        "equiv_equivalent_count": 4,
        "equiv_non_equivalent_count": 9,
        "equiv_unknown_count": 0,
        "code_review_pass_count": 3,
        "code_review_fail_count": 9,
        "code_review_unsure_count": 1,
        "code_review_fail_rate": 0.6923076923076923,
        "behavioral_robustness_used_count": 13,
        "behavioral_robustness_skipped_count": 0,
        "behavioral_robustness_unavailable_count": 0,
        "probe_accepted_commands_count": 1,
        "probe_gold_pass_candidate_pass_count": 26,
        "probe_gold_pass_candidate_fail_count": 13,
        "probe_review_required_count": 0,
        "probe_agreement_rate": 0.6666666666666666,
        "tb_unresolved_but_tests_pass_count": 9,
        "tb_resolved_but_tests_not_pass_count": 4,
        "tests_unknown_count": 1,
        "tests_unknown_cause_counts": {
          "no_commands_selected": 1
        },
        "tests_unknown_rate": 0.03571428571428571,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false,
        "cache_evaluated_count": 27,
        "cache_hit_count": 0,
        "cache_miss_count": 27,
        "cache_hit_rate": 0,
        "cache_miss_reason_counts": {
          "missing_pinned_dependencies": 27
        },
        "total_setup_ms_saved": null,
        "total_pinned_bytes": null,
        "footprint_risk_mean_score": 0.5314101950462605,
        "footprint_risk_median_score": 0.4287227140723908,
        "footprint_risk_scored_count": 28,
        "footprint_risk_used_count": 28,
        "footprint_risk_unavailable_count": 0,
        "footprint_risk_missing_count": 0,
        "footprint_risk_flagged_count": 8,
        "footprint_risk_flagged_rate": 0.2857142857142857,
        "footprint_risk_severe_count": 8,
        "footprint_risk_severe_rate": 0.2857142857142857,
        "footprint_risk_level_low_count": 10,
        "footprint_risk_level_medium_count": 10,
        "footprint_risk_level_high_count": 8,
        "footprint_risk_level_unknown_count": 0,
        "total_cost": 18.658483999999998,
        "cost_per_task": 0.6663744285714285,
        "tests_only_quality_per_dollar": 1.038916955129655,
        "equiv_quality_per_dollar": 0.4617408689465134,
        "total_input_tokens": 26956746,
        "total_output_tokens": 240436,
        "total_tokens": 27197182,
        "total_uncached_input_tokens": 2171082,
        "total_cache_creation_input_tokens": null,
        "total_cache_read_input_tokens": 24785664,
        "total_cached_input_tokens": 24785664,
        "cost_tasks_total": 28,
        "cost_tasks_with_tokens": 28,
        "cost_tasks_with_cache_tokens": 28,
        "cost_tasks_with_cache_aware_pricing": 28,
        "cost_tasks_with_legacy_pricing": 0,
        "cost_tasks_with_pricing": 28,
        "cost_tasks_with_cost": 28,
        "cost_tasks_missing_tokens": 0,
        "cost_tasks_missing_pricing": 0,
        "cost_tasks_missing_cost": 0,
        "cost_tasks_with_cost_rate": 1,
        "pricing_version": "local-placeholder-2026-02-19",
        "pricing_source": "local_static_table",
        "pricing_model_key": "gpt-5.4"
      },
      "publish_exclusions": {
        "base_tests_pass_without_patch": 15
      },
      "publish_guard": {
        "publishable": true,
        "blocked": false,
        "reasons": [],
        "tests_unknown_rate": 0.03571428571428571,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false
      },
      "partial_metrics": {
        "failed_task_count": 6,
        "failed_task_with_partial_score": 5,
        "failed_task_partial_threshold": 0.8,
        "failed_task_partial_threshold_hits": 0,
        "failed_task_partial_ppr": 0,
        "failed_task_partial_mean_score": 0.23888888888888887,
        "failed_task_partial_coverage": 0.8333333333333334
      },
      "footprint_risk_metrics": {
        "used_count": 28,
        "unavailable_count": 0,
        "missing_count": 0,
        "scored_count": 28,
        "mean_score": 0.5314101950462605,
        "median_score": 0.4287227140723908,
        "flagged_count": 8,
        "flagged_rate": 0.2857142857142857,
        "severe_count": 8,
        "severe_rate": 0.2857142857142857,
        "level_low_count": 10,
        "level_medium_count": 10,
        "level_high_count": 8,
        "level_unknown_count": 0
      },
      "passRate": 0.6923076923076923,
      "ciLow": 0.46153846153846156,
      "ciHigh": 0.9230769230769231,
      "effectiveN": 13,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 5,
        "fail_infra": 0,
        "fail_likely_equiv": 1,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 7,
        "pass_with_warn": 15
      }
    }
  },
  "tasks": {
    "flux-commit-0064304a": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0.125,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": 7,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2139457,
          "tb_total_output_tokens": 28432,
          "tb_total_tokens": 2167889,
          "tb_uncached_input_tokens": 240961,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1898496,
          "tb_cached_input_tokens": 1898496,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.8168079,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0.125,
          "partial_score_numerator": 1,
          "partial_score_denominator": 8,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0.16666666666666666,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": 5,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 594267,
          "tb_total_output_tokens": 5298,
          "tb_total_tokens": 599565,
          "tb_uncached_input_tokens": 30555,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 563712,
          "tb_cached_input_tokens": 563712,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.621773,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0.16666666666666666,
          "partial_score_numerator": 1,
          "partial_score_denominator": 6,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 1,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-4": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0.1111111111111111,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": 8,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 514054,
          "tb_total_output_tokens": 6706,
          "tb_total_tokens": 520760,
          "tb_uncached_input_tokens": 23686,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 490368,
          "tb_cached_input_tokens": 490368,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.34620400000000007,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0.1111111111111111,
          "partial_score_numerator": 1,
          "partial_score_denominator": 9,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 0.9649999208143414,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        }
      }
    },
    "flux-commit-64a54b07": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 3,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3654436,
          "tb_total_output_tokens": 82597,
          "tb_total_tokens": 3737033,
          "tb_uncached_input_tokens": 417700,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3236736,
          "tb_cached_input_tokens": 3236736,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.6076424,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0,
          "partial_score_numerator": 0,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1951665,
          "tb_total_output_tokens": 16032,
          "tb_total_tokens": 1967697,
          "tb_uncached_input_tokens": 144689,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1806976,
          "tb_cached_input_tokens": 1806976,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 5.842719,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0,
          "partial_score_numerator": 0,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-4": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 3,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1777761,
          "tb_total_output_tokens": 15203,
          "tb_total_tokens": 1792964,
          "tb_uncached_input_tokens": 110433,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1667328,
          "tb_cached_input_tokens": 1667328,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.176154,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0,
          "partial_score_numerator": 0,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        }
      }
    },
    "flux-commit-7af773c0": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0.2,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": 4,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4055598,
          "tb_total_output_tokens": 60744,
          "tb_total_tokens": 4116342,
          "tb_uncached_input_tokens": 578478,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3477120,
          "tb_cached_input_tokens": 3477120,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.753749,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0.2,
          "partial_score_numerator": 1,
          "partial_score_denominator": 5,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.22269389797715866,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1545065,
          "tb_total_output_tokens": 14502,
          "tb_total_tokens": 1559567,
          "tb_uncached_input_tokens": 116073,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1428992,
          "tb_cached_input_tokens": 1428992,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.754703,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0.2,
          "partial_score_numerator": 1,
          "partial_score_denominator": 5,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2260592172086254,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_commands_selected",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 913212,
          "tb_total_output_tokens": 12529,
          "tb_total_tokens": 925741,
          "tb_uncached_input_tokens": 81724,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 831488,
          "tb_cached_input_tokens": 831488,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.679424,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_commands_selected",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.44633678895315987,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-commit-a8580f2b": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2933113,
          "tb_total_output_tokens": 42307,
          "tb_total_tokens": 2975420,
          "tb_uncached_input_tokens": 589049,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2344064,
          "tb_cached_input_tokens": 2344064,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.4890250999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1014052,
          "tb_total_output_tokens": 8490,
          "tb_total_tokens": 1022542,
          "tb_uncached_input_tokens": 43556,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 970496,
          "tb_cached_input_tokens": 970496,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.618484,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1099691,
          "tb_total_output_tokens": 12478,
          "tb_total_tokens": 1112169,
          "tb_uncached_input_tokens": 66475,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1033216,
          "tb_cached_input_tokens": 1033216,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.749382,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        }
      }
    },
    "flux-commit-fc48a85d": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0.5,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3950333,
          "tb_total_output_tokens": 67957,
          "tb_total_tokens": 4018290,
          "tb_uncached_input_tokens": 385021,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3565312,
          "tb_cached_input_tokens": 3565312,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.5200703,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0.5,
          "partial_score_numerator": 1,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
          "equivalence_status": "used",
          "equivalence_outcome": null,
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0.5,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 539049,
          "tb_total_output_tokens": 10884,
          "tb_total_tokens": 549933,
          "tb_uncached_input_tokens": 56873,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 482176,
          "tb_cached_input_tokens": 482176,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.229399,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0.5,
          "partial_score_numerator": 1,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 0.9650000000000001,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 491906,
          "tb_total_output_tokens": 12873,
          "tb_total_tokens": 504779,
          "tb_uncached_input_tokens": 72066,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 419840,
          "tb_cached_input_tokens": 419840,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.45703600000000005,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.515889155149982,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-3535": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 718503,
          "tb_total_output_tokens": 18500,
          "tb_total_tokens": 737003,
          "tb_uncached_input_tokens": 111143,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 607360,
          "tb_cached_input_tokens": 607360,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.3688185,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2159136192945093,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 333181,
          "tb_total_output_tokens": 3304,
          "tb_total_tokens": 336485,
          "tb_uncached_input_tokens": 26877,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 306304,
          "tb_cached_input_tokens": 306304,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.060851,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4132175498715512,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 287682,
          "tb_total_output_tokens": 3702,
          "tb_total_tokens": 291384,
          "tb_uncached_input_tokens": 56130,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 231552,
          "tb_cached_input_tokens": 231552,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.257652,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.44272926669063817,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-3712": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4809887,
          "tb_total_output_tokens": 26473,
          "tb_total_tokens": 4836360,
          "tb_uncached_input_tokens": 296351,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4513536,
          "tb_cached_input_tokens": 4513536,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.2803949,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.5705808456873562,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1434505,
          "tb_total_output_tokens": 6601,
          "tb_total_tokens": 1441106,
          "tb_uncached_input_tokens": 70537,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1363968,
          "tb_cached_input_tokens": 1363968,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.500067,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 907911,
          "tb_total_output_tokens": 6614,
          "tb_total_tokens": 914525,
          "tb_uncached_input_tokens": 84615,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 823296,
          "tb_cached_input_tokens": 823296,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.63379,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3342444802632546,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-3820": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 6443720,
          "tb_total_output_tokens": 33139,
          "tb_total_tokens": 6476859,
          "tb_uncached_input_tokens": 831688,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5612032,
          "tb_cached_input_tokens": 5612032,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.2881708,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.41873918495808904,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1496321,
          "tb_total_output_tokens": 10847,
          "tb_total_tokens": 1507168,
          "tb_uncached_input_tokens": 53633,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1442688,
          "tb_cached_input_tokens": 1442688,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.619347,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.560356454767305,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 541388,
          "tb_total_output_tokens": 9365,
          "tb_total_tokens": 550753,
          "tb_uncached_input_tokens": 67532,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 473856,
          "tb_cached_input_tokens": 473856,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.446912,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.5765361256453612,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-3850": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_5a9cfc6bb1d00ff525dc46efc4c7f360",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3273165,
          "tb_total_output_tokens": 36796,
          "tb_total_tokens": 3309961,
          "tb_uncached_input_tokens": 531917,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2741248,
          "tb_cached_input_tokens": 2741248,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.4298387000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.32751625129432205,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_5a9cfc6bb1d00ff525dc46efc4c7f360",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 582151,
          "tb_total_output_tokens": 5633,
          "tb_total_tokens": 587784,
          "tb_uncached_input_tokens": 34183,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 547968,
          "tb_cached_input_tokens": 547968,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.672677,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.25079506393494533,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_5a9cfc6bb1d00ff525dc46efc4c7f360",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0.75,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1630569,
          "tb_total_output_tokens": 10149,
          "tb_total_tokens": 1640718,
          "tb_uncached_input_tokens": 117993,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1512576,
          "tb_cached_input_tokens": 1512576,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.073466,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0.75,
          "partial_score_numerator": 3,
          "partial_score_denominator": 4,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        }
      }
    },
    "flux-pr-4539": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1288729,
          "tb_total_output_tokens": 18813,
          "tb_total_tokens": 1307542,
          "tb_uncached_input_tokens": 270745,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1017984,
          "tb_cached_input_tokens": 1017984,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.6716931,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 829328,
          "tb_total_output_tokens": 4629,
          "tb_total_tokens": 833957,
          "tb_uncached_input_tokens": 56336,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 772992,
          "tb_cached_input_tokens": 772992,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.282268,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4860969030943976,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1072339,
          "tb_total_output_tokens": 5273,
          "tb_total_tokens": 1077612,
          "tb_uncached_input_tokens": 48083,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1024256,
          "tb_cached_input_tokens": 1024256,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.650478,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        }
      }
    },
    "flux-pr-4567": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5409937,
          "tb_total_output_tokens": 47087,
          "tb_total_tokens": 5457024,
          "tb_uncached_input_tokens": 770065,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4639872,
          "tb_cached_input_tokens": 4639872,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.1336003000000003,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1770241,
          "tb_total_output_tokens": 8646,
          "tb_total_tokens": 1778887,
          "tb_uncached_input_tokens": 69377,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1700864,
          "tb_cached_input_tokens": 1700864,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.110711,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2836271609204553,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1348406,
          "tb_total_output_tokens": 8605,
          "tb_total_tokens": 1357011,
          "tb_uncached_input_tokens": 89398,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1259008,
          "tb_cached_input_tokens": 1259008,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.87714,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2287567071894106,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-4568": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2777374,
          "tb_total_output_tokens": 18584,
          "tb_total_tokens": 2795958,
          "tb_uncached_input_tokens": 369054,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2408320,
          "tb_cached_input_tokens": 2408320,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.026333,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.07897112275089671,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2233911,
          "tb_total_output_tokens": 7453,
          "tb_total_tokens": 2241364,
          "tb_uncached_input_tokens": 185527,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2048384,
          "tb_cached_input_tokens": 2048384,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 6.3026610000000005,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1692436,
          "tb_total_output_tokens": 8301,
          "tb_total_tokens": 1700737,
          "tb_uncached_input_tokens": 108436,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1584000,
          "tb_cached_input_tokens": 1584000,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.07528,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.31319218233596996,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-4672": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2249940,
          "tb_total_output_tokens": 31846,
          "tb_total_tokens": 2281786,
          "tb_uncached_input_tokens": 283604,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1966336,
          "tb_cached_input_tokens": 1966336,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.9114324000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.19766459751520393,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1470141,
          "tb_total_output_tokens": 9487,
          "tb_total_tokens": 1479628,
          "tb_uncached_input_tokens": 137149,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1332992,
          "tb_cached_input_tokens": 1332992,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.6259429999999995,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1453861,
          "tb_total_output_tokens": 9709,
          "tb_total_tokens": 1463570,
          "tb_uncached_input_tokens": 99237,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1354624,
          "tb_cached_input_tokens": 1354624,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.953458,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        }
      }
    },
    "flux-pr-4680": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_9eb87a001605e9dbad2d6bc4df1cdf1a",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5408297,
          "tb_total_output_tokens": 54953,
          "tb_total_tokens": 5463250,
          "tb_uncached_input_tokens": 575017,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4833280,
          "tb_cached_input_tokens": 4833280,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.9172355,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_9eb87a001605e9dbad2d6bc4df1cdf1a",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1308578,
          "tb_total_output_tokens": 8403,
          "tb_total_tokens": 1316981,
          "tb_uncached_input_tokens": 82722,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1225856,
          "tb_cached_input_tokens": 1225856,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.583794,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.22162069657386407,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_9eb87a001605e9dbad2d6bc4df1cdf1a",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 969917,
          "tb_total_output_tokens": 13235,
          "tb_total_tokens": 983152,
          "tb_uncached_input_tokens": 67517,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 902400,
          "tb_cached_input_tokens": 902400,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.6921139999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.38476672560889974,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-4807": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_a474bb355191484fe37511ef6adcbbdb",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 9480750,
          "tb_total_output_tokens": 63023,
          "tb_total_tokens": 9543773,
          "tb_uncached_input_tokens": 811182,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 8669568,
          "tb_cached_input_tokens": 8669568,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.8953461999999996,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.25436705359416195,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_a474bb355191484fe37511ef6adcbbdb",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2246169,
          "tb_total_output_tokens": 10241,
          "tb_total_tokens": 2256410,
          "tb_uncached_input_tokens": 148633,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2097536,
          "tb_cached_input_tokens": 2097536,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 5.990259,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2743853862459483,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_a474bb355191484fe37511ef6adcbbdb",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1393356,
          "tb_total_output_tokens": 9574,
          "tb_total_tokens": 1402930,
          "tb_uncached_input_tokens": 130380,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1262976,
          "tb_cached_input_tokens": 1262976,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.96884,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2460003936157359,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-4811": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3960049,
          "tb_total_output_tokens": 39603,
          "tb_total_tokens": 3999652,
          "tb_uncached_input_tokens": 597873,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3362176,
          "tb_cached_input_tokens": 3362176,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.6387539000000002,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "fail_likely_equiv",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 892256,
          "tb_total_output_tokens": 7149,
          "tb_total_tokens": 899405,
          "tb_uncached_input_tokens": 59232,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 833024,
          "tb_cached_input_tokens": 833024,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.566956,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.37634583057880866,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1509723,
          "tb_total_output_tokens": 9525,
          "tb_total_tokens": 1519248,
          "tb_uncached_input_tokens": 144347,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1365376,
          "tb_cached_input_tokens": 1365376,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.047582,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4795384148544676,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-4843": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2199488,
          "tb_total_output_tokens": 31627,
          "tb_total_tokens": 2231115,
          "tb_uncached_input_tokens": 221888,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1977600,
          "tb_cached_input_tokens": 1977600,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.819234,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.21139211406787362,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 556807,
          "tb_total_output_tokens": 5367,
          "tb_total_tokens": 562174,
          "tb_uncached_input_tokens": 42631,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 514176,
          "tb_cached_input_tokens": 514176,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.7327489999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 345613,
          "tb_total_output_tokens": 3713,
          "tb_total_tokens": 349326,
          "tb_uncached_input_tokens": 49293,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 296320,
          "tb_cached_input_tokens": 296320,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.27645,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.21307789923626405,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-4861": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1915148,
          "tb_total_output_tokens": 13300,
          "tb_total_tokens": 1928448,
          "tb_uncached_input_tokens": 308364,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1606784,
          "tb_cached_input_tokens": 1606784,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.7833635999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1185030,
          "tb_total_output_tokens": 6398,
          "tb_total_tokens": 1191428,
          "tb_uncached_input_tokens": 135174,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1049856,
          "tb_cached_input_tokens": 1049856,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.9862739999999994,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 524242,
          "tb_total_output_tokens": 6156,
          "tb_total_tokens": 530398,
          "tb_uncached_input_tokens": 40786,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 483456,
          "tb_cached_input_tokens": 483456,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.37254800000000005,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.22014048356200014,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-4970": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3133675,
          "tb_total_output_tokens": 33832,
          "tb_total_tokens": 3167507,
          "tb_uncached_input_tokens": 417899,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2715776,
          "tb_cached_input_tokens": 2715776,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.2372069,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.48039984552855913,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1478654,
          "tb_total_output_tokens": 6804,
          "tb_total_tokens": 1485458,
          "tb_uncached_input_tokens": 57982,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1420672,
          "tb_cached_input_tokens": 1420672,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.4089780000000003,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.42420633014606945,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 760317,
          "tb_total_output_tokens": 6074,
          "tb_total_tokens": 766391,
          "tb_uncached_input_tokens": 52349,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 707968,
          "tb_cached_input_tokens": 707968,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.507274,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.424164075572708,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-5156": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_9eb87a001605e9dbad2d6bc4df1cdf1a",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1328670,
          "tb_total_output_tokens": 28066,
          "tb_total_tokens": 1356736,
          "tb_uncached_input_tokens": 206622,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1122048,
          "tb_cached_input_tokens": 1122048,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.6466362,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_9eb87a001605e9dbad2d6bc4df1cdf1a",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 531364,
          "tb_total_output_tokens": 3451,
          "tb_total_tokens": 534815,
          "tb_uncached_input_tokens": 50340,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 481024,
          "tb_cached_input_tokens": 481024,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.683696,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1286210544582699,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_9eb87a001605e9dbad2d6bc4df1cdf1a",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 3,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1155869,
          "tb_total_output_tokens": 8097,
          "tb_total_tokens": 1163966,
          "tb_uncached_input_tokens": 107421,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1048448,
          "tb_cached_input_tokens": 1048448,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.803842,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3289645941015719,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-5187": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 374880,
          "tb_total_output_tokens": 14109,
          "tb_total_tokens": 388989,
          "tb_uncached_input_tokens": 78688,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 296192,
          "tb_cached_input_tokens": 296192,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.2471148,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0,
          "partial_score_numerator": 0,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1485020908096406,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 327742,
          "tb_total_output_tokens": 3318,
          "tb_total_tokens": 331060,
          "tb_uncached_input_tokens": 31678,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 296064,
          "tb_cached_input_tokens": 296064,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.1183459999999998,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0,
          "partial_score_numerator": 0,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.15172317398071444,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "fail_likely_equiv",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "likely_equivalent_despite_test_fail",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 531621,
          "tb_total_output_tokens": 4641,
          "tb_total_tokens": 536262,
          "tb_uncached_input_tokens": 69669,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 461952,
          "tb_cached_input_tokens": 461952,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.40744199999999997,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": true,
          "rescue_eligible": false,
          "rescue_decision": "rejected_not_high_confidence_stylistic",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0,
          "partial_score_numerator": 0,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3860531541355702,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-5222": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 8284142,
          "tb_total_output_tokens": 43947,
          "tb_total_tokens": 8328089,
          "tb_uncached_input_tokens": 708846,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 7575296,
          "tb_cached_input_tokens": 7575296,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.4632454000000004,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2950900600460944,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 985355,
          "tb_total_output_tokens": 6042,
          "tb_total_tokens": 991397,
          "tb_uncached_input_tokens": 52107,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 933248,
          "tb_cached_input_tokens": 933248,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.543997,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1912181625774956,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 735292,
          "tb_total_output_tokens": 6544,
          "tb_total_tokens": 741836,
          "tb_uncached_input_tokens": 46908,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 688384,
          "tb_cached_input_tokens": 688384,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.49036,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.20982209080964062,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-5316": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0.8,
          "probe_gold_pass_candidate_pass_count": 4,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2683864,
          "tb_total_output_tokens": 21333,
          "tb_total_tokens": 2705197,
          "tb_uncached_input_tokens": 440280,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2243584,
          "tb_cached_input_tokens": 2243584,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.1249556,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0.8,
          "partial_score_numerator": 4,
          "partial_score_denominator": 5,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 4,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1254592,
          "tb_total_output_tokens": 5851,
          "tb_total_tokens": 1260443,
          "tb_uncached_input_tokens": 74688,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1179904,
          "tb_cached_input_tokens": 1179904,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.2412360000000002,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 4,
          "partial_score_denominator": 4,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.5125593326877056,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 4,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 686564,
          "tb_total_output_tokens": 5430,
          "tb_total_tokens": 691994,
          "tb_uncached_input_tokens": 78180,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 608384,
          "tb_cached_input_tokens": 608384,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.503992,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 4,
          "partial_score_denominator": 4,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4332813525720736,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-5409": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0.5,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 496116,
          "tb_total_output_tokens": 26761,
          "tb_total_tokens": 522877,
          "tb_uncached_input_tokens": 61172,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 434944,
          "tb_cached_input_tokens": 434944,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.3175656,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0.5,
          "partial_score_numerator": 1,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1521158408096406,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0.5,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 212364,
          "tb_total_output_tokens": 7282,
          "tb_total_tokens": 219646,
          "tb_uncached_input_tokens": 22028,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 190336,
          "tb_cached_input_tokens": 190336,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.0528440000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0.5,
          "partial_score_numerator": 1,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.15542334080964063,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0.3333333333333333,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": 2,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 946305,
          "tb_total_output_tokens": 12660,
          "tb_total_tokens": 958965,
          "tb_uncached_input_tokens": 105601,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 840704,
          "tb_cached_input_tokens": 840704,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.732834,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0.3333333333333333,
          "partial_score_numerator": 1,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        }
      }
    },
    "flux-pr-5519": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 4,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 322087,
          "tb_total_output_tokens": 10808,
          "tb_total_tokens": 332895,
          "tb_uncached_input_tokens": 36775,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 285312,
          "tb_cached_input_tokens": 285312,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.1628073,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 4,
          "partial_score_denominator": 4,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1823535583490964,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 4,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 311608,
          "tb_total_output_tokens": 3967,
          "tb_total_tokens": 315575,
          "tb_uncached_input_tokens": 36024,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 275584,
          "tb_cached_input_tokens": 275584,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.1917559999999998,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 4,
          "partial_score_denominator": 4,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.17267834080964062,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 4,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 417752,
          "tb_total_output_tokens": 4444,
          "tb_total_tokens": 422196,
          "tb_uncached_input_tokens": 32984,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 384768,
          "tb_cached_input_tokens": 384768,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.29390400000000005,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 4,
          "partial_score_denominator": 4,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1776559640270023,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-5574": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1230418,
          "tb_total_output_tokens": 26783,
          "tb_total_tokens": 1257201,
          "tb_uncached_input_tokens": 133842,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1096576,
          "tb_cached_input_tokens": 1096576,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.5259474,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 745129,
          "tb_total_output_tokens": 5843,
          "tb_total_tokens": 750972,
          "tb_uncached_input_tokens": 43177,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 701952,
          "tb_cached_input_tokens": 701952,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.051163,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4898923220273933,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 837514,
          "tb_total_output_tokens": 7903,
          "tb_total_tokens": 845417,
          "tb_uncached_input_tokens": 110474,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 727040,
          "tb_cached_input_tokens": 727040,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.6476919999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        }
      }
    },
    "flux-pr-5575": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 8579750,
          "tb_total_output_tokens": 56197,
          "tb_total_tokens": 8635947,
          "tb_uncached_input_tokens": 1046182,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 7533568,
          "tb_cached_input_tokens": 7533568,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.0364902,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1046681,
          "tb_total_output_tokens": 8259,
          "tb_total_tokens": 1054940,
          "tb_uncached_input_tokens": 59929,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 986752,
          "tb_cached_input_tokens": 986752,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.874603,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2753473365976477,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 913164,
          "tb_total_output_tokens": 6201,
          "tb_total_tokens": 919365,
          "tb_uncached_input_tokens": 59020,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 854144,
          "tb_cached_input_tokens": 854144,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.59472,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3059745956441987,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-5578": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 6300926,
          "tb_total_output_tokens": 66116,
          "tb_total_tokens": 6367042,
          "tb_uncached_input_tokens": 482430,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5818496,
          "tb_cached_input_tokens": 5818496,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.9931154,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        },
        "gpt-5-3-codex": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 2,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1828976,
          "tb_total_output_tokens": 10776,
          "tb_total_tokens": 1839752,
          "tb_uncached_input_tokens": 80880,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1748096,
          "tb_cached_input_tokens": 1748096,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.481904,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.3-codex",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 2,
          "partial_score_denominator": 2,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.18922047089881425,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        },
        "gpt-5-4": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1498281,
          "tb_total_output_tokens": 14732,
          "tb_total_tokens": 1513013,
          "tb_uncached_input_tokens": 50345,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1447936,
          "tb_cached_input_tokens": 1447936,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.9425139999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.4",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2473610905130445,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    }
  },
  "comparison": {
    "partial_score_threshold": 0.8,
    "methodology": {
      "cache_pricing_mode_field": "cost_pricing_mode",
      "code_review_fail_rate_field": "code_review_fail_rate",
      "code_review_rate_denominator": "leaderboard_eligible",
      "code_review_role": "additive_non_gating",
      "cost_per_task_field": "cost_per_task",
      "cost_role": "additive_non_gating",
      "equiv_quality_per_dollar_field": "equiv_quality_per_dollar",
      "equiv_rate_denominator": "leaderboard_eligible",
      "equiv_rate_field": "equiv_rate",
      "equiv_rate_role": "additive_non_gating",
      "footprint_risk_denominator": "validated",
      "footprint_risk_role": "additive_non_gating",
      "footprint_risk_score_field": "footprint_risk_score",
      "leaderboard_rate_field": "tests_only_pass_rate",
      "pricing_source": "local_static_table",
      "pricing_version": "local-placeholder-2026-02-19",
      "probe_agreement_rate_field": "probe_agreement_rate",
      "probe_gold_pass_candidate_fail_field": "probe_gold_pass_candidate_fail_count",
      "probe_rate_denominator": "probe_gold_pass_candidate_known",
      "probe_review_required_field": "probe_review_required_count",
      "probe_role": "additive_non_gating",
      "publish_filter_default": "include",
      "publish_filter_field": "publish.include_in_leaderboard",
      "quality_per_dollar_denominator": "cost_per_task",
      "rescue_aware_rate_field": "rescue_aware_pass_rate",
      "tb_resolved_but_tests_not_pass_field": "tb_resolved_but_tests_not_pass_count",
      "tb_unresolved_but_tests_pass_field": "tb_unresolved_but_tests_pass_count",
      "tests_only_quality_per_dollar_field": "tests_only_quality_per_dollar",
      "tests_unknown_rate_threshold": 0.1
    }