STET

summary

reports/summary.json

92934 bytes

Back to adjudication
{
  "generated_at": "2026-02-21T17:36:34Z",
  "dataset": "/Users/ben/dev/flux/.tmp/sqlparser-rs-dataset",
  "output_root": "/Users/ben/dev/flux/.tmp/h2h-sqlparser-flu77-mini",
  "statistics": {
    "pass_definition": {
      "denominator_statuses": [
        "fail_guardrail",
        "fail_high_conf",
        "fail_infra",
        "fail_likely_equiv",
        "fail_no_patch",
        "fail_with_diag",
        "pass",
        "pass_with_warn"
      ],
      "positive_statuses": [
        "pass",
        "pass_with_warn"
      ]
    },
    "bootstrap": {
      "base_seed": 1337,
      "confidence_level": 0.95,
      "method": "nonparametric_task_bootstrap",
      "resamples": 5000
    },
    "tiering": {
      "rule": "A is strictly superior to B iff passRate(A) > ciHigh(B)",
      "strategy": "conservative_non_superiority_grouping"
    }
  },
  "models": [
    {
      "name": "gpt-5.1-codex-mini",
      "key": "gpt-5-1-codex-mini",
      "run_id": "2026-02-21__03-36-16__gpt-5-1-codex-mini"
    }
  ],
  "runs": {
    "gpt-5-1-codex-mini": {
      "model": "gpt-5.1-codex-mini",
      "requested_model": "gpt-5.1-codex-mini",
      "run_id": "2026-02-21__03-36-16__gpt-5-1-codex-mini",
      "run_dir": "/Users/ben/dev/flux/.tmp/h2h-sqlparser-flu77-mini/runs/2026-02-21__03-36-16__gpt-5-1-codex-mini",
      "results_path": "/Users/ben/dev/flux/.tmp/h2h-sqlparser-flu77-mini/runs/2026-02-21__03-36-16__gpt-5-1-codex-mini/results.json",
      "validation_metrics": {
        "validated": 30,
        "leaderboard_eligible": 30,
        "leaderboard_excluded": 0,
        "binary_pass_count": 30,
        "binary_pass_rate": 1,
        "tests_only_pass_count": 30,
        "tests_only_pass_rate": 1,
        "rescue_aware_pass_count": 29,
        "rescue_aware_pass_rate": 0.9666666666666667,
        "rescue_delta_rate": -0.033333333333333326,
        "equiv_rate": 0.3,
        "equiv_equivalent_count": 9,
        "equiv_non_equivalent_count": 16,
        "equiv_unknown_count": 5,
        "code_review_pass_count": 2,
        "code_review_fail_count": 22,
        "code_review_unsure_count": 6,
        "code_review_fail_rate": 0.7333333333333333,
        "behavioral_robustness_used_count": 30,
        "behavioral_robustness_skipped_count": 0,
        "behavioral_robustness_unavailable_count": 0,
        "probe_accepted_commands_count": 8,
        "probe_gold_pass_candidate_pass_count": 22,
        "probe_gold_pass_candidate_fail_count": 8,
        "probe_review_required_count": 0,
        "probe_agreement_rate": 0.7333333333333333,
        "tests_unknown_count": 0,
        "tests_unknown_cause_counts": {},
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false,
        "cache_evaluated_count": 30,
        "cache_hit_count": 0,
        "cache_miss_count": 30,
        "cache_hit_rate": 0,
        "cache_miss_reason_counts": {
          "missing_pinned_dependencies": 30
        },
        "total_setup_ms_saved": null,
        "total_pinned_bytes": null,
        "footprint_risk_mean_score": 0.29715347760563326,
        "footprint_risk_median_score": 0.2743117781907549,
        "footprint_risk_scored_count": 30,
        "footprint_risk_used_count": 30,
        "footprint_risk_unavailable_count": 0,
        "footprint_risk_missing_count": 0,
        "footprint_risk_flagged_count": 0,
        "footprint_risk_flagged_rate": 0,
        "footprint_risk_severe_count": 0,
        "footprint_risk_severe_rate": 0,
        "footprint_risk_level_low_count": 20,
        "footprint_risk_level_medium_count": 10,
        "footprint_risk_level_high_count": 0,
        "footprint_risk_level_unknown_count": 0,
        "total_cost": 27.9638031,
        "cost_per_task": 0.93212677,
        "tests_only_quality_per_dollar": 1.0728154497697775,
        "equiv_quality_per_dollar": 0.32184463493093324,
        "total_input_tokens": 76954533,
        "total_output_tokens": 792359,
        "total_tokens": 77746892,
        "total_uncached_input_tokens": 8641829,
        "total_cache_creation_input_tokens": null,
        "total_cache_read_input_tokens": 68312704,
        "total_cached_input_tokens": 68312704,
        "cost_tasks_total": 30,
        "cost_tasks_with_tokens": 30,
        "cost_tasks_with_cache_tokens": 19,
        "cost_tasks_with_cache_aware_pricing": 19,
        "cost_tasks_with_legacy_pricing": 11,
        "cost_tasks_with_pricing": 30,
        "cost_tasks_with_cost": 30,
        "cost_tasks_missing_tokens": 0,
        "cost_tasks_missing_pricing": 0,
        "cost_tasks_missing_cost": 0,
        "cost_tasks_with_cost_rate": 1,
        "pricing_version": "local-placeholder-2026-02-19",
        "pricing_source": "local_static_table",
        "pricing_model_key": "gpt-5.1-codex-mini"
      },
      "publish_exclusions": {},
      "publish_guard": {
        "publishable": true,
        "blocked": false,
        "reasons": [],
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false
      },
      "partial_metrics": {
        "failed_task_count": 1,
        "failed_task_with_partial_score": 1,
        "failed_task_partial_threshold": 0.8,
        "failed_task_partial_threshold_hits": 1,
        "failed_task_partial_ppr": 1,
        "failed_task_partial_mean_score": 1,
        "failed_task_partial_coverage": 1
      },
      "footprint_risk_metrics": {
        "used_count": 30,
        "unavailable_count": 0,
        "missing_count": 0,
        "scored_count": 30,
        "mean_score": 0.29715347760563326,
        "median_score": 0.2743117781907549,
        "flagged_count": 0,
        "flagged_rate": 0,
        "severe_count": 0,
        "severe_rate": 0,
        "level_low_count": 20,
        "level_medium_count": 10,
        "level_high_count": 0,
        "level_unknown_count": 0
      },
      "passRate": 1,
      "ciLow": 1,
      "ciHigh": 1,
      "effectiveN": 30,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 1,
        "fail_high_conf": 0,
        "fail_infra": 0,
        "fail_likely_equiv": 0,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 6,
        "pass_with_warn": 23
      }
    }
  },
  "tasks": {
    "flux-pr-1414": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5904039,
          "tb_total_output_tokens": 64085,
          "tb_total_tokens": 5968124,
          "tb_uncached_input_tokens": 619943,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5284096,
          "tb_cached_input_tokens": 5284096,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.1070389,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.34667593196008695,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1435": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2169285,
          "tb_total_output_tokens": 17142,
          "tb_total_tokens": 2186427,
          "tb_uncached_input_tokens": 265925,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1903360,
          "tb_cached_input_tokens": 1903360,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.7872435000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 0,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.5325472606515167,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1441": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2131310,
          "tb_total_output_tokens": 32773,
          "tb_total_tokens": 2164083,
          "tb_uncached_input_tokens": 228334,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1902976,
          "tb_cached_input_tokens": 1902976,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.8245854,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4721201742874394,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1495": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2026004,
          "tb_total_output_tokens": 30910,
          "tb_total_tokens": 2056914,
          "tb_uncached_input_tokens": 276372,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1749632,
          "tb_cached_input_tokens": 1749632,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.8624628,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.19803380729650558,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1500": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3837620,
          "tb_total_output_tokens": 38965,
          "tb_total_tokens": 3876585,
          "tb_uncached_input_tokens": 560820,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3276800,
          "tb_cached_input_tokens": 3276800,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.5665399999999998,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.21446548197062992,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1501": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "zero_token_agent_timeout"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2683777264146465,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1526": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 7973427,
          "tb_total_output_tokens": 51130,
          "tb_total_tokens": 8024557,
          "tb_uncached_input_tokens": 716339,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 7257088,
          "tb_cached_input_tokens": 7257088,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.4698516999999995,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4137873857188443,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1534": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "zero_token_agent_timeout"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4905708883279633,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1576": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3238923,
          "tb_total_output_tokens": 59184,
          "tb_total_tokens": 3298107,
          "tb_uncached_input_tokens": 396683,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2842240,
          "tb_cached_input_tokens": 2842240,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.3764645000000002,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2122236176674215,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1604": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2446623,
          "tb_total_output_tokens": 31073,
          "tb_total_tokens": 2477696,
          "tb_uncached_input_tokens": 382751,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2063872,
          "tb_cached_input_tokens": 2063872,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.0701453,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.23340946072972635,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1628": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3684210,
          "tb_total_output_tokens": 38092,
          "tb_total_tokens": 3722302,
          "tb_uncached_input_tokens": 396786,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3287424,
          "tb_cached_input_tokens": 3287424,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.3168446,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.13130820755660505,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1649": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5048829,
          "tb_total_output_tokens": 45376,
          "tb_total_tokens": 5094205,
          "tb_uncached_input_tokens": 350845,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4697984,
          "tb_cached_input_tokens": 4697984,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.5032211,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3237812707640059,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1747": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": null,
          "tb_failure_mode": "parse_error",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.17988376786566407,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1759": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "zero_token_agent_timeout"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.21652572320668134,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1765": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": null,
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 6988156,
          "tb_total_output_tokens": 74257,
          "tb_total_tokens": 7062413,
          "tb_uncached_input_tokens": 593532,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 6394624,
          "tb_cached_input_tokens": 6394624,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.2950336,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.29683321107790217,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1791": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4825030,
          "tb_total_output_tokens": 29081,
          "tb_total_tokens": 4854111,
          "tb_uncached_input_tokens": 508742,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4316288,
          "tb_cached_input_tokens": 4316288,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.5850422,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3753562195420756,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1839": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3510971,
          "tb_total_output_tokens": 55604,
          "tb_total_tokens": 3566575,
          "tb_uncached_input_tokens": 366779,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3144192,
          "tb_cached_input_tokens": 3144192,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.3554213,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.20973643622822102,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1891": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2174573,
          "tb_total_output_tokens": 39448,
          "tb_total_tokens": 2214021,
          "tb_uncached_input_tokens": 492141,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1682432,
          "tb_cached_input_tokens": 1682432,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.2272642999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.20205197397076974,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1900": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 8173916,
          "tb_total_output_tokens": 59525,
          "tb_total_tokens": 8233441,
          "tb_uncached_input_tokens": 1027548,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 7146368,
          "tb_cached_input_tokens": 7146368,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.9704271999999996,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.27527477164746755,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1908": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2968845,
          "tb_total_output_tokens": 26135,
          "tb_total_tokens": 2994980,
          "tb_uncached_input_tokens": 301197,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2667648,
          "tb_cached_input_tokens": 2667648,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.0087526999999998,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.316755778680191,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1918": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2425671,
          "tb_total_output_tokens": 26525,
          "tb_total_tokens": 2452196,
          "tb_uncached_input_tokens": 309319,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2116352,
          "tb_cached_input_tokens": 2116352,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.9405813000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.44111636768342255,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1965": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1824375,
          "tb_total_output_tokens": 23770,
          "tb_total_tokens": 1848145,
          "tb_uncached_input_tokens": 218871,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1605504,
          "tb_cached_input_tokens": 1605504,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.7117521,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.21320184001290873,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1984": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5898355,
          "tb_total_output_tokens": 58314,
          "tb_total_tokens": 5956669,
          "tb_uncached_input_tokens": 950387,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4947968,
          "tb_cached_input_tokens": 4947968,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.5176597,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2618665733866072,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-2011": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3393261,
          "tb_total_output_tokens": 28939,
          "tb_total_tokens": 3422200,
          "tb_uncached_input_tokens": 362093,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3031168,
          "tb_cached_input_tokens": 3031168,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.1714487,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.28882125343531045,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-2096": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "zero_token_agent_timeout"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.22759618140151325,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-2148": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 6497608,
          "tb_total_output_tokens": 68438,
          "tb_total_tokens": 6566046,
          "tb_uncached_input_tokens": 578632,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5918976,
          "tb_cached_input_tokens": 5918976,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.1664224,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.19502537181816498,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-2151": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "unavailable",
          "equivalence_outcome": null,
          "code_review_status": "unavailable",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 1790838,
          "tb_total_output_tokens": 16954,
          "tb_total_tokens": 1807792,
          "tb_uncached_input_tokens": 149238,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1641600,
          "tb_cached_input_tokens": 1641600,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.571821,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.37645727258338846,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-2170": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 6351257,
          "tb_total_output_tokens": 35022,
          "tb_total_tokens": 6386279,
          "tb_uncached_input_tokens": 670233,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5681024,
          "tb_cached_input_tokens": 5681024,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.0676350999999995,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.4051653933986369,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-2172": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "unavailable",
          "equivalence_outcome": null,
          "code_review_status": "unavailable",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 8265531,
          "tb_total_output_tokens": 71730,
          "tb_total_tokens": 8337261,
          "tb_uncached_input_tokens": 451899,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 7813632,
          "tb_cached_input_tokens": 7813632,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.2802733,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.40855897498585636,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-2185": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
          "equivalence_status": "unavailable",
          "equivalence_outcome": null,
          "code_review_status": "unavailable",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4822949,
          "tb_total_output_tokens": 66901,
          "tb_total_tokens": 4889850,
          "tb_uncached_input_tokens": 786469,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4036480,
          "tb_cached_input_tokens": 4036480,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.1865815,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.25657050595776393,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    }
  },
  "comparison": {
    "partial_score_threshold": 0.8,
    "methodology": {
      "cache_pricing_mode_field": "cost_pricing_mode",
      "code_review_fail_rate_field": "code_review_fail_rate",
      "code_review_rate_denominator": "leaderboard_eligible",
      "code_review_role": "additive_non_gating",
      "cost_per_task_field": "cost_per_task",
      "cost_role": "additive_non_gating",
      "equiv_quality_per_dollar_field": "equiv_quality_per_dollar",
      "equiv_rate_denominator": "leaderboard_eligible",
      "equiv_rate_field": "equiv_rate",
      "equiv_rate_role": "additive_non_gating",
      "footprint_risk_denominator": "validated",
      "footprint_risk_role": "additive_non_gating",
      "footprint_risk_score_field": "footprint_risk_score",
      "leaderboard_rate_field": "tests_only_pass_rate",
      "pricing_source": "local_static_table",
      "pricing_version": "local-placeholder-2026-02-19",
      "probe_agreement_rate_field": "probe_agreement_rate",
      "probe_gold_pass_candidate_fail_field": "probe_gold_pass_candidate_fail_count",
      "probe_rate_denominator": "probe_gold_pass_candidate_known",
      "probe_review_required_field": "probe_review_required_count",
      "probe_role": "additive_non_gating",
      "publish_filter_default": "include",
      "publish_filter_field": "publish.include_in_leaderboard",
      "quality_per_dollar_denominator": "cost_per_task",
      "rescue_aware_rate_field": "rescue_aware_pass_rate",
      "tests_only_quality_per_dollar_field": "tests_only_quality_per_dollar",
      "tests_unknown_rate_threshold": 0.1
    },
    "ranking": [
      {
        "rank": 1,
        "model": "gpt-5.1-codex-mini",
        "model_key": "gpt-5-1-codex-mini",
        "binary_pass_rate": 1,
        "binary_pass_count": 30,
        "validated": 30,
        "failed_task_partial_ppr": 1,
        "failed_task_partial_mean_score": 1,
        "failed_task_partial_coverage": 1,
        "tie_break_basis": null
      }
    ],
    "publish_guard": {
      "all_runs_publishable": true,
      "blocked_runs": [],
      "tests_unknown_rate_threshold": 0.1
    }
  }
}