STET

summary

reports/summary.json

96753 bytes

Back to adjudication
{
  "generated_at": "2026-03-07T17:34:18Z",
  "dataset": "/Users/ben/dev/flux/.tmp/graphql-go-tools-dataset",
  "output_root": "/Users/ben/dev/flux/.tmp/h2h-graphql-flu77-mini-backfill-20260303",
  "statistics": {
    "pass_definition": {
      "denominator_statuses": [
        "fail_guardrail",
        "fail_high_conf",
        "fail_infra",
        "fail_likely_equiv",
        "fail_no_patch",
        "fail_with_diag",
        "pass",
        "pass_with_warn"
      ],
      "positive_statuses": [
        "pass",
        "pass_with_warn"
      ]
    },
    "bootstrap": {
      "base_seed": 1337,
      "confidence_level": 0.95,
      "method": "nonparametric_task_bootstrap",
      "resamples": 5000
    },
    "tiering": {
      "rule": "A is strictly superior to B iff passRate(A) > ciHigh(B)",
      "strategy": "conservative_non_superiority_grouping"
    }
  },
  "models": [
    {
      "name": "gpt-5-1-codex-mini",
      "key": "gpt-5-1-codex-mini",
      "run_id": "2026-02-21__03-36-16__gpt-5-1-codex-mini"
    }
  ],
  "runs": {
    "gpt-5-1-codex-mini": {
      "model": "gpt-5-1-codex-mini",
      "requested_model": "gpt-5-1-codex-mini",
      "run_id": "2026-02-21__03-36-16__gpt-5-1-codex-mini",
      "run_dir": "/Users/ben/dev/flux/.tmp/h2h-graphql-flu77-mini-backfill-20260303/runs/2026-02-21__03-36-16__gpt-5-1-codex-mini",
      "results_path": "/Users/ben/dev/flux/.tmp/h2h-graphql-flu77-mini-backfill-20260303/runs/2026-02-21__03-36-16__gpt-5-1-codex-mini/results.json",
      "validation_metrics": {
        "validated": 30,
        "leaderboard_eligible": 21,
        "leaderboard_excluded": 9,
        "binary_pass_count": 12,
        "binary_pass_rate": 0.5714285714285714,
        "tests_only_pass_count": 12,
        "tests_only_pass_rate": 0.5714285714285714,
        "rescue_aware_pass_count": 11,
        "rescue_aware_pass_rate": 0.5238095238095238,
        "rescue_delta_rate": -0.04761904761904756,
        "equiv_rate": 0.38095238095238093,
        "equiv_equivalent_count": 8,
        "equiv_non_equivalent_count": 13,
        "equiv_unknown_count": 0,
        "code_review_pass_count": 1,
        "code_review_fail_count": 15,
        "code_review_unsure_count": 5,
        "code_review_fail_rate": 0.7142857142857143,
        "behavioral_robustness_used_count": 15,
        "behavioral_robustness_skipped_count": 6,
        "behavioral_robustness_unavailable_count": 0,
        "probe_accepted_commands_count": 1,
        "probe_gold_pass_candidate_pass_count": 11,
        "probe_gold_pass_candidate_fail_count": 2,
        "probe_review_required_count": 0,
        "probe_agreement_rate": 0.8461538461538461,
        "tb_unresolved_but_tests_pass_count": 4,
        "tb_resolved_but_tests_not_pass_count": 10,
        "tests_unknown_count": 15,
        "tests_unknown_cause_counts": {
          "all_commands_ignored_gold_failure_mode_unset": 2,
          "no_gold_pass_commands": 12,
          "unspecified": 1
        },
        "tests_unknown_rate": 0.5,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": true,
        "cache_evaluated_count": 17,
        "cache_hit_count": 0,
        "cache_miss_count": 17,
        "cache_hit_rate": 0,
        "cache_miss_reason_counts": {
          "missing_pinned_dependencies": 17
        },
        "total_setup_ms_saved": null,
        "total_pinned_bytes": null,
        "footprint_risk_mean_score": 0.30077457434219274,
        "footprint_risk_median_score": 0.24824581326145242,
        "footprint_risk_scored_count": 29,
        "footprint_risk_used_count": 29,
        "footprint_risk_unavailable_count": 1,
        "footprint_risk_missing_count": 0,
        "footprint_risk_flagged_count": 2,
        "footprint_risk_flagged_rate": 0.06896551724137931,
        "footprint_risk_severe_count": 0,
        "footprint_risk_severe_rate": 0,
        "footprint_risk_level_low_count": 20,
        "footprint_risk_level_medium_count": 7,
        "footprint_risk_level_high_count": 2,
        "footprint_risk_level_unknown_count": 0,
        "total_cost": 42.6939756,
        "cost_per_task": 1.42313252,
        "tests_only_quality_per_dollar": 0.40152871457717193,
        "equiv_quality_per_dollar": 0.2676858097181146,
        "total_input_tokens": 123851784,
        "total_output_tokens": 907123,
        "total_tokens": 124758907,
        "total_uncached_input_tokens": 13832200,
        "total_cache_creation_input_tokens": null,
        "total_cache_read_input_tokens": 110019584,
        "total_cached_input_tokens": 110019584,
        "cost_tasks_total": 30,
        "cost_tasks_with_tokens": 30,
        "cost_tasks_with_cache_tokens": 20,
        "cost_tasks_with_cache_aware_pricing": 20,
        "cost_tasks_with_legacy_pricing": 10,
        "cost_tasks_with_pricing": 30,
        "cost_tasks_with_cost": 30,
        "cost_tasks_missing_tokens": 0,
        "cost_tasks_missing_pricing": 0,
        "cost_tasks_missing_cost": 0,
        "cost_tasks_with_cost_rate": 1,
        "pricing_version": "local-placeholder-2026-02-19",
        "pricing_source": "local_static_table",
        "pricing_model_key": "gpt-5.1-codex-mini"
      },
      "publish_exclusions": {
        "agent_patch_empty": 1,
        "zero_token_agent_timeout": 8
      },
      "publish_guard": {
        "publishable": false,
        "blocked": true,
        "reasons": [
          "tests_unknown_rate_threshold_exceeded"
        ],
        "tests_unknown_rate": 0.5,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": true
      },
      "partial_metrics": {
        "failed_task_count": 17,
        "failed_task_with_partial_score": 2,
        "failed_task_partial_threshold": 0.8,
        "failed_task_partial_threshold_hits": 1,
        "failed_task_partial_ppr": 0.058823529411764705,
        "failed_task_partial_mean_score": 0.5,
        "failed_task_partial_coverage": 0.11764705882352941
      },
      "footprint_risk_metrics": {
        "used_count": 29,
        "unavailable_count": 1,
        "missing_count": 0,
        "scored_count": 29,
        "mean_score": 0.30077457434219274,
        "median_score": 0.24824581326145242,
        "flagged_count": 2,
        "flagged_rate": 0.06896551724137931,
        "severe_count": 0,
        "severe_rate": 0,
        "level_low_count": 20,
        "level_medium_count": 7,
        "level_high_count": 2,
        "level_unknown_count": 0
      },
      "passRate": 0.5714285714285714,
      "ciLow": 0.38095238095238093,
      "ciHigh": 0.8095238095238095,
      "effectiveN": 21,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 13,
        "fail_high_conf": 1,
        "fail_infra": 2,
        "fail_likely_equiv": 0,
        "fail_no_patch": 1,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 2,
        "pass_with_warn": 11
      }
    }
  },
  "tasks": {
    "flux-pr-1001": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3314010,
          "tb_total_output_tokens": 34866,
          "tb_total_tokens": 3348876,
          "tb_uncached_input_tokens": 482650,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2831360,
          "tb_cached_input_tokens": 2831360,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.357875,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.23758751030834876,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1034": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "zero_token_agent_timeout"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3000124593227613,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1076": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_no_patch",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "unspecified",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "skipped",
          "equivalence_outcome": null,
          "code_review_status": "skipped",
          "code_review_signal": null,
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2764702,
          "tb_total_output_tokens": 22484,
          "tb_total_tokens": 2787186,
          "tb_uncached_input_tokens": 518174,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2246528,
          "tb_cached_input_tokens": 2246528,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.2491442,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "agent_patch_empty"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": null,
          "partial_score_provenance": null,
          "partial_score_reason": null,
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "unavailable",
          "footprint_risk_reason": "agent_patch_missing",
          "footprint_risk_level": null,
          "footprint_risk_score": 0,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1087": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3178147,
          "tb_total_output_tokens": 47961,
          "tb_total_tokens": 3226108,
          "tb_uncached_input_tokens": 441123,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2737024,
          "tb_cached_input_tokens": 2737024,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.3600041,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2470346887171848,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1099": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 8929391,
          "tb_total_output_tokens": 62391,
          "tb_total_tokens": 8991782,
          "tb_uncached_input_tokens": 1215983,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 7713408,
          "tb_cached_input_tokens": 7713408,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.3553317000000003,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 0.6855711480107036,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1128": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 8149667,
          "tb_total_output_tokens": 58928,
          "tb_total_tokens": 8208595,
          "tb_uncached_input_tokens": 1206947,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 6942720,
          "tb_cached_input_tokens": 6942720,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.2053965,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2920290653629463,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1155": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "zero_token_agent_timeout"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1519263438073954,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1169": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 9175810,
          "tb_total_output_tokens": 31914,
          "tb_total_tokens": 9207724,
          "tb_uncached_input_tokens": 817282,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 8358528,
          "tb_cached_input_tokens": 8358528,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.6711861999999997,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1694095619334414,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1184": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_high_conf",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "all_commands_ignored_gold_failure_mode_unset",
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3401794,
          "tb_total_output_tokens": 68463,
          "tb_total_tokens": 3470257,
          "tb_uncached_input_tokens": 370626,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3031168,
          "tb_cached_input_tokens": 3031168,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.4213922,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.15927334080964062,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1209": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 8484342,
          "tb_total_output_tokens": 60989,
          "tb_total_tokens": 8545331,
          "tb_uncached_input_tokens": 1571190,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 6913152,
          "tb_cached_input_tokens": 6913152,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.7596918000000006,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.29822963811309394,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1230": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3521042,
          "tb_total_output_tokens": 29833,
          "tb_total_tokens": 3550875,
          "tb_uncached_input_tokens": 317970,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3203072,
          "tb_cached_input_tokens": 3203072,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.1364138,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1591496417486659,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1232": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2034200,
          "tb_total_output_tokens": 47496,
          "tb_total_tokens": 2081696,
          "tb_uncached_input_tokens": 282776,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1751424,
          "tb_cached_input_tokens": 1751424,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.9718536000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.37296211772540183,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1240": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4416035,
          "tb_total_output_tokens": 44042,
          "tb_total_tokens": 4460077,
          "tb_uncached_input_tokens": 622243,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3793792,
          "tb_cached_input_tokens": 3793792,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.7666852999999998,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.44800204579281366,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1241": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4050275,
          "tb_total_output_tokens": 49054,
          "tb_total_tokens": 4099329,
          "tb_uncached_input_tokens": 345827,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3704448,
          "tb_cached_input_tokens": 3704448,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.3687317,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.3673636082122127,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1260": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 5927854,
          "tb_total_output_tokens": 43930,
          "tb_total_tokens": 5971784,
          "tb_uncached_input_tokens": 882734,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 5045120,
          "tb_cached_input_tokens": 5045120,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.344449,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.16326361063811723,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1262": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "zero_token_agent_timeout"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.14254262836816045,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1268": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 16056908,
          "tb_total_output_tokens": 58373,
          "tb_total_tokens": 16115281,
          "tb_uncached_input_tokens": 1143372,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 14913536,
          "tb_cached_input_tokens": 14913536,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 4.3023264,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.2964594184716357,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1293": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_infra",
          "tests_outcome": "fail",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "infra_failure",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": 0,
          "partial_score_numerator": 0,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.24824581326145242,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1297": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "zero_token_agent_timeout"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.05980861101766491,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1308": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "zero_token_agent_timeout"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 0.7604586822681538,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1309": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "pass",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4042948,
          "tb_total_output_tokens": 18619,
          "tb_total_tokens": 4061567,
          "tb_uncached_input_tokens": 777028,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3265920,
          "tb_cached_input_tokens": 3265920,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.767144,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.5219061677960048,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1338": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2951109,
          "tb_total_output_tokens": 17040,
          "tb_total_tokens": 2968149,
          "tb_uncached_input_tokens": 155333,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 2795776,
          "tb_cached_input_tokens": 2795776,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.7546058999999999,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.554566763640818,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1351": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 4519980,
          "tb_total_output_tokens": 51933,
          "tb_total_tokens": 4571913,
          "tb_uncached_input_tokens": 496044,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 4023936,
          "tb_cached_input_tokens": 4023936,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.6592544,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.46109476297018426,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1359": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_infra",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "all_commands_ignored_gold_failure_mode_unset",
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "infra_failure",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.1117593631364283,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-1380": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "zero_token_agent_timeout"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.07863922071597225,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-817": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": 1,
          "probe_accepted_commands": 1,
          "probe_agreement_rate": 0,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": 1,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "zero_token_agent_timeout"
          ],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.20041400727009268,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-828": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 7302669,
          "tb_total_output_tokens": 58388,
          "tb_total_tokens": 7361057,
          "tb_uncached_input_tokens": 762765,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 6539904,
          "tb_cached_input_tokens": 6539904,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.4754611000000004,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 0,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.09886623382570056,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-859": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "lane_report_lane": null,
          "lane_report_source": null,
          "lane_report_reasons": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "used",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": 1,
          "probe_gold_pass_candidate_pass_count": 1,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 7741220,
          "tb_total_output_tokens": 37263,
          "tb_total_tokens": 7778483,
          "tb_uncached_input_tokens": 636708,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 7104512,
          "tb_cached_input_tokens": 7104512,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 2.2443168,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": false,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 1,
          "partial_score_denominator": 1,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "medium",
          "footprint_risk_score": 0.6074320516516629,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-870": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 13889681,
          "tb_total_output_tokens": 63156,
          "tb_total_tokens": 13952837,
          "tb_uncached_input_tokens": 785425,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 13104256,
          "tb_cached_input_tokens": 13104256,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 3.5227119,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": true,
          "publish_exclusion_reasons": [],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.21269592200229215,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-891": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "fail_guardrail",
          "tests_outcome": "unknown",
          "tests_unknown_cause": "no_gold_pass_commands",
          "lane_report_lane": "lane_unknown",
          "lane_report_source": "lane_unknown",
          "lane_report_reasons": [
            "lane_unknown"
          ],
          "cache_hit": null,
          "cache_miss_reason": null,
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": null,
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": false,
          "tb_failure_mode": "agent_timeout",
          "tb_total_input_tokens": 0,
          "tb_total_output_tokens": 0,
          "tb_total_tokens": 0,
          "tb_uncached_input_tokens": 0,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": null,
          "tb_cached_input_tokens": null,
          "token_status": "present",
          "cache_token_status": "missing",
          "token_source": null,
          "task_cost": 0,
          "cost_status": "present",
          "cost_pricing_mode": "legacy_input_output",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "zero_token_agent_timeout"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 0,
          "rescue_aware_outcome": 0,
          "partial_score": null,
          "partial_score_numerator": 0,
          "partial_score_denominator": 0,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "no_gold_pass_commands",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.31575822902463807,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    }
  },
  "comparison": {
    "partial_score_threshold": 0.8,
    "methodology": {
      "cache_pricing_mode_field": "cost_pricing_mode",
      "code_review_fail_rate_field": "code_review_fail_rate",
      "code_review_rate_denominator": "leaderboard_eligible",
      "code_review_role": "additive_non_gating",
      "cost_per_task_field": "cost_per_task",
      "cost_role": "additive_non_gating",
      "equiv_quality_per_dollar_field": "equiv_quality_per_dollar",
      "equiv_rate_denominator": "leaderboard_eligible",
      "equiv_rate_field": "equiv_rate",
      "equiv_rate_role": "additive_non_gating",
      "footprint_risk_denominator": "validated",
      "footprint_risk_role": "additive_non_gating",
      "footprint_risk_score_field": "footprint_risk_score",
      "leaderboard_rate_field": "tests_only_pass_rate",
      "pricing_source": "local_static_table",
      "pricing_version": "local-placeholder-2026-02-19",
      "probe_agreement_rate_field": "probe_agreement_rate",
      "probe_gold_pass_candidate_fail_field": "probe_gold_pass_candidate_fail_count",
      "probe_rate_denominator": "probe_gold_pass_candidate_known",
      "probe_review_required_field": "probe_review_required_count",
      "probe_role": "additive_non_gating",
      "publish_filter_default": "include",
      "publish_filter_field": "publish.include_in_leaderboard",
      "quality_per_dollar_denominator": "cost_per_task",
      "rescue_aware_rate_field": "rescue_aware_pass_rate",
      "tb_resolved_but_tests_not_pass_field": "tb_resolved_but_tests_not_pass_count",
      "tb_unresolved_but_tests_pass_field": "tb_unresolved_but_tests_pass_count",
      "tests_only_quality_per_dollar_field": "tests_only_quality_per_dollar",
      "tests_unknown_rate_threshold": 0.1
    },
    "ranking": [
      {
        "rank": 1,
        "model": "gpt-5-1-codex-mini",
        "model_key": "gpt-5-1-codex-mini",
        "binary_pass_rate": 0.5714285714285714,
        "binary_pass_count": 12,
        "validated": 30,
        "failed_task_partial_ppr": 0.058823529411764705,
        "failed_task_partial_mean_score": 0.5,
        "failed_task_partial_coverage": 0.11764705882352941,
        "tie_break_basis": null
      }
    ],
    "publish_guard": {
      "all_runs_publishable": false,
      "blocked_runs": [
        "gpt-5-1-codex-mini"
      ],
      "tests_unknown_rate_threshold": 0.1
    }
  }
}