summary
reports/summary.json
263553 bytes
Preview truncated to keep static evidence pages bounded.
{
"generated_at": "2026-03-06T23:48:13Z",
"dataset": "/Users/ben/dev/flux/.tmp/validation-zod-cleaned50-r7-p1-20260208-170124",
"output_root": "/Users/ben/dev/flux/.tmp/h2h-zod-w2",
"statistics": {
"pass_definition": {
"denominator_statuses": [
"fail_guardrail",
"fail_high_conf",
"fail_infra",
"fail_likely_equiv",
"fail_no_patch",
"fail_with_diag",
"pass",
"pass_with_warn"
],
"positive_statuses": [
"pass",
"pass_with_warn"
]
},
"bootstrap": {
"base_seed": 1337,
"confidence_level": 0.95,
"method": "nonparametric_task_bootstrap",
"resamples": 5000
},
"tiering": {
"rule": "A is strictly superior to B iff passRate(A) \u003e ciHigh(B)",
"strategy": "conservative_non_superiority_grouping"
}
},
"models": [
{
"name": "gpt-5.1-codex-mini",
"key": "gpt-5-1-codex-mini",
"run_id": "2026-02-27__21-30-28__gpt-5-1-codex-mini"
},
{
"name": "gpt-5.3-codex",
"key": "gpt-5-3-codex",
"run_id": "2026-02-27__21-30-28__gpt-5-3-codex"
},
{
"name": "gpt-5.4",
"key": "gpt-5-4",
"run_id": "2026-02-27__21-30-28__gpt-5-4"
}
],
"runs": {
"gpt-5-1-codex-mini": {
"model": "gpt-5.1-codex-mini",
"requested_model": "gpt-5.1-codex-mini",
"run_id": "2026-02-27__21-30-28__gpt-5-1-codex-mini",
"run_dir": "/Users/ben/dev/flux/.tmp/h2h-zod-w2/runs/2026-02-27__21-30-28__gpt-5-1-codex-mini",
"results_path": "/Users/ben/dev/flux/.tmp/h2h-zod-w2/runs/2026-02-27__21-30-28__gpt-5-1-codex-mini/results.json",
"validation_metrics": {
"validated": 28,
"leaderboard_eligible": 13,
"leaderboard_excluded": 15,
"binary_pass_count": 8,
"binary_pass_rate": 0.6153846153846154,
"tests_only_pass_count": 8,
"tests_only_pass_rate": 0.6153846153846154,
"rescue_aware_pass_count": 8,
"rescue_aware_pass_rate": 0.6153846153846154,
"rescue_delta_rate": 0,
"equiv_rate": 0.15384615384615385,
"equiv_equivalent_count": 2,
"equiv_non_equivalent_count": 11,
"equiv_unknown_count": 0,
"code_review_pass_count": 1,
"code_review_fail_count": 12,
"code_review_unsure_count": 0,
"code_review_fail_rate": 0.9230769230769231,
"behavioral_robustness_used_count": 13,
"behavioral_robustness_skipped_count": 0,
"behavioral_robustness_unavailable_count": 0,
"probe_accepted_commands_count": 0,
"probe_gold_pass_candidate_pass_count": 27,
"probe_gold_pass_candidate_fail_count": 16,
"probe_review_required_count": 0,
"probe_agreement_rate": 0.627906976744186,
"tb_unresolved_but_tests_pass_count": 6,
"tb_resolved_but_tests_not_pass_count": 1,
"tests_unknown_count": 0,
"tests_unknown_cause_counts": {},
"tests_unknown_rate": 0,
"tests_unknown_rate_threshold": 0.1,
"tests_unknown_threshold_breached": false,
"cache_evaluated_count": 28,
"cache_hit_count": 0,
"cache_miss_count": 28,
"cache_hit_rate": 0,
"cache_miss_reason_counts": {
"missing_pinned_dependencies": 28
},
"total_setup_ms_saved": null,
"total_pinned_bytes": null,
"footprint_risk_mean_score": 0.6341535743815215,
"footprint_risk_median_score": 0.7852904228436781,
"footprint_risk_scored_count": 28,
"footprint_risk_used_count": 28,
"footprint_risk_unavailable_count": 0,
"footprint_risk_missing_count": 0,
"footprint_risk_flagged_count": 14,
"footprint_risk_flagged_rate": 0.5,
"footprint_risk_severe_count": 14,
"footprint_risk_severe_rate": 0.5,
"footprint_risk_level_low_count": 11,
"footprint_risk_level_medium_count": 3,
"footprint_risk_level_high_count": 14,
"footprint_risk_level_unknown_count": 0,
"total_cost": 37.10659440000001,
"cost_per_task": 1.3252355142857148,
"tests_only_quality_per_dollar": 0.464358680967209,
"equiv_quality_per_dollar": 0.11608967024180225,
"total_input_tokens": 99402452,
"total_output_tokens": 1043733,
"total_tokens": 100446185,
"total_uncached_input_tokens": 11802836,
"total_cache_creation_input_tokens": null,
"total_cache_read_input_tokens": 87599616,
"total_cached_input_tokens": 87599616,
"cost_tasks_total": 28,
"cost_tasks_with_tokens": 28,
"cost_tasks_with_cache_tokens": 28,
"cost_tasks_with_cache_aware_pricing": 28,
"cost_tasks_with_legacy_pricing": 0,
"cost_tasks_with_pricing": 28,
"cost_tasks_with_cost": 28,
"cost_tasks_missing_tokens": 0,
"cost_tasks_missing_pricing": 0,
"cost_tasks_missing_cost": 0,
"cost_tasks_with_cost_rate": 1,
"pricing_version": "local-placeholder-2026-02-19",
"pricing_source": "local_static_table",
"pricing_model_key": "gpt-5.1-codex-mini"
},
"publish_exclusions": {
"base_tests_pass_without_patch": 15
},
"publish_guard": {
"publishable": true,
"blocked": false,
"reasons": [],
"tests_unknown_rate": 0,
"tests_unknown_rate_threshold": 0.1,
"tests_unknown_threshold_breached": false
},
"partial_metrics": {
"failed_task_count": 7,
"failed_task_with_partial_score": 7,
"failed_task_partial_threshold": 0.8,
"failed_task_partial_threshold_hits": 1,
"failed_task_partial_ppr": 0.14285714285714285,
"failed_task_partial_mean_score": 0.30357142857142855,
"failed_task_partial_coverage": 1
},
"footprint_risk_metrics": {
"used_count": 28,
"unavailable_count": 0,
"missing_count": 0,
"scored_count": 28,
"mean_score": 0.6341535743815215,
"median_score": 0.7852904228436781,
"flagged_count": 14,
"flagged_rate": 0.5,
"severe_count": 14,
"severe_rate": 0.5,
"level_low_count": 11,
"level_medium_count": 3,
"level_high_count": 14,
"level_unknown_count": 0
},
"passRate": 0.6153846153846154,
"ciLow": 0.3076923076923077,
"ciHigh": 0.8461538461538461,
"effectiveN": 13,
"tier": 1,
"validation_counts": {
"fail_guardrail": 0,
"fail_high_conf": 7,
"fail_infra": 0,
"fail_likely_equiv": 0,
"fail_with_diag": 0,
"missing": 0,
"pass": 2,
"pass_with_warn": 19
}
},
"gpt-5-3-codex": {
"model": "gpt-5.3-codex",
"requested_model": "gpt-5.3-codex",
"run_id": "2026-02-27__21-30-28__gpt-5-3-codex",
"run_dir": "/Users/ben/dev/flux/.tmp/h2h-zod-w2/runs/2026-02-27__21-30-28__gpt-5-3-codex",
"results_path": "/Users/ben/dev/flux/.tmp/h2h-zod-w2/runs/2026-02-27__21-30-28__gpt-5-3-codex/results.json",
"validation_metrics": {
"validated": 28,
"leaderboard_eligible": 13,
"leaderboard_excluded": 15,
"binary_pass_count": 9,
"binary_pass_rate": 0.6923076923076923,
"tests_only_pass_count": 9,
"tests_only_pass_rate": 0.6923076923076923,
"rescue_aware_pass_count": 9,
"rescue_aware_pass_rate": 0.6923076923076923,
"rescue_delta_rate": 0,
"equiv_rate": 0.38461538461538464,
"equiv_equivalent_count": 5,
"equiv_non_equivalent_count": 7,
"equiv_unknown_count": 1,
"code_review_pass_count": 1,
"code_review_fail_count": 10,
"code_review_unsure_count": 2,
"code_review_fail_rate": 0.7692307692307693,
"behavioral_robustness_used_count": 13,
"behavioral_robustness_skipped_count": 0,
"behavioral_robustness_unavailable_count": 0,
"probe_accepted_commands_count": 2,
"probe_gold_pass_candidate_pass_count": 27,
"probe_gold_pass_candidate_fail_count": 8,
"probe_review_required_count": 0,
"probe_agreement_rate": 0.7714285714285715,
"tb_unresolved_but_tests_pass_count": 6,
"tb_resolved_but_tests_not_pass_count": 2,
"tests_unknown_count": 1,
"tests_unknown_cause_counts": {
"no_gold_pass_commands": 1
},
"tests_unknown_rate": 0.03571428571428571,
"tests_unknown_rate_threshold": 0.1,
"tests_unknown_threshold_breached": false,
"cache_evaluated_count": 28,
"cache_hit_count": 0,
"cache_miss_count": 28,
"cache_hit_rate": 0,
"cache_miss_reason_counts": {
"missing_pinned_dependencies": 28
},
"total_setup_ms_saved": null,
"total_pinned_bytes": null,
"footprint_risk_mean_score": 0.5267283617210462,
"footprint_risk_median_score": 0.4187119400088103,
"footprint_risk_scored_count": 28,
"footprint_risk_used_count": 28,
"footprint_risk_unavailable_count": 0,
"footprint_risk_missing_count": 0,
"footprint_risk_flagged_count": 9,
"footprint_risk_flagged_rate": 0.32142857142857145,
"footprint_risk_severe_count": 9,
"footprint_risk_severe_rate": 0.32142857142857145,
"footprint_risk_level_low_count": 12,
"footprint_risk_level_medium_count": 7,
"footprint_risk_level_high_count": 9,
"footprint_risk_level_unknown_count": 0,
"total_cost": 85.750158,
"cost_per_task": 3.062505642857143,
"tests_only_quality_per_dollar": 0.22605923810210804,
"equiv_quality_per_dollar": 0.12558846561228226,
"total_input_tokens": 30705182,
"total_output_tokens": 210957,
"total_tokens": 30916139,
"total_uncached_input_tokens": 2002590,
"total_cache_creation_input_tokens": null,
"total_cache_read_input_tokens": 28702592,
"total_cached_input_tokens": 28702592,
"cost_tasks_total": 28,
"cost_tasks_with_tokens": 28,
"cost_tasks_with_cache_tokens": 28,
"cost_tasks_with_cache_aware_pricing": 28,
"cost_tasks_with_legacy_pricing": 0,
"cost_tasks_with_pricing": 28,
"cost_tasks_with_cost": 28,
"cost_tasks_missing_tokens": 0,
"cost_tasks_missing_pricing": 0,
"cost_tasks_missing_cost": 0,
"cost_tasks_with_cost_rate": 1,
"pricing_version": "local-placeholder-2026-02-19",
"pricing_source": "local_static_table",
"pricing_model_key": "gpt-5.3-codex"
},
"publish_exclusions": {
"base_tests_pass_without_patch": 15
},
"publish_guard": {
"publishable": true,
"blocked": false,
"reasons": [],
"tests_unknown_rate": 0.03571428571428571,
"tests_unknown_rate_threshold": 0.1,
"tests_unknown_threshold_breached": false
},
"partial_metrics": {
"failed_task_count": 7,
"failed_task_with_partial_score": 6,
"failed_task_partial_threshold": 0.8,
"failed_task_partial_threshold_hits": 0,
"failed_task_partial_ppr": 0,
"failed_task_partial_mean_score": 0.22777777777777777,
"failed_task_partial_coverage": 0.8571428571428571
},
"footprint_risk_metrics": {
"used_count": 28,
"unavailable_count": 0,
"missing_count": 0,
"scored_count": 28,
"mean_score": 0.5267283617210462,
"median_score": 0.4187119400088103,
"flagged_count": 9,
"flagged_rate": 0.32142857142857145,
"severe_count": 9,
"severe_rate": 0.32142857142857145,
"level_low_count": 12,
"level_medium_count": 7,
"level_high_count": 9,
"level_unknown_count": 0
},
"passRate": 0.6923076923076923,
"ciLow": 0.46153846153846156,
"ciHigh": 0.9230769230769231,
"effectiveN": 13,
"tier": 1,
"validation_counts": {
"fail_guardrail": 0,
"fail_high_conf": 6,
"fail_infra": 0,
"fail_likely_equiv": 1,
"fail_with_diag": 0,
"missing": 0,
"pass": 6,
"pass_with_warn": 15
}
},
"gpt-5-4": {
"model": "gpt-5.4",
"requested_model": "gpt-5.4",
"run_id": "2026-02-27__21-30-28__gpt-5-4",
"run_dir": "/Users/ben/dev/flux/.tmp/h2h-zod-w2/runs/2026-02-27__21-30-28__gpt-5-4",
"results_path": "/Users/ben/dev/flux/.tmp/h2h-zod-w2/runs/2026-02-27__21-30-28__gpt-5-4/results.json",
"validation_metrics": {
"validated": 28,
"leaderboard_eligible": 13,
"leaderboard_excluded": 15,
"binary_pass_count": 9,
"binary_pass_rate": 0.6923076923076923,
"tests_only_pass_count": 9,
"tests_only_pass_rate": 0.6923076923076923,
"rescue_aware_pass_count": 9,
"rescue_aware_pass_rate": 0.6923076923076923,
"rescue_delta_rate": 0,
"equiv_rate": 0.3076923076923077,
"equiv_equivalent_count": 4,
"equiv_non_equivalent_count": 9,
"equiv_unknown_count": 0,
"code_review_pass_count": 3,
"code_review_fail_count": 9,
"code_review_unsure_count": 1,
"code_review_fail_rate": 0.6923076923076923,
"behavioral_robustness_used_count": 13,
"behavioral_robustness_skipped_count": 0,
"behavioral_robustness_unavailable_count": 0,
"probe_accepted_commands_count": 1,
"probe_gold_pass_candidate_pass_count": 26,
"probe_gold_pass_candidate_fail_count": 13,
"probe_review_required_count": 0,
"probe_agreement_rate": 0.6666666666666666,
"tb_unresolved_but_tests_pass_count": 9,
"tb_resolved_but_tests_not_pass_count": 4,
"tests_unknown_count": 1,
"tests_unknown_cause_counts": {
"no_commands_selected": 1
},
"tests_unknown_rate": 0.03571428571428571,
"tests_unknown_rate_threshold": 0.1,
"tests_unknown_threshold_breached": false,
"cache_evaluated_count": 27,
"cache_hit_count": 0,
"cache_miss_count": 27,
"cache_hit_rate": 0,
"cache_miss_reason_counts": {
"missing_pinned_dependencies": 27
},
"total_setup_ms_saved": null,
"total_pinned_bytes": null,
"footprint_risk_mean_score": 0.5314101950462605,
"footprint_risk_median_score": 0.4287227140723908,
"footprint_risk_scored_count": 28,
"footprint_risk_used_count": 28,
"footprint_risk_unavailable_count": 0,
"footprint_risk_missing_count": 0,
"footprint_risk_flagged_count": 8,
"footprint_risk_flagged_rate": 0.2857142857142857,
"footprint_risk_severe_count": 8,
"footprint_risk_severe_rate": 0.2857142857142857,
"footprint_risk_level_low_count": 10,
"footprint_risk_level_medium_count": 10,
"footprint_risk_level_high_count": 8,
"footprint_risk_level_unknown_count": 0,
"total_cost": 18.658483999999998,
"cost_per_task": 0.6663744285714285,
"tests_only_quality_per_dollar": 1.038916955129655,
"equiv_quality_per_dollar": 0.4617408689465134,
"total_input_tokens": 26956746,
"total_output_tokens": 240436,
"total_tokens": 27197182,
"total_uncached_input_tokens": 2171082,
"total_cache_creation_input_tokens": null,
"total_cache_read_input_tokens": 24785664,
"total_cached_input_tokens": 24785664,
"cost_tasks_total": 28,
"cost_tasks_with_tokens": 28,
"cost_tasks_with_cache_tokens": 28,
"cost_tasks_with_cache_aware_pricing": 28,
"cost_tasks_with_legacy_pricing": 0,
"cost_tasks_with_pricing": 28,
"cost_tasks_with_cost": 28,
"cost_tasks_missing_tokens": 0,
"cost_tasks_missing_pricing": 0,
"cost_tasks_missing_cost": 0,
"cost_tasks_with_cost_rate": 1,
"pricing_version": "local-placeholder-2026-02-19",
"pricing_source": "local_static_table",
"pricing_model_key": "gpt-5.4"
},
"publish_exclusions": {
"base_tests_pass_without_patch": 15
},
"publish_guard": {
"publishable": true,
"blocked": false,
"reasons": [],
"tests_unknown_rate": 0.03571428571428571,
"tests_unknown_rate_threshold": 0.1,
"tests_unknown_threshold_breached": false
},
"partial_metrics": {
"failed_task_count": 6,
"failed_task_with_partial_score": 5,
"failed_task_partial_threshold": 0.8,
"failed_task_partial_threshold_hits": 0,
"failed_task_partial_ppr": 0,
"failed_task_partial_mean_score": 0.23888888888888887,
"failed_task_partial_coverage": 0.8333333333333334
},
"footprint_risk_metrics": {
"used_count": 28,
"unavailable_count": 0,
"missing_count": 0,
"scored_count": 28,
"mean_score": 0.5314101950462605,
"median_score": 0.4287227140723908,
"flagged_count": 8,
"flagged_rate": 0.2857142857142857,
"severe_count": 8,
"severe_rate": 0.2857142857142857,
"level_low_count": 10,
"level_medium_count": 10,
"level_high_count": 8,
"level_unknown_count": 0
},
"passRate": 0.6923076923076923,
"ciLow": 0.46153846153846156,
"ciHigh": 0.9230769230769231,
"effectiveN": 13,
"tier": 1,
"validation_counts": {
"fail_guardrail": 0,
"fail_high_conf": 5,
"fail_infra": 0,
"fail_likely_equiv": 1,
"fail_with_diag": 0,
"missing": 0,
"pass": 7,
"pass_with_warn": 15
}
}
},
"tasks": {
"flux-commit-0064304a": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0.125,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": 7,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 2139457,
"tb_total_output_tokens": 28432,
"tb_total_tokens": 2167889,
"tb_uncached_input_tokens": 240961,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1898496,
"tb_cached_input_tokens": 1898496,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.8168079,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0.125,
"partial_score_numerator": 1,
"partial_score_denominator": 8,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0.16666666666666666,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": 5,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 594267,
"tb_total_output_tokens": 5298,
"tb_total_tokens": 599565,
"tb_uncached_input_tokens": 30555,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 563712,
"tb_cached_input_tokens": 563712,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.621773,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0.16666666666666666,
"partial_score_numerator": 1,
"partial_score_denominator": 6,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 1,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-4": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0.1111111111111111,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": 8,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 514054,
"tb_total_output_tokens": 6706,
"tb_total_tokens": 520760,
"tb_uncached_input_tokens": 23686,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 490368,
"tb_cached_input_tokens": 490368,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.34620400000000007,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0.1111111111111111,
"partial_score_numerator": 1,
"partial_score_denominator": 9,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 0.9649999208143414,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
}
}
},
"flux-commit-64a54b07": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0,
"probe_gold_pass_candidate_pass_count": null,
"probe_gold_pass_candidate_fail_count": 3,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 3654436,
"tb_total_output_tokens": 82597,
"tb_total_tokens": 3737033,
"tb_uncached_input_tokens": 417700,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 3236736,
"tb_cached_input_tokens": 3236736,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.6076424,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0,
"partial_score_numerator": 0,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": 1,
"probe_accepted_commands": 1,
"probe_agreement_rate": 0,
"probe_gold_pass_candidate_pass_count": null,
"probe_gold_pass_candidate_fail_count": 1,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1951665,
"tb_total_output_tokens": 16032,
"tb_total_tokens": 1967697,
"tb_uncached_input_tokens": 144689,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1806976,
"tb_cached_input_tokens": 1806976,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 5.842719,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0,
"partial_score_numerator": 0,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-4": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0,
"probe_gold_pass_candidate_pass_count": null,
"probe_gold_pass_candidate_fail_count": 3,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1777761,
"tb_total_output_tokens": 15203,
"tb_total_tokens": 1792964,
"tb_uncached_input_tokens": 110433,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1667328,
"tb_cached_input_tokens": 1667328,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.176154,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0,
"partial_score_numerator": 0,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
}
}
},
"flux-commit-7af773c0": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0.2,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": 4,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 4055598,
"tb_total_output_tokens": 60744,
"tb_total_tokens": 4116342,
"tb_uncached_input_tokens": 578478,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 3477120,
"tb_cached_input_tokens": 3477120,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.753749,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0.2,
"partial_score_numerator": 1,
"partial_score_denominator": 5,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.22269389797715866,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": 1,
"probe_accepted_commands": 1,
"probe_agreement_rate": 0,
"probe_gold_pass_candidate_pass_count": null,
"probe_gold_pass_candidate_fail_count": 1,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1545065,
"tb_total_output_tokens": 14502,
"tb_total_tokens": 1559567,
"tb_uncached_input_tokens": 116073,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1428992,
"tb_cached_input_tokens": 1428992,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 4.754703,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0.2,
"partial_score_numerator": 1,
"partial_score_denominator": 5,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.2260592172086254,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "fail_high_conf",
"tests_outcome": "unknown",
"tests_unknown_cause": "no_commands_selected",
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": null,
"cache_miss_reason": null,
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": null,
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": 1,
"probe_accepted_commands": 1,
"probe_agreement_rate": 0,
"probe_gold_pass_candidate_pass_count": null,
"probe_gold_pass_candidate_fail_count": 1,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 913212,
"tb_total_output_tokens": 12529,
"tb_total_tokens": 925741,
"tb_uncached_input_tokens": 81724,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 831488,
"tb_cached_input_tokens": 831488,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.679424,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": true,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": null,
"partial_score_numerator": 0,
"partial_score_denominator": 0,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "no_commands_selected",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.44633678895315987,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-commit-a8580f2b": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 2933113,
"tb_total_output_tokens": 42307,
"tb_total_tokens": 2975420,
"tb_uncached_input_tokens": 589049,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 2344064,
"tb_cached_input_tokens": 2344064,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.4890250999999999,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1014052,
"tb_total_output_tokens": 8490,
"tb_total_tokens": 1022542,
"tb_uncached_input_tokens": 43556,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 970496,
"tb_cached_input_tokens": 970496,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 2.618484,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1099691,
"tb_total_output_tokens": 12478,
"tb_total_tokens": 1112169,
"tb_uncached_input_tokens": 66475,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1033216,
"tb_cached_input_tokens": 1033216,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.749382,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
}
}
},
"flux-commit-fc48a85d": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0.5,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": 1,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 3950333,
"tb_total_output_tokens": 67957,
"tb_total_tokens": 4018290,
"tb_uncached_input_tokens": 385021,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 3565312,
"tb_cached_input_tokens": 3565312,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.5200703,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0.5,
"partial_score_numerator": 1,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
"equivalence_status": "used",
"equivalence_outcome": null,
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0.5,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": 1,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 539049,
"tb_total_output_tokens": 10884,
"tb_total_tokens": 549933,
"tb_uncached_input_tokens": 56873,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 482176,
"tb_cached_input_tokens": 482176,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 2.229399,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0.5,
"partial_score_numerator": 1,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 0.9650000000000001,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-4": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_d902ff08370f67ea917d128a11b002fb",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "unsure",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 491906,
"tb_total_output_tokens": 12873,
"tb_total_tokens": 504779,
"tb_uncached_input_tokens": 72066,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 419840,
"tb_cached_input_tokens": 419840,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.45703600000000005,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 1,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.515889155149982,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-3535": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 718503,
"tb_total_output_tokens": 18500,
"tb_total_tokens": 737003,
"tb_uncached_input_tokens": 111143,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 607360,
"tb_cached_input_tokens": 607360,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.3688185,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.2159136192945093,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 333181,
"tb_total_output_tokens": 3304,
"tb_total_tokens": 336485,
"tb_uncached_input_tokens": 26877,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 306304,
"tb_cached_input_tokens": 306304,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.060851,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.4132175498715512,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 287682,
"tb_total_output_tokens": 3702,
"tb_total_tokens": 291384,
"tb_uncached_input_tokens": 56130,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 231552,
"tb_cached_input_tokens": 231552,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.257652,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.44272926669063817,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-3712": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 4809887,
"tb_total_output_tokens": 26473,
"tb_total_tokens": 4836360,
"tb_uncached_input_tokens": 296351,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 4513536,
"tb_cached_input_tokens": 4513536,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.2803949,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.5705808456873562,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1434505,
"tb_total_output_tokens": 6601,
"tb_total_tokens": 1441106,
"tb_uncached_input_tokens": 70537,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1363968,
"tb_cached_input_tokens": 1363968,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 3.500067,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-4": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 907911,
"tb_total_output_tokens": 6614,
"tb_total_tokens": 914525,
"tb_uncached_input_tokens": 84615,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 823296,
"tb_cached_input_tokens": 823296,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.63379,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.3342444802632546,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-3820": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 6443720,
"tb_total_output_tokens": 33139,
"tb_total_tokens": 6476859,
"tb_uncached_input_tokens": 831688,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 5612032,
"tb_cached_input_tokens": 5612032,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 2.2881708,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.41873918495808904,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1496321,
"tb_total_output_tokens": 10847,
"tb_total_tokens": 1507168,
"tb_uncached_input_tokens": 53633,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1442688,
"tb_cached_input_tokens": 1442688,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 3.619347,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.560356454767305,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_29438338557a335d9f35e663ade4f97a",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 541388,
"tb_total_output_tokens": 9365,
"tb_total_tokens": 550753,
"tb_uncached_input_tokens": 67532,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 473856,
"tb_cached_input_tokens": 473856,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.446912,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.5765361256453612,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-3850": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_5a9cfc6bb1d00ff525dc46efc4c7f360",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 3273165,
"tb_total_output_tokens": 36796,
"tb_total_tokens": 3309961,
"tb_uncached_input_tokens": 531917,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 2741248,
"tb_cached_input_tokens": 2741248,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.4298387000000001,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.32751625129432205,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_5a9cfc6bb1d00ff525dc46efc4c7f360",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 582151,
"tb_total_output_tokens": 5633,
"tb_total_tokens": 587784,
"tb_uncached_input_tokens": 34183,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 547968,
"tb_cached_input_tokens": 547968,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.672677,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.25079506393494533,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_5a9cfc6bb1d00ff525dc46efc4c7f360",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0.75,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": 1,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1630569,
"tb_total_output_tokens": 10149,
"tb_total_tokens": 1640718,
"tb_uncached_input_tokens": 117993,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1512576,
"tb_cached_input_tokens": 1512576,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.073466,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0.75,
"partial_score_numerator": 3,
"partial_score_denominator": 4,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
}
}
},
"flux-pr-4539": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1288729,
"tb_total_output_tokens": 18813,
"tb_total_tokens": 1307542,
"tb_uncached_input_tokens": 270745,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1017984,
"tb_cached_input_tokens": 1017984,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.6716931,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 829328,
"tb_total_output_tokens": 4629,
"tb_total_tokens": 833957,
"tb_uncached_input_tokens": 56336,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 772992,
"tb_cached_input_tokens": 772992,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 2.282268,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.4860969030943976,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1072339,
"tb_total_output_tokens": 5273,
"tb_total_tokens": 1077612,
"tb_uncached_input_tokens": 48083,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1024256,
"tb_cached_input_tokens": 1024256,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.650478,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
}
}
},
"flux-pr-4567": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 5409937,
"tb_total_output_tokens": 47087,
"tb_total_tokens": 5457024,
"tb_uncached_input_tokens": 770065,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 4639872,
"tb_cached_input_tokens": 4639872,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 2.1336003000000003,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1770241,
"tb_total_output_tokens": 8646,
"tb_total_tokens": 1778887,
"tb_uncached_input_tokens": 69377,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1700864,
"tb_cached_input_tokens": 1700864,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 4.110711,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.2836271609204553,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1348406,
"tb_total_output_tokens": 8605,
"tb_total_tokens": 1357011,
"tb_uncached_input_tokens": 89398,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1259008,
"tb_cached_input_tokens": 1259008,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.87714,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.2287567071894106,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-4568": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "unsure",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 2777374,
"tb_total_output_tokens": 18584,
"tb_total_tokens": 2795958,
"tb_uncached_input_tokens": 369054,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 2408320,
"tb_cached_input_tokens": 2408320,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.026333,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.07897112275089671,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 2233911,
"tb_total_output_tokens": 7453,
"tb_total_tokens": 2241364,
"tb_uncached_input_tokens": 185527,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 2048384,
"tb_cached_input_tokens": 2048384,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 6.3026610000000005,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1692436,
"tb_total_output_tokens": 8301,
"tb_total_tokens": 1700737,
"tb_uncached_input_tokens": 108436,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1584000,
"tb_cached_input_tokens": 1584000,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.07528,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.31319218233596996,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-4672": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 2249940,
"tb_total_output_tokens": 31846,
"tb_total_tokens": 2281786,
"tb_uncached_input_tokens": 283604,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1966336,
"tb_cached_input_tokens": 1966336,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.9114324000000001,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 1,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.19766459751520393,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1470141,
"tb_total_output_tokens": 9487,
"tb_total_tokens": 1479628,
"tb_uncached_input_tokens": 137149,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1332992,
"tb_cached_input_tokens": 1332992,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 4.6259429999999995,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 1,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1453861,
"tb_total_output_tokens": 9709,
"tb_total_tokens": 1463570,
"tb_uncached_input_tokens": 99237,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1354624,
"tb_cached_input_tokens": 1354624,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.953458,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 1,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
}
}
},
"flux-pr-4680": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_9eb87a001605e9dbad2d6bc4df1cdf1a",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 5408297,
"tb_total_output_tokens": 54953,
"tb_total_tokens": 5463250,
"tb_uncached_input_tokens": 575017,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 4833280,
"tb_cached_input_tokens": 4833280,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.9172355,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_9eb87a001605e9dbad2d6bc4df1cdf1a",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1308578,
"tb_total_output_tokens": 8403,
"tb_total_tokens": 1316981,
"tb_uncached_input_tokens": 82722,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1225856,
"tb_cached_input_tokens": 1225856,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 3.583794,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.22162069657386407,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_9eb87a001605e9dbad2d6bc4df1cdf1a",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 969917,
"tb_total_output_tokens": 13235,
"tb_total_tokens": 983152,
"tb_uncached_input_tokens": 67517,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 902400,
"tb_cached_input_tokens": 902400,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.6921139999999999,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.38476672560889974,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-4807": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_a474bb355191484fe37511ef6adcbbdb",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 9480750,
"tb_total_output_tokens": 63023,
"tb_total_tokens": 9543773,
"tb_uncached_input_tokens": 811182,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 8669568,
"tb_cached_input_tokens": 8669568,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 2.8953461999999996,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.25436705359416195,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_a474bb355191484fe37511ef6adcbbdb",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 2246169,
"tb_total_output_tokens": 10241,
"tb_total_tokens": 2256410,
"tb_uncached_input_tokens": 148633,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 2097536,
"tb_cached_input_tokens": 2097536,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 5.990259,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.2743853862459483,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_a474bb355191484fe37511ef6adcbbdb",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1393356,
"tb_total_output_tokens": 9574,
"tb_total_tokens": 1402930,
"tb_uncached_input_tokens": 130380,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1262976,
"tb_cached_input_tokens": 1262976,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.96884,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.2460003936157359,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-4811": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 3960049,
"tb_total_output_tokens": 39603,
"tb_total_tokens": 3999652,
"tb_uncached_input_tokens": 597873,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 3362176,
"tb_cached_input_tokens": 3362176,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.6387539000000002,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "fail_likely_equiv",
"tests_outcome": "unknown",
"tests_unknown_cause": "no_gold_pass_commands",
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "unsure",
"behavioral_robustness_status": "skipped",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": null,
"probe_gold_pass_candidate_pass_count": null,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 892256,
"tb_total_output_tokens": 7149,
"tb_total_tokens": 899405,
"tb_uncached_input_tokens": 59232,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 833024,
"tb_cached_input_tokens": 833024,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 2.566956,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": null,
"partial_score_numerator": 0,
"partial_score_denominator": 0,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "no_gold_pass_commands",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.37634583057880866,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1509723,
"tb_total_output_tokens": 9525,
"tb_total_tokens": 1519248,
"tb_uncached_input_tokens": 144347,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1365376,
"tb_cached_input_tokens": 1365376,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.047582,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.4795384148544676,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-4843": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 2199488,
"tb_total_output_tokens": 31627,
"tb_total_tokens": 2231115,
"tb_uncached_input_tokens": 221888,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1977600,
"tb_cached_input_tokens": 1977600,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.819234,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.21139211406787362,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 556807,
"tb_total_output_tokens": 5367,
"tb_total_tokens": 562174,
"tb_uncached_input_tokens": 42631,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 514176,
"tb_cached_input_tokens": 514176,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.7327489999999999,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-4": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 345613,
"tb_total_output_tokens": 3713,
"tb_total_tokens": 349326,
"tb_uncached_input_tokens": 49293,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 296320,
"tb_cached_input_tokens": 296320,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.27645,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.21307789923626405,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-4861": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1915148,
"tb_total_output_tokens": 13300,
"tb_total_tokens": 1928448,
"tb_uncached_input_tokens": 308364,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1606784,
"tb_cached_input_tokens": 1606784,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.7833635999999999,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1185030,
"tb_total_output_tokens": 6398,
"tb_total_tokens": 1191428,
"tb_uncached_input_tokens": 135174,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1049856,
"tb_cached_input_tokens": 1049856,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 3.9862739999999994,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "unsure",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 524242,
"tb_total_output_tokens": 6156,
"tb_total_tokens": 530398,
"tb_uncached_input_tokens": 40786,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 483456,
"tb_cached_input_tokens": 483456,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.37254800000000005,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.22014048356200014,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-4970": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 3133675,
"tb_total_output_tokens": 33832,
"tb_total_tokens": 3167507,
"tb_uncached_input_tokens": 417899,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 2715776,
"tb_cached_input_tokens": 2715776,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.2372069,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.48039984552855913,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1478654,
"tb_total_output_tokens": 6804,
"tb_total_tokens": 1485458,
"tb_uncached_input_tokens": 57982,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1420672,
"tb_cached_input_tokens": 1420672,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 3.4089780000000003,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.42420633014606945,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 760317,
"tb_total_output_tokens": 6074,
"tb_total_tokens": 766391,
"tb_uncached_input_tokens": 52349,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 707968,
"tb_cached_input_tokens": 707968,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.507274,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.424164075572708,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-5156": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_9eb87a001605e9dbad2d6bc4df1cdf1a",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1328670,
"tb_total_output_tokens": 28066,
"tb_total_tokens": 1356736,
"tb_uncached_input_tokens": 206622,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1122048,
"tb_cached_input_tokens": 1122048,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.6466362,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_9eb87a001605e9dbad2d6bc4df1cdf1a",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "unsure",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 531364,
"tb_total_output_tokens": 3451,
"tb_total_tokens": 534815,
"tb_uncached_input_tokens": 50340,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 481024,
"tb_cached_input_tokens": 481024,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.683696,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.1286210544582699,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_9eb87a001605e9dbad2d6bc4df1cdf1a",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 3,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1155869,
"tb_total_output_tokens": 8097,
"tb_total_tokens": 1163966,
"tb_uncached_input_tokens": 107421,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1048448,
"tb_cached_input_tokens": 1048448,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.803842,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.3289645941015719,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-5187": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "unsure",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0,
"probe_gold_pass_candidate_pass_count": null,
"probe_gold_pass_candidate_fail_count": 1,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 374880,
"tb_total_output_tokens": 14109,
"tb_total_tokens": 388989,
"tb_uncached_input_tokens": 78688,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 296192,
"tb_cached_input_tokens": 296192,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.2471148,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0,
"partial_score_numerator": 0,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.1485020908096406,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0,
"probe_gold_pass_candidate_pass_count": null,
"probe_gold_pass_candidate_fail_count": 1,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 327742,
"tb_total_output_tokens": 3318,
"tb_total_tokens": 331060,
"tb_uncached_input_tokens": 31678,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 296064,
"tb_cached_input_tokens": 296064,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.1183459999999998,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0,
"partial_score_numerator": 0,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.15172317398071444,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "fail_likely_equiv",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "unsure",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": 1,
"probe_accepted_commands": 1,
"probe_agreement_rate": 0,
"probe_gold_pass_candidate_pass_count": null,
"probe_gold_pass_candidate_fail_count": 1,
"probe_review_required_count": null,
"flags": [
"likely_equivalent_despite_test_fail",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 531621,
"tb_total_output_tokens": 4641,
"tb_total_tokens": 536262,
"tb_uncached_input_tokens": 69669,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 461952,
"tb_cached_input_tokens": 461952,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.40744199999999997,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": true,
"rescue_eligible": false,
"rescue_decision": "rejected_not_high_confidence_stylistic",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0,
"partial_score_numerator": 0,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.3860531541355702,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-5222": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 8284142,
"tb_total_output_tokens": 43947,
"tb_total_tokens": 8328089,
"tb_uncached_input_tokens": 708846,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 7575296,
"tb_cached_input_tokens": 7575296,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 2.4632454000000004,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 1,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.2950900600460944,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 985355,
"tb_total_output_tokens": 6042,
"tb_total_tokens": 991397,
"tb_uncached_input_tokens": 52107,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 933248,
"tb_cached_input_tokens": 933248,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 2.543997,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 1,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.1912181625774956,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 735292,
"tb_total_output_tokens": 6544,
"tb_total_tokens": 741836,
"tb_uncached_input_tokens": 46908,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 688384,
"tb_cached_input_tokens": 688384,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.49036,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 1,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.20982209080964062,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-5316": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0.8,
"probe_gold_pass_candidate_pass_count": 4,
"probe_gold_pass_candidate_fail_count": 1,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 2683864,
"tb_total_output_tokens": 21333,
"tb_total_tokens": 2705197,
"tb_uncached_input_tokens": 440280,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 2243584,
"tb_cached_input_tokens": 2243584,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.1249556,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0.8,
"partial_score_numerator": 4,
"partial_score_denominator": 5,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 4,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1254592,
"tb_total_output_tokens": 5851,
"tb_total_tokens": 1260443,
"tb_uncached_input_tokens": 74688,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1179904,
"tb_cached_input_tokens": 1179904,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 3.2412360000000002,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 4,
"partial_score_denominator": 4,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.5125593326877056,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 4,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 686564,
"tb_total_output_tokens": 5430,
"tb_total_tokens": 691994,
"tb_uncached_input_tokens": 78180,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 608384,
"tb_cached_input_tokens": 608384,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.503992,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 4,
"partial_score_denominator": 4,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.4332813525720736,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-5409": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0.5,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": 1,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 496116,
"tb_total_output_tokens": 26761,
"tb_total_tokens": 522877,
"tb_uncached_input_tokens": 61172,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 434944,
"tb_cached_input_tokens": 434944,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.3175656,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0.5,
"partial_score_numerator": 1,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.1521158408096406,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0.5,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": 1,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 212364,
"tb_total_output_tokens": 7282,
"tb_total_tokens": 219646,
"tb_uncached_input_tokens": 22028,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 190336,
"tb_cached_input_tokens": 190336,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.0528440000000001,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0.5,
"partial_score_numerator": 1,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.15542334080964063,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "fail_high_conf",
"tests_outcome": "fail",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 0.3333333333333333,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": 2,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 946305,
"tb_total_output_tokens": 12660,
"tb_total_tokens": 958965,
"tb_uncached_input_tokens": 105601,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 840704,
"tb_cached_input_tokens": 840704,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.732834,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": false,
"tests_only_outcome": 0,
"rescue_aware_outcome": 0,
"partial_score": 0.3333333333333333,
"partial_score_numerator": 1,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
}
}
},
"flux-pr-5519": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 4,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 322087,
"tb_total_output_tokens": 10808,
"tb_total_tokens": 332895,
"tb_uncached_input_tokens": 36775,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 285312,
"tb_cached_input_tokens": 285312,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.1628073,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 4,
"partial_score_denominator": 4,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.1823535583490964,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-3-codex": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "unsure",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 4,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 311608,
"tb_total_output_tokens": 3967,
"tb_total_tokens": 315575,
"tb_uncached_input_tokens": 36024,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 275584,
"tb_cached_input_tokens": 275584,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.1917559999999998,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 4,
"partial_score_denominator": 4,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.17267834080964062,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 4,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 417752,
"tb_total_output_tokens": 4444,
"tb_total_tokens": 422196,
"tb_uncached_input_tokens": 32984,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 384768,
"tb_cached_input_tokens": 384768,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.29390400000000005,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 4,
"partial_score_denominator": 4,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.1776559640270023,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-5574": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1230418,
"tb_total_output_tokens": 26783,
"tb_total_tokens": 1257201,
"tb_uncached_input_tokens": 133842,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1096576,
"tb_cached_input_tokens": 1096576,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.5259474,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 745129,
"tb_total_output_tokens": 5843,
"tb_total_tokens": 750972,
"tb_uncached_input_tokens": 43177,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 701952,
"tb_cached_input_tokens": 701952,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 2.051163,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.4898923220273933,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 837514,
"tb_total_output_tokens": 7903,
"tb_total_tokens": 845417,
"tb_uncached_input_tokens": 110474,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 727040,
"tb_cached_input_tokens": 727040,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.6476919999999999,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
}
}
},
"flux-pr-5575": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 8579750,
"tb_total_output_tokens": 56197,
"tb_total_tokens": 8635947,
"tb_uncached_input_tokens": 1046182,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 7533568,
"tb_cached_input_tokens": 7533568,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 3.0364902,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1046681,
"tb_total_output_tokens": 8259,
"tb_total_tokens": 1054940,
"tb_uncached_input_tokens": 59929,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 986752,
"tb_cached_input_tokens": 986752,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 2.874603,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.2753473365976477,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 913164,
"tb_total_output_tokens": 6201,
"tb_total_tokens": 919365,
"tb_uncached_input_tokens": 59020,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 854144,
"tb_cached_input_tokens": 854144,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.59472,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.3059745956441987,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-5578": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 6300926,
"tb_total_output_tokens": 66116,
"tb_total_tokens": 6367042,
"tb_uncached_input_tokens": 482430,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 5818496,
"tb_cached_input_tokens": 5818496,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.9931154,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 1,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
},
"gpt-5-3-codex": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 2,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1828976,
"tb_total_output_tokens": 10776,
"tb_total_tokens": 1839752,
"tb_uncached_input_tokens": 80880,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1748096,
"tb_cached_input_tokens": 1748096,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 4.481904,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.3-codex",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 2,
"partial_score_denominator": 2,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.18922047089881425,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"gpt-5-4": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1498281,
"tb_total_output_tokens": 14732,
"tb_total_tokens": 1513013,
"tb_uncached_input_tokens": 50345,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1447936,
"tb_cached_input_tokens": 1447936,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.9425139999999999,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 1,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.2473610905130445,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
}
},
"comparison": {
"partial_score_threshold": 0.8,
"methodology": {
"cache_pricing_mode_field": "cost_pricing_mode",
"code_review_fail_rate_field": "code_review_fail_rate",
"code_review_rate_denominator": "leaderboard_eligible",
"code_review_role": "additive_non_gating",
"cost_per_task_field": "cost_per_task",
"cost_role": "additive_non_gating",
"equiv_quality_per_dollar_field": "equiv_quality_per_dollar",
"equiv_rate_denominator": "leaderboard_eligible",
"equiv_rate_field": "equiv_rate",
"equiv_rate_role": "additive_non_gating",
"footprint_risk_denominator": "validated",
"footprint_risk_role": "additive_non_gating",
"footprint_risk_score_field": "footprint_risk_score",
"leaderboard_rate_field": "tests_only_pass_rate",
"pricing_source": "local_static_table",
"pricing_version": "local-placeholder-2026-02-19",
"probe_agreement_rate_field": "probe_agreement_rate",
"probe_gold_pass_candidate_fail_field": "probe_gold_pass_candidate_fail_count",
"probe_rate_denominator": "probe_gold_pass_candidate_known",
"probe_review_required_field": "probe_review_required_count",
"probe_role": "additive_non_gating",
"publish_filter_default": "include",
"publish_filter_field": "publish.include_in_leaderboard",
"quality_per_dollar_denominator": "cost_per_task",
"rescue_aware_rate_field": "rescue_aware_pass_rate",
"tb_resolved_but_tests_not_pass_field": "tb_resolved_but_tests_not_pass_count",
"tb_unresolved_but_tests_pass_field": "tb_unresolved_but_tests_pass_count",
"tests_only_quality_per_dollar_field": "tests_only_quality_per_dollar",
"tests_unknown_rate_threshold": 0.1
}