summary
reports/summary.json
16931 bytes
{
"generated_at": "2026-02-20T16:17:49Z",
"dataset": "/Users/ben/dev/flux/.tmp/validation-zod-cleaned50-r7-p1-20260208-170124",
"output_root": "/Users/ben/dev/flux/.tmp/h2h-zod-w66",
"statistics": {
"pass_definition": {
"denominator_statuses": [
"fail_guardrail",
"fail_high_conf",
"fail_infra",
"fail_likely_equiv",
"fail_no_patch",
"fail_with_diag",
"pass",
"pass_with_warn"
],
"positive_statuses": [
"pass",
"pass_with_warn"
]
},
"bootstrap": {
"base_seed": 1337,
"confidence_level": 0.95,
"method": "nonparametric_task_bootstrap",
"resamples": 5000
},
"tiering": {
"rule": "A is strictly superior to B iff passRate(A) > ciHigh(B)",
"strategy": "conservative_non_superiority_grouping"
}
},
"models": [
{
"name": "gpt-5.1-codex-mini",
"key": "gpt-5-1-codex-mini",
"run_id": "2026-02-20__00-10-38__gpt-5-1-codex-mini"
}
],
"runs": {
"gpt-5-1-codex-mini": {
"model": "gpt-5.1-codex-mini",
"requested_model": "gpt-5.1-codex-mini",
"run_id": "2026-02-20__00-10-38__gpt-5-1-codex-mini",
"run_dir": "/Users/ben/dev/flux/.tmp/h2h-zod-w66/runs/2026-02-20__00-10-38__gpt-5-1-codex-mini",
"results_path": "/Users/ben/dev/flux/.tmp/h2h-zod-w66/runs/2026-02-20__00-10-38__gpt-5-1-codex-mini/results.json",
"validation_metrics": {
"validated": 3,
"leaderboard_eligible": 0,
"leaderboard_excluded": 3,
"binary_pass_count": 0,
"binary_pass_rate": null,
"tests_only_pass_count": 0,
"tests_only_pass_rate": null,
"rescue_aware_pass_count": 0,
"rescue_aware_pass_rate": null,
"rescue_delta_rate": null,
"equiv_rate": null,
"equiv_equivalent_count": 0,
"equiv_non_equivalent_count": 0,
"equiv_unknown_count": 0,
"code_review_pass_count": 0,
"code_review_fail_count": 0,
"code_review_unsure_count": 0,
"code_review_fail_rate": null,
"behavioral_robustness_used_count": 0,
"behavioral_robustness_skipped_count": 0,
"behavioral_robustness_unavailable_count": 0,
"probe_accepted_commands_count": 0,
"probe_gold_pass_candidate_pass_count": 0,
"probe_gold_pass_candidate_fail_count": 0,
"probe_review_required_count": 0,
"probe_agreement_rate": null,
"tests_unknown_count": 0,
"tests_unknown_cause_counts": {},
"tests_unknown_rate": 0,
"tests_unknown_rate_threshold": 0.1,
"tests_unknown_threshold_breached": false,
"cache_evaluated_count": 3,
"cache_hit_count": 0,
"cache_miss_count": 3,
"cache_hit_rate": 0,
"cache_miss_reason_counts": {
"missing_pinned_dependencies": 3
},
"total_setup_ms_saved": null,
"total_pinned_bytes": null,
"footprint_risk_mean_score": 0.47063650442368105,
"footprint_risk_median_score": 0.3053732454743462,
"footprint_risk_scored_count": 3,
"footprint_risk_used_count": 3,
"footprint_risk_unavailable_count": 0,
"footprint_risk_missing_count": 0,
"footprint_risk_flagged_count": 1,
"footprint_risk_flagged_rate": 0.3333333333333333,
"footprint_risk_severe_count": 1,
"footprint_risk_severe_rate": 0.3333333333333333,
"footprint_risk_level_low_count": 2,
"footprint_risk_level_medium_count": 0,
"footprint_risk_level_high_count": 1,
"footprint_risk_level_unknown_count": 0,
"total_cost": 2.7896835,
"cost_per_task": 0.9298945000000001,
"tests_only_quality_per_dollar": null,
"equiv_quality_per_dollar": null,
"total_input_tokens": 6483585,
"total_output_tokens": 87491,
"total_tokens": 6571076,
"total_uncached_input_tokens": 957185,
"total_cache_creation_input_tokens": null,
"total_cache_read_input_tokens": 5526400,
"total_cached_input_tokens": 5526400,
"cost_tasks_total": 3,
"cost_tasks_with_tokens": 3,
"cost_tasks_with_cache_tokens": 3,
"cost_tasks_with_cache_aware_pricing": 3,
"cost_tasks_with_legacy_pricing": 0,
"cost_tasks_with_pricing": 3,
"cost_tasks_with_cost": 3,
"cost_tasks_missing_tokens": 0,
"cost_tasks_missing_pricing": 0,
"cost_tasks_missing_cost": 0,
"cost_tasks_with_cost_rate": 1,
"pricing_version": "local-placeholder-2026-02-19",
"pricing_source": "local_static_table",
"pricing_model_key": "gpt-5.1-codex-mini"
},
"publish_exclusions": {
"base_tests_pass_without_patch": 3
},
"publish_guard": {
"publishable": true,
"blocked": false,
"reasons": [],
"tests_unknown_rate": 0,
"tests_unknown_rate_threshold": 0.1,
"tests_unknown_threshold_breached": false
},
"partial_metrics": {
"failed_task_count": 0,
"failed_task_with_partial_score": 0,
"failed_task_partial_threshold": 0.8,
"failed_task_partial_threshold_hits": 0,
"failed_task_partial_ppr": null,
"failed_task_partial_mean_score": null,
"failed_task_partial_coverage": null
},
"footprint_risk_metrics": {
"used_count": 3,
"unavailable_count": 0,
"missing_count": 0,
"scored_count": 3,
"mean_score": 0.47063650442368105,
"median_score": 0.3053732454743462,
"flagged_count": 1,
"flagged_rate": 0.3333333333333333,
"severe_count": 1,
"severe_rate": 0.3333333333333333,
"level_low_count": 2,
"level_medium_count": 0,
"level_high_count": 1,
"level_unknown_count": 0
},
"passRate": null,
"ciLow": null,
"ciHigh": null,
"effectiveN": 0,
"tier": null,
"validation_counts": {
"fail_guardrail": 0,
"fail_high_conf": 0,
"fail_infra": 0,
"fail_likely_equiv": 0,
"fail_with_diag": 0,
"missing": 0,
"pass": 0,
"pass_with_warn": 3
}
}
},
"tasks": {
"flux-pr-4843": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "skipped",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": null,
"probe_gold_pass_candidate_pass_count": null,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 2028666,
"tb_total_output_tokens": 18019,
"tb_total_tokens": 2046685,
"tb_uncached_input_tokens": 223866,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1804800,
"tb_cached_input_tokens": 1804800,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.7146330000000001,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "high",
"footprint_risk_score": 1,
"footprint_risk_flag": true,
"footprint_risk_severe_flag": true
}
}
},
"flux-pr-4861": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "unsure",
"behavioral_robustness_status": "skipped",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": null,
"probe_gold_pass_candidate_pass_count": null,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 692361,
"tb_total_output_tokens": 8019,
"tb_total_tokens": 700380,
"tb_uncached_input_tokens": 116617,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 575744,
"tb_cached_input_tokens": 575744,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.3094011,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.10653626779669706,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
},
"flux-pr-4970": {
"models": {
"gpt-5-1-codex-mini": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "unsure",
"behavioral_robustness_status": "skipped",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": null,
"probe_gold_pass_candidate_pass_count": null,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 3762558,
"tb_total_output_tokens": 61453,
"tb_total_tokens": 3824011,
"tb_uncached_input_tokens": 616702,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 3145856,
"tb_cached_input_tokens": 3145856,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 1.7656494,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": false,
"publish_exclusion_reasons": [
"base_tests_pass_without_patch"
],
"publish_weak_signal_risk": true,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 3,
"partial_score_denominator": 3,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.3053732454743462,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
}
}
}
},
"comparison": {
"partial_score_threshold": 0.8,
"methodology": {
"cache_pricing_mode_field": "cost_pricing_mode",
"code_review_fail_rate_field": "code_review_fail_rate",
"code_review_rate_denominator": "leaderboard_eligible",
"code_review_role": "additive_non_gating",
"cost_per_task_field": "cost_per_task",
"cost_role": "additive_non_gating",
"equiv_quality_per_dollar_field": "equiv_quality_per_dollar",
"equiv_rate_denominator": "leaderboard_eligible",
"equiv_rate_field": "equiv_rate",
"equiv_rate_role": "additive_non_gating",
"footprint_risk_denominator": "validated",
"footprint_risk_role": "additive_non_gating",
"footprint_risk_score_field": "footprint_risk_score",
"leaderboard_rate_field": "tests_only_pass_rate",
"pricing_source": "local_static_table",
"pricing_version": "local-placeholder-2026-02-19",
"probe_agreement_rate_field": "probe_agreement_rate",
"probe_gold_pass_candidate_fail_field": "probe_gold_pass_candidate_fail_count",
"probe_rate_denominator": "probe_gold_pass_candidate_known",
"probe_review_required_field": "probe_review_required_count",
"probe_role": "additive_non_gating",
"publish_filter_default": "include",
"publish_filter_field": "publish.include_in_leaderboard",
"quality_per_dollar_denominator": "cost_per_task",
"rescue_aware_rate_field": "rescue_aware_pass_rate",
"tests_only_quality_per_dollar_field": "tests_only_quality_per_dollar",
"tests_unknown_rate_threshold": 0.1
},
"ranking": [
{
"rank": 1,
"model": "gpt-5.1-codex-mini",
"model_key": "gpt-5-1-codex-mini",
"binary_pass_rate": null,
"binary_pass_count": 0,
"validated": 3,
"failed_task_partial_ppr": null,
"failed_task_partial_mean_score": null,
"failed_task_partial_coverage": null,
"tie_break_basis": null
}
],
"publish_guard": {
"all_runs_publishable": true,
"blocked_runs": [],
"tests_unknown_rate_threshold": 0.1
}
}
}