STET

summary

reports/summary.json

16931 bytes

Back to adjudication
{
  "generated_at": "2026-02-20T16:17:49Z",
  "dataset": "/Users/ben/dev/flux/.tmp/validation-zod-cleaned50-r7-p1-20260208-170124",
  "output_root": "/Users/ben/dev/flux/.tmp/h2h-zod-w66",
  "statistics": {
    "pass_definition": {
      "denominator_statuses": [
        "fail_guardrail",
        "fail_high_conf",
        "fail_infra",
        "fail_likely_equiv",
        "fail_no_patch",
        "fail_with_diag",
        "pass",
        "pass_with_warn"
      ],
      "positive_statuses": [
        "pass",
        "pass_with_warn"
      ]
    },
    "bootstrap": {
      "base_seed": 1337,
      "confidence_level": 0.95,
      "method": "nonparametric_task_bootstrap",
      "resamples": 5000
    },
    "tiering": {
      "rule": "A is strictly superior to B iff passRate(A) > ciHigh(B)",
      "strategy": "conservative_non_superiority_grouping"
    }
  },
  "models": [
    {
      "name": "gpt-5.1-codex-mini",
      "key": "gpt-5-1-codex-mini",
      "run_id": "2026-02-20__00-10-38__gpt-5-1-codex-mini"
    }
  ],
  "runs": {
    "gpt-5-1-codex-mini": {
      "model": "gpt-5.1-codex-mini",
      "requested_model": "gpt-5.1-codex-mini",
      "run_id": "2026-02-20__00-10-38__gpt-5-1-codex-mini",
      "run_dir": "/Users/ben/dev/flux/.tmp/h2h-zod-w66/runs/2026-02-20__00-10-38__gpt-5-1-codex-mini",
      "results_path": "/Users/ben/dev/flux/.tmp/h2h-zod-w66/runs/2026-02-20__00-10-38__gpt-5-1-codex-mini/results.json",
      "validation_metrics": {
        "validated": 3,
        "leaderboard_eligible": 0,
        "leaderboard_excluded": 3,
        "binary_pass_count": 0,
        "binary_pass_rate": null,
        "tests_only_pass_count": 0,
        "tests_only_pass_rate": null,
        "rescue_aware_pass_count": 0,
        "rescue_aware_pass_rate": null,
        "rescue_delta_rate": null,
        "equiv_rate": null,
        "equiv_equivalent_count": 0,
        "equiv_non_equivalent_count": 0,
        "equiv_unknown_count": 0,
        "code_review_pass_count": 0,
        "code_review_fail_count": 0,
        "code_review_unsure_count": 0,
        "code_review_fail_rate": null,
        "behavioral_robustness_used_count": 0,
        "behavioral_robustness_skipped_count": 0,
        "behavioral_robustness_unavailable_count": 0,
        "probe_accepted_commands_count": 0,
        "probe_gold_pass_candidate_pass_count": 0,
        "probe_gold_pass_candidate_fail_count": 0,
        "probe_review_required_count": 0,
        "probe_agreement_rate": null,
        "tests_unknown_count": 0,
        "tests_unknown_cause_counts": {},
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false,
        "cache_evaluated_count": 3,
        "cache_hit_count": 0,
        "cache_miss_count": 3,
        "cache_hit_rate": 0,
        "cache_miss_reason_counts": {
          "missing_pinned_dependencies": 3
        },
        "total_setup_ms_saved": null,
        "total_pinned_bytes": null,
        "footprint_risk_mean_score": 0.47063650442368105,
        "footprint_risk_median_score": 0.3053732454743462,
        "footprint_risk_scored_count": 3,
        "footprint_risk_used_count": 3,
        "footprint_risk_unavailable_count": 0,
        "footprint_risk_missing_count": 0,
        "footprint_risk_flagged_count": 1,
        "footprint_risk_flagged_rate": 0.3333333333333333,
        "footprint_risk_severe_count": 1,
        "footprint_risk_severe_rate": 0.3333333333333333,
        "footprint_risk_level_low_count": 2,
        "footprint_risk_level_medium_count": 0,
        "footprint_risk_level_high_count": 1,
        "footprint_risk_level_unknown_count": 0,
        "total_cost": 2.7896835,
        "cost_per_task": 0.9298945000000001,
        "tests_only_quality_per_dollar": null,
        "equiv_quality_per_dollar": null,
        "total_input_tokens": 6483585,
        "total_output_tokens": 87491,
        "total_tokens": 6571076,
        "total_uncached_input_tokens": 957185,
        "total_cache_creation_input_tokens": null,
        "total_cache_read_input_tokens": 5526400,
        "total_cached_input_tokens": 5526400,
        "cost_tasks_total": 3,
        "cost_tasks_with_tokens": 3,
        "cost_tasks_with_cache_tokens": 3,
        "cost_tasks_with_cache_aware_pricing": 3,
        "cost_tasks_with_legacy_pricing": 0,
        "cost_tasks_with_pricing": 3,
        "cost_tasks_with_cost": 3,
        "cost_tasks_missing_tokens": 0,
        "cost_tasks_missing_pricing": 0,
        "cost_tasks_missing_cost": 0,
        "cost_tasks_with_cost_rate": 1,
        "pricing_version": "local-placeholder-2026-02-19",
        "pricing_source": "local_static_table",
        "pricing_model_key": "gpt-5.1-codex-mini"
      },
      "publish_exclusions": {
        "base_tests_pass_without_patch": 3
      },
      "publish_guard": {
        "publishable": true,
        "blocked": false,
        "reasons": [],
        "tests_unknown_rate": 0,
        "tests_unknown_rate_threshold": 0.1,
        "tests_unknown_threshold_breached": false
      },
      "partial_metrics": {
        "failed_task_count": 0,
        "failed_task_with_partial_score": 0,
        "failed_task_partial_threshold": 0.8,
        "failed_task_partial_threshold_hits": 0,
        "failed_task_partial_ppr": null,
        "failed_task_partial_mean_score": null,
        "failed_task_partial_coverage": null
      },
      "footprint_risk_metrics": {
        "used_count": 3,
        "unavailable_count": 0,
        "missing_count": 0,
        "scored_count": 3,
        "mean_score": 0.47063650442368105,
        "median_score": 0.3053732454743462,
        "flagged_count": 1,
        "flagged_rate": 0.3333333333333333,
        "severe_count": 1,
        "severe_rate": 0.3333333333333333,
        "level_low_count": 2,
        "level_medium_count": 0,
        "level_high_count": 1,
        "level_unknown_count": 0
      },
      "passRate": null,
      "ciLow": null,
      "ciHigh": null,
      "effectiveN": 0,
      "tier": null,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 0,
        "fail_infra": 0,
        "fail_likely_equiv": 0,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 0,
        "pass_with_warn": 3
      }
    }
  },
  "tasks": {
    "flux-pr-4843": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "non_equivalent",
          "code_review_status": "used",
          "code_review_signal": "fail",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "equiv_warn",
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 2028666,
          "tb_total_output_tokens": 18019,
          "tb_total_tokens": 2046685,
          "tb_uncached_input_tokens": 223866,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 1804800,
          "tb_cached_input_tokens": 1804800,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.7146330000000001,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "high",
          "footprint_risk_score": 1,
          "footprint_risk_flag": true,
          "footprint_risk_severe_flag": true
        }
      }
    },
    "flux-pr-4861": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 692361,
          "tb_total_output_tokens": 8019,
          "tb_total_tokens": 700380,
          "tb_uncached_input_tokens": 116617,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 575744,
          "tb_cached_input_tokens": 575744,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 0.3094011,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.10653626779669706,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    },
    "flux-pr-4970": {
      "models": {
        "gpt-5-1-codex-mini": {
          "matrix_status": "pass_with_warn",
          "tests_outcome": "pass",
          "tests_unknown_cause": null,
          "cache_hit": false,
          "cache_miss_reason": "missing_pinned_dependencies",
          "setup_ms_saved": null,
          "pinned_bytes": null,
          "environment_group_id": "eg_de49b64fec1e79eec7de4bb10f9c5f2f",
          "equivalence_status": "used",
          "equivalence_outcome": "equivalent",
          "code_review_status": "used",
          "code_review_signal": "unsure",
          "behavioral_robustness_status": "skipped",
          "coverage_delta_status": "unavailable",
          "mutation_lite_status": "unavailable",
          "probe_accepted_candidates": null,
          "probe_accepted_commands": null,
          "probe_agreement_rate": null,
          "probe_gold_pass_candidate_pass_count": null,
          "probe_gold_pass_candidate_fail_count": null,
          "probe_review_required_count": null,
          "flags": [
            "review_warn"
          ],
          "tb_is_resolved": true,
          "tb_failure_mode": "unset",
          "tb_total_input_tokens": 3762558,
          "tb_total_output_tokens": 61453,
          "tb_total_tokens": 3824011,
          "tb_uncached_input_tokens": 616702,
          "tb_cache_creation_input_tokens": null,
          "tb_cache_read_input_tokens": 3145856,
          "tb_cached_input_tokens": 3145856,
          "token_status": "present",
          "cache_token_status": "present",
          "token_source": "openai_cached_tokens_usage",
          "task_cost": 1.7656494,
          "cost_status": "present",
          "cost_pricing_mode": "cache_aware",
          "pricing_model_key": "gpt-5.1-codex-mini",
          "equiv_rescue_policy": "on",
          "rescue_candidate": false,
          "rescue_eligible": false,
          "rescue_decision": "not_candidate",
          "publish_include_in_leaderboard": false,
          "publish_exclusion_reasons": [
            "base_tests_pass_without_patch"
          ],
          "publish_weak_signal_risk": true,
          "tests_only_outcome": 1,
          "rescue_aware_outcome": 1,
          "partial_score": 1,
          "partial_score_numerator": 3,
          "partial_score_denominator": 3,
          "partial_score_level": "command",
          "partial_score_provenance": "fallback_command_level",
          "partial_score_reason": "test_case_detail_unavailable",
          "partial_score_unknown_count": 0,
          "footprint_risk_status": "used",
          "footprint_risk_reason": "none",
          "footprint_risk_level": "low",
          "footprint_risk_score": 0.3053732454743462,
          "footprint_risk_flag": false,
          "footprint_risk_severe_flag": false
        }
      }
    }
  },
  "comparison": {
    "partial_score_threshold": 0.8,
    "methodology": {
      "cache_pricing_mode_field": "cost_pricing_mode",
      "code_review_fail_rate_field": "code_review_fail_rate",
      "code_review_rate_denominator": "leaderboard_eligible",
      "code_review_role": "additive_non_gating",
      "cost_per_task_field": "cost_per_task",
      "cost_role": "additive_non_gating",
      "equiv_quality_per_dollar_field": "equiv_quality_per_dollar",
      "equiv_rate_denominator": "leaderboard_eligible",
      "equiv_rate_field": "equiv_rate",
      "equiv_rate_role": "additive_non_gating",
      "footprint_risk_denominator": "validated",
      "footprint_risk_role": "additive_non_gating",
      "footprint_risk_score_field": "footprint_risk_score",
      "leaderboard_rate_field": "tests_only_pass_rate",
      "pricing_source": "local_static_table",
      "pricing_version": "local-placeholder-2026-02-19",
      "probe_agreement_rate_field": "probe_agreement_rate",
      "probe_gold_pass_candidate_fail_field": "probe_gold_pass_candidate_fail_count",
      "probe_rate_denominator": "probe_gold_pass_candidate_known",
      "probe_review_required_field": "probe_review_required_count",
      "probe_role": "additive_non_gating",
      "publish_filter_default": "include",
      "publish_filter_field": "publish.include_in_leaderboard",
      "quality_per_dollar_denominator": "cost_per_task",
      "rescue_aware_rate_field": "rescue_aware_pass_rate",
      "tests_only_quality_per_dollar_field": "tests_only_quality_per_dollar",
      "tests_unknown_rate_threshold": 0.1
    },
    "ranking": [
      {
        "rank": 1,
        "model": "gpt-5.1-codex-mini",
        "model_key": "gpt-5-1-codex-mini",
        "binary_pass_rate": null,
        "binary_pass_count": 0,
        "validated": 3,
        "failed_task_partial_ppr": null,
        "failed_task_partial_mean_score": null,
        "failed_task_partial_coverage": null,
        "tie_break_basis": null
      }
    ],
    "publish_guard": {
      "all_runs_publishable": true,
      "blocked_runs": [],
      "tests_unknown_rate_threshold": 0.1
    }
  }
}