task_detail
validation/gpt-5-1-codex-mini/flux-pr-1435/task_detail.v1.json
13572 bytes
{
"version": "v1",
"materialization": {
"mode": "canonical"
},
"identity": {
"variant_id": "gpt-5.1-codex-mini",
"requested_model": "gpt-5.1-codex-mini",
"model_key": "gpt-5-1-codex-mini",
"run_id": "2026-02-28__18-27-22__gpt-5-1-codex-mini",
"task_id": "flux-pr-1435",
"trial_name": "flux-pr-1435.1-of-1.2026-02-28__18-27-22__gpt-5-1-codex-mini"
},
"decision": {
"status": "pass",
"primary_signal": "tests",
"matrix_status": "pass",
"reason_codes": [
"guardrail_diff_too_large",
"primary_tests",
"tests_passed",
"equivalence_equivalent",
"review_pass"
],
"lane_report": {
"lane": "lane_unknown",
"source": "lane_unknown",
"reasons": [
"lane_unknown"
]
},
"publish": {
"include_in_leaderboard": true,
"weak_signal_risk": false
},
"rescue": {
"candidate": false,
"eligible": false,
"decision": "not_candidate",
"required_artifacts_ok": true
}
},
"tests": {
"outcome": "pass",
"strength": "strong",
"commands": [
"env PATH=/root/.cargo/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin cargo test --all-features"
],
"command_results": [
{
"command": "env PATH=/root/.cargo/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin cargo test --all-features",
"gold_outcome": "pass",
"gold_tb_trial_name": "flux-pr-1435.1-of-1.validate-gold-1772315307749008000",
"gold_failure_mode": "unset",
"agent_outcome": "pass",
"agent_tb_trial_name": "flux-pr-1435.1-of-1.validate-override-1772315424354879000",
"agent_failure_mode": "unset",
"agent_retried": false
}
],
"partial_score": {
"score": 1,
"numerator": 1,
"denominator": 1,
"level": "command",
"provenance": "fallback_command_level",
"reason": "test_case_detail_unavailable"
},
"tb": {
"trial_name": "flux-pr-1435.1-of-1.2026-02-28__18-27-22__gpt-5-1-codex-mini",
"is_resolved": true,
"failure_mode": "unset",
"duration_sec": 231.958556667,
"time_budget_sec": 1200
},
"cache": {
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655"
}
},
"quality": {
"equivalence": {
"status": "used",
"outcome": "equivalent",
"confidence": 0.9,
"issue_type": "stylistic",
"notes": "The agent patch appears to implement the same intended span-tracking infrastructure (docs, `AttachedToken`, AST/helper wiring, and query/token span propagation) as the gold approach. The shown diffs are effectively identical aside from path-prefix formatting in the patch headers, with no visible behavioral deviation."
},
"code_review": {
"status": "used",
"signal": "pass",
"confidence": 0.8,
"issue_type": "unclear",
"source_field": "code_review",
"dimension_scores": {
"correctness": {
"score_0_4": 4,
"confidence_0_1": 0.84,
"rationale": "The shown agent patch aligns with the intended span-tracking infrastructure (docs, new AttachedToken helper, helper module export, and AST/query imports) and matches the provided gold diff excerpts; benchmark results also indicate the target test command passes."
},
"edge_case_handling": {
"score_0_4": 3,
"confidence_0_1": 0.66,
"rationale": "The patch explicitly documents that source span coverage is still work-in-progress, so edge cases with missing/inaccurate spans are acknowledged rather than fully solved in this change."
},
"introduced_bug_risk": {
"score_0_4": 4,
"confidence_0_1": 0.78,
"rationale": "The new wrapper deliberately preserves semantic equality/hash behavior while carrying token location metadata, which reduces regression risk for existing AST comparisons."
},
"maintainability_idioms": {
"score_0_4": 4,
"confidence_0_1": 0.81,
"rationale": "The change is modular and idiomatic: a focused helper type is added and exported through the helpers module, with clear docs describing constraints and contribution guidance."
}
},
"overall": {
"confidence_overall": 0.8,
"summary": "Based on the provided diff excerpts, the agent patch likely satisfies the intended PR change for introducing span infrastructure and appears consistent with the gold patch direction, with acknowledged WIP limitations on complete span accuracy."
},
"notes": "Based on the provided diff excerpts, the agent patch likely satisfies the intended PR change for introducing span infrastructure and appears consistent with the gold patch direction, with acknowledged WIP limitations on complete span accuracy."
},
"behavioral_robustness": {
"status": "used",
"probe_status": "used",
"gold_pass_candidate_pass_count": 1,
"agreement_rate": 1,
"coverage_delta_status": "unavailable",
"coverage_delta_reason": "coverage_adapter_unavailable",
"mutation_lite_status": "unavailable",
"mutation_lite_reason": "mutation_runner_unavailable",
"notes": "derived from command-level outcomes; no accepted generated probes"
},
"footprint_risk": {
"status": "used",
"reason": "none",
"version": "v1",
"level": "medium",
"score": 0.5325472606515167,
"flag": false,
"severe_flag": false
},
"cost": {
"status": "tokens_only",
"token_status": "present",
"cache_token_status": "present",
"total_input_tokens": 1465254,
"total_output_tokens": 18505,
"total_tokens": 1483759,
"uncached_input_tokens": 268966,
"cache_read_input_tokens": 1196288,
"cached_input_tokens": 1196288,
"total_cost_usd": 0.6939221999999999,
"token_source": "openai_cached_tokens_usage",
"pricing_model_key": "gpt-5.1-codex-mini",
"pricing_mode": "cache_aware"
}
},
"summary": {
"task": {
"matrix_status": "pass",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "pass",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1465254,
"tb_total_output_tokens": 18505,
"tb_total_tokens": 1483759,
"tb_uncached_input_tokens": 268966,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1196288,
"tb_cached_input_tokens": 1196288,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.6939221999999999,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 1,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.5325472606515167,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"run": {
"model": "gpt-5.1-codex-mini",
"requested_model": "gpt-5.1-codex-mini",
"run_id": "2026-02-28__18-27-22__gpt-5-1-codex-mini",
"passRate": 0.9333333333333333,
"ciLow": 0.8333333333333334,
"ciHigh": 1,
"effectiveN": 30,
"tier": 1,
"validation_counts": {
"fail_guardrail": 0,
"fail_high_conf": 1,
"fail_infra": 0,
"fail_likely_equiv": 1,
"fail_with_diag": 0,
"missing": 0,
"pass": 3,
"pass_with_warn": 25
}
},
"run_metadata": {
"agent_name": "codex",
"model_name": "gpt-5.1-codex-mini",
"start_time": "2026-02-28T20:00:12.722616+00:00",
"end_time": "2026-02-28T21:48:26.352711+00:00",
"commit_hash": "32dbe1950f8d0a4ed6db86745bf0a62da2ee0921",
"accuracy": 0.26666666666666666,
"reasoning_effort": "medium"
}
},
"evidence": {
"task_detail": {
"path": "validation/gpt-5-1-codex-mini/flux-pr-1435/task_detail.v1.json",
"exists": true
},
"trajectory": {
"path": "validation/gpt-5-1-codex-mini/flux-pr-1435/trajectory.v1.json",
"exists": true,
"non_empty": true,
"size_bytes": 15382
},
"validation": {
"path": "validation/gpt-5-1-codex-mini/flux-pr-1435/validation.json",
"exists": true,
"non_empty": true,
"size_bytes": 104541
},
"results": {
"path": "runs/2026-02-28__18-27-22__gpt-5-1-codex-mini/results.json",
"exists": true,
"non_empty": true,
"size_bytes": 61948
},
"run_metadata": {
"path": "runs/2026-02-28__18-27-22__gpt-5-1-codex-mini/run_metadata.json",
"exists": true,
"non_empty": true,
"size_bytes": 1673
},
"agent_patch": {
"path": "runs/2026-02-28__18-27-22__gpt-5-1-codex-mini/flux-pr-1435/flux-pr-1435.1-of-1.2026-02-28__18-27-22__gpt-5-1-codex-mini/agent-logs/agent.patch",
"exists": true,
"non_empty": true,
"size_bytes": 230758
},
"summary": {
"path": "reports/summary.json",
"exists": true,
"non_empty": true,
"size_bytes": 280264
},
"manifest": {
"path": "manifest.json",
"exists": true,
"non_empty": true,
"size_bytes": 675
},
"validation_inputs": {
"required_for_rescue_ok": true,
"gold_patch": {
"exists": true,
"non_empty": true,
"size_bytes": 138971,
"sha256": "705d8df2ee172a8bc40984c3d7de0e724d733bb6cae7d7ae4a77a223e02f7abd",
"required_for_rescue": true
},
"repo_tarball": {
"exists": true,
"non_empty": true,
"size_bytes": 439108,
"sha256": "d19a5cbec64835f4aa21ce54e4e8f004734def12096e03acd93282123f98058c",
"required_for_rescue": true
},
"agent_patch": {
"path": "runs/2026-02-28__18-27-22__gpt-5-1-codex-mini/flux-pr-1435/flux-pr-1435.1-of-1.2026-02-28__18-27-22__gpt-5-1-codex-mini/agent-logs/agent.patch",
"exists": true,
"non_empty": true,
"size_bytes": 230758,
"sha256": "2b0ebcf37404b6ae8046a18cbf03937190d8956883b95ea1e1e6d931c84591b4",
"required_for_rescue": true
}
}
},
"provenance": {
"identity": [
{
"source": "manifest",
"available": true
},
{
"source": "tb_results",
"available": true
}
],
"decision": [
{
"source": "validation",
"available": true
}
],
"tests": [
{
"source": "validation",
"available": true
},
{
"source": "tb_results",
"available": true
}
],
"quality": {
"equivalence": [
{
"source": "validation",
"available": true
}
],
"code_review": [
{
"source": "validation",
"available": true
}
],
"behavioral_robustness": [
{
"source": "validation",
"available": true
}
],
"footprint_risk": [
{
"source": "validation",
"available": true
}
],
"cost": [
{
"source": "validation",
"available": true
},
{
"source": "summary",
"available": true
}
]
},
"summary": {
"task": [
{
"source": "summary",
"available": true
}
],
"run": [
{
"source": "summary",
"available": true
}
],
"run_metadata": [
{
"source": "run_metadata",
"available": true
}
]
},
"evidence": [
{
"source": "filesystem",
"available": true
}
]
}
}