task_detail
validation/gpt-5-4/flux-pr-1034/task_detail.v1.json
16427 bytes
{
"version": "v1",
"materialization": {
"mode": "canonical"
},
"identity": {
"variant_id": "gpt-5.4",
"requested_model": "gpt-5.4",
"model_key": "gpt-5-4",
"run_id": "2026-02-28__13-38-10__gpt-5-4",
"task_id": "flux-pr-1034",
"trial_name": "flux-pr-1034.1-of-1.2026-02-28__13-38-10__gpt-5-4"
},
"decision": {
"status": "pass",
"primary_signal": "tests",
"matrix_status": "pass_with_warn",
"reason_codes": [
"primary_tests",
"tests_passed",
"equivalence_non_equivalent",
"decision_conflict",
"review_fail"
],
"flags": [
"equiv_warn",
"review_warn"
],
"lane_report": {
"lane": "lane_unknown",
"source": "lane_unknown",
"reasons": [
"lane_unknown"
]
},
"publish": {
"include_in_leaderboard": true,
"weak_signal_risk": false
},
"rescue": {
"candidate": false,
"eligible": false,
"decision": "not_candidate",
"required_artifacts_ok": true
}
},
"tests": {
"outcome": "pass",
"strength": "strong",
"commands": [
"go test -C v2 ./... -count=1 -timeout=300s"
],
"command_results": [
{
"command": "go test -C v2 ./... -count=1 -timeout=300s",
"gold_outcome": "pass",
"gold_tb_trial_name": "flux-pr-1034.1-of-1.validate-gold-1772829224076359000",
"gold_failure_mode": "unset",
"agent_outcome": "pass",
"agent_tb_trial_name": "flux-pr-1034.1-of-1.validate-override-1772829550985282000",
"agent_failure_mode": "unset",
"agent_retried": false
}
],
"partial_score": {
"score": 1,
"numerator": 1,
"denominator": 1,
"level": "command",
"provenance": "fallback_command_level",
"reason": "test_case_detail_unavailable"
},
"tb": {
"trial_name": "flux-pr-1034.1-of-1.2026-02-28__13-38-10__gpt-5-4",
"is_resolved": true,
"failure_mode": "unset",
"duration_sec": 420.414611792,
"time_budget_sec": 1200
},
"cache": {
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c"
}
},
"quality": {
"equivalence": {
"status": "used",
"outcome": "non_equivalent",
"evaluator_model": "gpt-5.3-codex",
"confidence": 0.86,
"issue_type": "behavioral"
},
"code_review": {
"status": "used",
"signal": "fail",
"evaluator_model": "gpt-5.3-codex",
"confidence": 0.87,
"issue_type": "unclear",
"source_field": "code_review",
"dimension_scores": {
"correctness": {
"score_0_4": 1,
"confidence_0_1": 0.9,
"rationale": "The remapper skips renaming when a variable already matches the canonical token and does not reorder variable definitions, so structurally identical queries with different variable declaration order can still normalize to different query strings. It also applies collection broadly from any argument visit, not scoped to operation-root usage."
},
"edge_case_handling": {
"score_0_4": 1,
"confidence_0_1": 0.83,
"rationale": "Edge cases around irrelevant document parts are weak: variable discovery is not constrained to operation ancestry, so unused/non-executed argument trees can influence canonical assignment and cache keys."
},
"introduced_bug_risk": {
"score_0_4": 1,
"confidence_0_1": 0.81,
"rationale": "The patch introduces JSON-level mutation of request variables during normalization; non-object variable payloads (e.g. null) now cause unmarshal errors and can abort normalization via internal error paths."
},
"maintainability_idioms": {
"score_0_4": 2,
"confidence_0_1": 0.75,
"rationale": "Implementation is readable but mixes AST normalization concerns with JSON variable payload rewriting, increasing coupling and making future behavior changes harder to reason about."
}
},
"findings": [
{
"id": "f1",
"dimension": "correctness",
"severity": "major",
"title": "Canonicalization misses declaration-order normalization",
"detail": "Variables already named like canonical tokens are skipped, and variable definition refs are never sorted/reordered. A query declared as `($b,$a)` but used as `$a,$b` can remain different from an equivalent `($a,$b)` query, defeating same-shape cache normalization.",
"confidence_0_1": 0.95,
"citations": [
{
"path": "app/v2/pkg/astnormalization/variables_mapper.go",
"side": "new",
"start_line": 83,
"end_line": 89,
"anchor_kind": "diff",
"head_sha": "93dcbff9a9eb294de12c18d5ad541150ce64f210"
},
{
"path": "app/v2/pkg/astnormalization/variables_mapper.go",
"side": "new",
"start_line": 92,
"end_line": 98,
"anchor_kind": "diff",
"head_sha": "93dcbff9a9eb294de12c18d5ad541150ce64f210"
}
]
},
{
"id": "f2",
"dimension": "introduced_bug_risk",
"severity": "major",
"title": "Remap fails on non-object variables payload",
"detail": "Variable input remapping unmarshals into `map[string]json.RawMessage`; payloads like `null` are valid in transport but will error, and the visitor converts that into an internal normalization failure.",
"confidence_0_1": 0.86,
"citations": [
{
"path": "app/v2/pkg/astnormalization/variables_mapper.go",
"side": "new",
"start_line": 136,
"end_line": 142,
"anchor_kind": "diff",
"head_sha": "93dcbff9a9eb294de12c18d5ad541150ce64f210"
},
{
"path": "app/v2/pkg/astnormalization/variables_mapper.go",
"side": "new",
"start_line": 102,
"end_line": 107,
"anchor_kind": "diff",
"head_sha": "93dcbff9a9eb294de12c18d5ad541150ce64f210"
}
]
},
{
"id": "f3",
"dimension": "edge_case_handling",
"severity": "major",
"title": "Variable-order collection is not scoped to active operation context",
"detail": "Argument traversal collects variables without an ancestor/operation check. Combined with remap stage placement before other cleanup stages, irrelevant arguments (e.g., in non-executed sections) can influence canonical names and cache keys.",
"confidence_0_1": 0.82,
"citations": [
{
"path": "app/v2/pkg/astnormalization/variables_mapper.go",
"side": "new",
"start_line": 44,
"end_line": 65,
"anchor_kind": "diff",
"head_sha": "93dcbff9a9eb294de12c18d5ad541150ce64f210"
},
{
"path": "app/v2/pkg/astnormalization/astnormalization.go",
"side": "new",
"start_line": 244,
"end_line": 252,
"anchor_kind": "diff",
"head_sha": "93dcbff9a9eb294de12c18d5ad541150ce64f210"
}
]
}
],
"overall": {
"overall_score_0_100": 30.000000000000004,
"signal": "fail",
"confidence_overall": 0.87,
"summary": "The patch implements substantial remapping plumbing, but it likely does not fully satisfy the intended normalization behavior: declaration-order equivalence can still diverge, and JSON-level variable rewriting introduces new failure modes."
}
},
"behavioral_robustness": {
"status": "used",
"probe_status": "used",
"gold_pass_candidate_pass_count": 1,
"agreement_rate": 1,
"coverage_delta_status": "unavailable",
"coverage_delta_reason": "coverage_adapter_unavailable",
"mutation_lite_status": "unavailable",
"mutation_lite_reason": "mutation_runner_unavailable",
"notes": "derived from command-level outcomes; no accepted generated probes"
},
"footprint_risk": {
"status": "used",
"reason": "none",
"version": "v1",
"level": "low",
"score": 0.26356485337836205,
"flag": false,
"severe_flag": false
},
"cost": {
"status": "tokens_only",
"token_status": "present",
"cache_token_status": "present",
"total_input_tokens": 7590363,
"total_output_tokens": 16438,
"total_tokens": 7606801,
"uncached_input_tokens": 447579,
"cache_read_input_tokens": 7142784,
"cached_input_tokens": 7142784,
"total_cost_usd": 4.598053999999999,
"token_source": "openai_cached_tokens_usage",
"pricing_model_key": "gpt-5.4",
"pricing_mode": "cache_aware"
}
},
"summary": {
"task": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": true,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 7590363,
"tb_total_output_tokens": 16438,
"tb_total_tokens": 7606801,
"tb_uncached_input_tokens": 447579,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 7142784,
"tb_cached_input_tokens": 7142784,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 4.598053999999999,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 1,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.26356485337836205,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"run": {
"model": "gpt-5.4",
"requested_model": "gpt-5.4",
"run_id": "2026-02-28__13-38-10__gpt-5-4",
"passRate": 0.896551724137931,
"ciLow": 0.7586206896551724,
"ciHigh": 1,
"effectiveN": 29,
"tier": 1,
"validation_counts": {
"fail_guardrail": 1,
"fail_high_conf": 2,
"fail_infra": 0,
"fail_likely_equiv": 1,
"fail_with_diag": 0,
"missing": 0,
"pass": 5,
"pass_with_warn": 20
}
},
"run_metadata": {
"agent_name": "codex",
"model_name": "gpt-5.4",
"start_time": "2026-03-06T11:06:36.354605+00:00",
"end_time": "2026-03-06T13:16:38.693205+00:00",
"commit_hash": "8f1bc9fe2c3cb8a14c254de41e9577a2ce352ac2",
"accuracy": 0.7586206896551724,
"reasoning_effort": "medium"
}
},
"evidence": {
"task_detail": {
"path": "validation/gpt-5-4/flux-pr-1034/task_detail.v1.json",
"exists": true
},
"trajectory": {
"path": "validation/gpt-5-4/flux-pr-1034/trajectory.v1.json",
"exists": true,
"non_empty": true,
"size_bytes": 15260
},
"validation": {
"path": "validation/gpt-5-4/flux-pr-1034/validation.json",
"exists": true,
"non_empty": true,
"size_bytes": 164042
},
"results": {
"path": "runs/2026-02-28__13-38-10__gpt-5-4/results.json",
"exists": true,
"non_empty": true,
"size_bytes": 61729
},
"run_metadata": {
"path": "runs/2026-02-28__13-38-10__gpt-5-4/run_metadata.json",
"exists": true,
"non_empty": true,
"size_bytes": 1623
},
"agent_patch": {
"path": "runs/2026-02-28__13-38-10__gpt-5-4/flux-pr-1034/flux-pr-1034.1-of-1.2026-02-28__13-38-10__gpt-5-4/agent-logs/agent.patch",
"exists": true,
"non_empty": true,
"size_bytes": 23487
},
"summary": {
"path": "reports/summary.json",
"exists": true,
"non_empty": true,
"size_bytes": 269968
},
"manifest": {
"path": "manifest.json",
"exists": true,
"non_empty": true,
"size_bytes": 677
},
"validation_inputs": {
"required_for_rescue_ok": true,
"gold_patch": {
"exists": true,
"non_empty": true,
"size_bytes": 11956,
"sha256": "5b21daceee9a4c4f67498bc2777798430c0ccea25a935b867d1463ecfd3ebfcd",
"required_for_rescue": true
},
"repo_tarball": {
"exists": true,
"non_empty": true,
"size_bytes": 4155946,
"sha256": "1000f8cb511b58a0eb9145c57c2ee81c89e2bb324ec28a30226ee1749d334bb6",
"required_for_rescue": true
},
"agent_patch": {
"path": "runs/2026-02-28__13-38-10__gpt-5-4/flux-pr-1034/flux-pr-1034.1-of-1.2026-02-28__13-38-10__gpt-5-4/agent-logs/agent.patch",
"exists": true,
"non_empty": true,
"size_bytes": 23487,
"sha256": "84375a55fa478051ef2c0a808a3707661f70985c9c0d2c6b0bbd85ba5f25e5cb",
"required_for_rescue": true
}
}
},
"provenance": {
"identity": [
{
"source": "manifest",
"available": true
},
{
"source": "tb_results",
"available": true
}
],
"decision": [
{
"source": "validation",
"available": true
}
],
"tests": [
{
"source": "validation",
"available": true
},
{
"source": "tb_results",
"available": true
}
],
"quality": {
"equivalence": [
{
"source": "validation",
"available": true
}
],
"code_review": [
{
"source": "validation",
"available": true
}
],
"behavioral_robustness": [
{
"source": "validation",
"available": true
}
],
"footprint_risk": [
{
"source": "validation",
"available": true
}
],
"cost": [
{
"source": "validation",
"available": true
},
{
"source": "summary",
"available": true
}
]
},
"summary": {
"task": [
{
"source": "summary",
"available": true
}
],
"run": [
{
"source": "summary",
"available": true
}
],
"run_metadata": [
{
"source": "run_metadata",
"available": true
}
]
},
"evidence": [
{
"source": "filesystem",
"available": true
}
]
}
}