task_detail
validation/gpt-5-4/flux-pr-2170/task_detail.v1.json
16259 bytes
{
"version": "v1",
"materialization": {
"mode": "canonical"
},
"identity": {
"variant_id": "gpt-5.4",
"requested_model": "gpt-5.4",
"model_key": "gpt-5-4",
"run_id": "2026-02-28__18-27-22__gpt-5-4",
"task_id": "flux-pr-2170",
"trial_name": "flux-pr-2170.1-of-1.2026-02-28__18-27-22__gpt-5-4"
},
"decision": {
"status": "pass",
"primary_signal": "tests",
"matrix_status": "pass_with_warn",
"reason_codes": [
"primary_tests",
"tests_passed",
"equivalence_equivalent"
],
"flags": [
"review_warn"
],
"lane_report": {
"lane": "lane_unknown",
"source": "lane_unknown",
"reasons": [
"lane_unknown"
]
},
"publish": {
"include_in_leaderboard": true,
"weak_signal_risk": false
},
"rescue": {
"candidate": false,
"eligible": false,
"decision": "not_candidate",
"required_artifacts_ok": true
}
},
"tests": {
"outcome": "pass",
"strength": "strong",
"commands": [
"env PATH=/root/.cargo/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin cargo test --all-features"
],
"command_results": [
{
"command": "env PATH=/root/.cargo/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin cargo test --all-features",
"gold_outcome": "pass",
"gold_tb_trial_name": "flux-pr-2170.1-of-1.validate-gold-1772832843845157000",
"gold_failure_mode": "unset",
"agent_outcome": "pass",
"agent_tb_trial_name": "flux-pr-2170.1-of-1.validate-override-1772833021932044000",
"agent_failure_mode": "unset",
"agent_retried": false
}
],
"partial_score": {
"score": 1,
"numerator": 1,
"denominator": 1,
"level": "command",
"provenance": "fallback_command_level",
"reason": "test_case_detail_unavailable"
},
"tb": {
"trial_name": "flux-pr-2170.1-of-1.2026-02-28__18-27-22__gpt-5-4",
"is_resolved": false,
"failure_mode": "unset",
"duration_sec": 347.587282875,
"time_budget_sec": 1200
},
"cache": {
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655"
}
},
"quality": {
"equivalence": {
"status": "used",
"outcome": "equivalent",
"evaluator_model": "gpt-5.3-codex",
"confidence": 0.93,
"issue_type": "stylistic"
},
"code_review": {
"status": "used",
"signal": "fail",
"evaluator_model": "gpt-5.3-codex",
"confidence": 0.82,
"issue_type": "unclear",
"source_field": "code_review",
"dimension_scores": {
"correctness": {
"score_0_4": 1,
"confidence_0_1": 0.84,
"rationale": "Core features were added, but there are likely behavioral regressions versus intended syntax compatibility: Databricks STRUCT now requires colon-only fields, and OPTIMIZE clause ordering was changed in both parse and display paths."
},
"edge_case_handling": {
"score_0_4": 1,
"confidence_0_1": 0.78,
"rationale": "Edge compatibility appears weak: the Databricks STRUCT parser does not accept non-colon form, and OPTIMIZE parsing order can reject or reformat mixed option sequences unexpectedly."
},
"introduced_bug_risk": {
"score_0_4": 1,
"confidence_0_1": 0.8,
"rationale": "The patch introduces dialect-specific parsing branches and reordered OPTIMIZE components, increasing risk of regressions for previously accepted forms and cross-dialect behavior."
},
"maintainability_idioms": {
"score_0_4": 2,
"confidence_0_1": 0.72,
"rationale": "Implementation is readable but introduces separate one-off parsing functions/fields where existing shared helpers could be extended, which increases maintenance surface and inconsistency."
}
},
"findings": [
{
"id": "f1",
"dimension": "correctness",
"severity": "major",
"title": "Databricks STRUCT parsing became colon-only and likely regresses accepted forms",
"detail": "The new Databricks-specific struct field parser requires `field_name: field_type` via `expect_token(Token::Colon)`, so `STRUCT<a INT>` is no longer accepted under Databricks in this path. The intended change is additive support for colon syntax, not necessarily replacement of existing form.",
"confidence_0_1": 0.88,
"citations": [
{
"path": "app/src/parser/mod.rs",
"side": "new",
"start_line": 3415,
"end_line": 3438,
"anchor_kind": "diff",
"head_sha": "0c63e73e58043e382297893b414f7d3760b49a3a"
},
{
"path": "app/src/parser/mod.rs",
"side": "new",
"start_line": 11969,
"end_line": 11978,
"anchor_kind": "diff",
"head_sha": "0c63e73e58043e382297893b414f7d3760b49a3a"
}
]
},
{
"id": "f2",
"dimension": "edge_case_handling",
"severity": "major",
"title": "OPTIMIZE option ordering changed and may reject/reshape valid sequences",
"detail": "Parser now reads `WHERE`/`ZORDER BY` before `FINAL`/`DEDUPLICATE`, and formatter emits that order too. This can break compatibility for sequences where ClickHouse options and Databricks options coexist or are round-tripped in expected order.",
"confidence_0_1": 0.8,
"citations": [
{
"path": "app/src/parser/mod.rs",
"side": "new",
"start_line": 18553,
"end_line": 18581,
"anchor_kind": "diff",
"head_sha": "0c63e73e58043e382297893b414f7d3760b49a3a"
},
{
"path": "app/src/ast/mod.rs",
"side": "new",
"start_line": 6088,
"end_line": 6104,
"anchor_kind": "diff",
"head_sha": "0c63e73e58043e382297893b414f7d3760b49a3a"
}
]
},
{
"id": "f3",
"dimension": "maintainability_idioms",
"severity": "minor",
"title": "Databricks-only branch duplicates struct-field parsing logic instead of extending shared parser",
"detail": "A separate `parse_databricks_struct_field_def` function and special data-type branch were added, creating divergent behavior from existing shared struct parsing and increasing long-term maintenance burden.",
"confidence_0_1": 0.71,
"citations": [
{
"path": "app/src/parser/mod.rs",
"side": "new",
"start_line": 3415,
"end_line": 3438,
"anchor_kind": "diff",
"head_sha": "0c63e73e58043e382297893b414f7d3760b49a3a"
},
{
"path": "app/src/parser/mod.rs",
"side": "new",
"start_line": 11969,
"end_line": 11978,
"anchor_kind": "diff",
"head_sha": "0c63e73e58043e382297893b414f7d3760b49a3a"
}
]
}
],
"overall": {
"overall_score_0_100": 30.000000000000004,
"signal": "fail",
"confidence_overall": 0.82,
"summary": "The patch partially implements the requested Databricks features, but likely does not fully satisfy intended behavior due to STRUCT compatibility regression and OPTIMIZE ordering differences, making it unlikely to match the target change cleanly."
}
},
"behavioral_robustness": {
"status": "used",
"probe_status": "used",
"gold_pass_candidate_pass_count": 1,
"agreement_rate": 1,
"coverage_delta_status": "unavailable",
"coverage_delta_reason": "coverage_adapter_unavailable",
"mutation_lite_status": "unavailable",
"mutation_lite_reason": "mutation_runner_unavailable",
"notes": "derived from command-level outcomes; no accepted generated probes"
},
"footprint_risk": {
"status": "used",
"reason": "none",
"version": "v1",
"level": "low",
"score": 0.3115149921328357,
"flag": false,
"severe_flag": false
},
"cost": {
"status": "tokens_only",
"token_status": "present",
"cache_token_status": "present",
"total_input_tokens": 1142749,
"total_output_tokens": 11557,
"total_tokens": 1154306,
"uncached_input_tokens": 74717,
"cache_read_input_tokens": 1068032,
"cached_input_tokens": 1068032,
"total_cost_usd": 0.7759060000000001,
"token_source": "openai_cached_tokens_usage",
"pricing_model_key": "gpt-5.4",
"pricing_mode": "cache_aware"
}
},
"summary": {
"task": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": "lane_unknown",
"lane_report_source": "lane_unknown",
"lane_report_reasons": [
"lane_unknown"
],
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
"equivalence_status": "used",
"equivalence_outcome": "equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 1142749,
"tb_total_output_tokens": 11557,
"tb_total_tokens": 1154306,
"tb_uncached_input_tokens": 74717,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 1068032,
"tb_cached_input_tokens": 1068032,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 0.7759060000000001,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.4",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 1,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "low",
"footprint_risk_score": 0.3115149921328357,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"run": {
"model": "gpt-5.4",
"requested_model": "gpt-5.4",
"run_id": "2026-02-28__18-27-22__gpt-5-4",
"passRate": 1,
"ciLow": 1,
"ciHigh": 1,
"effectiveN": 30,
"tier": 1,
"validation_counts": {
"fail_guardrail": 0,
"fail_high_conf": 0,
"fail_infra": 0,
"fail_likely_equiv": 0,
"fail_with_diag": 0,
"missing": 0,
"pass": 5,
"pass_with_warn": 25
}
},
"run_metadata": {
"agent_name": "codex",
"model_name": "gpt-5.4",
"start_time": "2026-03-06T15:25:43.932417+00:00",
"end_time": "2026-03-06T17:16:29.393831+00:00",
"commit_hash": "8f1bc9fe2c3cb8a14c254de41e9577a2ce352ac2",
"accuracy": 0.2,
"reasoning_effort": "medium"
}
},
"evidence": {
"task_detail": {
"path": "validation/gpt-5-4/flux-pr-2170/task_detail.v1.json",
"exists": true
},
"trajectory": {
"path": "validation/gpt-5-4/flux-pr-2170/trajectory.v1.json",
"exists": true,
"non_empty": true,
"size_bytes": 15278
},
"validation": {
"path": "validation/gpt-5-4/flux-pr-2170/validation.json",
"exists": true,
"non_empty": true,
"size_bytes": 113529
},
"results": {
"path": "runs/2026-02-28__18-27-22__gpt-5-4/results.json",
"exists": true,
"non_empty": true,
"size_bytes": 60973
},
"run_metadata": {
"path": "runs/2026-02-28__18-27-22__gpt-5-4/run_metadata.json",
"exists": true,
"non_empty": true,
"size_bytes": 1635
},
"agent_patch": {
"path": "runs/2026-02-28__18-27-22__gpt-5-4/flux-pr-2170/flux-pr-2170.1-of-1.2026-02-28__18-27-22__gpt-5-4/agent-logs/agent.patch",
"exists": true,
"non_empty": true,
"size_bytes": 11171
},
"summary": {
"path": "reports/summary.json",
"exists": true,
"non_empty": true,
"size_bytes": 280264
},
"manifest": {
"path": "manifest.json",
"exists": true,
"non_empty": true,
"size_bytes": 675
},
"validation_inputs": {
"required_for_rescue_ok": true,
"gold_patch": {
"exists": true,
"non_empty": true,
"size_bytes": 10064,
"sha256": "0f3900eca772989b3058acc763eb9125a420f4a5f12383b3cc74cb9ef1a82d71",
"required_for_rescue": true
},
"repo_tarball": {
"exists": true,
"non_empty": true,
"size_bytes": 906065,
"sha256": "c28decd7d8499adbb9f707fcd926b814e1744a23e0ece7f5d08eef482c8810df",
"required_for_rescue": true
},
"agent_patch": {
"path": "runs/2026-02-28__18-27-22__gpt-5-4/flux-pr-2170/flux-pr-2170.1-of-1.2026-02-28__18-27-22__gpt-5-4/agent-logs/agent.patch",
"exists": true,
"non_empty": true,
"size_bytes": 11171,
"sha256": "a6131c892a0b63d4c439385a5eafe12af305ad3da359a5f301564d91125ffad9",
"required_for_rescue": true
}
}
},
"provenance": {
"identity": [
{
"source": "manifest",
"available": true
},
{
"source": "tb_results",
"available": true
}
],
"decision": [
{
"source": "validation",
"available": true
}
],
"tests": [
{
"source": "validation",
"available": true
},
{
"source": "tb_results",
"available": true
}
],
"quality": {
"equivalence": [
{
"source": "validation",
"available": true
}
],
"code_review": [
{
"source": "validation",
"available": true
}
],
"behavioral_robustness": [
{
"source": "validation",
"available": true
}
],
"footprint_risk": [
{
"source": "validation",
"available": true
}
],
"cost": [
{
"source": "validation",
"available": true
},
{
"source": "summary",
"available": true
}
]
},
"summary": {
"task": [
{
"source": "summary",
"available": true
}
],
"run": [
{
"source": "summary",
"available": true
}
],
"run_metadata": [
{
"source": "run_metadata",
"available": true
}
]
},
"evidence": [
{
"source": "filesystem",
"available": true
}
]
}
}