STET

task_detail

validation/gpt-5-4/flux-pr-1501/task_detail.v1.json

16801 bytes

Back to adjudication
{
  "version": "v1",
  "materialization": {
    "mode": "canonical"
  },
  "identity": {
    "variant_id": "gpt-5.4",
    "requested_model": "gpt-5.4",
    "model_key": "gpt-5-4",
    "run_id": "2026-02-28__18-27-22__gpt-5-4",
    "task_id": "flux-pr-1501",
    "trial_name": "flux-pr-1501.1-of-1.2026-02-28__18-27-22__gpt-5-4"
  },
  "decision": {
    "status": "pass",
    "primary_signal": "tests",
    "matrix_status": "pass_with_warn",
    "reason_codes": [
      "primary_tests",
      "tests_passed",
      "equivalence_non_equivalent",
      "decision_conflict",
      "review_fail"
    ],
    "flags": [
      "equiv_warn",
      "review_warn"
    ],
    "lane_report": {
      "lane": "lane_unknown",
      "source": "lane_unknown",
      "reasons": [
        "lane_unknown"
      ]
    },
    "publish": {
      "include_in_leaderboard": true,
      "weak_signal_risk": false
    },
    "rescue": {
      "candidate": false,
      "eligible": false,
      "decision": "not_candidate",
      "required_artifacts_ok": true
    }
  },
  "tests": {
    "outcome": "pass",
    "strength": "strong",
    "commands": [
      "env PATH=/root/.cargo/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin cargo test --all-features"
    ],
    "command_results": [
      {
        "command": "env PATH=/root/.cargo/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin cargo test --all-features",
        "gold_outcome": "pass",
        "gold_tb_trial_name": "flux-pr-1501.1-of-1.validate-gold-1772831133907885000",
        "gold_failure_mode": "unset",
        "agent_outcome": "pass",
        "agent_tb_trial_name": "flux-pr-1501.1-of-1.validate-override-1772831312941335000",
        "agent_failure_mode": "unset",
        "agent_retried": false
      }
    ],
    "partial_score": {
      "score": 1,
      "numerator": 1,
      "denominator": 1,
      "level": "command",
      "provenance": "fallback_command_level",
      "reason": "test_case_detail_unavailable"
    },
    "tb": {
      "trial_name": "flux-pr-1501.1-of-1.2026-02-28__18-27-22__gpt-5-4",
      "is_resolved": false,
      "failure_mode": "unset",
      "duration_sec": 334.003114125,
      "time_budget_sec": 1200
    },
    "cache": {
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655"
    }
  },
  "quality": {
    "equivalence": {
      "status": "used",
      "outcome": "non_equivalent",
      "evaluator_model": "gpt-5.3-codex",
      "confidence": 0.82,
      "issue_type": "behavioral"
    },
    "code_review": {
      "status": "used",
      "signal": "fail",
      "evaluator_model": "gpt-5.3-codex",
      "confidence": 0.9,
      "issue_type": "unclear",
      "source_field": "code_review",
      "dimension_scores": {
        "correctness": {
          "score_0_4": 0,
          "confidence_0_1": 0.94,
          "rationale": "The patch shape diverges from the intended implementation and appears internally inconsistent: SHOW AST variants were collapsed to only `show_options` without adding required Snowflake modifier fields, while parser call sites were changed to new signatures/behavior not aligned with the target design."
        },
        "edge_case_handling": {
          "score_0_4": 1,
          "confidence_0_1": 0.86,
          "rationale": "Some Snowflake scenarios were added in tests, but they target a different data model than the intended one and do not reflect the required LIKE/IN ordering hook semantics in dialect behavior."
        },
        "introduced_bug_risk": {
          "score_0_4": 0,
          "confidence_0_1": 0.9,
          "rationale": "The parser changes introduce broad modifier parsing and altered function call contracts, while dialect files only add imports rather than the needed dialect hook behavior, creating high risk of compile/runtime regressions."
        },
        "maintainability_idioms": {
          "score_0_4": 1,
          "confidence_0_1": 0.84,
          "rationale": "The patch introduces partial API churn (new imports, changed statement fields, changed parser calls) without the cohesive shared options model and dialect hook implementation, making the code harder to reason about and likely brittle."
        }
      },
      "findings": [
        {
          "id": "f1",
          "dimension": "correctness",
          "severity": "major",
          "title": "SHOW AST variants missing required Snowflake modifier fields",
          "detail": "The statement variants were simplified to only `show_options` for databases/schemas/tables/views, but the intended change requires explicit `terse/history/external/materialized` placement in the AST for correct parsing/printing behavior.",
          "confidence_0_1": 0.92,
          "citations": [
            {
              "path": "src/ast/mod.rs",
              "side": "new",
              "start_line": 2787,
              "end_line": 2808,
              "anchor_kind": "diff",
              "head_sha": "c8757576ee97b1ef7260dbf21eb6e77b6055c66a"
            }
          ]
        },
        {
          "id": "f2",
          "dimension": "correctness",
          "severity": "major",
          "title": "Parser modifier handling diverges from target SHOW grammar",
          "detail": "The parser now globally parses `TERSE`/`EXTERNAL` and wires them into calls like `parse_show_columns(..., terse)`, which does not match the intended grammar split and is likely incompatible with existing parser function contracts.",
          "confidence_0_1": 0.91,
          "citations": [
            {
              "path": "src/parser/mod.rs",
              "side": "new",
              "start_line": 9613,
              "end_line": 9638,
              "anchor_kind": "diff",
              "head_sha": "c8757576ee97b1ef7260dbf21eb6e77b6055c66a"
            }
          ]
        },
        {
          "id": "f3",
          "dimension": "edge_case_handling",
          "severity": "major",
          "title": "Dialect LIKE-before-IN hook not implemented",
          "detail": "The change set adds only imports in dialect modules instead of introducing/overriding the dialect capability for `SHOW ... LIKE ... IN ...` ordering, so Snowflake-specific ordering behavior is not properly modeled.",
          "confidence_0_1": 0.86,
          "citations": [
            {
              "path": "src/dialect/mod.rs",
              "side": "new",
              "start_line": 49,
              "end_line": 49,
              "anchor_kind": "diff",
              "head_sha": "c8757576ee97b1ef7260dbf21eb6e77b6055c66a"
            },
            {
              "path": "src/dialect/snowflake.rs",
              "side": "new",
              "start_line": 25,
              "end_line": 29,
              "anchor_kind": "diff",
              "head_sha": "c8757576ee97b1ef7260dbf21eb6e77b6055c66a"
            }
          ]
        },
        {
          "id": "f4",
          "dimension": "maintainability_idioms",
          "severity": "major",
          "title": "Tests target an inconsistent options model",
          "detail": "New Snowflake tests assert fields/types (`scope`, `ShowStatementLimit`, `Default` patterns) that do not align with the intended shared options design, indicating drift between parser/AST contracts and tests.",
          "confidence_0_1": 0.83,
          "citations": [
            {
              "path": "tests/sqlparser_snowflake.rs",
              "side": "new",
              "start_line": 114,
              "end_line": 211,
              "anchor_kind": "diff",
              "head_sha": "c8757576ee97b1ef7260dbf21eb6e77b6055c66a"
            },
            {
              "path": "tests/sqlparser_mysql.rs",
              "side": "new",
              "start_line": 229,
              "end_line": 241,
              "anchor_kind": "diff",
              "head_sha": "c8757576ee97b1ef7260dbf21eb6e77b6055c66a"
            }
          ]
        }
      ],
      "overall": {
        "overall_score_0_100": 10,
        "signal": "fail",
        "confidence_overall": 0.9,
        "summary": "The agent patch likely does not satisfy the intended PR: it only partially refactors SHOW statements, misses the dialect hook implementation pattern, and introduces parser/test model inconsistencies that strongly suggest breakage."
      }
    },
    "behavioral_robustness": {
      "status": "used",
      "probe_status": "used",
      "gold_pass_candidate_pass_count": 1,
      "agreement_rate": 1,
      "coverage_delta_status": "unavailable",
      "coverage_delta_reason": "coverage_adapter_unavailable",
      "mutation_lite_status": "unavailable",
      "mutation_lite_reason": "mutation_runner_unavailable",
      "notes": "derived from command-level outcomes; no accepted generated probes"
    },
    "footprint_risk": {
      "status": "used",
      "reason": "none",
      "version": "v1",
      "level": "medium",
      "score": 0.34456698283904336,
      "flag": false,
      "severe_flag": false
    },
    "cost": {
      "status": "tokens_only",
      "token_status": "present",
      "cache_token_status": "present",
      "total_input_tokens": 2166850,
      "total_output_tokens": 26245,
      "total_tokens": 2193095,
      "uncached_input_tokens": 67266,
      "cache_read_input_tokens": 2099584,
      "cached_input_tokens": 2099584,
      "total_cost_usd": 1.394284,
      "token_source": "openai_cached_tokens_usage",
      "pricing_model_key": "gpt-5.4",
      "pricing_mode": "cache_aware"
    }
  },
  "summary": {
    "task": {
      "matrix_status": "pass_with_warn",
      "tests_outcome": "pass",
      "tests_unknown_cause": null,
      "lane_report_lane": "lane_unknown",
      "lane_report_source": "lane_unknown",
      "lane_report_reasons": [
        "lane_unknown"
      ],
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "setup_ms_saved": null,
      "pinned_bytes": null,
      "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
      "equivalence_status": "used",
      "equivalence_outcome": "non_equivalent",
      "code_review_status": "used",
      "code_review_signal": "fail",
      "behavioral_robustness_status": "used",
      "coverage_delta_status": "unavailable",
      "mutation_lite_status": "unavailable",
      "probe_accepted_candidates": null,
      "probe_accepted_commands": null,
      "probe_agreement_rate": 1,
      "probe_gold_pass_candidate_pass_count": 1,
      "probe_gold_pass_candidate_fail_count": null,
      "probe_review_required_count": null,
      "flags": [
        "equiv_warn",
        "review_warn"
      ],
      "tb_is_resolved": false,
      "tb_failure_mode": "unset",
      "tb_total_input_tokens": 2166850,
      "tb_total_output_tokens": 26245,
      "tb_total_tokens": 2193095,
      "tb_uncached_input_tokens": 67266,
      "tb_cache_creation_input_tokens": null,
      "tb_cache_read_input_tokens": 2099584,
      "tb_cached_input_tokens": 2099584,
      "token_status": "present",
      "cache_token_status": "present",
      "token_source": "openai_cached_tokens_usage",
      "task_cost": 1.394284,
      "cost_status": "present",
      "cost_pricing_mode": "cache_aware",
      "pricing_model_key": "gpt-5.4",
      "equiv_rescue_policy": "on",
      "rescue_candidate": false,
      "rescue_eligible": false,
      "rescue_decision": "not_candidate",
      "publish_include_in_leaderboard": true,
      "publish_exclusion_reasons": [],
      "publish_weak_signal_risk": false,
      "tests_only_outcome": 1,
      "rescue_aware_outcome": 1,
      "partial_score": 1,
      "partial_score_numerator": 1,
      "partial_score_denominator": 1,
      "partial_score_level": "command",
      "partial_score_provenance": "fallback_command_level",
      "partial_score_reason": "test_case_detail_unavailable",
      "partial_score_unknown_count": 0,
      "footprint_risk_status": "used",
      "footprint_risk_reason": "none",
      "footprint_risk_level": "medium",
      "footprint_risk_score": 0.34456698283904336,
      "footprint_risk_flag": false,
      "footprint_risk_severe_flag": false
    },
    "run": {
      "model": "gpt-5.4",
      "requested_model": "gpt-5.4",
      "run_id": "2026-02-28__18-27-22__gpt-5-4",
      "passRate": 1,
      "ciLow": 1,
      "ciHigh": 1,
      "effectiveN": 30,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 0,
        "fail_infra": 0,
        "fail_likely_equiv": 0,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 5,
        "pass_with_warn": 25
      }
    },
    "run_metadata": {
      "agent_name": "codex",
      "model_name": "gpt-5.4",
      "start_time": "2026-03-06T15:25:43.932417+00:00",
      "end_time": "2026-03-06T17:16:29.393831+00:00",
      "commit_hash": "8f1bc9fe2c3cb8a14c254de41e9577a2ce352ac2",
      "accuracy": 0.2,
      "reasoning_effort": "medium"
    }
  },
  "evidence": {
    "task_detail": {
      "path": "validation/gpt-5-4/flux-pr-1501/task_detail.v1.json",
      "exists": true
    },
    "trajectory": {
      "path": "validation/gpt-5-4/flux-pr-1501/trajectory.v1.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 15168
    },
    "validation": {
      "path": "validation/gpt-5-4/flux-pr-1501/validation.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 172156
    },
    "results": {
      "path": "runs/2026-02-28__18-27-22__gpt-5-4/results.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 60973
    },
    "run_metadata": {
      "path": "runs/2026-02-28__18-27-22__gpt-5-4/run_metadata.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 1635
    },
    "agent_patch": {
      "path": "runs/2026-02-28__18-27-22__gpt-5-4/flux-pr-1501/flux-pr-1501.1-of-1.2026-02-28__18-27-22__gpt-5-4/agent-logs/agent.patch",
      "exists": true,
      "non_empty": true,
      "size_bytes": 38606
    },
    "summary": {
      "path": "reports/summary.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 280264
    },
    "manifest": {
      "path": "manifest.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 675
    },
    "validation_inputs": {
      "required_for_rescue_ok": true,
      "gold_patch": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 24277,
        "sha256": "f470764b585efb5e24f44b15e8c24936be583c80dc1108eab53b56848a19a4e7",
        "required_for_rescue": true
      },
      "repo_tarball": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 426390,
        "sha256": "dd54d6ddf25e6d65b697a7577c055c0377e5a0d0a51acf1e5cae5fa84b84d326",
        "required_for_rescue": true
      },
      "agent_patch": {
        "path": "runs/2026-02-28__18-27-22__gpt-5-4/flux-pr-1501/flux-pr-1501.1-of-1.2026-02-28__18-27-22__gpt-5-4/agent-logs/agent.patch",
        "exists": true,
        "non_empty": true,
        "size_bytes": 38606,
        "sha256": "89fe9f476cc9349a4c5a80f9e103629355e5fe1b94d921f7df0e9039726e5b1f",
        "required_for_rescue": true
      }
    }
  },
  "provenance": {
    "identity": [
      {
        "source": "manifest",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "decision": [
      {
        "source": "validation",
        "available": true
      }
    ],
    "tests": [
      {
        "source": "validation",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "quality": {
      "equivalence": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "code_review": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "behavioral_robustness": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "footprint_risk": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "cost": [
        {
          "source": "validation",
          "available": true
        },
        {
          "source": "summary",
          "available": true
        }
      ]
    },
    "summary": {
      "task": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run_metadata": [
        {
          "source": "run_metadata",
          "available": true
        }
      ]
    },
    "evidence": [
      {
        "source": "filesystem",
        "available": true
      }
    ]
  }
}