STET

task_detail

validation/gpt-5-1-codex-mini/flux-pr-1501/task_detail.v1.json

17228 bytes

Back to adjudication
{
  "version": "v1",
  "materialization": {
    "mode": "canonical"
  },
  "identity": {
    "variant_id": "gpt-5.1-codex-mini",
    "requested_model": "gpt-5.1-codex-mini",
    "model_key": "gpt-5-1-codex-mini",
    "run_id": "2026-02-28__18-27-22__gpt-5-1-codex-mini",
    "task_id": "flux-pr-1501",
    "trial_name": "flux-pr-1501.1-of-1.2026-02-28__18-27-22__gpt-5-1-codex-mini"
  },
  "decision": {
    "status": "pass",
    "primary_signal": "tests",
    "matrix_status": "pass_with_warn",
    "reason_codes": [
      "primary_tests",
      "tests_passed",
      "equivalence_non_equivalent",
      "review_fail"
    ],
    "flags": [
      "equiv_warn",
      "review_warn"
    ],
    "lane_report": {
      "lane": "lane_unknown",
      "source": "lane_unknown",
      "reasons": [
        "lane_unknown"
      ]
    },
    "publish": {
      "include_in_leaderboard": true,
      "weak_signal_risk": false
    },
    "rescue": {
      "candidate": false,
      "eligible": false,
      "decision": "not_candidate",
      "required_artifacts_ok": true
    }
  },
  "tests": {
    "outcome": "pass",
    "strength": "strong",
    "commands": [
      "env PATH=/root/.cargo/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin cargo test --all-features"
    ],
    "command_results": [
      {
        "command": "env PATH=/root/.cargo/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin cargo test --all-features",
        "gold_outcome": "pass",
        "gold_tb_trial_name": "flux-pr-1501.1-of-1.validate-gold-1772315762541551000",
        "gold_failure_mode": "unset",
        "agent_outcome": "pass",
        "agent_tb_trial_name": "flux-pr-1501.1-of-1.validate-override-1772315868215347000",
        "agent_failure_mode": "unset",
        "agent_retried": false
      }
    ],
    "partial_score": {
      "score": 1,
      "numerator": 1,
      "denominator": 1,
      "level": "command",
      "provenance": "fallback_command_level",
      "reason": "test_case_detail_unavailable"
    },
    "tb": {
      "trial_name": "flux-pr-1501.1-of-1.2026-02-28__18-27-22__gpt-5-1-codex-mini",
      "is_resolved": false,
      "failure_mode": "unset",
      "duration_sec": 212.843028833,
      "time_budget_sec": 1200
    },
    "cache": {
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655"
    }
  },
  "quality": {
    "equivalence": {
      "status": "used",
      "outcome": "non_equivalent",
      "confidence": 0.83,
      "issue_type": "behavioral",
      "notes": "The patch appears to only partially implement the SHOW unification/snowflake support. From the diff shown, `SHOW COLUMNS` still keeps a required `table_name` field (not fully moved into shared options), and Snowflake-specific modifiers/scoping (e.g., `TERSE/HISTORY/EXTERNAL` and full `IN/FROM` parent forms across all SHOW variants) are not clearly integrated in the statement variants like in the intended change. The dialect hook is added, but core behavior for fully supporting Snowflake SHOW DATABASES/SCHEMAS/TABLES/VIEWS/COLUMNS via a unified options model looks incomplete."
    },
    "code_review": {
      "status": "used",
      "signal": "fail",
      "confidence": 0.94,
      "issue_type": "unclear",
      "source_field": "code_review",
      "dimension_scores": {
        "correctness": {
          "score_0_4": 0,
          "confidence_0_1": 0.97,
          "rationale": "The patch appears internally inconsistent and likely non-compiling: it introduces new AST fields and parser calls without corresponding complete parser/AST/type updates."
        },
        "edge_case_handling": {
          "score_0_4": 1,
          "confidence_0_1": 0.87,
          "rationale": "The intended Snowflake edge cases (TERSE/HISTORY/EXTERNAL, STARTS WITH, LIMIT/FROM, LIKE/IN ordering) are only partially represented and not comprehensively wired into parser/keywords."
        },
        "introduced_bug_risk": {
          "score_0_4": 0,
          "confidence_0_1": 0.95,
          "rationale": "High regression risk is introduced by partial refactors across core statement structs and parser logic, with no evidence of complete propagation through formatting/parsing/type definitions."
        },
        "maintainability_idioms": {
          "score_0_4": 1,
          "confidence_0_1": 0.9,
          "rationale": "The patch introduces a divergent ad-hoc model (e.g., generic `options` plus undeclared modifier/scope item types in tests) rather than the coherent shared structure expected by the task, making future extension brittle."
        }
      },
      "findings": [
        {
          "id": "f1",
          "dimension": "correctness",
          "severity": "major",
          "title": "Parser calls an apparently missing function",
          "detail": "SHOW DATABASES/SCHEMAS now call `parse_show_statement_options()`, but no corresponding implementation is shown in the patch, indicating likely compile failure or incomplete refactor.",
          "confidence_0_1": 0.96,
          "citations": [
            {
              "path": "src/parser/mod.rs",
              "side": "new",
              "start_line": 9665,
              "end_line": 9675,
              "anchor_kind": "diff",
              "head_sha": "c8757576ee97b1ef7260dbf21eb6e77b6055c66a"
            }
          ]
        },
        {
          "id": "f2",
          "dimension": "correctness",
          "severity": "major",
          "title": "SHOW AST refactor is structurally incomplete",
          "detail": "`ShowColumns` keeps `table_name` while also replacing `filter` with `options`, and multiple SHOW variants are rewritten without evidence of synchronized Display/parser/type updates, creating likely mismatches.",
          "confidence_0_1": 0.93,
          "citations": [
            {
              "path": "src/ast/mod.rs",
              "side": "new",
              "start_line": 2780,
              "end_line": 2810,
              "anchor_kind": "diff",
              "head_sha": "c8757576ee97b1ef7260dbf21eb6e77b6055c66a"
            }
          ]
        },
        {
          "id": "f3",
          "dimension": "edge_case_handling",
          "severity": "major",
          "title": "Snowflake SHOW syntax support is only partial",
          "detail": "The task requires TERSE/HISTORY/EXTERNAL and ordering-sensitive LIKE/IN plus STARTS WITH and LIMIT/FROM handling across SHOW variants, but the patch shows only limited parser entrypoint changes and dialect hook rename without full syntax plumbing.",
          "confidence_0_1": 0.88,
          "citations": [
            {
              "path": "src/parser/mod.rs",
              "side": "new",
              "start_line": 9665,
              "end_line": 9675,
              "anchor_kind": "diff",
              "head_sha": "c8757576ee97b1ef7260dbf21eb6e77b6055c66a"
            },
            {
              "path": "src/dialect/snowflake.rs",
              "side": "new",
              "start_line": 203,
              "end_line": 210,
              "anchor_kind": "diff",
              "head_sha": "c8757576ee97b1ef7260dbf21eb6e77b6055c66a"
            }
          ]
        },
        {
          "id": "f4",
          "dimension": "introduced_bug_risk",
          "severity": "major",
          "title": "New tests depend on types not evidenced in implementation",
          "detail": "Snowflake tests assert `ShowStatementModifier`, `ShowStatementScope`, `ShowScopeItem`, and `ShowStatementLimit`, but these definitions are not shown in the code changes, suggesting unresolved symbols or architecture drift.",
          "confidence_0_1": 0.94,
          "citations": [
            {
              "path": "tests/sqlparser_snowflake.rs",
              "side": "new",
              "start_line": 1247,
              "end_line": 1307,
              "anchor_kind": "diff",
              "head_sha": "c8757576ee97b1ef7260dbf21eb6e77b6055c66a"
            }
          ]
        }
      ],
      "overall": {
        "confidence_overall": 0.94,
        "summary": "The agent patch likely does not satisfy the intended change: it appears incomplete versus the required unified SHOW options model and Snowflake syntax support, and likely fails correctness/build expectations."
      },
      "notes": "The agent patch likely does not satisfy the intended change: it appears incomplete versus the required unified SHOW options model and Snowflake syntax support, and likely fails correctness/build expectations."
    },
    "behavioral_robustness": {
      "status": "used",
      "probe_status": "used",
      "gold_pass_candidate_pass_count": 1,
      "agreement_rate": 1,
      "coverage_delta_status": "unavailable",
      "coverage_delta_reason": "coverage_adapter_unavailable",
      "mutation_lite_status": "unavailable",
      "mutation_lite_reason": "mutation_runner_unavailable",
      "notes": "derived from command-level outcomes; no accepted generated probes"
    },
    "footprint_risk": {
      "status": "used",
      "reason": "none",
      "version": "v1",
      "level": "low",
      "score": 0.2995330140417937,
      "flag": false,
      "severe_flag": false
    },
    "cost": {
      "status": "tokens_only",
      "token_status": "present",
      "cache_token_status": "present",
      "total_input_tokens": 6406986,
      "total_output_tokens": 84912,
      "total_tokens": 6491898,
      "uncached_input_tokens": 802122,
      "cache_read_input_tokens": 5604864,
      "cached_input_tokens": 5604864,
      "total_cost_usd": 2.5533846000000002,
      "token_source": "openai_cached_tokens_usage",
      "pricing_model_key": "gpt-5.1-codex-mini",
      "pricing_mode": "cache_aware"
    }
  },
  "summary": {
    "task": {
      "matrix_status": "pass_with_warn",
      "tests_outcome": "pass",
      "tests_unknown_cause": null,
      "lane_report_lane": "lane_unknown",
      "lane_report_source": "lane_unknown",
      "lane_report_reasons": [
        "lane_unknown"
      ],
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "setup_ms_saved": null,
      "pinned_bytes": null,
      "environment_group_id": "eg_2d84f25b38a313fb6c13d2663529d655",
      "equivalence_status": "used",
      "equivalence_outcome": "non_equivalent",
      "code_review_status": "used",
      "code_review_signal": "fail",
      "behavioral_robustness_status": "used",
      "coverage_delta_status": "unavailable",
      "mutation_lite_status": "unavailable",
      "probe_accepted_candidates": null,
      "probe_accepted_commands": null,
      "probe_agreement_rate": 1,
      "probe_gold_pass_candidate_pass_count": 1,
      "probe_gold_pass_candidate_fail_count": null,
      "probe_review_required_count": null,
      "flags": [
        "equiv_warn",
        "review_warn"
      ],
      "tb_is_resolved": false,
      "tb_failure_mode": "unset",
      "tb_total_input_tokens": 6406986,
      "tb_total_output_tokens": 84912,
      "tb_total_tokens": 6491898,
      "tb_uncached_input_tokens": 802122,
      "tb_cache_creation_input_tokens": null,
      "tb_cache_read_input_tokens": 5604864,
      "tb_cached_input_tokens": 5604864,
      "token_status": "present",
      "cache_token_status": "present",
      "token_source": "openai_cached_tokens_usage",
      "task_cost": 2.5533846000000002,
      "cost_status": "present",
      "cost_pricing_mode": "cache_aware",
      "pricing_model_key": "gpt-5.1-codex-mini",
      "equiv_rescue_policy": "on",
      "rescue_candidate": false,
      "rescue_eligible": false,
      "rescue_decision": "not_candidate",
      "publish_include_in_leaderboard": true,
      "publish_exclusion_reasons": [],
      "publish_weak_signal_risk": false,
      "tests_only_outcome": 1,
      "rescue_aware_outcome": 1,
      "partial_score": 1,
      "partial_score_numerator": 1,
      "partial_score_denominator": 1,
      "partial_score_level": "command",
      "partial_score_provenance": "fallback_command_level",
      "partial_score_reason": "test_case_detail_unavailable",
      "partial_score_unknown_count": 0,
      "footprint_risk_status": "used",
      "footprint_risk_reason": "none",
      "footprint_risk_level": "low",
      "footprint_risk_score": 0.2995330140417937,
      "footprint_risk_flag": false,
      "footprint_risk_severe_flag": false
    },
    "run": {
      "model": "gpt-5.1-codex-mini",
      "requested_model": "gpt-5.1-codex-mini",
      "run_id": "2026-02-28__18-27-22__gpt-5-1-codex-mini",
      "passRate": 0.9333333333333333,
      "ciLow": 0.8333333333333334,
      "ciHigh": 1,
      "effectiveN": 30,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 1,
        "fail_infra": 0,
        "fail_likely_equiv": 1,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 3,
        "pass_with_warn": 25
      }
    },
    "run_metadata": {
      "agent_name": "codex",
      "model_name": "gpt-5.1-codex-mini",
      "start_time": "2026-02-28T20:00:12.722616+00:00",
      "end_time": "2026-02-28T21:48:26.352711+00:00",
      "commit_hash": "32dbe1950f8d0a4ed6db86745bf0a62da2ee0921",
      "accuracy": 0.26666666666666666,
      "reasoning_effort": "medium"
    }
  },
  "evidence": {
    "task_detail": {
      "path": "validation/gpt-5-1-codex-mini/flux-pr-1501/task_detail.v1.json",
      "exists": true
    },
    "trajectory": {
      "path": "validation/gpt-5-1-codex-mini/flux-pr-1501/trajectory.v1.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 15394
    },
    "validation": {
      "path": "validation/gpt-5-1-codex-mini/flux-pr-1501/validation.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 162788
    },
    "results": {
      "path": "runs/2026-02-28__18-27-22__gpt-5-1-codex-mini/results.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 61948
    },
    "run_metadata": {
      "path": "runs/2026-02-28__18-27-22__gpt-5-1-codex-mini/run_metadata.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 1673
    },
    "agent_patch": {
      "path": "runs/2026-02-28__18-27-22__gpt-5-1-codex-mini/flux-pr-1501/flux-pr-1501.1-of-1.2026-02-28__18-27-22__gpt-5-1-codex-mini/agent-logs/agent.patch",
      "exists": true,
      "non_empty": true,
      "size_bytes": 27705
    },
    "summary": {
      "path": "reports/summary.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 280264
    },
    "manifest": {
      "path": "manifest.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 675
    },
    "validation_inputs": {
      "required_for_rescue_ok": true,
      "gold_patch": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 24277,
        "sha256": "f470764b585efb5e24f44b15e8c24936be583c80dc1108eab53b56848a19a4e7",
        "required_for_rescue": true
      },
      "repo_tarball": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 426390,
        "sha256": "dd54d6ddf25e6d65b697a7577c055c0377e5a0d0a51acf1e5cae5fa84b84d326",
        "required_for_rescue": true
      },
      "agent_patch": {
        "path": "runs/2026-02-28__18-27-22__gpt-5-1-codex-mini/flux-pr-1501/flux-pr-1501.1-of-1.2026-02-28__18-27-22__gpt-5-1-codex-mini/agent-logs/agent.patch",
        "exists": true,
        "non_empty": true,
        "size_bytes": 27705,
        "sha256": "7e3b85dd457800ac6c4dcd64a45f8d00fef48d5da2bfca58f8e1ca8a0fa8cc1f",
        "required_for_rescue": true
      }
    }
  },
  "provenance": {
    "identity": [
      {
        "source": "manifest",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "decision": [
      {
        "source": "validation",
        "available": true
      }
    ],
    "tests": [
      {
        "source": "validation",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "quality": {
      "equivalence": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "code_review": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "behavioral_robustness": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "footprint_risk": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "cost": [
        {
          "source": "validation",
          "available": true
        },
        {
          "source": "summary",
          "available": true
        }
      ]
    },
    "summary": {
      "task": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run_metadata": [
        {
          "source": "run_metadata",
          "available": true
        }
      ]
    },
    "evidence": [
      {
        "source": "filesystem",
        "available": true
      }
    ]
  }
}