STET

task_detail

validation/gpt-5-4/flux-pr-1184/task_detail.v1.json

16233 bytes

Back to adjudication
{
  "version": "v1",
  "materialization": {
    "mode": "canonical"
  },
  "identity": {
    "variant_id": "gpt-5.4",
    "requested_model": "gpt-5.4",
    "model_key": "gpt-5-4",
    "run_id": "2026-02-28__13-38-10__gpt-5-4",
    "task_id": "flux-pr-1184",
    "trial_name": "flux-pr-1184.1-of-1.2026-02-28__13-38-10__gpt-5-4"
  },
  "decision": {
    "status": "pass",
    "primary_signal": "tests",
    "matrix_status": "pass_with_warn",
    "reason_codes": [
      "primary_tests",
      "tests_passed",
      "equivalence_non_equivalent",
      "decision_conflict",
      "review_unsure"
    ],
    "flags": [
      "equiv_warn",
      "review_warn"
    ],
    "lane_report": {
      "lane": "lane_unknown",
      "source": "lane_unknown",
      "reasons": [
        "lane_unknown"
      ]
    },
    "publish": {
      "include_in_leaderboard": true,
      "weak_signal_risk": false
    },
    "rescue": {
      "candidate": false,
      "eligible": false,
      "decision": "not_candidate",
      "required_artifacts_ok": true
    }
  },
  "tests": {
    "outcome": "pass",
    "strength": "strong",
    "commands": [
      "go test -C v2 ./... -count=1 -timeout=300s"
    ],
    "command_results": [
      {
        "command": "go test -C v2 ./... -count=1 -timeout=300s",
        "gold_outcome": "pass",
        "gold_tb_trial_name": "flux-pr-1184.1-of-1.validate-gold-1772829846286210000",
        "gold_failure_mode": "unset",
        "agent_outcome": "pass",
        "agent_tb_trial_name": "flux-pr-1184.1-of-1.validate-override-1772829945137552000",
        "agent_failure_mode": "unset",
        "agent_retried": false
      }
    ],
    "partial_score": {
      "score": 1,
      "numerator": 1,
      "denominator": 1,
      "level": "command",
      "provenance": "fallback_command_level",
      "reason": "test_case_detail_unavailable"
    },
    "tb": {
      "trial_name": "flux-pr-1184.1-of-1.2026-02-28__13-38-10__gpt-5-4",
      "is_resolved": true,
      "failure_mode": "unset",
      "duration_sec": 194.069653375,
      "time_budget_sec": 1200
    },
    "cache": {
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c"
    }
  },
  "quality": {
    "equivalence": {
      "status": "used",
      "outcome": "non_equivalent",
      "evaluator_model": "gpt-5.3-codex",
      "confidence": 0.72,
      "issue_type": "behavioral"
    },
    "code_review": {
      "status": "used",
      "signal": "unsure",
      "evaluator_model": "gpt-5.3-codex",
      "confidence": 0.8,
      "issue_type": "unclear",
      "source_field": "code_review",
      "dimension_scores": {
        "correctness": {
          "score_0_4": 2,
          "confidence_0_1": 0.83,
          "rationale": "The patch addresses core pieces (entity filtering fix and union-fragment recursion), but it appears to miss required rewrite behavior in important paths, including union-only fragment cases and full flattening semantics for nested object fragments."
        },
        "edge_case_handling": {
          "score_0_4": 2,
          "confidence_0_1": 0.79,
          "rationale": "Nested union fragments are added, but edge cases around union-only inline fragments and nested object-fragment normalization are not fully handled."
        },
        "introduced_bug_risk": {
          "score_0_4": 2,
          "confidence_0_1": 0.76,
          "rationale": "The new recursive cleanup logic is substantial and mixes parent-valid-type intersections with existing flow, but rewrite gating and flatten behavior are still inconsistent, increasing risk of subtle planner regressions."
        },
        "maintainability_idioms": {
          "score_0_4": 3,
          "confidence_0_1": 0.82,
          "rationale": "Code is generally idiomatic and introduces reusable helpers, but duplicated/parallel cleanup and flatten paths remain difficult to reason about and diverge from the intended simplification."
        }
      },
      "findings": [
        {
          "id": "f1",
          "dimension": "correctness",
          "severity": "major",
          "title": "Union-only inline fragments can skip rewrite due to existing early-return flow",
          "detail": "The union cleanup check was added, but it sits after the pre-existing flow where union rewrite decisions may return before evaluating union fragments when no interface fragments are present. This can miss required rewrites for selections that rely on union nested fragments.",
          "confidence_0_1": 0.8,
          "citations": [
            {
              "path": "v2/pkg/engine/plan/abstract_selection_rewriter.go",
              "side": "new",
              "start_line": 177,
              "end_line": 184,
              "anchor_kind": "diff",
              "head_sha": "ae9f57aca01c7c1e3b8175c487c46b8bb96fb0a9"
            }
          ]
        },
        {
          "id": "f2",
          "dimension": "correctness",
          "severity": "major",
          "title": "Union flattening still copies object fragment selections instead of recursively normalizing them",
          "detail": "In `flattenFragmentOnUnion`, object inline fragments are copied directly (`CopySelection`) rather than recursively flattened through nested abstract fragments. This can preserve invalid nested structure instead of rewriting to concrete-type fragments.",
          "confidence_0_1": 0.86,
          "citations": [
            {
              "path": "v2/pkg/engine/plan/abstract_selection_rewriter.go",
              "side": "new",
              "start_line": 472,
              "end_line": 477,
              "anchor_kind": "diff",
              "head_sha": "ae9f57aca01c7c1e3b8175c487c46b8bb96fb0a9"
            },
            {
              "path": "v2/pkg/engine/plan/abstract_selection_rewriter.go",
              "side": "new",
              "start_line": 454,
              "end_line": 492,
              "anchor_kind": "diff",
              "head_sha": "ae9f57aca01c7c1e3b8175c487c46b8bb96fb0a9"
            }
          ]
        },
        {
          "id": "f3",
          "dimension": "edge_case_handling",
          "severity": "major",
          "title": "Top-level object-fragment cleanup path still may not enforce parent-valid-type membership",
          "detail": "The helper now supports parent-valid-type checks, but the main interface/union rewrite decision flow was not fully switched to the new object cleanup strategy, leaving paths that still rely on datasource existence checks and can miss parent-membership filtering.",
          "confidence_0_1": 0.74,
          "citations": [
            {
              "path": "v2/pkg/engine/plan/abstract_selection_rewriter.go",
              "side": "new",
              "start_line": 360,
              "end_line": 368,
              "anchor_kind": "diff",
              "head_sha": "ae9f57aca01c7c1e3b8175c487c46b8bb96fb0a9"
            },
            {
              "path": "v2/pkg/engine/plan/abstract_selection_rewriter_helpers.go",
              "side": "new",
              "start_line": 195,
              "end_line": 223,
              "anchor_kind": "diff",
              "head_sha": "ae9f57aca01c7c1e3b8175c487c46b8bb96fb0a9"
            }
          ]
        }
      ],
      "overall": {
        "overall_score_0_100": 55.00000000000001,
        "signal": "unsure",
        "confidence_overall": 0.8,
        "summary": "The patch is directionally correct and likely fixes several failing cases, but it does not fully match the intended rewrite semantics; key correctness gaps remain in union rewrite gating and recursive fragment flattening behavior."
      },
      "notes": "review JSON parse recovered after one retry"
    },
    "behavioral_robustness": {
      "status": "used",
      "probe_status": "used",
      "gold_pass_candidate_pass_count": 1,
      "agreement_rate": 1,
      "coverage_delta_status": "unavailable",
      "coverage_delta_reason": "coverage_adapter_unavailable",
      "mutation_lite_status": "unavailable",
      "mutation_lite_reason": "mutation_runner_unavailable",
      "notes": "derived from command-level outcomes; no accepted generated probes"
    },
    "footprint_risk": {
      "status": "used",
      "reason": "none",
      "version": "v1",
      "level": "medium",
      "score": 0.40211387838901175,
      "flag": false,
      "severe_flag": false
    },
    "cost": {
      "status": "tokens_only",
      "token_status": "present",
      "cache_token_status": "present",
      "total_input_tokens": 480777,
      "total_output_tokens": 11030,
      "total_tokens": 491807,
      "uncached_input_tokens": 51977,
      "cache_read_input_tokens": 428800,
      "cached_input_tokens": 428800,
      "total_cost_usd": 0.406594,
      "token_source": "openai_cached_tokens_usage",
      "pricing_model_key": "gpt-5.4",
      "pricing_mode": "cache_aware"
    }
  },
  "summary": {
    "task": {
      "matrix_status": "pass_with_warn",
      "tests_outcome": "pass",
      "tests_unknown_cause": null,
      "lane_report_lane": "lane_unknown",
      "lane_report_source": "lane_unknown",
      "lane_report_reasons": [
        "lane_unknown"
      ],
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "setup_ms_saved": null,
      "pinned_bytes": null,
      "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
      "equivalence_status": "used",
      "equivalence_outcome": "non_equivalent",
      "code_review_status": "used",
      "code_review_signal": "unsure",
      "behavioral_robustness_status": "used",
      "coverage_delta_status": "unavailable",
      "mutation_lite_status": "unavailable",
      "probe_accepted_candidates": null,
      "probe_accepted_commands": null,
      "probe_agreement_rate": 1,
      "probe_gold_pass_candidate_pass_count": 1,
      "probe_gold_pass_candidate_fail_count": null,
      "probe_review_required_count": null,
      "flags": [
        "equiv_warn",
        "review_warn"
      ],
      "tb_is_resolved": true,
      "tb_failure_mode": "unset",
      "tb_total_input_tokens": 480777,
      "tb_total_output_tokens": 11030,
      "tb_total_tokens": 491807,
      "tb_uncached_input_tokens": 51977,
      "tb_cache_creation_input_tokens": null,
      "tb_cache_read_input_tokens": 428800,
      "tb_cached_input_tokens": 428800,
      "token_status": "present",
      "cache_token_status": "present",
      "token_source": "openai_cached_tokens_usage",
      "task_cost": 0.406594,
      "cost_status": "present",
      "cost_pricing_mode": "cache_aware",
      "pricing_model_key": "gpt-5.4",
      "equiv_rescue_policy": "on",
      "rescue_candidate": false,
      "rescue_eligible": false,
      "rescue_decision": "not_candidate",
      "publish_include_in_leaderboard": true,
      "publish_exclusion_reasons": [],
      "publish_weak_signal_risk": false,
      "tests_only_outcome": 1,
      "rescue_aware_outcome": 1,
      "partial_score": 1,
      "partial_score_numerator": 1,
      "partial_score_denominator": 1,
      "partial_score_level": "command",
      "partial_score_provenance": "fallback_command_level",
      "partial_score_reason": "test_case_detail_unavailable",
      "partial_score_unknown_count": 0,
      "footprint_risk_status": "used",
      "footprint_risk_reason": "none",
      "footprint_risk_level": "medium",
      "footprint_risk_score": 0.40211387838901175,
      "footprint_risk_flag": false,
      "footprint_risk_severe_flag": false
    },
    "run": {
      "model": "gpt-5.4",
      "requested_model": "gpt-5.4",
      "run_id": "2026-02-28__13-38-10__gpt-5-4",
      "passRate": 0.896551724137931,
      "ciLow": 0.7586206896551724,
      "ciHigh": 1,
      "effectiveN": 29,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 1,
        "fail_high_conf": 2,
        "fail_infra": 0,
        "fail_likely_equiv": 1,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 5,
        "pass_with_warn": 20
      }
    },
    "run_metadata": {
      "agent_name": "codex",
      "model_name": "gpt-5.4",
      "start_time": "2026-03-06T11:06:36.354605+00:00",
      "end_time": "2026-03-06T13:16:38.693205+00:00",
      "commit_hash": "8f1bc9fe2c3cb8a14c254de41e9577a2ce352ac2",
      "accuracy": 0.7586206896551724,
      "reasoning_effort": "medium"
    }
  },
  "evidence": {
    "task_detail": {
      "path": "validation/gpt-5-4/flux-pr-1184/task_detail.v1.json",
      "exists": true
    },
    "trajectory": {
      "path": "validation/gpt-5-4/flux-pr-1184/trajectory.v1.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 13521
    },
    "validation": {
      "path": "validation/gpt-5-4/flux-pr-1184/validation.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 174118
    },
    "results": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-4/results.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 61729
    },
    "run_metadata": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-4/run_metadata.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 1623
    },
    "agent_patch": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-4/flux-pr-1184/flux-pr-1184.1-of-1.2026-02-28__13-38-10__gpt-5-4/agent-logs/agent.patch",
      "exists": true,
      "non_empty": true,
      "size_bytes": 16301
    },
    "summary": {
      "path": "reports/summary.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 269968
    },
    "manifest": {
      "path": "manifest.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 677
    },
    "validation_inputs": {
      "required_for_rescue_ok": true,
      "gold_patch": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 23450,
        "sha256": "de907438d0c4ce9fc38e20cb9194ebbf86f18973dbbbb4a95457a9f4a80e022e",
        "required_for_rescue": true
      },
      "repo_tarball": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 4281607,
        "sha256": "122b56b076b160f5c1ff920fbf5b851dae1433b3718824dc611b0858cb4a222e",
        "required_for_rescue": true
      },
      "agent_patch": {
        "path": "runs/2026-02-28__13-38-10__gpt-5-4/flux-pr-1184/flux-pr-1184.1-of-1.2026-02-28__13-38-10__gpt-5-4/agent-logs/agent.patch",
        "exists": true,
        "non_empty": true,
        "size_bytes": 16301,
        "sha256": "06d8d5888b4f1016a3d96de38fbc9150e203420e6cb69f0255ed3dc78d25c8a5",
        "required_for_rescue": true
      }
    }
  },
  "provenance": {
    "identity": [
      {
        "source": "manifest",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "decision": [
      {
        "source": "validation",
        "available": true
      }
    ],
    "tests": [
      {
        "source": "validation",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "quality": {
      "equivalence": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "code_review": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "behavioral_robustness": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "footprint_risk": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "cost": [
        {
          "source": "validation",
          "available": true
        },
        {
          "source": "summary",
          "available": true
        }
      ]
    },
    "summary": {
      "task": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run_metadata": [
        {
          "source": "run_metadata",
          "available": true
        }
      ]
    },
    "evidence": [
      {
        "source": "filesystem",
        "available": true
      }
    ]
  }
}