STET

task_detail

validation/gpt-5-3-codex/flux-pr-1034/task_detail.v1.json

16984 bytes

Back to adjudication
{
  "version": "v1",
  "materialization": {
    "mode": "canonical"
  },
  "identity": {
    "variant_id": "gpt-5.3-codex",
    "requested_model": "gpt-5.3-codex",
    "model_key": "gpt-5-3-codex",
    "run_id": "2026-02-28__13-38-10__gpt-5-3-codex",
    "task_id": "flux-pr-1034",
    "trial_name": "flux-pr-1034.1-of-1.2026-02-28__13-38-10__gpt-5-3-codex"
  },
  "decision": {
    "status": "pass",
    "primary_signal": "tests",
    "matrix_status": "pass_with_warn",
    "reason_codes": [
      "primary_tests",
      "tests_passed",
      "equivalence_non_equivalent",
      "review_fail"
    ],
    "flags": [
      "equiv_warn",
      "review_warn"
    ],
    "publish": {
      "include_in_leaderboard": true,
      "weak_signal_risk": false
    },
    "rescue": {
      "candidate": false,
      "eligible": false,
      "decision": "not_candidate",
      "required_artifacts_ok": true
    }
  },
  "tests": {
    "outcome": "pass",
    "strength": "strong",
    "commands": [
      "go test -C v2 ./... -count=1 -timeout=300s"
    ],
    "command_results": [
      {
        "command": "go test -C v2 ./... -count=1 -timeout=300s",
        "gold_outcome": "pass",
        "gold_tb_trial_name": "flux-pr-1034.1-of-1.validate-gold-1772289784691540000",
        "gold_failure_mode": "unset",
        "agent_outcome": "pass",
        "agent_tb_trial_name": "flux-pr-1034.1-of-1.validate-override-1772289836488674000",
        "agent_failure_mode": "unset",
        "agent_retried": false
      }
    ],
    "partial_score": {
      "score": 1,
      "numerator": 1,
      "denominator": 1,
      "level": "command",
      "provenance": "fallback_command_level",
      "reason": "test_case_detail_unavailable"
    },
    "tb": {
      "trial_name": "flux-pr-1034.1-of-1.2026-02-28__13-38-10__gpt-5-3-codex",
      "is_resolved": true,
      "failure_mode": "unset",
      "duration_sec": 106.387112458,
      "time_budget_sec": 1200
    },
    "cache": {
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c"
    }
  },
  "quality": {
    "equivalence": {
      "status": "used",
      "outcome": "non_equivalent",
      "confidence": 0.93,
      "issue_type": "behavioral",
      "notes": "The patch misses core intent. It remaps variables by **variable-definition order** (`operationDefinition.VariableDefinitions.Refs`), not by **order of appearance in the operation arguments**, so structurally identical queries with different declaration order can still normalize differently (e.g. swapped definitions produce swapped variable usage). It also targets `app/pkg/graphql/*` rather than implementing the v2 pipeline changes (normalizer/validator/resolve context) shown by the task scope, so the required end-to-end canonical-name translation behavior is not fully met."
    },
    "code_review": {
      "status": "used",
      "signal": "fail",
      "confidence": 0.91,
      "issue_type": "unclear",
      "source_field": "code_review",
      "dimension_scores": {
        "correctness": {
          "score_0_4": 1,
          "confidence_0_1": 0.94,
          "rationale": "The patch implements a different approach in `app/pkg/graphql` and does not add the v2 pipeline pieces required by the task (AST normalization mapper in v2, validator remap hook, resolve context remap), so it is unlikely to satisfy the intended PR behavior end-to-end."
        },
        "edge_case_handling": {
          "score_0_4": 1,
          "confidence_0_1": 0.84,
          "rationale": "Canonicalization uses variable-definition order rather than variable appearance order in the operation, which misses the stated normalization rule for structurally identical queries."
        },
        "introduced_bug_risk": {
          "score_0_4": 1,
          "confidence_0_1": 0.87,
          "rationale": "It introduces custom JSON key remap/merge logic and dual variable payload handling, increasing risk of mismatches between normalized operation variables and original client variables in execution."
        },
        "maintainability_idioms": {
          "score_0_4": 2,
          "confidence_0_1": 0.78,
          "rationale": "The patch adds substantial bespoke variable remapping logic in the request layer and engine call sites instead of using the normalization/visitor pipeline pattern expected by the codebase, making future behavior harder to reason about."
        }
      },
      "findings": [
        {
          "id": "f1",
          "dimension": "correctness",
          "severity": "major",
          "title": "Implements changes in the wrong subsystem",
          "detail": "The task targets v2 normalization/validation/resolve flow, but this patch only modifies `app/pkg/graphql` with a custom remapper. Required v2 integration points are not present, so intended behavior is likely missing where tests run.",
          "confidence_0_1": 0.96,
          "citations": [
            {
              "path": "app/pkg/graphql/variables_mapper.go",
              "side": "new",
              "start_line": 1,
              "end_line": 153,
              "anchor_kind": "diff",
              "head_sha": "93dcbff9a9eb294de12c18d5ad541150ce64f210"
            }
          ]
        },
        {
          "id": "f2",
          "dimension": "correctness",
          "severity": "major",
          "title": "Canonical naming order does not follow variable appearance",
          "detail": "New names are assigned by iterating variable definitions (`VariableDefinitions.Refs`) instead of depth-first argument occurrence order, which can produce different canonical forms for structurally identical queries.",
          "confidence_0_1": 0.9,
          "citations": [
            {
              "path": "app/pkg/graphql/variables_mapper.go",
              "side": "new",
              "start_line": 25,
              "end_line": 37,
              "anchor_kind": "diff",
              "head_sha": "93dcbff9a9eb294de12c18d5ad541150ce64f210"
            }
          ]
        },
        {
          "id": "f3",
          "dimension": "correctness",
          "severity": "major",
          "title": "Variable remap is not wired into validator/resolve context",
          "detail": "Execution engines switch input JSON source, but there is no validator remap API usage or resolve-context variable-name translation path in this patch, so validation/subgraph rendering can diverge from normalized variable names.",
          "confidence_0_1": 0.88,
          "citations": [
            {
              "path": "app/pkg/graphql/execution_engine.go",
              "side": "new",
              "start_line": 126,
              "end_line": 133,
              "anchor_kind": "diff",
              "head_sha": "93dcbff9a9eb294de12c18d5ad541150ce64f210"
            },
            {
              "path": "app/pkg/graphql/execution_engine_v2.go",
              "side": "new",
              "start_line": 259,
              "end_line": 265,
              "anchor_kind": "diff",
              "head_sha": "93dcbff9a9eb294de12c18d5ad541150ce64f210"
            }
          ]
        },
        {
          "id": "f4",
          "dimension": "introduced_bug_risk",
          "severity": "major",
          "title": "Merging original and canonical variable keys can leak inconsistent payloads",
          "detail": "The merge function preserves original client keys not present in canonical JSON, potentially sending both original and canonical keys downstream and creating ambiguous behavior across components.",
          "confidence_0_1": 0.82,
          "citations": [
            {
              "path": "app/pkg/graphql/variables_mapper.go",
              "side": "new",
              "start_line": 117,
              "end_line": 153,
              "anchor_kind": "diff",
              "head_sha": "93dcbff9a9eb294de12c18d5ad541150ce64f210"
            }
          ]
        }
      ],
      "overall": {
        "confidence_overall": 0.91,
        "summary": "The patch is unlikely to satisfy the intended PR: it implements remapping in `app/pkg/graphql` with different semantics and misses the expected v2 normalization/validation/resolve integration path."
      },
      "notes": "The patch is unlikely to satisfy the intended PR: it implements remapping in `app/pkg/graphql` with different semantics and misses the expected v2 normalization/validation/resolve integration path."
    },
    "behavioral_robustness": {
      "status": "used",
      "probe_status": "used",
      "gold_pass_candidate_pass_count": 1,
      "agreement_rate": 1,
      "coverage_delta_status": "unavailable",
      "coverage_delta_reason": "coverage_adapter_unavailable",
      "mutation_lite_status": "unavailable",
      "mutation_lite_reason": "mutation_runner_unavailable",
      "notes": "derived from command-level outcomes; no accepted generated probes"
    },
    "footprint_risk": {
      "status": "used",
      "reason": "none",
      "version": "v1",
      "level": "low",
      "score": 0.22682349504576949,
      "flag": false,
      "severe_flag": false
    },
    "cost": {
      "status": "tokens_only",
      "token_status": "present",
      "cache_token_status": "present",
      "total_input_tokens": 6923096,
      "total_output_tokens": 23474,
      "total_tokens": 6946570,
      "uncached_input_tokens": 135384,
      "cache_read_input_tokens": 6787712,
      "cached_input_tokens": 6787712,
      "total_cost_usd": 13.620768,
      "token_source": "openai_cached_tokens_usage",
      "pricing_model_key": "gpt-5.3-codex",
      "pricing_mode": "cache_aware"
    }
  },
  "summary": {
    "task": {
      "matrix_status": "pass_with_warn",
      "tests_outcome": "pass",
      "tests_unknown_cause": null,
      "lane_report_lane": null,
      "lane_report_source": null,
      "lane_report_reasons": null,
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "setup_ms_saved": null,
      "pinned_bytes": null,
      "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
      "equivalence_status": "used",
      "equivalence_outcome": "non_equivalent",
      "code_review_status": "used",
      "code_review_signal": "fail",
      "behavioral_robustness_status": "used",
      "coverage_delta_status": "unavailable",
      "mutation_lite_status": "unavailable",
      "probe_accepted_candidates": null,
      "probe_accepted_commands": null,
      "probe_agreement_rate": 1,
      "probe_gold_pass_candidate_pass_count": 1,
      "probe_gold_pass_candidate_fail_count": null,
      "probe_review_required_count": null,
      "flags": [
        "equiv_warn",
        "review_warn"
      ],
      "tb_is_resolved": true,
      "tb_failure_mode": "unset",
      "tb_total_input_tokens": 6923096,
      "tb_total_output_tokens": 23474,
      "tb_total_tokens": 6946570,
      "tb_uncached_input_tokens": 135384,
      "tb_cache_creation_input_tokens": null,
      "tb_cache_read_input_tokens": 6787712,
      "tb_cached_input_tokens": 6787712,
      "token_status": "present",
      "cache_token_status": "present",
      "token_source": "openai_cached_tokens_usage",
      "task_cost": 13.620768,
      "cost_status": "present",
      "cost_pricing_mode": "cache_aware",
      "pricing_model_key": "gpt-5.3-codex",
      "equiv_rescue_policy": "on",
      "rescue_candidate": false,
      "rescue_eligible": false,
      "rescue_decision": "not_candidate",
      "publish_include_in_leaderboard": true,
      "publish_exclusion_reasons": [],
      "publish_weak_signal_risk": false,
      "tests_only_outcome": 1,
      "rescue_aware_outcome": 1,
      "partial_score": 1,
      "partial_score_numerator": 1,
      "partial_score_denominator": 1,
      "partial_score_level": "command",
      "partial_score_provenance": "fallback_command_level",
      "partial_score_reason": "test_case_detail_unavailable",
      "partial_score_unknown_count": 0,
      "footprint_risk_status": "used",
      "footprint_risk_reason": "none",
      "footprint_risk_level": "low",
      "footprint_risk_score": 0.22682349504576949,
      "footprint_risk_flag": false,
      "footprint_risk_severe_flag": false
    },
    "run": {
      "model": "gpt-5.3-codex",
      "requested_model": "gpt-5.3-codex",
      "run_id": "2026-02-28__13-38-10__gpt-5-3-codex",
      "passRate": 0.9310344827586207,
      "ciLow": 0.8275862068965517,
      "ciHigh": 1,
      "effectiveN": 29,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 1,
        "fail_infra": 0,
        "fail_likely_equiv": 1,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 6,
        "pass_with_warn": 21
      }
    },
    "run_metadata": {
      "agent_name": "codex",
      "model_name": "gpt-5.3-codex",
      "start_time": "2026-02-28T13:38:12.733292+00:00",
      "end_time": "2026-02-28T14:43:03.470912+00:00",
      "commit_hash": "99ae882e59281de74856a3e7eeb1b7c986b466af",
      "accuracy": 0.896551724137931,
      "reasoning_effort": "medium"
    }
  },
  "evidence": {
    "task_detail": {
      "path": "validation/gpt-5-3-codex/flux-pr-1034/task_detail.v1.json",
      "exists": true
    },
    "trajectory": {
      "path": "validation/gpt-5-3-codex/flux-pr-1034/trajectory.v1.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 15176
    },
    "validation": {
      "path": "validation/gpt-5-3-codex/flux-pr-1034/validation.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 164733
    },
    "results": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-3-codex/results.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 62248
    },
    "run_metadata": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-3-codex/run_metadata.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 1634
    },
    "agent_patch": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-3-codex/flux-pr-1034/flux-pr-1034.1-of-1.2026-02-28__13-38-10__gpt-5-3-codex/agent-logs/agent.patch",
      "exists": true,
      "non_empty": true,
      "size_bytes": 14591
    },
    "summary": {
      "path": "reports/summary.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 269968
    },
    "manifest": {
      "path": "manifest.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 677
    },
    "validation_inputs": {
      "required_for_rescue_ok": true,
      "gold_patch": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 11956,
        "sha256": "5b21daceee9a4c4f67498bc2777798430c0ccea25a935b867d1463ecfd3ebfcd",
        "required_for_rescue": true
      },
      "repo_tarball": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 4155946,
        "sha256": "1000f8cb511b58a0eb9145c57c2ee81c89e2bb324ec28a30226ee1749d334bb6",
        "required_for_rescue": true
      },
      "agent_patch": {
        "path": "runs/2026-02-28__13-38-10__gpt-5-3-codex/flux-pr-1034/flux-pr-1034.1-of-1.2026-02-28__13-38-10__gpt-5-3-codex/agent-logs/agent.patch",
        "exists": true,
        "non_empty": true,
        "size_bytes": 14591,
        "sha256": "51d121720567fecdafd2260f7436c816d1d8d8ad1bc69c4a546e569a78b93a96",
        "required_for_rescue": true
      }
    }
  },
  "provenance": {
    "identity": [
      {
        "source": "manifest",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "decision": [
      {
        "source": "validation",
        "available": true
      }
    ],
    "tests": [
      {
        "source": "validation",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "quality": {
      "equivalence": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "code_review": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "behavioral_robustness": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "footprint_risk": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "cost": [
        {
          "source": "validation",
          "available": true
        },
        {
          "source": "summary",
          "available": true
        }
      ]
    },
    "summary": {
      "task": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run_metadata": [
        {
          "source": "run_metadata",
          "available": true
        }
      ]
    },
    "evidence": [
      {
        "source": "filesystem",
        "available": true
      }
    ]
  }
}