task_detail

validation/gpt-5-1-codex-mini/flux-pr-1087/task_detail.v1.json
15444 bytes
{
  "version": "v1",
  "materialization": {
    "mode": "canonical"
  },
  "identity": {
    "variant_id": "gpt-5.1-codex-mini",
    "requested_model": "gpt-5.1-codex-mini",
    "model_key": "gpt-5-1-codex-mini",
    "run_id": "2026-02-28__13-38-10__gpt-5-1-codex-mini",
    "task_id": "flux-pr-1087",
    "trial_name": "flux-pr-1087.1-of-1.2026-02-28__13-38-10__gpt-5-1-codex-mini"
  },
  "decision": {
    "status": "pass",
    "primary_signal": "tests",
    "matrix_status": "pass_with_warn",
    "reason_codes": [
      "primary_tests",
      "tests_passed",
      "equivalence_non_equivalent",
      "review_fail"
    ],
    "flags": [
      "equiv_warn",
      "review_warn"
    ],
    "publish": {
      "include_in_leaderboard": true,
      "weak_signal_risk": false
    },
    "rescue": {
      "candidate": false,
      "eligible": false,
      "decision": "not_candidate",
      "required_artifacts_ok": true
    }
  },
  "tests": {
    "outcome": "pass",
    "strength": "strong",
    "commands": [
      "go test -C v2 ./... -count=1 -timeout=300s"
    ],
    "command_results": [
      {
        "command": "go test -C v2 ./... -count=1 -timeout=300s",
        "gold_outcome": "pass",
        "gold_tb_trial_name": "flux-pr-1087.1-of-1.validate-gold-1772297062926058000",
        "gold_failure_mode": "unset",
        "agent_outcome": "pass",
        "agent_tb_trial_name": "flux-pr-1087.1-of-1.validate-override-1772297111143742000",
        "agent_failure_mode": "unset",
        "agent_retried": false
      }
    ],
    "partial_score": {
      "score": 1,
      "numerator": 1,
      "denominator": 1,
      "level": "command",
      "provenance": "fallback_command_level",
      "reason": "test_case_detail_unavailable"
    },
    "tb": {
      "trial_name": "flux-pr-1087.1-of-1.2026-02-28__13-38-10__gpt-5-1-codex-mini",
      "is_resolved": false,
      "failure_mode": "unset",
      "duration_sec": 97.161811709,
      "time_budget_sec": 1200
    },
    "cache": {
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c"
    }
  },
  "quality": {
    "equivalence": {
      "status": "used",
      "outcome": "non_equivalent",
      "confidence": 0.93,
      "issue_type": "behavioral",
      "notes": "The patch appears to miss a core functional requirement: it imports `github.com/wundergraph/graphql-go-tools/v2/pkg/testing/permutations` but adds the new package at `pkg/testing/permutations` (missing `v2/`), so the intended reusable permutations utility is not correctly placed. The two planner bug fixes are largely addressed, but this path/package mismatch means the overall intended change is not fully satisfied."
    },
    "code_review": {
      "status": "used",
      "signal": "fail",
      "confidence": 0.88,
      "issue_type": "unclear",
      "source_field": "code_review",
      "dimension_scores": {
        "correctness": {
          "score_0_4": 1,
          "confidence_0_1": 0.91,
          "rationale": "The patch addresses part of the intended logic, but the new permutations package is added under `pkg/...` while call sites import `.../v2/pkg/...`, which is a likely build-break. The parent-selection fix also diverges from the intended helper-based boundary check."
        },
        "edge_case_handling": {
          "score_0_4": 2,
          "confidence_0_1": 0.76,
          "rationale": "The empty-keys case is now handled by returning false when no resolvable keys exist, which matches the intended bugfix. However, parent selection stopping conditions may still miss edge behavior encapsulated in the shared helper."
        },
        "introduced_bug_risk": {
          "score_0_4": 1,
          "confidence_0_1": 0.82,
          "rationale": "Manual external-parent checks were inlined instead of reusing the existing `selectWithExternalCheck` behavior, increasing risk of semantic drift. The package-path mismatch introduces immediate integration risk."
        },
        "maintainability_idioms": {
          "score_0_4": 2,
          "confidence_0_1": 0.74,
          "rationale": "Extracting permutations to a generic package is a maintainability improvement, but the API/path mismatch and bypassing a shared helper reduce consistency and make future behavior harder to reason about."
        }
      },
      "findings": [
        {
          "id": "f1",
          "dimension": "correctness",
          "severity": "major",
          "title": "Generic permutations package added at wrong module path",
          "detail": "Call sites import `github.com/wundergraph/graphql-go-tools/v2/pkg/testing/permutations`, but the new file is created under `pkg/testing/permutations` (no `v2/`), so the import target is missing in the v2 module tree.",
          "confidence_0_1": 0.96,
          "citations": [
            {
              "path": "v2/pkg/engine/datasourcetesting/datasourcetesting.go",
              "side": "new",
              "start_line": 16,
              "end_line": 16,
              "anchor_kind": "diff",
              "head_sha": "f8ccf4745f0012af0bf735978fc80406f413e3de"
            },
            {
              "path": "pkg/testing/permutations/permutations.go",
              "side": "new",
              "start_line": 1,
              "end_line": 3,
              "anchor_kind": "diff",
              "head_sha": "f8ccf4745f0012af0bf735978fc80406f413e3de"
            }
          ]
        },
        {
          "id": "f2",
          "dimension": "introduced_bug_risk",
          "severity": "major",
          "title": "Parent selection logic bypasses existing external-check helper",
          "detail": "The patch introduces an inline `IsExternal && !IsProvided` break and then directly calls `selectWithReason`, instead of using `selectWithExternalCheck` as intended. This risks missing helper-enforced semantics and future consistency.",
          "confidence_0_1": 0.79,
          "citations": [
            {
              "path": "v2/pkg/engine/plan/datasource_filter_visitor.go",
              "side": "new",
              "start_line": 209,
              "end_line": 214,
              "anchor_kind": "diff",
              "head_sha": "f8ccf4745f0012af0bf735978fc80406f413e3de"
            }
          ]
        }
      ],
      "overall": {
        "confidence_overall": 0.88,
        "summary": "The patch partially implements the intended fixes but likely does not satisfy the task end-to-end due to a major package-path mismatch and a non-idiomatic divergence in parent-selection boundary logic."
      },
      "notes": "The patch partially implements the intended fixes but likely does not satisfy the task end-to-end due to a major package-path mismatch and a non-idiomatic divergence in parent-selection boundary logic."
    },
    "behavioral_robustness": {
      "status": "used",
      "probe_status": "used",
      "gold_pass_candidate_pass_count": 1,
      "agreement_rate": 1,
      "coverage_delta_status": "unavailable",
      "coverage_delta_reason": "coverage_adapter_unavailable",
      "mutation_lite_status": "unavailable",
      "mutation_lite_reason": "mutation_runner_unavailable",
      "notes": "derived from command-level outcomes; no accepted generated probes"
    },
    "footprint_risk": {
      "status": "used",
      "reason": "none",
      "version": "v1",
      "level": "low",
      "score": 0.19900566064728448,
      "flag": false,
      "severe_flag": false
    },
    "cost": {
      "status": "tokens_only",
      "token_status": "present",
      "cache_token_status": "present",
      "total_input_tokens": 3660541,
      "total_output_tokens": 22177,
      "total_tokens": 3682718,
      "uncached_input_tokens": 360573,
      "cache_read_input_tokens": 3299968,
      "cached_input_tokens": 3299968,
      "total_cost_usd": 1.1689166999999998,
      "token_source": "openai_cached_tokens_usage",
      "pricing_model_key": "gpt-5.1-codex-mini",
      "pricing_mode": "cache_aware"
    }
  },
  "summary": {
    "task": {
      "matrix_status": "pass_with_warn",
      "tests_outcome": "pass",
      "tests_unknown_cause": null,
      "lane_report_lane": null,
      "lane_report_source": null,
      "lane_report_reasons": null,
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "setup_ms_saved": null,
      "pinned_bytes": null,
      "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
      "equivalence_status": "used",
      "equivalence_outcome": "non_equivalent",
      "code_review_status": "used",
      "code_review_signal": "fail",
      "behavioral_robustness_status": "used",
      "coverage_delta_status": "unavailable",
      "mutation_lite_status": "unavailable",
      "probe_accepted_candidates": null,
      "probe_accepted_commands": null,
      "probe_agreement_rate": 1,
      "probe_gold_pass_candidate_pass_count": 1,
      "probe_gold_pass_candidate_fail_count": null,
      "probe_review_required_count": null,
      "flags": [
        "equiv_warn",
        "review_warn"
      ],
      "tb_is_resolved": false,
      "tb_failure_mode": "unset",
      "tb_total_input_tokens": 3660541,
      "tb_total_output_tokens": 22177,
      "tb_total_tokens": 3682718,
      "tb_uncached_input_tokens": 360573,
      "tb_cache_creation_input_tokens": null,
      "tb_cache_read_input_tokens": 3299968,
      "tb_cached_input_tokens": 3299968,
      "token_status": "present",
      "cache_token_status": "present",
      "token_source": "openai_cached_tokens_usage",
      "task_cost": 1.1689166999999998,
      "cost_status": "present",
      "cost_pricing_mode": "cache_aware",
      "pricing_model_key": "gpt-5.1-codex-mini",
      "equiv_rescue_policy": "on",
      "rescue_candidate": false,
      "rescue_eligible": false,
      "rescue_decision": "not_candidate",
      "publish_include_in_leaderboard": true,
      "publish_exclusion_reasons": [],
      "publish_weak_signal_risk": false,
      "tests_only_outcome": 1,
      "rescue_aware_outcome": 1,
      "partial_score": 1,
      "partial_score_numerator": 1,
      "partial_score_denominator": 1,
      "partial_score_level": "command",
      "partial_score_provenance": "fallback_command_level",
      "partial_score_reason": "test_case_detail_unavailable",
      "partial_score_unknown_count": 0,
      "footprint_risk_status": "used",
      "footprint_risk_reason": "none",
      "footprint_risk_level": "low",
      "footprint_risk_score": 0.19900566064728448,
      "footprint_risk_flag": false,
      "footprint_risk_severe_flag": false
    },
    "run": {
      "model": "gpt-5.1-codex-mini",
      "requested_model": "gpt-5.1-codex-mini",
      "run_id": "2026-02-28__13-38-10__gpt-5-1-codex-mini",
      "passRate": 0.9655172413793104,
      "ciLow": 0.896551724137931,
      "ciHigh": 1,
      "effectiveN": 29,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 0,
        "fail_infra": 1,
        "fail_likely_equiv": 0,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 2,
        "pass_with_warn": 26
      }
    },
    "run_metadata": {
      "agent_name": "codex",
      "model_name": "gpt-5.1-codex-mini",
      "start_time": "2026-02-28T15:13:02.887835+00:00",
      "end_time": "2026-02-28T16:42:40.244280+00:00",
      "commit_hash": "99ae882e59281de74856a3e7eeb1b7c986b466af",
      "accuracy": 0.6896551724137931,
      "reasoning_effort": "medium"
    }
  },
  "evidence": {
    "task_detail": {
      "path": "validation/gpt-5-1-codex-mini/flux-pr-1087/task_detail.v1.json",
      "exists": true
    },
    "trajectory": {
      "path": "validation/gpt-5-1-codex-mini/flux-pr-1087/trajectory.v1.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 15393
    },
    "validation": {
      "path": "validation/gpt-5-1-codex-mini/flux-pr-1087/validation.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 114219
    },
    "results": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-1-codex-mini/results.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 57447
    },
    "run_metadata": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-1-codex-mini/run_metadata.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 1645
    },
    "agent_patch": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-1-codex-mini/flux-pr-1087/flux-pr-1087.1-of-1.2026-02-28__13-38-10__gpt-5-1-codex-mini/agent-logs/agent.patch",
      "exists": true,
      "non_empty": true,
      "size_bytes": 6310
    },
    "summary": {
      "path": "reports/summary.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 269968
    },
    "manifest": {
      "path": "manifest.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 677
    },
    "validation_inputs": {
      "required_for_rescue_ok": true,
      "gold_patch": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 5650,
        "sha256": "21defeddeba4ab2272e7a13a2d9fa96b1cfdf2a16d684ebc2f7ba8db7cd45aa7",
        "required_for_rescue": true
      },
      "repo_tarball": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 4172645,
        "sha256": "fcddaba29c9c50c5c431da73e0ae42b2d005df1720c06b2d531ef4981fbee22a",
        "required_for_rescue": true
      },
      "agent_patch": {
        "path": "runs/2026-02-28__13-38-10__gpt-5-1-codex-mini/flux-pr-1087/flux-pr-1087.1-of-1.2026-02-28__13-38-10__gpt-5-1-codex-mini/agent-logs/agent.patch",
        "exists": true,
        "non_empty": true,
        "size_bytes": 6310,
        "sha256": "14b930f94341be80aa4e2d5fb560fcfa5093b1001676933127f3263fd09f0614",
        "required_for_rescue": true
      }
    }
  },
  "provenance": {
    "identity": [
      {
        "source": "manifest",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "decision": [
      {
        "source": "validation",
        "available": true
      }
    ],
    "tests": [
      {
        "source": "validation",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "quality": {
      "equivalence": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "code_review": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "behavioral_robustness": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "footprint_risk": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "cost": [
        {
          "source": "validation",
          "available": true
        },
        {
          "source": "summary",
          "available": true
        }
      ]
    },
    "summary": {
      "task": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run_metadata": [
        {
          "source": "run_metadata",
          "available": true
        }
      ]
    },
    "evidence": [
      {
        "source": "filesystem",
        "available": true
      }
    ]
  }
}