STET

task_detail

validation/gpt-5-3-codex/flux-pr-828/task_detail.v1.json

17070 bytes

Back to adjudication
{
  "version": "v1",
  "materialization": {
    "mode": "canonical"
  },
  "identity": {
    "variant_id": "gpt-5.3-codex",
    "requested_model": "gpt-5.3-codex",
    "model_key": "gpt-5-3-codex",
    "run_id": "2026-02-28__13-38-10__gpt-5-3-codex",
    "task_id": "flux-pr-828",
    "trial_name": "flux-pr-828.1-of-1.2026-02-28__13-38-10__gpt-5-3-codex"
  },
  "decision": {
    "status": "pass",
    "primary_signal": "tests",
    "matrix_status": "pass_with_warn",
    "reason_codes": [
      "guardrail_diff_too_large",
      "primary_tests",
      "tests_passed",
      "equivalence_non_equivalent",
      "review_fail"
    ],
    "flags": [
      "equiv_warn",
      "review_warn"
    ],
    "publish": {
      "include_in_leaderboard": true,
      "weak_signal_risk": false
    },
    "rescue": {
      "candidate": false,
      "eligible": false,
      "decision": "not_candidate",
      "required_artifacts_ok": true
    }
  },
  "tests": {
    "outcome": "pass",
    "strength": "strong",
    "commands": [
      "go test -C v2 ./... -count=1 -timeout=300s"
    ],
    "command_results": [
      {
        "command": "go test -C v2 ./... -count=1 -timeout=300s",
        "gold_outcome": "pass",
        "gold_tb_trial_name": "flux-pr-828.1-of-1.validate-gold-1772291306451563000",
        "gold_failure_mode": "unset",
        "agent_outcome": "pass",
        "agent_tb_trial_name": "flux-pr-828.1-of-1.validate-override-1772291367839181000",
        "agent_failure_mode": "unset",
        "agent_retried": false
      }
    ],
    "partial_score": {
      "score": 1,
      "numerator": 1,
      "denominator": 1,
      "level": "command",
      "provenance": "fallback_command_level",
      "reason": "test_case_detail_unavailable"
    },
    "tb": {
      "trial_name": "flux-pr-828.1-of-1.2026-02-28__13-38-10__gpt-5-3-codex",
      "is_resolved": true,
      "failure_mode": "unset",
      "duration_sec": 122.264439167,
      "time_budget_sec": 1200
    },
    "cache": {
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c"
    }
  },
  "quality": {
    "equivalence": {
      "status": "used",
      "outcome": "non_equivalent",
      "confidence": 0.93,
      "issue_type": "behavioral",
      "notes": "The patch wires an opt-in flag and “only-if-smaller” fallback, but the minifier only deduplicates selection sets of inline fragments with the same type condition. It misses key intended cases where duplicate field selection sets are repeated across different enclosing inline fragments/type branches (the main federation payload-reduction scenario shown by the task/gold behavior)."
    },
    "code_review": {
      "status": "used",
      "signal": "fail",
      "confidence": 0.86,
      "issue_type": "unclear",
      "source_field": "code_review",
      "dimension_scores": {
        "correctness": {
          "score_0_4": 2,
          "confidence_0_1": 0.88,
          "rationale": "The patch wires an opt-in minifier into subgraph operation printing and applies it only when smaller, but the implemented algorithm only deduplicates inline fragments grouped by type and does not target repeated field selection sets, which is a core part of the intended compression behavior."
        },
        "edge_case_handling": {
          "score_0_4": 1,
          "confidence_0_1": 0.89,
          "rationale": "Edge-case coverage is limited: deduplication is restricted to inline fragments and equality relies on strict AST shape/order matching, so many duplicate selection-set patterns are not minimized."
        },
        "introduced_bug_risk": {
          "score_0_4": 2,
          "confidence_0_1": 0.81,
          "rationale": "The change touches planner/execution behavior for all subgraph operations when enabled, and the minifier mutates AST structures directly without broader fixture coverage, increasing risk of semantic or effectiveness regressions in complex operations."
        },
        "maintainability_idioms": {
          "score_0_4": 3,
          "confidence_0_1": 0.78,
          "rationale": "Code is reasonably structured and readable with isolated helper functions and planner flag plumbing, but behavior/limitations are not documented deeply and tests are narrow for such a cross-cutting optimization."
        }
      },
      "findings": [
        {
          "id": "f1",
          "dimension": "correctness",
          "severity": "major",
          "title": "Minifier scope is limited to inline fragments, missing repeated field selection-set deduplication",
          "detail": "The implementation only collects candidates from `doc.InlineFragments` by type and never scans/rewrites repeated selection sets under fields. This misses major duplication patterns in federated subgraph operations and likely under-delivers the intended compression behavior.",
          "confidence_0_1": 0.92,
          "citations": [
            {
              "path": "v2/pkg/astminifier/astminifier.go",
              "side": "new",
              "start_line": 50,
              "end_line": 69,
              "anchor_kind": "diff",
              "head_sha": "1c1c42b5c9c63b2aeae7bd3eecd05ad4525462dc"
            },
            {
              "path": "v2/pkg/astminifier/astminifier.go",
              "side": "new",
              "start_line": 84,
              "end_line": 99,
              "anchor_kind": "diff",
              "head_sha": "1c1c42b5c9c63b2aeae7bd3eecd05ad4525462dc"
            }
          ]
        },
        {
          "id": "f2",
          "dimension": "edge_case_handling",
          "severity": "major",
          "title": "Equality logic is strict positional AST equality and misses semantically duplicate patterns",
          "detail": "Selection-set equality compares selection references in exact order and recursively matches exact node kinds; there is no normalization/canonicalization path. Many real-world duplicates that differ structurally but are semantically equivalent will not be deduplicated.",
          "confidence_0_1": 0.85,
          "citations": [
            {
              "path": "v2/pkg/astminifier/astminifier.go",
              "side": "new",
              "start_line": 139,
              "end_line": 155,
              "anchor_kind": "diff",
              "head_sha": "1c1c42b5c9c63b2aeae7bd3eecd05ad4525462dc"
            },
            {
              "path": "v2/pkg/astminifier/astminifier.go",
              "side": "new",
              "start_line": 157,
              "end_line": 177,
              "anchor_kind": "diff",
              "head_sha": "1c1c42b5c9c63b2aeae7bd3eecd05ad4525462dc"
            }
          ]
        },
        {
          "id": "f3",
          "dimension": "introduced_bug_risk",
          "severity": "major",
          "title": "Test coverage is too narrow for planner-level optimization",
          "detail": "Tests only validate one synthetic success path and one non-shrinking path, without broader fixtures for nested selections, directives, existing fragments, or integration through planner flag behavior. This increases regression risk when feature is enabled.",
          "confidence_0_1": 0.79,
          "citations": [
            {
              "path": "v2/pkg/astminifier/astminifier_test.go",
              "side": "new",
              "start_line": 12,
              "end_line": 45,
              "anchor_kind": "diff",
              "head_sha": "1c1c42b5c9c63b2aeae7bd3eecd05ad4525462dc"
            },
            {
              "path": "v2/pkg/engine/datasource/graphql_datasource/graphql_datasource.go",
              "side": "new",
              "start_line": 1357,
              "end_line": 1368,
              "anchor_kind": "diff",
              "head_sha": "1c1c42b5c9c63b2aeae7bd3eecd05ad4525462dc"
            }
          ]
        }
      ],
      "overall": {
        "confidence_overall": 0.86,
        "summary": "The patch partially implements the feature (opt-in and apply-only-if-smaller) but likely does not satisfy the intended change fully because deduplication is limited to inline fragments and misses broader repeated selection-set compression patterns expected for subgraph requests."
      },
      "notes": "The patch partially implements the feature (opt-in and apply-only-if-smaller) but likely does not satisfy the intended change fully because deduplication is limited to inline fragments and misses broader repeated selection-set compression patterns expected for subgraph requests."
    },
    "behavioral_robustness": {
      "status": "used",
      "probe_status": "used",
      "gold_pass_candidate_pass_count": 1,
      "agreement_rate": 1,
      "coverage_delta_status": "unavailable",
      "coverage_delta_reason": "coverage_adapter_unavailable",
      "mutation_lite_status": "unavailable",
      "mutation_lite_reason": "mutation_runner_unavailable",
      "notes": "derived from command-level outcomes; no accepted generated probes"
    },
    "footprint_risk": {
      "status": "used",
      "reason": "none",
      "version": "v1",
      "level": "low",
      "score": 0.14080882787202723,
      "flag": false,
      "severe_flag": false
    },
    "cost": {
      "status": "tokens_only",
      "token_status": "present",
      "cache_token_status": "present",
      "total_input_tokens": 4222768,
      "total_output_tokens": 18062,
      "total_tokens": 4240830,
      "uncached_input_tokens": 183088,
      "cache_read_input_tokens": 4039680,
      "cached_input_tokens": 4039680,
      "total_cost_usd": 9.88956,
      "token_source": "openai_cached_tokens_usage",
      "pricing_model_key": "gpt-5.3-codex",
      "pricing_mode": "cache_aware"
    }
  },
  "summary": {
    "task": {
      "matrix_status": "pass_with_warn",
      "tests_outcome": "pass",
      "tests_unknown_cause": null,
      "lane_report_lane": null,
      "lane_report_source": null,
      "lane_report_reasons": null,
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "setup_ms_saved": null,
      "pinned_bytes": null,
      "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
      "equivalence_status": "used",
      "equivalence_outcome": "non_equivalent",
      "code_review_status": "used",
      "code_review_signal": "fail",
      "behavioral_robustness_status": "used",
      "coverage_delta_status": "unavailable",
      "mutation_lite_status": "unavailable",
      "probe_accepted_candidates": null,
      "probe_accepted_commands": null,
      "probe_agreement_rate": 1,
      "probe_gold_pass_candidate_pass_count": 1,
      "probe_gold_pass_candidate_fail_count": null,
      "probe_review_required_count": null,
      "flags": [
        "equiv_warn",
        "review_warn"
      ],
      "tb_is_resolved": true,
      "tb_failure_mode": "unset",
      "tb_total_input_tokens": 4222768,
      "tb_total_output_tokens": 18062,
      "tb_total_tokens": 4240830,
      "tb_uncached_input_tokens": 183088,
      "tb_cache_creation_input_tokens": null,
      "tb_cache_read_input_tokens": 4039680,
      "tb_cached_input_tokens": 4039680,
      "token_status": "present",
      "cache_token_status": "present",
      "token_source": "openai_cached_tokens_usage",
      "task_cost": 9.88956,
      "cost_status": "present",
      "cost_pricing_mode": "cache_aware",
      "pricing_model_key": "gpt-5.3-codex",
      "equiv_rescue_policy": "on",
      "rescue_candidate": false,
      "rescue_eligible": false,
      "rescue_decision": "not_candidate",
      "publish_include_in_leaderboard": true,
      "publish_exclusion_reasons": [],
      "publish_weak_signal_risk": false,
      "tests_only_outcome": 1,
      "rescue_aware_outcome": 1,
      "partial_score": 1,
      "partial_score_numerator": 1,
      "partial_score_denominator": 1,
      "partial_score_level": "command",
      "partial_score_provenance": "fallback_command_level",
      "partial_score_reason": "test_case_detail_unavailable",
      "partial_score_unknown_count": 0,
      "footprint_risk_status": "used",
      "footprint_risk_reason": "none",
      "footprint_risk_level": "low",
      "footprint_risk_score": 0.14080882787202723,
      "footprint_risk_flag": false,
      "footprint_risk_severe_flag": false
    },
    "run": {
      "model": "gpt-5.3-codex",
      "requested_model": "gpt-5.3-codex",
      "run_id": "2026-02-28__13-38-10__gpt-5-3-codex",
      "passRate": 0.9310344827586207,
      "ciLow": 0.8275862068965517,
      "ciHigh": 1,
      "effectiveN": 29,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 1,
        "fail_infra": 0,
        "fail_likely_equiv": 1,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 6,
        "pass_with_warn": 21
      }
    },
    "run_metadata": {
      "agent_name": "codex",
      "model_name": "gpt-5.3-codex",
      "start_time": "2026-02-28T13:38:12.733292+00:00",
      "end_time": "2026-02-28T14:43:03.470912+00:00",
      "commit_hash": "99ae882e59281de74856a3e7eeb1b7c986b466af",
      "accuracy": 0.896551724137931,
      "reasoning_effort": "medium"
    }
  },
  "evidence": {
    "task_detail": {
      "path": "validation/gpt-5-3-codex/flux-pr-828/task_detail.v1.json",
      "exists": true
    },
    "trajectory": {
      "path": "validation/gpt-5-3-codex/flux-pr-828/trajectory.v1.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 15252
    },
    "validation": {
      "path": "validation/gpt-5-3-codex/flux-pr-828/validation.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 208858
    },
    "results": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-3-codex/results.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 62248
    },
    "run_metadata": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-3-codex/run_metadata.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 1634
    },
    "agent_patch": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-3-codex/flux-pr-828/flux-pr-828.1-of-1.2026-02-28__13-38-10__gpt-5-3-codex/agent-logs/agent.patch",
      "exists": true,
      "non_empty": true,
      "size_bytes": 15339
    },
    "summary": {
      "path": "reports/summary.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 269968
    },
    "manifest": {
      "path": "manifest.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 677
    },
    "validation_inputs": {
      "required_for_rescue_ok": true,
      "gold_patch": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 35413,
        "sha256": "5b8feac8bb191509f042ca05741355c8b65942d53a5b13a94861ed74d41c2816",
        "required_for_rescue": true
      },
      "repo_tarball": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 4092497,
        "sha256": "a64c81070c7529d8049cfc5748f529c279ebd6aef27cc23e61a2a711e9df5014",
        "required_for_rescue": true
      },
      "agent_patch": {
        "path": "runs/2026-02-28__13-38-10__gpt-5-3-codex/flux-pr-828/flux-pr-828.1-of-1.2026-02-28__13-38-10__gpt-5-3-codex/agent-logs/agent.patch",
        "exists": true,
        "non_empty": true,
        "size_bytes": 15339,
        "sha256": "3b8e76020a80eb9d86b0b9fd4a06aaba78396a662e223d82241f10b022ca2a9c",
        "required_for_rescue": true
      }
    }
  },
  "provenance": {
    "identity": [
      {
        "source": "manifest",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "decision": [
      {
        "source": "validation",
        "available": true
      }
    ],
    "tests": [
      {
        "source": "validation",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "quality": {
      "equivalence": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "code_review": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "behavioral_robustness": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "footprint_risk": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "cost": [
        {
          "source": "validation",
          "available": true
        },
        {
          "source": "summary",
          "available": true
        }
      ]
    },
    "summary": {
      "task": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run_metadata": [
        {
          "source": "run_metadata",
          "available": true
        }
      ]
    },
    "evidence": [
      {
        "source": "filesystem",
        "available": true
      }
    ]
  }
}