STET

task_detail

validation/gpt-5-3-codex/flux-pr-1351/task_detail.v1.json

15376 bytes

Back to adjudication
{
  "version": "v1",
  "materialization": {
    "mode": "canonical"
  },
  "identity": {
    "variant_id": "gpt-5.3-codex",
    "requested_model": "gpt-5.3-codex",
    "model_key": "gpt-5-3-codex",
    "run_id": "2026-02-28__13-38-10__gpt-5-3-codex",
    "task_id": "flux-pr-1351",
    "trial_name": "flux-pr-1351.1-of-1.2026-02-28__13-38-10__gpt-5-3-codex"
  },
  "decision": {
    "status": "pass",
    "primary_signal": "tests",
    "matrix_status": "pass_with_warn",
    "reason_codes": [
      "primary_tests",
      "tests_passed",
      "equivalence_non_equivalent",
      "review_fail"
    ],
    "flags": [
      "equiv_warn",
      "review_warn"
    ],
    "publish": {
      "include_in_leaderboard": true,
      "weak_signal_risk": false
    },
    "rescue": {
      "candidate": false,
      "eligible": false,
      "decision": "not_candidate",
      "required_artifacts_ok": true
    }
  },
  "tests": {
    "outcome": "pass",
    "strength": "strong",
    "commands": [
      "go test -C v2 ./... -count=1 -timeout=300s"
    ],
    "command_results": [
      {
        "command": "go test -C v2 ./... -count=1 -timeout=300s",
        "gold_outcome": "pass",
        "gold_tb_trial_name": "flux-pr-1351.1-of-1.validate-gold-1772291111614156000",
        "gold_failure_mode": "unset",
        "agent_outcome": "pass",
        "agent_tb_trial_name": "flux-pr-1351.1-of-1.validate-override-1772291184582154000",
        "agent_failure_mode": "unset",
        "agent_retried": false
      }
    ],
    "partial_score": {
      "score": 1,
      "numerator": 1,
      "denominator": 1,
      "level": "command",
      "provenance": "fallback_command_level",
      "reason": "test_case_detail_unavailable"
    },
    "tb": {
      "trial_name": "flux-pr-1351.1-of-1.2026-02-28__13-38-10__gpt-5-3-codex",
      "is_resolved": true,
      "failure_mode": "unset",
      "duration_sec": 143.605649708,
      "time_budget_sec": 1200
    },
    "cache": {
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c"
    }
  },
  "quality": {
    "equivalence": {
      "status": "used",
      "outcome": "non_equivalent",
      "confidence": 0.88,
      "issue_type": "behavioral",
      "notes": "The patch does add grouped storage (`SubgraphErrorsBySubgraph`) and deterministic joining, but it misses core intended behavior in at least one important path: `loader.newResponseInfo` call sites still pass the combined `SubgraphErrors()` (not per-subgraph error), so per-subgraph attribution in response hook info is not implemented like the task intent. It also changes `Context.clone` to clear subgraph errors instead of preserving/copying them, which is a behavioral regression versus the intended restructuring."
    },
    "code_review": {
      "status": "used",
      "signal": "fail",
      "confidence": 0.81,
      "issue_type": "unclear",
      "source_field": "code_review",
      "dimension_scores": {
        "correctness": {
          "score_0_4": 2,
          "confidence_0_1": 0.84,
          "rationale": "The patch adds per-subgraph storage and deterministic joining, but key call sites still pass the aggregated error to response hooks instead of the subgraph-specific error, and clone now clears tracked subgraph errors rather than preserving them."
        },
        "edge_case_handling": {
          "score_0_4": 2,
          "confidence_0_1": 0.72,
          "rationale": "Deterministic ordering is handled, including key fallback when name is missing, but the hook-facing path still uses global joined errors, so per-subgraph attribution is not consistently available at all consumption points."
        },
        "introduced_bug_risk": {
          "score_0_4": 2,
          "confidence_0_1": 0.78,
          "rationale": "The change introduces new mutable state and synchronization plus a second representation (`subgraphErrors` and `subgraphErrorsBySubgraph`), increasing inconsistency risk. Clearing error state in clone is especially risky for flows using context cloning."
        },
        "maintainability_idioms": {
          "score_0_4": 2,
          "confidence_0_1": 0.74,
          "rationale": "The implementation is more complex than needed (dual structures, recomputation, locking) compared with a simpler map-of-errors approach, making future behavior changes and debugging harder."
        }
      },
      "findings": [
        {
          "id": "f1",
          "dimension": "correctness",
          "severity": "major",
          "title": "Loader hooks still receive aggregated subgraph errors",
          "detail": "Calls to `newResponseInfo` now pass `l.ctx.SubgraphErrors()` (the globally joined error), so hook consumers do not get per-subgraph attribution for the current datasource response.",
          "confidence_0_1": 0.88,
          "citations": [
            {
              "path": "v2/pkg/engine/resolve/loader.go",
              "side": "new",
              "start_line": 243,
              "end_line": 253,
              "anchor_kind": "diff",
              "head_sha": "42024d2d31ac476136dedf2ef63a488b7dd55f63"
            },
            {
              "path": "v2/pkg/engine/resolve/loader.go",
              "side": "new",
              "start_line": 289,
              "end_line": 304,
              "anchor_kind": "diff",
              "head_sha": "42024d2d31ac476136dedf2ef63a488b7dd55f63"
            }
          ]
        },
        {
          "id": "f2",
          "dimension": "correctness",
          "severity": "major",
          "title": "Context cloning drops accumulated subgraph errors",
          "detail": "The clone function explicitly resets `subgraphErrors` and `subgraphErrorsBySubgraph` to nil, which can lose already-collected error state when a context is cloned via `WithContext`.",
          "confidence_0_1": 0.83,
          "citations": [
            {
              "path": "v2/pkg/engine/resolve/context.go",
              "side": "new",
              "start_line": 280,
              "end_line": 283,
              "anchor_kind": "diff",
              "head_sha": "42024d2d31ac476136dedf2ef63a488b7dd55f63"
            }
          ]
        }
      ],
      "overall": {
        "confidence_overall": 0.81,
        "summary": "The patch moves toward grouped/deterministic subgraph error tracking, but it likely does not fully satisfy the intended behavior because downstream hook paths still use the global aggregated error and cloning now drops accumulated subgraph errors."
      },
      "notes": "The patch moves toward grouped/deterministic subgraph error tracking, but it likely does not fully satisfy the intended behavior because downstream hook paths still use the global aggregated error and cloning now drops accumulated subgraph errors."
    },
    "behavioral_robustness": {
      "status": "used",
      "probe_status": "used",
      "gold_pass_candidate_pass_count": 1,
      "agreement_rate": 1,
      "coverage_delta_status": "unavailable",
      "coverage_delta_reason": "coverage_adapter_unavailable",
      "mutation_lite_status": "unavailable",
      "mutation_lite_reason": "mutation_runner_unavailable",
      "notes": "derived from command-level outcomes; no accepted generated probes"
    },
    "footprint_risk": {
      "status": "used",
      "reason": "none",
      "version": "v1",
      "level": "medium",
      "score": 0.45114231648401637,
      "flag": false,
      "severe_flag": false
    },
    "cost": {
      "status": "tokens_only",
      "token_status": "present",
      "cache_token_status": "present",
      "total_input_tokens": 1427587,
      "total_output_tokens": 8619,
      "total_tokens": 1436206,
      "uncached_input_tokens": 62595,
      "cache_read_input_tokens": 1364992,
      "cached_input_tokens": 1364992,
      "total_cost_usd": 3.5035529999999997,
      "token_source": "openai_cached_tokens_usage",
      "pricing_model_key": "gpt-5.3-codex",
      "pricing_mode": "cache_aware"
    }
  },
  "summary": {
    "task": {
      "matrix_status": "pass_with_warn",
      "tests_outcome": "pass",
      "tests_unknown_cause": null,
      "lane_report_lane": null,
      "lane_report_source": null,
      "lane_report_reasons": null,
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "setup_ms_saved": null,
      "pinned_bytes": null,
      "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
      "equivalence_status": "used",
      "equivalence_outcome": "non_equivalent",
      "code_review_status": "used",
      "code_review_signal": "fail",
      "behavioral_robustness_status": "used",
      "coverage_delta_status": "unavailable",
      "mutation_lite_status": "unavailable",
      "probe_accepted_candidates": null,
      "probe_accepted_commands": null,
      "probe_agreement_rate": 1,
      "probe_gold_pass_candidate_pass_count": 1,
      "probe_gold_pass_candidate_fail_count": null,
      "probe_review_required_count": null,
      "flags": [
        "equiv_warn",
        "review_warn"
      ],
      "tb_is_resolved": true,
      "tb_failure_mode": "unset",
      "tb_total_input_tokens": 1427587,
      "tb_total_output_tokens": 8619,
      "tb_total_tokens": 1436206,
      "tb_uncached_input_tokens": 62595,
      "tb_cache_creation_input_tokens": null,
      "tb_cache_read_input_tokens": 1364992,
      "tb_cached_input_tokens": 1364992,
      "token_status": "present",
      "cache_token_status": "present",
      "token_source": "openai_cached_tokens_usage",
      "task_cost": 3.5035529999999997,
      "cost_status": "present",
      "cost_pricing_mode": "cache_aware",
      "pricing_model_key": "gpt-5.3-codex",
      "equiv_rescue_policy": "on",
      "rescue_candidate": false,
      "rescue_eligible": false,
      "rescue_decision": "not_candidate",
      "publish_include_in_leaderboard": true,
      "publish_exclusion_reasons": [],
      "publish_weak_signal_risk": false,
      "tests_only_outcome": 1,
      "rescue_aware_outcome": 1,
      "partial_score": 1,
      "partial_score_numerator": 1,
      "partial_score_denominator": 1,
      "partial_score_level": "command",
      "partial_score_provenance": "fallback_command_level",
      "partial_score_reason": "test_case_detail_unavailable",
      "partial_score_unknown_count": 0,
      "footprint_risk_status": "used",
      "footprint_risk_reason": "none",
      "footprint_risk_level": "medium",
      "footprint_risk_score": 0.45114231648401637,
      "footprint_risk_flag": false,
      "footprint_risk_severe_flag": false
    },
    "run": {
      "model": "gpt-5.3-codex",
      "requested_model": "gpt-5.3-codex",
      "run_id": "2026-02-28__13-38-10__gpt-5-3-codex",
      "passRate": 0.9310344827586207,
      "ciLow": 0.8275862068965517,
      "ciHigh": 1,
      "effectiveN": 29,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 1,
        "fail_infra": 0,
        "fail_likely_equiv": 1,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 6,
        "pass_with_warn": 21
      }
    },
    "run_metadata": {
      "agent_name": "codex",
      "model_name": "gpt-5.3-codex",
      "start_time": "2026-02-28T13:38:12.733292+00:00",
      "end_time": "2026-02-28T14:43:03.470912+00:00",
      "commit_hash": "99ae882e59281de74856a3e7eeb1b7c986b466af",
      "accuracy": 0.896551724137931,
      "reasoning_effort": "medium"
    }
  },
  "evidence": {
    "task_detail": {
      "path": "validation/gpt-5-3-codex/flux-pr-1351/task_detail.v1.json",
      "exists": true
    },
    "trajectory": {
      "path": "validation/gpt-5-3-codex/flux-pr-1351/trajectory.v1.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 15197
    },
    "validation": {
      "path": "validation/gpt-5-3-codex/flux-pr-1351/validation.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 133216
    },
    "results": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-3-codex/results.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 62248
    },
    "run_metadata": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-3-codex/run_metadata.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 1634
    },
    "agent_patch": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-3-codex/flux-pr-1351/flux-pr-1351.1-of-1.2026-02-28__13-38-10__gpt-5-3-codex/agent-logs/agent.patch",
      "exists": true,
      "non_empty": true,
      "size_bytes": 14119
    },
    "summary": {
      "path": "reports/summary.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 269968
    },
    "manifest": {
      "path": "manifest.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 677
    },
    "validation_inputs": {
      "required_for_rescue_ok": true,
      "gold_patch": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 6203,
        "sha256": "815afccd36fab8f4369b913dc089d46e94ae65c8735d45f3c11f35dc8fd6b631",
        "required_for_rescue": true
      },
      "repo_tarball": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 2164046,
        "sha256": "88c0509275e56d1a37054565c3744cb8706ad963eae4a1ac17b4457ab092b519",
        "required_for_rescue": true
      },
      "agent_patch": {
        "path": "runs/2026-02-28__13-38-10__gpt-5-3-codex/flux-pr-1351/flux-pr-1351.1-of-1.2026-02-28__13-38-10__gpt-5-3-codex/agent-logs/agent.patch",
        "exists": true,
        "non_empty": true,
        "size_bytes": 14119,
        "sha256": "fc39247c534671542b2fd297eb1f21daeb9be7a1a98432423af471d8b3be2a86",
        "required_for_rescue": true
      }
    }
  },
  "provenance": {
    "identity": [
      {
        "source": "manifest",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "decision": [
      {
        "source": "validation",
        "available": true
      }
    ],
    "tests": [
      {
        "source": "validation",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "quality": {
      "equivalence": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "code_review": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "behavioral_robustness": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "footprint_risk": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "cost": [
        {
          "source": "validation",
          "available": true
        },
        {
          "source": "summary",
          "available": true
        }
      ]
    },
    "summary": {
      "task": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run_metadata": [
        {
          "source": "run_metadata",
          "available": true
        }
      ]
    },
    "evidence": [
      {
        "source": "filesystem",
        "available": true
      }
    ]
  }
}