task_detail

validation/gpt-5-4/flux-pr-1001/task_detail.v1.json
16137 bytes
{
  "version": "v1",
  "materialization": {
    "mode": "canonical"
  },
  "identity": {
    "variant_id": "gpt-5.4",
    "requested_model": "gpt-5.4",
    "model_key": "gpt-5-4",
    "run_id": "2026-02-28__13-38-10__gpt-5-4",
    "task_id": "flux-pr-1001",
    "trial_name": "flux-pr-1001.1-of-1.2026-02-28__13-38-10__gpt-5-4"
  },
  "decision": {
    "status": "pass",
    "primary_signal": "tests",
    "matrix_status": "pass_with_warn",
    "reason_codes": [
      "primary_tests",
      "tests_passed",
      "equivalence_equivalent"
    ],
    "flags": [
      "review_warn"
    ],
    "lane_report": {
      "lane": "lane_unknown",
      "source": "lane_unknown",
      "reasons": [
        "lane_unknown"
      ]
    },
    "publish": {
      "include_in_leaderboard": true,
      "weak_signal_risk": false
    },
    "rescue": {
      "candidate": false,
      "eligible": false,
      "decision": "not_candidate",
      "required_artifacts_ok": true
    }
  },
  "tests": {
    "outcome": "pass",
    "strength": "strong",
    "commands": [
      "go test -C v2 ./... -count=1 -timeout=300s"
    ],
    "command_results": [
      {
        "command": "go test -C v2 ./... -count=1 -timeout=300s",
        "gold_outcome": "pass",
        "gold_tb_trial_name": "flux-pr-1001.1-of-1.validate-gold-1772829224047618000",
        "gold_failure_mode": "unset",
        "agent_outcome": "pass",
        "agent_tb_trial_name": "flux-pr-1001.1-of-1.validate-override-1772829551014855000",
        "agent_failure_mode": "unset",
        "agent_retried": false
      }
    ],
    "partial_score": {
      "score": 1,
      "numerator": 1,
      "denominator": 1,
      "level": "command",
      "provenance": "fallback_command_level",
      "reason": "test_case_detail_unavailable"
    },
    "tb": {
      "trial_name": "flux-pr-1001.1-of-1.2026-02-28__13-38-10__gpt-5-4",
      "is_resolved": true,
      "failure_mode": "unset",
      "duration_sec": 419.277693709,
      "time_budget_sec": 1200
    },
    "cache": {
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c"
    }
  },
  "quality": {
    "equivalence": {
      "status": "used",
      "outcome": "equivalent",
      "evaluator_model": "gpt-5.3-codex",
      "confidence": 0.73,
      "issue_type": "stylistic"
    },
    "code_review": {
      "status": "used",
      "signal": "fail",
      "evaluator_model": "gpt-5.3-codex",
      "confidence": 0.81,
      "issue_type": "unclear",
      "source_field": "code_review",
      "dimension_scores": {
        "correctness": {
          "score_0_4": 2,
          "confidence_0_1": 0.83,
          "rationale": "The patch adds HTTP metadata to hook callbacks, but it diverges from the intended API shape by keeping `err` as a separate callback arg and using a `TraceHTTP` snapshot instead of passing richer request/response data via a unified response info object."
        },
        "edge_case_handling": {
          "score_0_4": 1,
          "confidence_0_1": 0.8,
          "rationale": "Edge handling is incomplete for hook context edge cases: callback execution is skipped when `loaderHookContext` is nil, and no fallback context assignment was added to guarantee `OnFinished` delivery."
        },
        "introduced_bug_risk": {
          "score_0_4": 2,
          "confidence_0_1": 0.78,
          "rationale": "The new trace plumbing is localized and tested, but callback invocation still depends on non-nil hook context, which can suppress `OnFinished` in some flows and create inconsistent hook behavior."
        },
        "maintainability_idioms": {
          "score_0_4": 2,
          "confidence_0_1": 0.74,
          "rationale": "The design reuses `TraceHTTP` (a tracing-oriented type) for hook contracts, which mixes concerns and makes the hook API less clear than a dedicated response info model."
        }
      },
      "findings": [
        {
          "id": "f1",
          "dimension": "correctness",
          "severity": "major",
          "title": "OnFinished callback contract does not match intended enriched response-info model",
          "detail": "The callback still takes a separate `err` argument and introduces `LoaderHookResponseInfo` with `TraceHTTP`, instead of a unified response info object carrying status/error/request/response metadata as the primary payload. This likely misses the intended hook API change.",
          "confidence_0_1": 0.86,
          "citations": [
            {
              "path": "v2/pkg/engine/resolve/loader.go",
              "side": "new",
              "start_line": 37,
              "end_line": 45,
              "anchor_kind": "diff",
              "head_sha": "fcbfba1d6616c5f8d267e5d172268d1662023218"
            },
            {
              "path": "v2/pkg/engine/resolve/loader.go",
              "side": "new",
              "start_line": 560,
              "end_line": 569,
              "anchor_kind": "diff",
              "head_sha": "fcbfba1d6616c5f8d267e5d172268d1662023218"
            }
          ]
        },
        {
          "id": "f2",
          "dimension": "edge_case_handling",
          "severity": "major",
          "title": "OnFinished is still conditionally skipped when hook context is nil",
          "detail": "Multiple call sites still guard `OnFinished` with `loaderHookContext != nil`, so a nil return from `OnLoad` suppresses completion callbacks. The intended behavior should provide a valid context and always invoke OnFinished when hooks are configured.",
          "confidence_0_1": 0.84,
          "citations": [
            {
              "path": "v2/pkg/engine/resolve/loader.go",
              "side": "new",
              "start_line": 127,
              "end_line": 132,
              "anchor_kind": "diff",
              "head_sha": "fcbfba1d6616c5f8d267e5d172268d1662023218"
            },
            {
              "path": "v2/pkg/engine/resolve/loader.go",
              "side": "new",
              "start_line": 168,
              "end_line": 171,
              "anchor_kind": "diff",
              "head_sha": "fcbfba1d6616c5f8d267e5d172268d1662023218"
            }
          ]
        },
        {
          "id": "f3",
          "dimension": "maintainability_idioms",
          "severity": "minor",
          "title": "Hook metadata type is tied to tracing representation",
          "detail": "Using `httpclient.TraceHTTP` in hook response info couples hook semantics to extension-tracing format rather than exposing a dedicated request/response metadata contract.",
          "confidence_0_1": 0.69,
          "citations": [
            {
              "path": "v2/pkg/engine/resolve/loader.go",
              "side": "new",
              "start_line": 42,
              "end_line": 45,
              "anchor_kind": "diff",
              "head_sha": "fcbfba1d6616c5f8d267e5d172268d1662023218"
            },
            {
              "path": "v2/pkg/engine/datasource/httpclient/nethttpclient.go",
              "side": "new",
              "start_line": 84,
              "end_line": 92,
              "anchor_kind": "diff",
              "head_sha": "fcbfba1d6616c5f8d267e5d172268d1662023218"
            }
          ]
        }
      ],
      "overall": {
        "overall_score_0_100": 44.99999999999999,
        "signal": "fail",
        "confidence_overall": 0.81,
        "summary": "The patch moves in the right direction by attaching HTTP metadata, but it likely does not fully satisfy the intended change because the hook API shape and callback behavior differ in material ways from the expected enriched OnFinished contract."
      }
    },
    "behavioral_robustness": {
      "status": "used",
      "probe_status": "used",
      "gold_pass_candidate_pass_count": 1,
      "agreement_rate": 1,
      "coverage_delta_status": "unavailable",
      "coverage_delta_reason": "coverage_adapter_unavailable",
      "mutation_lite_status": "unavailable",
      "mutation_lite_reason": "mutation_runner_unavailable",
      "notes": "derived from command-level outcomes; no accepted generated probes"
    },
    "footprint_risk": {
      "status": "used",
      "reason": "none",
      "version": "v1",
      "level": "low",
      "score": 0.3118850085432988,
      "flag": false,
      "severe_flag": false
    },
    "cost": {
      "status": "tokens_only",
      "token_status": "present",
      "cache_token_status": "present",
      "total_input_tokens": 1249871,
      "total_output_tokens": 6724,
      "total_tokens": 1256595,
      "uncached_input_tokens": 83023,
      "cache_read_input_tokens": 1166848,
      "cached_input_tokens": 1166848,
      "total_cost_usd": 0.803262,
      "token_source": "openai_cached_tokens_usage",
      "pricing_model_key": "gpt-5.4",
      "pricing_mode": "cache_aware"
    }
  },
  "summary": {
    "task": {
      "matrix_status": "pass_with_warn",
      "tests_outcome": "pass",
      "tests_unknown_cause": null,
      "lane_report_lane": "lane_unknown",
      "lane_report_source": "lane_unknown",
      "lane_report_reasons": [
        "lane_unknown"
      ],
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "setup_ms_saved": null,
      "pinned_bytes": null,
      "environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
      "equivalence_status": "used",
      "equivalence_outcome": "equivalent",
      "code_review_status": "used",
      "code_review_signal": "fail",
      "behavioral_robustness_status": "used",
      "coverage_delta_status": "unavailable",
      "mutation_lite_status": "unavailable",
      "probe_accepted_candidates": null,
      "probe_accepted_commands": null,
      "probe_agreement_rate": 1,
      "probe_gold_pass_candidate_pass_count": 1,
      "probe_gold_pass_candidate_fail_count": null,
      "probe_review_required_count": null,
      "flags": [
        "review_warn"
      ],
      "tb_is_resolved": true,
      "tb_failure_mode": "unset",
      "tb_total_input_tokens": 1249871,
      "tb_total_output_tokens": 6724,
      "tb_total_tokens": 1256595,
      "tb_uncached_input_tokens": 83023,
      "tb_cache_creation_input_tokens": null,
      "tb_cache_read_input_tokens": 1166848,
      "tb_cached_input_tokens": 1166848,
      "token_status": "present",
      "cache_token_status": "present",
      "token_source": "openai_cached_tokens_usage",
      "task_cost": 0.803262,
      "cost_status": "present",
      "cost_pricing_mode": "cache_aware",
      "pricing_model_key": "gpt-5.4",
      "equiv_rescue_policy": "on",
      "rescue_candidate": false,
      "rescue_eligible": false,
      "rescue_decision": "not_candidate",
      "publish_include_in_leaderboard": true,
      "publish_exclusion_reasons": [],
      "publish_weak_signal_risk": false,
      "tests_only_outcome": 1,
      "rescue_aware_outcome": 1,
      "partial_score": 1,
      "partial_score_numerator": 1,
      "partial_score_denominator": 1,
      "partial_score_level": "command",
      "partial_score_provenance": "fallback_command_level",
      "partial_score_reason": "test_case_detail_unavailable",
      "partial_score_unknown_count": 0,
      "footprint_risk_status": "used",
      "footprint_risk_reason": "none",
      "footprint_risk_level": "low",
      "footprint_risk_score": 0.3118850085432988,
      "footprint_risk_flag": false,
      "footprint_risk_severe_flag": false
    },
    "run": {
      "model": "gpt-5.4",
      "requested_model": "gpt-5.4",
      "run_id": "2026-02-28__13-38-10__gpt-5-4",
      "passRate": 0.896551724137931,
      "ciLow": 0.7586206896551724,
      "ciHigh": 1,
      "effectiveN": 29,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 1,
        "fail_high_conf": 2,
        "fail_infra": 0,
        "fail_likely_equiv": 1,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 5,
        "pass_with_warn": 20
      }
    },
    "run_metadata": {
      "agent_name": "codex",
      "model_name": "gpt-5.4",
      "start_time": "2026-03-06T11:06:36.354605+00:00",
      "end_time": "2026-03-06T13:16:38.693205+00:00",
      "commit_hash": "8f1bc9fe2c3cb8a14c254de41e9577a2ce352ac2",
      "accuracy": 0.7586206896551724,
      "reasoning_effort": "medium"
    }
  },
  "evidence": {
    "task_detail": {
      "path": "validation/gpt-5-4/flux-pr-1001/task_detail.v1.json",
      "exists": true
    },
    "trajectory": {
      "path": "validation/gpt-5-4/flux-pr-1001/trajectory.v1.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 15142
    },
    "validation": {
      "path": "validation/gpt-5-4/flux-pr-1001/validation.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 135244
    },
    "results": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-4/results.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 61729
    },
    "run_metadata": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-4/run_metadata.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 1623
    },
    "agent_patch": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-4/flux-pr-1001/flux-pr-1001.1-of-1.2026-02-28__13-38-10__gpt-5-4/agent-logs/agent.patch",
      "exists": true,
      "non_empty": true,
      "size_bytes": 13525
    },
    "summary": {
      "path": "reports/summary.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 269968
    },
    "manifest": {
      "path": "manifest.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 677
    },
    "validation_inputs": {
      "required_for_rescue_ok": true,
      "gold_patch": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 9761,
        "sha256": "4f4be1e17e703fc3cc540539ce5f871a5426000898c2b32f153bbf088512a89d",
        "required_for_rescue": true
      },
      "repo_tarball": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 4156764,
        "sha256": "a84f92df8a1b5a1b8b23f6c61187570bb3fb8efb03eed33d78bffb8c3a3f3872",
        "required_for_rescue": true
      },
      "agent_patch": {
        "path": "runs/2026-02-28__13-38-10__gpt-5-4/flux-pr-1001/flux-pr-1001.1-of-1.2026-02-28__13-38-10__gpt-5-4/agent-logs/agent.patch",
        "exists": true,
        "non_empty": true,
        "size_bytes": 13525,
        "sha256": "3c0efc4bb0a3d5749fd78b1f8c9973c55a0df957982690f2e31e417cf3ff5a7d",
        "required_for_rescue": true
      }
    }
  },
  "provenance": {
    "identity": [
      {
        "source": "manifest",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "decision": [
      {
        "source": "validation",
        "available": true
      }
    ],
    "tests": [
      {
        "source": "validation",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "quality": {
      "equivalence": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "code_review": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "behavioral_robustness": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "footprint_risk": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "cost": [
        {
          "source": "validation",
          "available": true
        },
        {
          "source": "summary",
          "available": true
        }
      ]
    },
    "summary": {
      "task": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run_metadata": [
        {
          "source": "run_metadata",
          "available": true
        }
      ]
    },
    "evidence": [
      {
        "source": "filesystem",
        "available": true
      }
    ]
  }
}