STET

task_detail

validation/gpt-5-1-codex-mini/flux-pr-1309/task_detail.v1.json

15559 bytes

Back to adjudication
{
  "version": "v1",
  "materialization": {
    "mode": "canonical"
  },
  "identity": {
    "variant_id": "gpt-5.1-codex-mini",
    "requested_model": "gpt-5.1-codex-mini",
    "model_key": "gpt-5-1-codex-mini",
    "run_id": "2026-02-28__13-38-10__gpt-5-1-codex-mini",
    "task_id": "flux-pr-1309",
    "trial_name": "flux-pr-1309.1-of-1.2026-02-28__13-38-10__gpt-5-1-codex-mini"
  },
  "decision": {
    "status": "pass",
    "primary_signal": "tests",
    "matrix_status": "pass_with_warn",
    "reason_codes": [
      "primary_tests",
      "tests_passed",
      "equivalence_non_equivalent",
      "review_fail"
    ],
    "flags": [
      "equiv_warn",
      "review_warn"
    ],
    "lane_report": {
      "lane": "lane_unknown",
      "source": "lane_unknown",
      "reasons": [
        "lane_unknown"
      ]
    },
    "publish": {
      "include_in_leaderboard": true,
      "weak_signal_risk": false
    },
    "rescue": {
      "candidate": false,
      "eligible": false,
      "decision": "not_candidate",
      "required_artifacts_ok": true
    }
  },
  "tests": {
    "outcome": "pass",
    "strength": "strong",
    "commands": [
      "go test -C v2 ./... -count=1 -timeout=300s"
    ],
    "command_results": [
      {
        "command": "go test -C v2 ./... -count=1 -timeout=300s",
        "gold_outcome": "pass",
        "gold_tb_trial_name": "flux-pr-1309.1-of-1.validate-gold-1772334443281796000",
        "gold_failure_mode": "unset",
        "agent_outcome": "pass",
        "agent_tb_trial_name": "flux-pr-1309.1-of-1.validate-override-1772334507379513000",
        "agent_failure_mode": "unset",
        "agent_retried": false
      }
    ],
    "partial_score": {
      "score": 1,
      "numerator": 1,
      "denominator": 1,
      "level": "command",
      "provenance": "fallback_command_level",
      "reason": "test_case_detail_unavailable"
    },
    "tb": {
      "trial_name": "flux-pr-1309.1-of-1.2026-02-28__13-38-10__gpt-5-1-codex-mini",
      "is_resolved": true,
      "failure_mode": "unset",
      "duration_sec": 127.014766958,
      "time_budget_sec": 1200
    },
    "cache": {
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266"
    }
  },
  "quality": {
    "equivalence": {
      "status": "used",
      "outcome": "non_equivalent",
      "confidence": 0.63,
      "issue_type": "behavioral",
      "notes": "The shown agent changes cover pubsub datasource removal and a test helper for `UpdateSubscription`, but do not demonstrate the core subscription runtime features required by the task (startup hooks on subscription start, per-subscription targeted updates in runtime code, and per-subscription close behavior). Based on the visible patch, key intended behavior appears missing."
    },
    "code_review": {
      "status": "used",
      "signal": "fail",
      "confidence": 0.6,
      "issue_type": "unclear",
      "source_field": "code_review",
      "dimension_scores": {
        "correctness": {
          "score_0_4": 2,
          "confidence_0_1": 0.56,
          "rationale": "The visible patch mostly removes pubsub datasource files and adds a test helper shim; it does not clearly demonstrate end-to-end implementation of startup hooks and per-subscription lifecycle behavior in production paths."
        },
        "edge_case_handling": {
          "score_0_4": 1,
          "confidence_0_1": 0.58,
          "rationale": "A key edge case for this task is targeting one subscription without affecting siblings; the shown test implementation ignores subscription IDs, so this edge condition is not meaningfully exercised."
        },
        "introduced_bug_risk": {
          "score_0_4": 2,
          "confidence_0_1": 0.62,
          "rationale": "The added test adapter method discards the subscription identifier and routes targeted updates through broadcast behavior, which can mask regressions in subscription-specific delivery logic."
        },
        "maintainability_idioms": {
          "score_0_4": 2,
          "confidence_0_1": 0.67,
          "rationale": "The compatibility shim is concise but semantically lossy (ID ignored), which weakens test clarity around intended targeted-update behavior and can make future regressions harder to detect."
        }
      },
      "findings": [
        {
          "id": "f1",
          "dimension": "correctness",
          "severity": "major",
          "title": "Targeted subscription updates are effectively treated as broadcast in test adapter",
          "detail": "The new `UpdateSubscription` helper ignores `id` and forwards to `Update(data)`. For a PR centered on per-subscription targeting, this can let incorrect broadcast behavior pass tests.",
          "confidence_0_1": 0.86,
          "citations": [
            {
              "path": "v2/pkg/engine/datasource/graphql_datasource/graphql_datasource_test.go",
              "side": "new",
              "start_line": 8275,
              "end_line": 8277,
              "anchor_kind": "diff",
              "head_sha": "a4f189176bde4c90a3388ae1dd40f5d014ef1a8b"
            }
          ]
        },
        {
          "id": "f2",
          "dimension": "edge_case_handling",
          "severity": "major",
          "title": "Sibling-subscription isolation behavior is not validated by the shown changes",
          "detail": "The patch excerpt adds only an ID-discarding adapter and removes pubsub datasource code; it does not show explicit validation that updating/closing one subscription leaves siblings unaffected.",
          "confidence_0_1": 0.61,
          "citations": [
            {
              "path": "v2/pkg/engine/datasource/graphql_datasource/graphql_datasource_test.go",
              "side": "new",
              "start_line": 8275,
              "end_line": 8277,
              "anchor_kind": "diff",
              "head_sha": "a4f189176bde4c90a3388ae1dd40f5d014ef1a8b"
            },
            {
              "path": "v2/pkg/engine/datasource/pubsub_datasource/pubsub_datasource.go",
              "side": "old",
              "start_line": 1,
              "end_line": 30,
              "anchor_kind": "diff",
              "head_sha": "a4f189176bde4c90a3388ae1dd40f5d014ef1a8b"
            }
          ]
        }
      ],
      "overall": {
        "confidence_overall": 0.6,
        "summary": "Likely partial: cleanup/removal work is visible and tests pass, but the shown changes do not convincingly demonstrate the new targeted subscription lifecycle semantics, and the new test adapter may mask exactly those regressions."
      },
      "notes": "Likely partial: cleanup/removal work is visible and tests pass, but the shown changes do not convincingly demonstrate the new targeted subscription lifecycle semantics, and the new test adapter may mask exactly those regressions."
    },
    "behavioral_robustness": {
      "status": "used",
      "probe_status": "used",
      "gold_pass_candidate_pass_count": 1,
      "agreement_rate": 1,
      "coverage_delta_status": "unavailable",
      "coverage_delta_reason": "coverage_adapter_unavailable",
      "mutation_lite_status": "unavailable",
      "mutation_lite_reason": "mutation_runner_unavailable",
      "notes": "derived from command-level outcomes; no accepted generated probes"
    },
    "footprint_risk": {
      "status": "used",
      "reason": "none",
      "version": "v1",
      "level": "medium",
      "score": 0.45680361670345676,
      "flag": false,
      "severe_flag": false
    },
    "cost": {
      "status": "tokens_only",
      "token_status": "present",
      "cache_token_status": "present",
      "total_input_tokens": 10739431,
      "total_output_tokens": 64067,
      "total_tokens": 10803498,
      "uncached_input_tokens": 1316199,
      "cache_read_input_tokens": 9423232,
      "cached_input_tokens": 9423232,
      "total_cost_usd": 3.7721853,
      "token_source": "openai_cached_tokens_usage",
      "pricing_model_key": "gpt-5.1-codex-mini",
      "pricing_mode": "cache_aware"
    }
  },
  "summary": {
    "task": {
      "matrix_status": "pass_with_warn",
      "tests_outcome": "pass",
      "tests_unknown_cause": null,
      "lane_report_lane": "lane_unknown",
      "lane_report_source": "lane_unknown",
      "lane_report_reasons": [
        "lane_unknown"
      ],
      "cache_hit": false,
      "cache_miss_reason": "missing_pinned_dependencies",
      "setup_ms_saved": null,
      "pinned_bytes": null,
      "environment_group_id": "eg_b190ace324cafb0f9010852027ba6266",
      "equivalence_status": "used",
      "equivalence_outcome": "non_equivalent",
      "code_review_status": "used",
      "code_review_signal": "fail",
      "behavioral_robustness_status": "used",
      "coverage_delta_status": "unavailable",
      "mutation_lite_status": "unavailable",
      "probe_accepted_candidates": null,
      "probe_accepted_commands": null,
      "probe_agreement_rate": 1,
      "probe_gold_pass_candidate_pass_count": 1,
      "probe_gold_pass_candidate_fail_count": null,
      "probe_review_required_count": null,
      "flags": [
        "equiv_warn",
        "review_warn"
      ],
      "tb_is_resolved": true,
      "tb_failure_mode": "unset",
      "tb_total_input_tokens": 10739431,
      "tb_total_output_tokens": 64067,
      "tb_total_tokens": 10803498,
      "tb_uncached_input_tokens": 1316199,
      "tb_cache_creation_input_tokens": null,
      "tb_cache_read_input_tokens": 9423232,
      "tb_cached_input_tokens": 9423232,
      "token_status": "present",
      "cache_token_status": "present",
      "token_source": "openai_cached_tokens_usage",
      "task_cost": 3.7721853,
      "cost_status": "present",
      "cost_pricing_mode": "cache_aware",
      "pricing_model_key": "gpt-5.1-codex-mini",
      "equiv_rescue_policy": "on",
      "rescue_candidate": false,
      "rescue_eligible": false,
      "rescue_decision": "not_candidate",
      "publish_include_in_leaderboard": true,
      "publish_exclusion_reasons": [],
      "publish_weak_signal_risk": false,
      "tests_only_outcome": 1,
      "rescue_aware_outcome": 1,
      "partial_score": 1,
      "partial_score_numerator": 1,
      "partial_score_denominator": 1,
      "partial_score_level": "command",
      "partial_score_provenance": "fallback_command_level",
      "partial_score_reason": "test_case_detail_unavailable",
      "partial_score_unknown_count": 0,
      "footprint_risk_status": "used",
      "footprint_risk_reason": "none",
      "footprint_risk_level": "medium",
      "footprint_risk_score": 0.45680361670345676,
      "footprint_risk_flag": false,
      "footprint_risk_severe_flag": false
    },
    "run": {
      "model": "gpt-5.1-codex-mini",
      "requested_model": "gpt-5.1-codex-mini",
      "run_id": "2026-02-28__13-38-10__gpt-5-1-codex-mini",
      "passRate": 0.9655172413793104,
      "ciLow": 0.896551724137931,
      "ciHigh": 1,
      "effectiveN": 29,
      "tier": 1,
      "validation_counts": {
        "fail_guardrail": 0,
        "fail_high_conf": 0,
        "fail_infra": 1,
        "fail_likely_equiv": 0,
        "fail_with_diag": 0,
        "missing": 0,
        "pass": 2,
        "pass_with_warn": 26
      }
    },
    "run_metadata": {
      "agent_name": "codex",
      "model_name": "gpt-5.1-codex-mini",
      "start_time": "2026-02-28T15:13:02.887835+00:00",
      "end_time": "2026-02-28T16:42:40.244280+00:00",
      "commit_hash": "99ae882e59281de74856a3e7eeb1b7c986b466af",
      "accuracy": 0.6896551724137931,
      "reasoning_effort": "medium"
    }
  },
  "evidence": {
    "task_detail": {
      "path": "validation/gpt-5-1-codex-mini/flux-pr-1309/task_detail.v1.json",
      "exists": true
    },
    "trajectory": {
      "path": "validation/gpt-5-1-codex-mini/flux-pr-1309/trajectory.v1.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 15393
    },
    "validation": {
      "path": "validation/gpt-5-1-codex-mini/flux-pr-1309/validation.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 219174
    },
    "results": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-1-codex-mini/results.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 57447
    },
    "run_metadata": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-1-codex-mini/run_metadata.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 1645
    },
    "agent_patch": {
      "path": "runs/2026-02-28__13-38-10__gpt-5-1-codex-mini/flux-pr-1309/flux-pr-1309.1-of-1.2026-02-28__13-38-10__gpt-5-1-codex-mini/agent-logs/agent.patch",
      "exists": true,
      "non_empty": true,
      "size_bytes": 64873
    },
    "summary": {
      "path": "reports/summary.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 269968
    },
    "manifest": {
      "path": "manifest.json",
      "exists": true,
      "non_empty": true,
      "size_bytes": 677
    },
    "validation_inputs": {
      "required_for_rescue_ok": true,
      "gold_patch": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 42713,
        "sha256": "6c9373337f55ebcb4dbe540d3ffad4a8b21705c0921cff03033c909c092cf3d1",
        "required_for_rescue": true
      },
      "repo_tarball": {
        "exists": true,
        "non_empty": true,
        "size_bytes": 2139970,
        "sha256": "fd7dadc20d048470aae3bc0e6b7c22fb11db2a32d33400252859802f44596421",
        "required_for_rescue": true
      },
      "agent_patch": {
        "path": "runs/2026-02-28__13-38-10__gpt-5-1-codex-mini/flux-pr-1309/flux-pr-1309.1-of-1.2026-02-28__13-38-10__gpt-5-1-codex-mini/agent-logs/agent.patch",
        "exists": true,
        "non_empty": true,
        "size_bytes": 64873,
        "sha256": "5a052f8d37a88741d426eff5cdb2dcdb9850aa394f6cb159bdeda764effbec5f",
        "required_for_rescue": true
      }
    }
  },
  "provenance": {
    "identity": [
      {
        "source": "manifest",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "decision": [
      {
        "source": "validation",
        "available": true
      }
    ],
    "tests": [
      {
        "source": "validation",
        "available": true
      },
      {
        "source": "tb_results",
        "available": true
      }
    ],
    "quality": {
      "equivalence": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "code_review": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "behavioral_robustness": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "footprint_risk": [
        {
          "source": "validation",
          "available": true
        }
      ],
      "cost": [
        {
          "source": "validation",
          "available": true
        },
        {
          "source": "summary",
          "available": true
        }
      ]
    },
    "summary": {
      "task": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run": [
        {
          "source": "summary",
          "available": true
        }
      ],
      "run_metadata": [
        {
          "source": "run_metadata",
          "available": true
        }
      ]
    },
    "evidence": [
      {
        "source": "filesystem",
        "available": true
      }
    ]
  }
}