task_detail
validation/gpt-5-1-codex-mini/flux-pr-1308/task_detail.v1.json
17995 bytes
{
"version": "v1",
"materialization": {
"mode": "canonical"
},
"identity": {
"variant_id": "gpt-5.1-codex-mini",
"requested_model": "gpt-5.1-codex-mini",
"model_key": "gpt-5-1-codex-mini",
"run_id": "2026-02-28__13-38-10__gpt-5-1-codex-mini",
"task_id": "flux-pr-1308",
"trial_name": "flux-pr-1308.1-of-1.2026-02-28__13-38-10__gpt-5-1-codex-mini"
},
"decision": {
"status": "pass",
"primary_signal": "tests",
"matrix_status": "pass_with_warn",
"reason_codes": [
"primary_tests",
"tests_passed",
"equivalence_non_equivalent",
"review_fail"
],
"flags": [
"equiv_warn",
"review_warn"
],
"publish": {
"include_in_leaderboard": true,
"weak_signal_risk": false
},
"rescue": {
"candidate": false,
"eligible": false,
"decision": "not_candidate",
"required_artifacts_ok": true
}
},
"tests": {
"outcome": "pass",
"strength": "strong",
"commands": [
"go test -C v2 ./... -count=1 -timeout=300s"
],
"command_results": [
{
"command": "go test -C v2 ./... -count=1 -timeout=300s",
"gold_outcome": "pass",
"gold_tb_trial_name": "flux-pr-1308.1-of-1.validate-gold-1772298862406624000",
"gold_failure_mode": "unset",
"agent_outcome": "pass",
"agent_tb_trial_name": "flux-pr-1308.1-of-1.validate-override-1772298934293928000",
"agent_failure_mode": "unset",
"agent_retried": false
}
],
"partial_score": {
"score": 1,
"numerator": 1,
"denominator": 1,
"level": "command",
"provenance": "fallback_command_level",
"reason": "test_case_detail_unavailable"
},
"tb": {
"trial_name": "flux-pr-1308.1-of-1.2026-02-28__13-38-10__gpt-5-1-codex-mini",
"is_resolved": false,
"failure_mode": "unset",
"duration_sec": 133.088380875,
"time_budget_sec": 1200
},
"cache": {
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c"
}
},
"quality": {
"equivalence": {
"status": "used",
"outcome": "non_equivalent",
"confidence": 0.85,
"issue_type": "behavioral",
"notes": "The agent patch appears to only partially implement the feature: it adds `@oneOf` to base schema/fixtures and some variable-side helpers, but does not clearly include the full operation-level oneOf enforcement, nullable-variable-in-oneOf checks, or the undefined-variable error position/reporting changes required by the task. It also seems to change introspection shape (`isOneOf`) rather than clearly exposing the built-in `@oneOf` directive in directive introspection output."
},
"code_review": {
"status": "used",
"signal": "fail",
"confidence": 0.9,
"issue_type": "unclear",
"source_field": "code_review",
"dimension_scores": {
"correctness": {
"score_0_4": 1,
"confidence_0_1": 0.93,
"rationale": "The patch appears to implement only a small subset of the required oneOf behavior: it mostly adds schema text/fixtures and constants, but does not show the core operation-level validation and undefined-variable error behavior expected by the task. It also introduces introspection shape changes (isOneOf) that are not aligned with the target change set."
},
"edge_case_handling": {
"score_0_4": 1,
"confidence_0_1": 0.88,
"rationale": "Edge conditions called out by the task (exactly one field, non-null, nullable variable usage in oneOf, and undefined variable location reporting) are not comprehensively implemented in the shown operation validation changes."
},
"introduced_bug_risk": {
"score_0_4": 1,
"confidence_0_1": 0.86,
"rationale": "The patch changes public/output structures and fixtures for introspection while not showing corresponding generator/query updates, creating high risk of test and compatibility regressions. Error constants were added without corresponding full error-construction flow in this patch excerpt."
},
"maintainability_idioms": {
"score_0_4": 2,
"confidence_0_1": 0.77,
"rationale": "There is some structured helper extraction in variables validation, but the patch is inconsistent with the intended change scope (mixed schema/introspection/output changes with missing core validation pieces), which hurts maintainability and traceability."
}
},
"findings": [
{
"id": "f1",
"dimension": "correctness",
"severity": "major",
"title": "Operation-level oneOf validation is effectively missing",
"detail": "The operation values validator only adds a oneOf constant in the shown diff and does not include the required logic to enforce exactly one provided field, non-null value, and nullable-variable rejection at operation validation time.",
"confidence_0_1": 0.95,
"citations": [
{
"path": "v2/pkg/astvalidation/operation_rule_values.go",
"side": "new",
"start_line": 11,
"end_line": 14,
"anchor_kind": "diff",
"head_sha": "49b8a69cab1c77b398a71c157a5d051ebd3e2ee8"
}
]
},
{
"id": "f2",
"dimension": "correctness",
"severity": "major",
"title": "Undefined-variable error reporting improvements are not implemented",
"detail": "The task requires improved undefined-variable errors with source locations; the shown patch does not update the error-construction API or operation validation flow to carry positions for this case.",
"confidence_0_1": 0.9,
"citations": [
{
"path": "v2/pkg/operationreport/externalerror.go",
"side": "new",
"start_line": 26,
"end_line": 31,
"anchor_kind": "diff",
"head_sha": "49b8a69cab1c77b398a71c157a5d051ebd3e2ee8"
},
{
"path": "v2/pkg/astvalidation/operation_rule_values.go",
"side": "new",
"start_line": 11,
"end_line": 14,
"anchor_kind": "diff",
"head_sha": "49b8a69cab1c77b398a71c157a5d051ebd3e2ee8"
}
]
},
{
"id": "f3",
"dimension": "introduced_bug_risk",
"severity": "major",
"title": "Introspection output shape changed in a likely incompatible way",
"detail": "The patch adds `isOneOf` to `FullType` and multiple golden outputs, which is a broader schema/output change than required and likely to break existing introspection expectations if not fully wired end-to-end.",
"confidence_0_1": 0.86,
"citations": [
{
"path": "v2/pkg/introspection/introspection.go",
"side": "new",
"start_line": 68,
"end_line": 72,
"anchor_kind": "diff",
"head_sha": "49b8a69cab1c77b398a71c157a5d051ebd3e2ee8"
},
{
"path": "v2/pkg/introspection/fixtures/starwars_introspected.golden",
"side": "new",
"start_line": 247,
"end_line": 252,
"anchor_kind": "diff",
"head_sha": "49b8a69cab1c77b398a71c157a5d051ebd3e2ee8"
}
]
},
{
"id": "f4",
"dimension": "edge_case_handling",
"severity": "major",
"title": "OneOf validation coverage appears incomplete across variable and operation paths",
"detail": "Although helper logic was added in variables validation, the patch does not demonstrate the full expected behavior across both runtime variables and operation literals, especially nullable variable usage in oneOf fields.",
"confidence_0_1": 0.84,
"citations": [
{
"path": "v2/pkg/variablesvalidation/variablesvalidation.go",
"side": "new",
"start_line": 21,
"end_line": 63,
"anchor_kind": "diff",
"head_sha": "49b8a69cab1c77b398a71c157a5d051ebd3e2ee8"
},
{
"path": "v2/pkg/astvalidation/operation_rule_values.go",
"side": "new",
"start_line": 11,
"end_line": 14,
"anchor_kind": "diff",
"head_sha": "49b8a69cab1c77b398a71c157a5d051ebd3e2ee8"
}
]
}
],
"overall": {
"confidence_overall": 0.9,
"summary": "The agent patch likely does not satisfy the intended PR: it includes partial/schema-level edits and some unrelated introspection output changes, but misses core oneOf enforcement and undefined-variable-location behavior needed for correctness."
},
"notes": "The agent patch likely does not satisfy the intended PR: it includes partial/schema-level edits and some unrelated introspection output changes, but misses core oneOf enforcement and undefined-variable-location behavior needed for correctness."
},
"behavioral_robustness": {
"status": "used",
"probe_status": "used",
"gold_pass_candidate_pass_count": 1,
"agreement_rate": 1,
"coverage_delta_status": "unavailable",
"coverage_delta_reason": "coverage_adapter_unavailable",
"mutation_lite_status": "unavailable",
"mutation_lite_reason": "mutation_runner_unavailable",
"notes": "derived from command-level outcomes; no accepted generated probes"
},
"footprint_risk": {
"status": "used",
"reason": "none",
"version": "v1",
"level": "medium",
"score": 0.4272754775114472,
"flag": false,
"severe_flag": false
},
"cost": {
"status": "tokens_only",
"token_status": "present",
"cache_token_status": "present",
"total_input_tokens": 10881552,
"total_output_tokens": 62689,
"total_tokens": 10944241,
"uncached_input_tokens": 937360,
"cache_read_input_tokens": 9944192,
"cached_input_tokens": 9944192,
"total_cost_usd": 3.2738028,
"token_source": "openai_cached_tokens_usage",
"pricing_model_key": "gpt-5.1-codex-mini",
"pricing_mode": "cache_aware"
}
},
"summary": {
"task": {
"matrix_status": "pass_with_warn",
"tests_outcome": "pass",
"tests_unknown_cause": null,
"lane_report_lane": null,
"lane_report_source": null,
"lane_report_reasons": null,
"cache_hit": false,
"cache_miss_reason": "missing_pinned_dependencies",
"setup_ms_saved": null,
"pinned_bytes": null,
"environment_group_id": "eg_46b6a6c82a4890b6e6e87c39575fd02c",
"equivalence_status": "used",
"equivalence_outcome": "non_equivalent",
"code_review_status": "used",
"code_review_signal": "fail",
"behavioral_robustness_status": "used",
"coverage_delta_status": "unavailable",
"mutation_lite_status": "unavailable",
"probe_accepted_candidates": null,
"probe_accepted_commands": null,
"probe_agreement_rate": 1,
"probe_gold_pass_candidate_pass_count": 1,
"probe_gold_pass_candidate_fail_count": null,
"probe_review_required_count": null,
"flags": [
"equiv_warn",
"review_warn"
],
"tb_is_resolved": false,
"tb_failure_mode": "unset",
"tb_total_input_tokens": 10881552,
"tb_total_output_tokens": 62689,
"tb_total_tokens": 10944241,
"tb_uncached_input_tokens": 937360,
"tb_cache_creation_input_tokens": null,
"tb_cache_read_input_tokens": 9944192,
"tb_cached_input_tokens": 9944192,
"token_status": "present",
"cache_token_status": "present",
"token_source": "openai_cached_tokens_usage",
"task_cost": 3.2738028,
"cost_status": "present",
"cost_pricing_mode": "cache_aware",
"pricing_model_key": "gpt-5.1-codex-mini",
"equiv_rescue_policy": "on",
"rescue_candidate": false,
"rescue_eligible": false,
"rescue_decision": "not_candidate",
"publish_include_in_leaderboard": true,
"publish_exclusion_reasons": [],
"publish_weak_signal_risk": false,
"tests_only_outcome": 1,
"rescue_aware_outcome": 1,
"partial_score": 1,
"partial_score_numerator": 1,
"partial_score_denominator": 1,
"partial_score_level": "command",
"partial_score_provenance": "fallback_command_level",
"partial_score_reason": "test_case_detail_unavailable",
"partial_score_unknown_count": 0,
"footprint_risk_status": "used",
"footprint_risk_reason": "none",
"footprint_risk_level": "medium",
"footprint_risk_score": 0.4272754775114472,
"footprint_risk_flag": false,
"footprint_risk_severe_flag": false
},
"run": {
"model": "gpt-5.1-codex-mini",
"requested_model": "gpt-5.1-codex-mini",
"run_id": "2026-02-28__13-38-10__gpt-5-1-codex-mini",
"passRate": 0.9655172413793104,
"ciLow": 0.896551724137931,
"ciHigh": 1,
"effectiveN": 29,
"tier": 1,
"validation_counts": {
"fail_guardrail": 0,
"fail_high_conf": 0,
"fail_infra": 1,
"fail_likely_equiv": 0,
"fail_with_diag": 0,
"missing": 0,
"pass": 2,
"pass_with_warn": 26
}
},
"run_metadata": {
"agent_name": "codex",
"model_name": "gpt-5.1-codex-mini",
"start_time": "2026-02-28T15:13:02.887835+00:00",
"end_time": "2026-02-28T16:42:40.244280+00:00",
"commit_hash": "99ae882e59281de74856a3e7eeb1b7c986b466af",
"accuracy": 0.6896551724137931,
"reasoning_effort": "medium"
}
},
"evidence": {
"task_detail": {
"path": "validation/gpt-5-1-codex-mini/flux-pr-1308/task_detail.v1.json",
"exists": true
},
"trajectory": {
"path": "validation/gpt-5-1-codex-mini/flux-pr-1308/trajectory.v1.json",
"exists": true,
"non_empty": true,
"size_bytes": 15393
},
"validation": {
"path": "validation/gpt-5-1-codex-mini/flux-pr-1308/validation.json",
"exists": true,
"non_empty": true,
"size_bytes": 201739
},
"results": {
"path": "runs/2026-02-28__13-38-10__gpt-5-1-codex-mini/results.json",
"exists": true,
"non_empty": true,
"size_bytes": 57447
},
"run_metadata": {
"path": "runs/2026-02-28__13-38-10__gpt-5-1-codex-mini/run_metadata.json",
"exists": true,
"non_empty": true,
"size_bytes": 1645
},
"agent_patch": {
"path": "runs/2026-02-28__13-38-10__gpt-5-1-codex-mini/flux-pr-1308/flux-pr-1308.1-of-1.2026-02-28__13-38-10__gpt-5-1-codex-mini/agent-logs/agent.patch",
"exists": true,
"non_empty": true,
"size_bytes": 56158
},
"summary": {
"path": "reports/summary.json",
"exists": true,
"non_empty": true,
"size_bytes": 269968
},
"manifest": {
"path": "manifest.json",
"exists": true,
"non_empty": true,
"size_bytes": 677
},
"validation_inputs": {
"required_for_rescue_ok": true,
"gold_patch": {
"exists": true,
"non_empty": true,
"size_bytes": 24668,
"sha256": "84c6d0a576ca7a0db5e3262170eb3c8fe4bb83901a1e00a39caf52b059caa8a7",
"required_for_rescue": true
},
"repo_tarball": {
"exists": true,
"non_empty": true,
"size_bytes": 2055392,
"sha256": "9abac454baa18318efac0d0c2d411c481be5d4e9ebca3ad04ef3e07b63086960",
"required_for_rescue": true
},
"agent_patch": {
"path": "runs/2026-02-28__13-38-10__gpt-5-1-codex-mini/flux-pr-1308/flux-pr-1308.1-of-1.2026-02-28__13-38-10__gpt-5-1-codex-mini/agent-logs/agent.patch",
"exists": true,
"non_empty": true,
"size_bytes": 56158,
"sha256": "0847225d8112ae3273bde5c4ecf72d96c948ed01257856a42ab547ad92786fe8",
"required_for_rescue": true
}
}
},
"provenance": {
"identity": [
{
"source": "manifest",
"available": true
},
{
"source": "tb_results",
"available": true
}
],
"decision": [
{
"source": "validation",
"available": true
}
],
"tests": [
{
"source": "validation",
"available": true
},
{
"source": "tb_results",
"available": true
}
],
"quality": {
"equivalence": [
{
"source": "validation",
"available": true
}
],
"code_review": [
{
"source": "validation",
"available": true
}
],
"behavioral_robustness": [
{
"source": "validation",
"available": true
}
],
"footprint_risk": [
{
"source": "validation",
"available": true
}
],
"cost": [
{
"source": "validation",
"available": true
},
{
"source": "summary",
"available": true
}
]
},
"summary": {
"task": [
{
"source": "summary",
"available": true
}
],
"run": [
{
"source": "summary",
"available": true
}
],
"run_metadata": [
{
"source": "run_metadata",
"available": true
}
]
},
"evidence": [
{
"source": "filesystem",
"available": true
}
]
}
}