diff --git a/src/omnix/gates/gate6_equivalence/__init__.py b/src/omnix/gates/gate6_equivalence/__init__.py index 19dba4f..3227672 100644 --- a/src/omnix/gates/gate6_equivalence/__init__.py +++ b/src/omnix/gates/gate6_equivalence/__init__.py @@ -1,5 +1,14 @@ """Gate 6 behavioral equivalence harness.""" +from omnix.gates.gate6_equivalence.classifier import ( + Classification, + ClassifiedProbe, + Gate6Evaluation, + check, + classify_results, + evaluate, + evaluate_results, +) from omnix.gates.gate6_equivalence.harness import ProbeResult, run_harness from omnix.gates.gate6_equivalence.probes import ( DEFAULT_CONSTRUCT_MARKER, @@ -12,12 +21,19 @@ ) __all__ = [ + "Classification", + "ClassifiedProbe", "DEFAULT_CONSTRUCT_MARKER", "FLOAT_MARKER", + "Gate6Evaluation", "MAX_PROBES_PER_METHOD", "ProbeInput", "ProbeResult", "ProbeSet", + "check", + "classify_results", + "evaluate", + "evaluate_results", "generate_probe_set", "generate_probes", "run_harness", diff --git a/src/omnix/gates/gate6_equivalence/classifier.py b/src/omnix/gates/gate6_equivalence/classifier.py new file mode 100644 index 0000000..ece412a --- /dev/null +++ b/src/omnix/gates/gate6_equivalence/classifier.py @@ -0,0 +1,333 @@ +"""Gate 6 behavioral-equivalence classifier.""" + +from __future__ import annotations + +import os +from collections.abc import Callable, Mapping, Sequence +from dataclasses import dataclass +from typing import Any, Literal, cast + +from omnix.gates.gate6_equivalence.harness import ProbeResult, run_harness +from omnix.gates.gate6_equivalence.probes import ProbeSet, generate_probe_set +from omnix.gates.result import GateError +from omnix.semantic.node import SemanticNode, SourceLocation + +Classification = Literal[ + "equivalent", + "value_diverge", + "exception_diverge", + "stdout_diverge_deterministic", + "stdout_diverge_stochastic", + "wall_clock_diverge", + "fp_tolerance_diverge", +] +Gate6Status = Literal["passed", "failed", "runtime_crash", "skipped", "inconclusive"] +LegacyReplay = Callable[[ProbeResult], Sequence[ProbeResult]] + +_GATE_NUMBER = 6 +_GATE_NAME = "behavioral_equivalence" +_DIVERGENCE_CLASSES = { + "value_diverge", + "exception_diverge", + "stdout_diverge_deterministic", +} +_ACCEPTED_WITH_NOTE_CLASSES = { + "stdout_diverge_stochastic", + "wall_clock_diverge", + "fp_tolerance_diverge", +} +_WALL_BUCKETS = { + "<1ms": 0, + "<10ms": 1, + "<100ms": 2, + "<1s": 3, + "<10s": 4, + ">10s": 5, +} +_DOUBLE_EPS = 2**-52 + + +@dataclass(frozen=True) +class ClassifiedProbe: + """A single probe with its Gate 6 classification.""" + + result: ProbeResult + classification: Classification + + +@dataclass(frozen=True) +class Gate6Evaluation: + """Detailed Gate 6 result for receipt wiring.""" + + status: Gate6Status + details: dict[str, Any] + error: GateError | None = None + classified: tuple[ClassifiedProbe, ...] = () + probe_set: ProbeSet | None = None + + +def classify_results( + probe_results: Sequence[ProbeResult], + *, + legacy_replay: LegacyReplay | None = None, +) -> list[ClassifiedProbe]: + """Classify every probe result into exactly one Gate 6 bucket.""" + return [ + ClassifiedProbe(result=result, classification=_classify_one(result, legacy_replay)) + for result in probe_results + ] + + +def evaluate_results( + probe_results: Sequence[ProbeResult], + *, + legacy_replay: LegacyReplay | None = None, + probe_set: ProbeSet | None = None, +) -> Gate6Evaluation: + """Return the overall Gate 6 status for already-collected probe results.""" + classified = tuple(classify_results(probe_results, legacy_replay=legacy_replay)) + divergence = [c for c in classified if c.classification in _DIVERGENCE_CLASSES] + accepted = [c for c in classified if c.classification in _ACCEPTED_WITH_NOTE_CLASSES] + details: dict[str, Any] = { + "status": "passed", + "probe_count": len(classified), + "divergence_count": len(divergence), + "accepted_with_note_count": len(accepted), + "classifications": [_classification_detail(c) for c in classified], + } + if probe_set is not None: + details["probe_generation"] = dict(probe_set.details) + + if divergence: + first = divergence[0] + details.update( + { + "status": "failed", + "classification": first.classification, + "diverging_input": first.result.input, + } + ) + err = _gate_error("behavioral equivalence divergence", **details) + return Gate6Evaluation( + status="failed", + details=details, + error=err, + classified=classified, + probe_set=probe_set, + ) + + if len(classified) < 20: + details.update({"status": "inconclusive", "reason": "insufficient_probe_count"}) + err = _gate_error("gate 6 needs at least 20 probes", **details) + return Gate6Evaluation( + status="inconclusive", + details=details, + error=err, + classified=classified, + probe_set=probe_set, + ) + + return Gate6Evaluation( + status="passed", + details=details, + classified=classified, + probe_set=probe_set, + ) + + +def evaluate( + legacy_source: str, + rebuilt_source: str, + semantic_node: SemanticNode, + *, + gate5_details: Mapping[str, Any] | None = None, + num_random: int | None = None, + timeout_s: float = 60.0, +) -> Gate6Evaluation: + """Generate probes, run the harness, classify results, and return receipt details.""" + try: + random_count = _random_probe_count() if num_random is None else num_random + probe_set = generate_probe_set( + semantic_node, + num_random=random_count, + gate5_details=gate5_details, + ) + class_name = _class_name_from_fqn(semantic_node.fqn) + method_name = semantic_node.fqn.rsplit(".", 1)[-1] + parameter_types = list(semantic_node.resolved_param_types) + results = run_harness( + legacy_source, + rebuilt_source, + class_name, + method_name, + probe_set.probes, + parameter_types=parameter_types, + timeout_s=timeout_s, + ) + + def _replay(result: ProbeResult) -> Sequence[ProbeResult]: + return run_harness( + legacy_source, + legacy_source, + class_name, + method_name, + [result.input, result.input, result.input], + parameter_types=parameter_types, + timeout_s=timeout_s, + ) + + return evaluate_results(results, legacy_replay=_replay, probe_set=probe_set) + except Exception as exc: + err = _gate_error( + "gate 6 internal exception", + status="failed", + reason="gate6_internal_exception", + exception=type(exc).__name__, + error_message=str(exc), + ) + return Gate6Evaluation(status="failed", details=dict(err.details), error=err) + + +def check( + legacy_source: str, + rebuilt_source: str, + semantic_node: SemanticNode | None = None, + *, + class_name: str | None = None, + method_name: str | None = None, + parameter_types: Sequence[str] | None = None, + gate5_details: Mapping[str, Any] | None = None, + num_random: int | None = None, + timeout_s: float = 60.0, +) -> GateError | None: + """Return None when Gate 6 finds no blocking behavioral divergence.""" + node = semantic_node + if node is None: + if class_name is None or method_name is None: + raise ValueError("semantic_node or class_name/method_name is required") + node = SemanticNode( + fqn=f"{class_name}.{method_name}", + kind="method", + signature="", + resolved_param_types=tuple(parameter_types or ("java.lang.String",)), + resolved_return_type=None, + dependency_edges=(), + source_location=SourceLocation(file_path=f"{class_name}.java", line=1), + ) + return evaluate( + legacy_source, + rebuilt_source, + node, + gate5_details=gate5_details, + num_random=num_random, + timeout_s=timeout_s, + ).error + + +def _classify_one( + result: ProbeResult, + legacy_replay: LegacyReplay | None, +) -> Classification: + if _exceptions_differ(result): + return "exception_diverge" + if not _values_equal(result.return_value_legacy, result.return_value_rebuilt): + if _within_fp_tolerance(result.return_value_legacy, result.return_value_rebuilt): + return "fp_tolerance_diverge" + if _legacy_return_is_stochastic(result, legacy_replay): + return "stdout_diverge_stochastic" + return "value_diverge" + if ( + result.stdout_legacy_sha256 != result.stdout_rebuilt_sha256 + or result.stderr_legacy_sha256 != result.stderr_rebuilt_sha256 + ): + if _legacy_stdout_is_stochastic(result, legacy_replay): + return "stdout_diverge_stochastic" + return "stdout_diverge_deterministic" + if _wall_clock_distance(result) > 1: + return "wall_clock_diverge" + return "equivalent" + + +def _exceptions_differ(result: ProbeResult) -> bool: + return ( + result.legacy_outcome != result.rebuilt_outcome + or result.exception_legacy != result.exception_rebuilt + ) + + +def _values_equal(legacy: Any, rebuilt: Any) -> bool: + return legacy == rebuilt + + +def _within_fp_tolerance(legacy: Any, rebuilt: Any) -> bool: + if not isinstance(legacy, float) or not isinstance(rebuilt, float): + return False + scale = max(abs(legacy), abs(rebuilt), 1.0) + return abs(legacy - rebuilt) <= _DOUBLE_EPS * scale + + +def _legacy_stdout_is_stochastic( + result: ProbeResult, + legacy_replay: LegacyReplay | None, +) -> bool: + if legacy_replay is None: + return False + replayed = list(legacy_replay(result)) + if not replayed: + return False + hashes = {result.stdout_legacy_sha256} + hashes.update(r.stdout_legacy_sha256 for r in replayed) + return len(hashes) > 1 + + +def _legacy_return_is_stochastic( + result: ProbeResult, + legacy_replay: LegacyReplay | None, +) -> bool: + if legacy_replay is None: + return False + replayed = list(legacy_replay(result)) + if not replayed: + return False + values = {repr(result.return_value_legacy)} + values.update(repr(r.return_value_legacy) for r in replayed) + return len(values) > 1 + + +def _wall_clock_distance(result: ProbeResult) -> int: + legacy = _WALL_BUCKETS.get(result.wall_clock_bucket_legacy) + rebuilt = _WALL_BUCKETS.get(result.wall_clock_bucket_rebuilt) + if legacy is None or rebuilt is None: + return 0 + return abs(legacy - rebuilt) + + +def _classification_detail(classified: ClassifiedProbe) -> dict[str, Any]: + return { + "input": classified.result.input, + "classification": classified.classification, + } + + +def _class_name_from_fqn(fqn: str) -> str: + owner, _, _method = fqn.rpartition(".") + return owner + + +def _random_probe_count() -> int: + raw = os.environ.get("OMNIX_GATE6_RANDOM_PROBES", "100") + try: + return max(0, int(raw)) + except ValueError: + return 100 + + +def _gate_error(message: str, **details: Any) -> GateError: + status = details.get("status", "failed") + details["status"] = cast(Gate6Status, status) + return GateError( + gate_number=_GATE_NUMBER, + gate_name=_GATE_NAME, + message=message, + details=details, + ) diff --git a/src/omnix/rebuild/runner.py b/src/omnix/rebuild/runner.py index eeb613b..86bf668 100644 --- a/src/omnix/rebuild/runner.py +++ b/src/omnix/rebuild/runner.py @@ -44,7 +44,6 @@ GateResult, GateStatus, RebuildReceipt, - default_skipped_gate_results, sha256_hex_text, sign_rebuild, ) @@ -90,15 +89,22 @@ def _run_gates_1_to_5( rebuilt_source: str, skip_gate_5: bool = False, ) -> tuple[GateResult, ...]: - """Mechanically run gates 1-5 against a rebuilt source + spec. + """Mechanically run gates 1-6 against a rebuilt source + spec. Each gate returns either `None` (passed) or a `GateError`-shaped dict with `details`. We translate that into a `GateResult` with a `passed` or `failed` status. """ - from omnix.gates import gate1_syntactic, gate2_typecheck, gate3_signature, gate5_property + from omnix.gates import ( + gate1_syntactic, + gate2_typecheck, + gate3_signature, + gate5_property, + gate6_equivalence, + ) results: list[GateResult] = [] + gate5_details: dict[str, Any] = {} err = gate1_syntactic.check(rebuilt_source) results.append( @@ -149,6 +155,7 @@ def _run_gates_1_to_5( ) if skip_gate_5: + gate5_details = {"reason": "skipped_by_user", "status": "skipped"} results.append( GateResult( gate_number=5, @@ -159,6 +166,7 @@ def _run_gates_1_to_5( ) elif hasattr(gate5_property, "evaluate"): evaluation = gate5_property.evaluate(legacy_source, rebuilt_source, node) + gate5_details = dict(evaluation.details) results.append( GateResult( gate_number=5, @@ -169,7 +177,39 @@ def _run_gates_1_to_5( ) else: # pragma: no cover — compatibility for older stacked checkouts err = gate5_property.check(legacy_source, rebuilt_source, node) - results.append(_gate_error_to_result(5, err)) + gate5_result = _gate_error_to_result(5, err) + gate5_details = dict(gate5_result.details) + results.append(gate5_result) + + try: + gate6_evaluation = gate6_equivalence.evaluate( + legacy_source, + rebuilt_source, + node, + gate5_details=gate5_details, + ) + results.append( + GateResult( + gate_number=6, + gate_name=GATE_NAMES[6], + status=gate6_evaluation.status, + details=dict(gate6_evaluation.details), + ) + ) + except Exception as exc: # pragma: no cover — evaluate handles this path + results.append( + GateResult( + gate_number=6, + gate_name=GATE_NAMES[6], + status="failed", + details={ + "status": "failed", + "reason": "gate6_internal_exception", + "exception": type(exc).__name__, + "error_message": str(exc), + }, + ) + ) return tuple(results) @@ -202,10 +242,8 @@ def _build_receipt( spec: Any, prompt_text_hash: str, model: str, - gate_results_1_to_5: tuple[GateResult, ...], + gate_results_1_to_6: tuple[GateResult, ...], ) -> RebuildReceipt: - gate6_skipped = tuple(g for g in default_skipped_gate_results() if g.gate_number == 6) - full_gates = gate_results_1_to_5 + gate6_skipped return RebuildReceipt( project_id=project_id, node_fqn=node.fqn, @@ -216,7 +254,7 @@ def _build_receipt( prompt_template_version=PROMPT_TEMPLATE_VERSION, prompt_text_hash=prompt_text_hash, model=model, - gate_results=full_gates, + gate_results=gate_results_1_to_6, timestamp=now_iso8601_utc(), omnix_version=_omnix_version(), ) @@ -337,7 +375,7 @@ def _run_with_graph( prompt_text, prompt_hash = format_prompt(spec, legacy_source) rebuilt_source = _invoke(effective_dispatch, prompt_text, model) - gates_1_to_5 = _run_gates_1_to_5( + gates_1_to_6 = _run_gates_1_to_5( node=node, spec=spec, legacy_source=legacy_source, @@ -353,7 +391,7 @@ def _run_with_graph( spec=spec, prompt_text_hash=prompt_hash, model=model, - gate_results_1_to_5=gates_1_to_5, + gate_results_1_to_6=gates_1_to_6, ) signature_b64 = sign_rebuild(receipt) outputs.append( diff --git a/tests/gates/test_gate6_classifier.py b/tests/gates/test_gate6_classifier.py new file mode 100644 index 0000000..b7f8263 --- /dev/null +++ b/tests/gates/test_gate6_classifier.py @@ -0,0 +1,151 @@ +"""Tests for Gate 6 result classification.""" + +from __future__ import annotations + +from omnix.gates.gate6_equivalence import ( + ClassifiedProbe, + ProbeResult, + classify_results, + evaluate_results, +) + +EMPTY_SHA = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" +ALT_SHA = "0" * 64 +OTHER_SHA = "1" * 64 + + +def _result( + *, + input_value: list[object] | None = None, + legacy_outcome: str = "returned", + rebuilt_outcome: str = "returned", + wall_clock_bucket_legacy: str = "<10ms", + wall_clock_bucket_rebuilt: str = "<10ms", + stdout_legacy_sha256: str = EMPTY_SHA, + stdout_rebuilt_sha256: str = EMPTY_SHA, + stderr_legacy_sha256: str = EMPTY_SHA, + stderr_rebuilt_sha256: str = EMPTY_SHA, + return_value_legacy: object = "same", + return_value_rebuilt: object = "same", + exception_legacy: str | None = None, + exception_rebuilt: str | None = None, +) -> ProbeResult: + return ProbeResult( + input=[] if input_value is None else input_value, + legacy_outcome=legacy_outcome, + rebuilt_outcome=rebuilt_outcome, + wall_clock_bucket_legacy=wall_clock_bucket_legacy, + wall_clock_bucket_rebuilt=wall_clock_bucket_rebuilt, + stdout_legacy_sha256=stdout_legacy_sha256, + stdout_rebuilt_sha256=stdout_rebuilt_sha256, + stderr_legacy_sha256=stderr_legacy_sha256, + stderr_rebuilt_sha256=stderr_rebuilt_sha256, + return_value_legacy=return_value_legacy, + return_value_rebuilt=return_value_rebuilt, + exception_legacy=exception_legacy, + exception_rebuilt=exception_rebuilt, + ) + + +def test_classify_results_assigns_exactly_one_bucket_per_probe() -> None: + probes = [ + _result(), + _result(return_value_rebuilt="different"), + _result(exception_legacy="java.lang.IllegalArgumentException"), + _result(stdout_legacy_sha256=ALT_SHA, stdout_rebuilt_sha256=EMPTY_SHA), + _result( + stdout_legacy_sha256=ALT_SHA, + stdout_rebuilt_sha256=EMPTY_SHA, + input_value=["random"], + ), + _result(wall_clock_bucket_legacy="<1ms", wall_clock_bucket_rebuilt="<1s"), + _result(return_value_legacy=1e8, return_value_rebuilt=1e8 + 1e-8), + ] + + classified = classify_results( + probes, + legacy_replay=lambda result: [ + _result(input_value=result.input, stdout_legacy_sha256=ALT_SHA), + _result(input_value=result.input, stdout_legacy_sha256=OTHER_SHA), + _result(input_value=result.input, stdout_legacy_sha256=ALT_SHA), + ] + if result.input == ["random"] + else [], + ) + + assert all(isinstance(item, ClassifiedProbe) for item in classified) + assert [item.classification for item in classified] == [ + "equivalent", + "value_diverge", + "exception_diverge", + "stdout_diverge_deterministic", + "stdout_diverge_stochastic", + "wall_clock_diverge", + "fp_tolerance_diverge", + ] + + +def test_real_bug_classifications_fail_gate6_overall() -> None: + evaluation = evaluate_results( + [ + _result(input_value=["x"], return_value_legacy="x", return_value_rebuilt="y"), + _result(input_value=["ok"]), + ] + ) + + assert evaluation.status == "failed" + assert evaluation.error is not None + assert evaluation.details["divergence_count"] == 1 + assert evaluation.details["diverging_input"] == ["x"] + assert evaluation.details["classification"] == "value_diverge" + + +def test_stochastic_return_value_is_accepted_with_note_after_replay() -> None: + probe = _result( + input_value=["random"], + return_value_legacy="Hi", + return_value_rebuilt="Hey", + ) + + classified = classify_results( + [probe], + legacy_replay=lambda result: [ + _result(input_value=result.input, return_value_legacy="Hi"), + _result(input_value=result.input, return_value_legacy="Hey"), + _result(input_value=result.input, return_value_legacy="Hi"), + ], + ) + + assert classified[0].classification == "stdout_diverge_stochastic" + + +def test_small_probe_set_is_inconclusive_without_real_divergence() -> None: + evaluation = evaluate_results([_result()]) + + assert evaluation.status == "inconclusive" + assert evaluation.error is not None + assert evaluation.details["probe_count"] == 1 + assert evaluation.details["reason"] == "insufficient_probe_count" + + +def test_fp_tolerance_counts_as_accepted_note_not_failure() -> None: + probes = [ + _result(return_value_legacy=1e8, return_value_rebuilt=1e8 + 1e-8) + for _ in range(20) + ] + + evaluation = evaluate_results(probes) + + assert evaluation.status == "passed" + assert evaluation.error is None + assert evaluation.details["divergence_count"] == 0 + assert evaluation.details["accepted_with_note_count"] == 20 + + +def test_passed_gate6_details_include_probe_counts() -> None: + evaluation = evaluate_results([_result(input_value=[i]) for i in range(20)]) + + assert evaluation.status == "passed" + assert evaluation.details["probe_count"] == 20 + assert evaluation.details["divergence_count"] == 0 + assert evaluation.details["accepted_with_note_count"] == 0 diff --git a/tests/rebuild/test_runner.py b/tests/rebuild/test_runner.py index ad50e5d..0ff7f0f 100644 --- a/tests/rebuild/test_runner.py +++ b/tests/rebuild/test_runner.py @@ -80,6 +80,31 @@ def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> tuple[Path, _Stu return project_root, graph, pub_path +@pytest.fixture(autouse=True) +def fast_gate6(monkeypatch: pytest.MonkeyPatch) -> None: + """Keep rebuild-runner tests focused on receipt wiring, not JVM probe cost.""" + from omnix.gates import gate6_equivalence + + def _passing_gate6( + legacy_source: str, + rebuilt_source: str, + semantic_node: SemanticNode, + *, + gate5_details: dict[str, object] | None = None, + ): + return gate6_equivalence.Gate6Evaluation( + status="passed", + details={ + "status": "passed", + "probe_count": 20, + "divergence_count": 0, + "accepted_with_note_count": 0, + }, + ) + + monkeypatch.setattr(gate6_equivalence, "evaluate", _passing_gate6) + + def _good_rebuild_source() -> str: """A rebuild that should pass gates 1-3.""" return ( @@ -112,7 +137,7 @@ def test_runner_emits_one_receipt_per_node(project) -> None: assert o.receipt_path.parent == o.signature_path.parent == o.rebuilt_source_path.parent -def test_receipt_contains_all_six_gates_with_real_gate5_and_gate6_skipped( +def test_receipt_contains_all_six_gates_with_real_gate5_and_gate6( project, monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -144,9 +169,11 @@ def _passing_gate5(legacy_source: str, rebuilt_source: str, semantic_node: Seman receipt = RebuildReceipt.from_dict(receipt_dict) gate_status_by_number = {g.gate_number: g.status for g in receipt.gate_results} + gate_details_by_number = {g.gate_number: g.details for g in receipt.gate_results} assert sorted(gate_status_by_number) == [1, 2, 3, 4, 5, 6] assert gate_status_by_number[5] == "passed" - assert gate_status_by_number[6] == "skipped" + assert gate_status_by_number[6] == "passed" + assert gate_details_by_number[6]["probe_count"] == 20 # Gate 4 not yet wired mechanically — emitted as 'skipped'. assert gate_status_by_number[4] == "skipped" @@ -200,7 +227,76 @@ def _inconclusive_gate5( assert gate5.status == "inconclusive" assert gate5.details["examples_used"] == 50 assert gates_summary(receipt.gate_results) == ( - "3-passed/0-failed/2-skipped/1-inconclusive/0-deferred_m3" + "4-passed/0-failed/1-skipped/1-inconclusive/0-deferred_m3" + ) + + +def test_receipt_records_gate6_failure_details( + project, + monkeypatch: pytest.MonkeyPatch, +) -> None: + from omnix.gates import gate5_property, gate6_equivalence + from omnix.gates.result import GateError + + project_root, graph, _ = project + + def _passing_gate5( + legacy_source: str, + rebuilt_source: str, + semantic_node: SemanticNode, + ) -> gate5_property.Gate5Evaluation: + return gate5_property.Gate5Evaluation( + status="passed", + details={"status": "passed", "examples_used": 200}, + ) + + def _failing_gate6( + legacy_source: str, + rebuilt_source: str, + semantic_node: SemanticNode, + *, + gate5_details: dict[str, object] | None = None, + ) -> gate6_equivalence.Gate6Evaluation: + err = GateError( + gate_number=6, + gate_name="behavioral_equivalence", + message="behavioral equivalence divergence", + details={ + "status": "failed", + "classification": "value_diverge", + "diverging_input": ["x"], + "probe_count": 20, + "divergence_count": 1, + "accepted_with_note_count": 0, + }, + ) + return gate6_equivalence.Gate6Evaluation( + status="failed", + details=dict(err.details), + error=err, + ) + + monkeypatch.setattr(gate5_property, "evaluate", _passing_gate5) + monkeypatch.setattr(gate6_equivalence, "evaluate", _failing_gate6) + + outputs = _run_with_graph( + graph=graph, + project_path=project_root, + target_language="java21", + node_filter=None, + dispatch_fn=lambda prompt, model="claude-opus-4.7": _good_rebuild_source(), + model="claude-opus-4.7", + output_root=None, + ) + receipt = RebuildReceipt.from_dict( + json.loads(outputs[0].receipt_path.read_text(encoding="utf-8")) + ) + gate6 = next(g for g in receipt.gate_results if g.gate_number == 6) + assert gate6.status == "failed" + assert gate6.details["classification"] == "value_diverge" + assert gate6.details["diverging_input"] == ["x"] + assert gates_summary(receipt.gate_results) == ( + "4-passed/1-failed/1-skipped/0-inconclusive/0-deferred_m3" )