From 6e7b007ad42f6c9f8e2471d204579c1174e85d3f Mon Sep 17 00:00:00 2001 From: Abir Abbas Date: Mon, 23 Feb 2026 14:19:30 -0500 Subject: [PATCH 1/2] fix: prevent non-convergent stuck loops on trivial tasks (closes #12) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default-path coding loop (reviewer-only) had no stuck-loop detection, unlike the flagged path which gets it from the synthesizer. When a reviewer repeatedly returned approved=False / blocking=False (e.g. requesting minor polish), the loop burned all 5 iterations + 2 advisor rounds without converging, eventually causing an exit -2 abort. Changes: - Add _detect_stuck_loop() that detects 3+ consecutive non-blocking fix cycles on the default path - When stuck or loop-exhausted with non-blocking reviewer feedback AND code changes present, return COMPLETED_WITH_DEBT instead of FAILED_UNRECOVERABLE — the issue flows through the debt gate rather than triggering replanning/abort - Preserve FAILED_UNRECOVERABLE for genuinely blocking failures and cases where no code was produced Co-Authored-By: Claude Opus 4.6 --- swe_af/execution/coding_loop.py | 108 ++++++++++++++++++++++++++------ 1 file changed, 90 insertions(+), 18 deletions(-) diff --git a/swe_af/execution/coding_loop.py b/swe_af/execution/coding_loop.py index c6e02aa..009b9c8 100644 --- a/swe_af/execution/coding_loop.py +++ b/swe_af/execution/coding_loop.py @@ -251,6 +251,27 @@ async def _write_memory_on_failure( await _memory_set(memory_fn, "build_health", health) +# --------------------------------------------------------------------------- +# Stuck-loop detection +# --------------------------------------------------------------------------- + + +def _detect_stuck_loop(iteration_history: list[dict], window: int = 3) -> bool: + """Return True if the last ``window`` iterations are all non-blocking "fix" cycles. + + This catches the default-path failure mode where the reviewer repeatedly + returns approved=False / blocking=False with similar feedback, causing the + coder to re-attempt the same work without converging. + """ + if len(iteration_history) < window: + return False + recent = iteration_history[-window:] + return all( + entry.get("action") == "fix" and not entry.get("review_blocking", False) + for entry in recent + ) + + # --------------------------------------------------------------------------- # Path routing helpers # --------------------------------------------------------------------------- @@ -638,6 +659,7 @@ async def run_coding_loop( qa_result = None synthesis_result = None _save_artifact(dag_state.artifacts_dir, iteration_id, "review", review_result) + stuck = False # Record iteration for history @@ -733,27 +755,77 @@ async def run_coding_loop( else: feedback = summary - # Stuck detection + # Stuck detection — default path uses history-based detection since it + # has no synthesizer to set the stuck flag. + if not stuck and not needs_deeper_qa: + stuck = _detect_stuck_loop(iteration_history) + if stuck: - if note_fn: - note_fn( - f"Coding loop STUCK: {issue_name} — breaking after {iteration} iterations", - tags=["coding_loop", "stuck", issue_name], + last_blocking = review_result.get("blocking", False) if review_result else False + if not last_blocking and files_changed: + # Non-blocking stuck loop with code changes → accept with debt + if note_fn: + note_fn( + f"Coding loop STUCK (non-blocking): {issue_name} — " + f"accepting with debt after {iteration} iterations", + tags=["coding_loop", "stuck", "accept_debt", issue_name], + ) + return IssueResult( + issue_name=issue_name, + outcome=IssueOutcome.COMPLETED_WITH_DEBT, + result_summary=f"Accepted with debt (stuck loop, non-blocking): {summary}", + files_changed=files_changed, + branch_name=branch_name, + attempts=iteration, + iteration_history=iteration_history, ) - await _write_memory_on_failure( - memory_fn, issue, summary, review_result, note_fn, - ) - return IssueResult( - issue_name=issue_name, - outcome=IssueOutcome.FAILED_UNRECOVERABLE, - error_message=f"Stuck loop detected: {summary}", - files_changed=files_changed, - branch_name=branch_name, - attempts=iteration, - iteration_history=iteration_history, + else: + if note_fn: + note_fn( + f"Coding loop STUCK: {issue_name} — breaking after {iteration} iterations", + tags=["coding_loop", "stuck", issue_name], + ) + await _write_memory_on_failure( + memory_fn, issue, summary, review_result, note_fn, + ) + return IssueResult( + issue_name=issue_name, + outcome=IssueOutcome.FAILED_UNRECOVERABLE, + error_message=f"Stuck loop detected: {summary}", + files_changed=files_changed, + branch_name=branch_name, + attempts=iteration, + iteration_history=iteration_history, + ) + + # Loop exhausted without approval — check if we can accept with debt + last_review = review_result if 'review_result' in dir() else None + last_blocking = (last_review.get("blocking", False) if last_review else False) + + if not last_blocking and files_changed: + # Reviewer was never blocking and coder produced changes — accept with debt + # rather than failing entirely. This prevents trivial tasks from stalling + # the whole DAG when the reviewer keeps requesting minor polish. + if note_fn: + note_fn( + f"Coding loop exhausted (non-blocking): {issue_name} — " + f"accepting with debt after {max_iterations} iterations", + tags=["coding_loop", "exhausted", "accept_debt", issue_name], ) + return IssueResult( + issue_name=issue_name, + outcome=IssueOutcome.COMPLETED_WITH_DEBT, + result_summary=( + f"Accepted with debt after {max_iterations} iterations " + f"(reviewer non-blocking, code changes present)" + ), + files_changed=files_changed, + branch_name=branch_name, + attempts=max_iterations, + iteration_history=iteration_history, + ) - # Loop exhausted without approval + # Truly unrecoverable — reviewer was blocking or no code was produced if note_fn: note_fn( f"Coding loop exhausted: {issue_name} after {max_iterations} iterations", @@ -761,7 +833,7 @@ async def run_coding_loop( ) await _write_memory_on_failure( - memory_fn, issue, "Loop exhausted", review_result if 'review_result' in dir() else None, note_fn, + memory_fn, issue, "Loop exhausted", last_review, note_fn, ) return IssueResult( From 3494a02b234e3c39bad286af577406fe19bd6675 Mon Sep 17 00:00:00 2001 From: Abir Abbas Date: Mon, 23 Feb 2026 14:30:41 -0500 Subject: [PATCH 2/2] Add integration tests for coding loop stuck-detection and graceful degradation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 25 tests exercising run_coding_loop end-to-end with scripted call_fn (only AI agent calls are mocked). Covers: - Happy path (approved first/second iteration) - Non-blocking stuck loop → COMPLETED_WITH_DEBT at window boundary - Blocking review → immediate FAILED_UNRECOVERABLE - Stagnation (no files changed) → FAILED_UNRECOVERABLE - Loop exhaustion with non-blocking → COMPLETED_WITH_DEBT - Flagged path (QA + reviewer + synthesizer) scenarios - Coder exception handling - Artifact/checkpoint persistence to disk - File accumulation across iterations - note_fn tag observability Co-Authored-By: Claude Opus 4.6 --- tests/test_coding_loop.py | 760 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 760 insertions(+) create mode 100644 tests/test_coding_loop.py diff --git a/tests/test_coding_loop.py b/tests/test_coding_loop.py new file mode 100644 index 0000000..7826d3f --- /dev/null +++ b/tests/test_coding_loop.py @@ -0,0 +1,760 @@ +"""Integration tests for the coding loop stuck-detection and graceful degradation. + +These tests exercise ``run_coding_loop`` end-to-end with **real** DAGState, +ExecutionConfig, artifact I/O, and iteration bookkeeping. The **only** mock is +``call_fn`` — the async callable that dispatches to AI agents (coder, reviewer, +QA, synthesizer). Each test wires up a ``call_fn`` that returns scripted +responses simulating realistic agent behaviour, then asserts on the IssueResult +outcome, iteration count, files_changed, and iteration_history. +""" + +from __future__ import annotations + +import asyncio +import os +import shutil +import tempfile +import unittest + +from swe_af.execution.coding_loop import _detect_stuck_loop, run_coding_loop +from swe_af.execution.schemas import DAGState, ExecutionConfig, IssueOutcome + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_dag_state(artifacts_dir: str, repo_path: str = "/tmp/fake-repo") -> DAGState: + """Create a minimal DAGState for testing.""" + return DAGState( + repo_path=repo_path, + artifacts_dir=artifacts_dir, + prd_path="", + architecture_path="", + issues_dir="", + ) + + +def _make_config(**overrides) -> ExecutionConfig: + """Create an ExecutionConfig with sensible test defaults.""" + defaults = { + "max_coding_iterations": 5, + "agent_timeout_seconds": 30, + } + defaults.update(overrides) + return ExecutionConfig(**defaults) + + +def _make_issue(name: str = "ISSUE-1", needs_deeper_qa: bool = False, **extra) -> dict: + """Create a minimal issue dict.""" + issue = { + "name": name, + "title": "Test issue", + "description": "A test issue for the coding loop", + "acceptance_criteria": ["AC-1: it works"], + "depends_on": [], + "provides": [], + "files_to_create": [], + "files_to_modify": [], + "worktree_path": "/tmp/fake-repo", + "branch_name": "test/issue-1", + } + if needs_deeper_qa: + issue["guidance"] = {"needs_deeper_qa": True} + issue.update(extra) + return issue + + +class _CallFnBuilder: + """Build a scripted ``call_fn`` that returns predetermined responses. + + Usage:: + + builder = _CallFnBuilder() + builder.on_coder(iteration=1, files_changed=["a.py"]) + builder.on_reviewer(iteration=1, approved=False, blocking=False) + ... + call_fn = builder.build() + """ + + def __init__(self): + self._coder_responses: dict[int, dict] = {} + self._reviewer_responses: dict[int, dict] = {} + self._qa_responses: dict[int, dict] = {} + self._synth_responses: dict[int, dict] = {} + self._call_count = 0 + # Track iteration by counting coder calls + self._coder_calls = 0 + + # -- coder -- + def on_coder( + self, + iteration: int, + files_changed: list[str] | None = None, + summary: str = "Implemented changes", + ) -> "_CallFnBuilder": + self._coder_responses[iteration] = { + "files_changed": files_changed or [], + "summary": summary, + "complete": True, + } + return self + + # -- reviewer -- + def on_reviewer( + self, + iteration: int, + approved: bool = False, + blocking: bool = False, + summary: str = "Minor style issues", + debt_items: list[dict] | None = None, + ) -> "_CallFnBuilder": + self._reviewer_responses[iteration] = { + "approved": approved, + "blocking": blocking, + "summary": summary, + "debt_items": debt_items or [], + } + return self + + # -- QA -- + def on_qa( + self, + iteration: int, + passed: bool = True, + summary: str = "Tests pass", + ) -> "_CallFnBuilder": + self._qa_responses[iteration] = { + "passed": passed, + "summary": summary, + "test_failures": [], + } + return self + + # -- synthesizer -- + def on_synth( + self, + iteration: int, + action: str = "fix", + summary: str = "Continue fixing", + stuck: bool = False, + ) -> "_CallFnBuilder": + self._synth_responses[iteration] = { + "action": action, + "summary": summary, + "stuck": stuck, + } + return self + + def build(self) -> callable: + """Return an async call_fn that dispatches based on agent name.""" + builder = self + + def call_fn(agent_name: str, **kwargs): + """Return a coroutine that resolves to the scripted response.""" + + async def _invoke(): + # Determine which iteration we're on based on coder call count + if "run_coder" in agent_name: + builder._coder_calls += 1 + iteration = builder._coder_calls + default = {"files_changed": [], "summary": "No changes", "complete": True} + return builder._coder_responses.get(iteration, default) + + iteration = builder._coder_calls # same iteration as last coder call + + if "run_code_reviewer" in agent_name: + default = {"approved": False, "blocking": False, "summary": "Needs work"} + return builder._reviewer_responses.get(iteration, default) + + # Check synthesizer BEFORE qa — "run_qa" is a substring of + # "run_qa_synthesizer" so order matters. + if "run_qa_synthesizer" in agent_name: + default = {"action": "fix", "summary": "Continue fixing", "stuck": False} + return builder._synth_responses.get(iteration, default) + + if "run_qa" in agent_name: + default = {"passed": True, "summary": "Tests pass", "test_failures": []} + return builder._qa_responses.get(iteration, default) + + return {} + + return _invoke() + + return call_fn + + +def _run(coro): + """Run an async coroutine synchronously for tests.""" + return asyncio.new_event_loop().run_until_complete(coro) + + +# --------------------------------------------------------------------------- +# Unit tests: _detect_stuck_loop +# --------------------------------------------------------------------------- + + +class TestDetectStuckLoop(unittest.TestCase): + """Unit tests for the stuck-loop detection helper.""" + + def test_too_few_iterations(self): + history = [ + {"action": "fix", "review_blocking": False}, + {"action": "fix", "review_blocking": False}, + ] + self.assertFalse(_detect_stuck_loop(history, window=3)) + + def test_all_non_blocking_fix_triggers(self): + history = [ + {"action": "fix", "review_blocking": False}, + {"action": "fix", "review_blocking": False}, + {"action": "fix", "review_blocking": False}, + ] + self.assertTrue(_detect_stuck_loop(history, window=3)) + + def test_blocking_iteration_prevents_trigger(self): + history = [ + {"action": "fix", "review_blocking": False}, + {"action": "fix", "review_blocking": True}, + {"action": "fix", "review_blocking": False}, + ] + self.assertFalse(_detect_stuck_loop(history, window=3)) + + def test_approve_in_window_prevents_trigger(self): + history = [ + {"action": "approve", "review_blocking": False}, + {"action": "fix", "review_blocking": False}, + {"action": "fix", "review_blocking": False}, + ] + self.assertFalse(_detect_stuck_loop(history, window=3)) + + def test_only_looks_at_recent_window(self): + history = [ + {"action": "approve", "review_blocking": False}, # outside window + {"action": "fix", "review_blocking": False}, + {"action": "fix", "review_blocking": False}, + {"action": "fix", "review_blocking": False}, + ] + self.assertTrue(_detect_stuck_loop(history, window=3)) + + def test_custom_window_size(self): + history = [ + {"action": "fix", "review_blocking": False}, + {"action": "fix", "review_blocking": False}, + ] + self.assertTrue(_detect_stuck_loop(history, window=2)) + + def test_empty_history(self): + self.assertFalse(_detect_stuck_loop([], window=3)) + + +# --------------------------------------------------------------------------- +# Integration tests: run_coding_loop +# --------------------------------------------------------------------------- + + +class TestCodingLoopIntegration(unittest.TestCase): + """End-to-end tests for run_coding_loop with scripted call_fn.""" + + def setUp(self): + self.tmpdir = tempfile.mkdtemp(prefix="swe-af-test-") + self.artifacts_dir = os.path.join(self.tmpdir, "artifacts") + os.makedirs(self.artifacts_dir, exist_ok=True) + self.notes: list[str] = [] + self.note_tags: list[list[str]] = [] + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def _note_fn(self, msg: str, tags: list[str] | None = None): + self.notes.append(msg) + self.note_tags.append(tags or []) + + # -- Scenario 1: Happy path — approved on first iteration -- + + def test_approved_first_iteration(self): + """Coder produces changes, reviewer approves → COMPLETED.""" + builder = _CallFnBuilder() + builder.on_coder(1, files_changed=["src/app.py"]) + builder.on_reviewer(1, approved=True, blocking=False, summary="LGTM") + + result = _run(run_coding_loop( + issue=_make_issue(), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(), + note_fn=self._note_fn, + )) + + self.assertEqual(result.outcome, IssueOutcome.COMPLETED) + self.assertEqual(result.attempts, 1) + self.assertIn("src/app.py", result.files_changed) + self.assertEqual(len(result.iteration_history), 1) + self.assertEqual(result.iteration_history[0]["action"], "approve") + + # -- Scenario 2: Approved after a fix cycle -- + + def test_approved_after_fix_cycle(self): + """Reviewer rejects first, approves second → COMPLETED on iteration 2.""" + builder = _CallFnBuilder() + builder.on_coder(1, files_changed=["src/app.py"]) + builder.on_reviewer(1, approved=False, blocking=False, summary="Fix imports") + builder.on_coder(2, files_changed=["src/app.py"]) + builder.on_reviewer(2, approved=True, blocking=False, summary="LGTM now") + + result = _run(run_coding_loop( + issue=_make_issue(), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(), + note_fn=self._note_fn, + )) + + self.assertEqual(result.outcome, IssueOutcome.COMPLETED) + self.assertEqual(result.attempts, 2) + self.assertEqual(len(result.iteration_history), 2) + self.assertEqual(result.iteration_history[0]["action"], "fix") + self.assertEqual(result.iteration_history[1]["action"], "approve") + + # -- Scenario 3: Non-blocking stuck loop → COMPLETED_WITH_DEBT -- + + def test_stuck_loop_non_blocking_accepts_with_debt(self): + """Reviewer keeps saying fix (non-blocking) with code changes. + + After 3 consecutive non-blocking fix cycles, the stuck detector fires + and the loop returns COMPLETED_WITH_DEBT instead of burning all 5 + iterations. + """ + builder = _CallFnBuilder() + for i in range(1, 6): + builder.on_coder(i, files_changed=["src/app.py"]) + builder.on_reviewer(i, approved=False, blocking=False, + summary=f"Minor style issue iter {i}") + + result = _run(run_coding_loop( + issue=_make_issue(), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(max_coding_iterations=5), + note_fn=self._note_fn, + )) + + self.assertEqual(result.outcome, IssueOutcome.COMPLETED_WITH_DEBT) + # Should stop at iteration 3 (window=3 consecutive non-blocking fix) + self.assertEqual(result.attempts, 3) + self.assertIn("src/app.py", result.files_changed) + # Verify stuck detection note was emitted + stuck_notes = [n for n in self.notes if "STUCK" in n] + self.assertTrue(len(stuck_notes) > 0, f"Expected stuck note, got: {self.notes}") + + # -- Scenario 4: Blocking review → immediate FAILED_UNRECOVERABLE -- + + def test_blocking_review_fails_immediately(self): + """Reviewer marks blocking=True → FAILED_UNRECOVERABLE on first iteration.""" + builder = _CallFnBuilder() + builder.on_coder(1, files_changed=["src/app.py"]) + builder.on_reviewer(1, approved=False, blocking=True, + summary="Security vulnerability: SQL injection") + + result = _run(run_coding_loop( + issue=_make_issue(), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(), + note_fn=self._note_fn, + )) + + self.assertEqual(result.outcome, IssueOutcome.FAILED_UNRECOVERABLE) + self.assertEqual(result.attempts, 1) + self.assertEqual(result.iteration_history[0]["action"], "block") + + # -- Scenario 5: Stagnation (no files changed) → FAILED_UNRECOVERABLE -- + + def test_stagnation_no_files_changed(self): + """Coder produces no files across all iterations. + + Even though reviewer is non-blocking, having no code changes means + we can't accept-with-debt — it should fail as unrecoverable. + """ + builder = _CallFnBuilder() + for i in range(1, 6): + builder.on_coder(i, files_changed=[]) # No changes! + builder.on_reviewer(i, approved=False, blocking=False, + summary="Nothing to review") + + result = _run(run_coding_loop( + issue=_make_issue(), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(max_coding_iterations=5), + note_fn=self._note_fn, + )) + + self.assertEqual(result.outcome, IssueOutcome.FAILED_UNRECOVERABLE) + self.assertEqual(result.files_changed, []) + + # -- Scenario 6: Loop exhaustion (non-blocking, has changes) → COMPLETED_WITH_DEBT -- + + def test_exhaustion_non_blocking_accepts_debt(self): + """Loop exhausts all iterations but reviewer was never blocking and + code changes exist. This should accept-with-debt rather than fail. + + Uses window > max_iterations so stuck detection doesn't fire first. + """ + config = _make_config(max_coding_iterations=2) + builder = _CallFnBuilder() + # 2 iterations: first fix cycle doesn't trigger stuck (window=3 > 2) + builder.on_coder(1, files_changed=["src/main.py"]) + builder.on_reviewer(1, approved=False, blocking=False, summary="Needs work") + builder.on_coder(2, files_changed=["src/main.py", "src/util.py"]) + builder.on_reviewer(2, approved=False, blocking=False, summary="Still needs work") + + result = _run(run_coding_loop( + issue=_make_issue(), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=config, + note_fn=self._note_fn, + )) + + self.assertEqual(result.outcome, IssueOutcome.COMPLETED_WITH_DEBT) + self.assertEqual(result.attempts, 2) + self.assertIn("src/main.py", result.files_changed) + self.assertIn("src/util.py", result.files_changed) + + # -- Scenario 7: Loop exhaustion (blocking) → FAILED_UNRECOVERABLE -- + + def test_exhaustion_blocking_fails_unrecoverable(self): + """Loop exhausts but last review was blocking → FAILED_UNRECOVERABLE. + + The coder manages to avoid the block action (reviewer non-blocking initially) + but the final iteration gets a blocking review. Since we exhaust after a + non-block action=fix with history < window, the exhaustion path checks + last_blocking. + """ + config = _make_config(max_coding_iterations=2) + builder = _CallFnBuilder() + builder.on_coder(1, files_changed=["src/app.py"]) + builder.on_reviewer(1, approved=False, blocking=False, summary="Needs refactor") + builder.on_coder(2, files_changed=["src/app.py"]) + # Final iteration: blocking but action becomes "block" which returns immediately. + # To test exhaustion-with-blocking, we need the coder to keep going. + # Actually, blocking=True sets action="block" which returns immediately. + # So we simulate: non-blocking through iterations, but reviewer sets blocking + # on the last one — but blocking=True means action="block" and immediate return. + # The exhaustion path only triggers when action=fix on all iterations. + # Let's test: all fix, last reviewer has blocking=True in review_result + # but action was "fix" (approved=False, blocking=False review but we + # manually set blocking in the response — no, that contradicts). + # + # Simpler: if the loop exhausts and all reviewers were non-blocking with + # files_changed, it's COMPLETED_WITH_DEBT. To get FAILED_UNRECOVERABLE on + # exhaustion, we need either blocking=True on last review or no files changed. + # Since blocking=True → action=block → immediate return (not exhaustion), + # the only exhaustion→FAILED path is no files changed. + # This scenario is already covered by test_stagnation_no_files_changed. + # Let's test the blocking-on-last instead (immediate return path). + builder.on_reviewer(2, approved=False, blocking=True, summary="Critical bug") + + result = _run(run_coding_loop( + issue=_make_issue(), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=config, + note_fn=self._note_fn, + )) + + # blocking=True on iter 2 → action=block → immediate FAILED_UNRECOVERABLE + self.assertEqual(result.outcome, IssueOutcome.FAILED_UNRECOVERABLE) + self.assertEqual(result.attempts, 2) + self.assertEqual(result.iteration_history[-1]["action"], "block") + + # -- Scenario 8: Coder exception → FAILED_UNRECOVERABLE -- + + def test_coder_exception_fails_unrecoverable(self): + """Coder raises an exception → immediate FAILED_UNRECOVERABLE.""" + + def call_fn(agent_name: str, **kwargs): + async def _invoke(): + if "run_coder" in agent_name: + raise RuntimeError("Model API timeout") + return {} + return _invoke() + + result = _run(run_coding_loop( + issue=_make_issue(), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=call_fn, + node_id="test-node", + config=_make_config(), + note_fn=self._note_fn, + )) + + self.assertEqual(result.outcome, IssueOutcome.FAILED_UNRECOVERABLE) + self.assertIn("Coder agent failed", result.error_message) + self.assertEqual(result.attempts, 1) + + # -- Scenario 9: Flagged path — synthesizer detects stuck -- + + def test_flagged_path_synthesizer_stuck(self): + """Flagged path: synthesizer sets stuck=True with non-blocking review. + + Should return COMPLETED_WITH_DEBT from the stuck handler (since + files_changed and not blocking). + """ + builder = _CallFnBuilder() + builder.on_coder(1, files_changed=["src/module.py"]) + builder.on_qa(1, passed=True, summary="Tests pass") + builder.on_reviewer(1, approved=False, blocking=False, summary="Minor issues") + builder.on_synth(1, action="fix", summary="Continue fixing", stuck=True) + + result = _run(run_coding_loop( + issue=_make_issue(needs_deeper_qa=True), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(), + note_fn=self._note_fn, + )) + + self.assertEqual(result.outcome, IssueOutcome.COMPLETED_WITH_DEBT) + self.assertEqual(result.attempts, 1) + self.assertIn("src/module.py", result.files_changed) + + # -- Scenario 10: Flagged path — synthesizer stuck + blocking → FAILED -- + + def test_flagged_path_stuck_blocking_fails(self): + """Flagged path: synthesizer stuck + reviewer blocking → FAILED_UNRECOVERABLE.""" + builder = _CallFnBuilder() + builder.on_coder(1, files_changed=["src/module.py"]) + builder.on_qa(1, passed=False, summary="Security test failed") + builder.on_reviewer(1, approved=False, blocking=True, summary="Security vulnerability") + builder.on_synth(1, action="fix", summary="Critical issue", stuck=True) + + result = _run(run_coding_loop( + issue=_make_issue(needs_deeper_qa=True), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(), + note_fn=self._note_fn, + )) + + self.assertEqual(result.outcome, IssueOutcome.FAILED_UNRECOVERABLE) + self.assertEqual(result.attempts, 1) + + # -- Scenario 11: Flagged path happy path -- + + def test_flagged_path_approved(self): + """Flagged path: QA passes, reviewer approves, synthesizer approves.""" + builder = _CallFnBuilder() + builder.on_coder(1, files_changed=["src/feature.py", "tests/test_feature.py"]) + builder.on_qa(1, passed=True, summary="All 5 tests pass") + builder.on_reviewer(1, approved=True, blocking=False, summary="Clean code") + builder.on_synth(1, action="approve", summary="All checks pass") + + result = _run(run_coding_loop( + issue=_make_issue(needs_deeper_qa=True), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(), + note_fn=self._note_fn, + )) + + self.assertEqual(result.outcome, IssueOutcome.COMPLETED) + self.assertEqual(result.attempts, 1) + self.assertEqual(len(result.files_changed), 2) + + # -- Scenario 12: Iteration history is accumulated correctly -- + + def test_iteration_history_accumulated(self): + """Verify iteration_history contains full detail from each iteration.""" + builder = _CallFnBuilder() + builder.on_coder(1, files_changed=["a.py"]) + builder.on_reviewer(1, approved=False, blocking=False, summary="Fix A") + builder.on_coder(2, files_changed=["b.py"]) + builder.on_reviewer(2, approved=True, blocking=False, summary="LGTM") + + result = _run(run_coding_loop( + issue=_make_issue(), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(), + note_fn=self._note_fn, + )) + + self.assertEqual(len(result.iteration_history), 2) + + h1 = result.iteration_history[0] + self.assertEqual(h1["iteration"], 1) + self.assertEqual(h1["action"], "fix") + self.assertEqual(h1["path"], "default") + self.assertFalse(h1["review_blocking"]) + + h2 = result.iteration_history[1] + self.assertEqual(h2["iteration"], 2) + self.assertEqual(h2["action"], "approve") + self.assertTrue(h2["review_approved"]) + + # -- Scenario 13: Artifacts are saved to disk -- + + def test_artifacts_saved(self): + """Verify iteration artifacts (coder, review) are written to disk.""" + builder = _CallFnBuilder() + builder.on_coder(1, files_changed=["x.py"]) + builder.on_reviewer(1, approved=True, summary="Good") + + _run(run_coding_loop( + issue=_make_issue(), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(), + note_fn=self._note_fn, + )) + + # Check that coding-loop artifacts directory was created + coding_loop_dir = os.path.join(self.artifacts_dir, "coding-loop") + self.assertTrue(os.path.isdir(coding_loop_dir), + f"Expected artifacts dir at {coding_loop_dir}") + + # Should have at least one iteration subdirectory with coder.json and review.json + subdirs = os.listdir(coding_loop_dir) + self.assertTrue(len(subdirs) >= 1) + first_iter = os.path.join(coding_loop_dir, subdirs[0]) + self.assertTrue(os.path.exists(os.path.join(first_iter, "coder.json"))) + self.assertTrue(os.path.exists(os.path.join(first_iter, "review.json"))) + + # -- Scenario 14: Iteration checkpoint saved -- + + def test_iteration_checkpoint_saved(self): + """Verify per-issue iteration checkpoint JSON is written.""" + builder = _CallFnBuilder() + builder.on_coder(1, files_changed=["x.py"]) + builder.on_reviewer(1, approved=True, summary="Good") + + _run(run_coding_loop( + issue=_make_issue("CHECKPOINT-1"), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(), + note_fn=self._note_fn, + )) + + checkpoint_path = os.path.join( + self.artifacts_dir, "execution", "iterations", "CHECKPOINT-1.json" + ) + self.assertTrue(os.path.exists(checkpoint_path)) + + # -- Scenario 15: Files accumulate across iterations -- + + def test_files_accumulate_across_iterations(self): + """Files changed in different iterations should all appear in result.""" + builder = _CallFnBuilder() + builder.on_coder(1, files_changed=["a.py", "b.py"]) + builder.on_reviewer(1, approved=False, blocking=False, summary="Fix") + builder.on_coder(2, files_changed=["b.py", "c.py"]) # b.py again + new c.py + builder.on_reviewer(2, approved=True, summary="LGTM") + + result = _run(run_coding_loop( + issue=_make_issue(), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(), + note_fn=self._note_fn, + )) + + self.assertEqual(result.outcome, IssueOutcome.COMPLETED) + # a.py, b.py from iter 1, c.py from iter 2 — b.py not duplicated + self.assertEqual(sorted(result.files_changed), ["a.py", "b.py", "c.py"]) + + # -- Scenario 16: Stuck detection fires exactly at window boundary -- + + def test_stuck_fires_at_window_boundary(self): + """With window=3, stuck fires on iteration 3 (not before, not after).""" + # 3 iterations, all non-blocking fix → stuck on iter 3 + builder = _CallFnBuilder() + for i in range(1, 4): + builder.on_coder(i, files_changed=["x.py"]) + builder.on_reviewer(i, approved=False, blocking=False, + summary=f"Polish iter {i}") + + result = _run(run_coding_loop( + issue=_make_issue(), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(max_coding_iterations=10), # high limit, stuck should fire first + note_fn=self._note_fn, + )) + + self.assertEqual(result.outcome, IssueOutcome.COMPLETED_WITH_DEBT) + self.assertEqual(result.attempts, 3) + + # -- Scenario 17: Mixed blocking/non-blocking doesn't trigger stuck -- + + def test_mixed_blocking_prevents_stuck(self): + """If one of the recent iterations was blocking (action=block returns + immediately), stuck detection can't reach window=3 of consecutive + non-blocking fix cycles. + + Here: iter 1 non-blocking fix, iter 2 approved (approved breaks the fix streak). + Then iter 3 can't happen because iter 2 was approved. + """ + builder = _CallFnBuilder() + builder.on_coder(1, files_changed=["x.py"]) + builder.on_reviewer(1, approved=False, blocking=False, summary="Fix") + builder.on_coder(2, files_changed=["x.py"]) + builder.on_reviewer(2, approved=True, summary="LGTM") + + result = _run(run_coding_loop( + issue=_make_issue(), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(), + note_fn=self._note_fn, + )) + + self.assertEqual(result.outcome, IssueOutcome.COMPLETED) + self.assertEqual(result.attempts, 2) + + # -- Scenario 18: note_fn receives meaningful tags -- + + def test_note_fn_receives_tags(self): + """Verify note_fn is called with structured tags for observability.""" + builder = _CallFnBuilder() + builder.on_coder(1, files_changed=["x.py"]) + builder.on_reviewer(1, approved=True, summary="Good") + + _run(run_coding_loop( + issue=_make_issue("TAG-TEST"), + dag_state=_make_dag_state(self.artifacts_dir), + call_fn=builder.build(), + node_id="test-node", + config=_make_config(), + note_fn=self._note_fn, + )) + + # Should have tags with issue name and lifecycle events + all_tags = [tag for tags in self.note_tags for tag in tags] + self.assertIn("TAG-TEST", all_tags) + self.assertIn("coding_loop", all_tags) + self.assertIn("start", all_tags) + self.assertIn("complete", all_tags) + + +if __name__ == "__main__": + unittest.main()