feat: propagate execution evidence across iterations and enhance reports

- Carry execution evidence forward so reviewer/senior prompts in subsequent iterations can inspect prior transcript and command data - Add {execution_evidence} to REVIEW_ONLY templates (en/ko) - Add evidence summary table to iteration reports - Fix test_agentic to match stdin-based prompt delivery for Claude - Add expanded claim/no-change marker tests and cross-iteration evidence propagation tests Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 23:36:28 +09:00
parent c467222a2a
commit 87bc0ffbfb
5 changed files with 591 additions and 10 deletions
--- a/tests/test_agentic.py
+++ b/tests/test_agentic.py
@@ -233,11 +233,11 @@ class TestBaseRepoIsolation(unittest.TestCase):
 # ===================================================================

 class TestInvokeAgentAgenticClaude(unittest.TestCase):
-    """invoke_agent_agentic builds correct cmd for claude (no -p, prompt as positional arg)."""
+    """invoke_agent_agentic builds correct cmd for claude (no -p, prompt via stdin)."""

    @patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
    @patch("subprocess.run")
-    def test_claude_cmd_has_no_dash_p_and_prompt_as_positional(
+    def test_claude_cmd_has_no_dash_p_and_prompt_via_stdin(
        self, mock_run: MagicMock, mock_diff: MagicMock,
    ) -> None:
        mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
@@ -271,8 +271,10 @@ class TestInvokeAgentAgenticClaude(unittest.TestCase):

        # No -p flag
        self.assertNotIn("-p", cmd)
-        # Last arg is a task file reference (not raw prompt — avoids arg length limits)
-        self.assertIn("task file", cmd[-1].lower())
+        # Prompt is delivered via stdin (input kwarg), not as a positional arg
+        input_data = agent_call[1].get("input")
+        self.assertIsNotNone(input_data)
+        self.assertIn("implement feature X", input_data)


 class TestInvokeAgentAgenticCodex(unittest.TestCase):