feat: propagate execution evidence across iterations and enhance reports

- Carry execution evidence forward so reviewer/senior prompts in
  subsequent iterations can inspect prior transcript and command data
- Add {execution_evidence} to REVIEW_ONLY templates (en/ko)
- Add evidence summary table to iteration reports
- Fix test_agentic to match stdin-based prompt delivery for Claude
- Add expanded claim/no-change marker tests and cross-iteration
  evidence propagation tests

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
chungyeong
2026-03-13 23:36:28 +09:00
parent c467222a2a
commit 87bc0ffbfb
5 changed files with 591 additions and 10 deletions

View File

@@ -233,11 +233,11 @@ class TestBaseRepoIsolation(unittest.TestCase):
# ===================================================================
class TestInvokeAgentAgenticClaude(unittest.TestCase):
"""invoke_agent_agentic builds correct cmd for claude (no -p, prompt as positional arg)."""
"""invoke_agent_agentic builds correct cmd for claude (no -p, prompt via stdin)."""
@patch("cross_eval.worktree.capture_diff", return_value="diff --git a/file ...")
@patch("subprocess.run")
def test_claude_cmd_has_no_dash_p_and_prompt_as_positional(
def test_claude_cmd_has_no_dash_p_and_prompt_via_stdin(
self, mock_run: MagicMock, mock_diff: MagicMock,
) -> None:
mock_run.return_value = MagicMock(returncode=0, stdout="ok", stderr="")
@@ -271,8 +271,10 @@ class TestInvokeAgentAgenticClaude(unittest.TestCase):
# No -p flag
self.assertNotIn("-p", cmd)
# Last arg is a task file reference (not raw prompt — avoids arg length limits)
self.assertIn("task file", cmd[-1].lower())
# Prompt is delivered via stdin (input kwarg), not as a positional arg
input_data = agent_call[1].get("input")
self.assertIsNotNone(input_data)
self.assertIn("implement feature X", input_data)
class TestInvokeAgentAgenticCodex(unittest.TestCase):