feat: propagate execution evidence across iterations and enhance reports

- Carry execution evidence forward so reviewer/senior prompts in
  subsequent iterations can inspect prior transcript and command data
- Add {execution_evidence} to REVIEW_ONLY templates (en/ko)
- Add evidence summary table to iteration reports
- Fix test_agentic to match stdin-based prompt delivery for Claude
- Add expanded claim/no-change marker tests and cross-iteration
  evidence propagation tests

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
chungyeong
2026-03-13 23:36:28 +09:00
parent c467222a2a
commit 87bc0ffbfb
5 changed files with 591 additions and 10 deletions

View File

@@ -338,6 +338,13 @@ def _run_simple_pipeline(
if tracker:
input_contents["previous_senior_tracker"] = tracker
# Carry execution evidence forward so subsequent iterations'
# reviewer/senior prompts can inspect prior transcript data.
if step_results:
input_contents["execution_evidence"] = _format_execution_evidence(
step_results,
)
iterations.append(iter_result)
# ESCALATE check (highest priority)
@@ -531,6 +538,13 @@ def _run_phased_pipeline(
if tracker:
input_contents["previous_senior_tracker"] = tracker
# Carry execution evidence forward so subsequent iterations'
# reviewer/senior prompts can inspect prior transcript data.
if step_results:
input_contents["execution_evidence"] = _format_execution_evidence(
step_results,
)
iterations.append(iter_result)
# ESCALATE check
@@ -1133,16 +1147,33 @@ def _build_context(
max_iterations: int,
step_results: dict[str, AgentResult] | None = None,
) -> dict[str, str]:
"""Build the template context dict."""
"""Build the template context dict.
Execution evidence from prior iterations is carried forward in
``input_contents["execution_evidence"]``. When the current iteration
has its own step results, the evidence is merged so reviewers/seniors
see both prior and current data.
"""
context: dict[str, str] = {}
context.update(input_contents)
context.update(step_outputs)
context["feedback"] = feedback
context["iteration"] = str(iteration)
context["max_iterations"] = str(max_iterations)
# Surface execution evidence from prior steps so reviewers can inspect it
# Surface execution evidence from prior steps so reviewers can inspect it.
# Prior-iteration evidence may already live in context via input_contents.
prior_evidence = context.get("execution_evidence", "")
if step_results:
context["execution_evidence"] = _format_execution_evidence(step_results)
current_evidence = _format_execution_evidence(step_results)
if prior_evidence and prior_evidence != "(no prior execution evidence)":
context["execution_evidence"] = (
"# Prior Iteration Evidence\n"
+ prior_evidence
+ "\n\n# Current Iteration Evidence\n"
+ current_evidence
)
else:
context["execution_evidence"] = current_evidence
return context
@@ -1164,6 +1195,7 @@ def _format_execution_evidence(
f"- Command: `{result.command_preview}`" if result.command_preview else "",
f"- Exit code: {result.exit_code}",
f"- Duration: {result.duration_seconds}s",
f"- Output size: {len(result.output)} chars",
]
section = [line for line in section if line]
if result.transcript: