feat: propagate execution evidence across iterations and enhance reports
- Carry execution evidence forward so reviewer/senior prompts in
subsequent iterations can inspect prior transcript and command data
- Add {execution_evidence} to REVIEW_ONLY templates (en/ko)
- Add evidence summary table to iteration reports
- Fix test_agentic to match stdin-based prompt delivery for Claude
- Add expanded claim/no-change marker tests and cross-iteration
evidence propagation tests
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -338,6 +338,13 @@ def _run_simple_pipeline(
|
||||
if tracker:
|
||||
input_contents["previous_senior_tracker"] = tracker
|
||||
|
||||
# Carry execution evidence forward so subsequent iterations'
|
||||
# reviewer/senior prompts can inspect prior transcript data.
|
||||
if step_results:
|
||||
input_contents["execution_evidence"] = _format_execution_evidence(
|
||||
step_results,
|
||||
)
|
||||
|
||||
iterations.append(iter_result)
|
||||
|
||||
# ESCALATE check (highest priority)
|
||||
@@ -531,6 +538,13 @@ def _run_phased_pipeline(
|
||||
if tracker:
|
||||
input_contents["previous_senior_tracker"] = tracker
|
||||
|
||||
# Carry execution evidence forward so subsequent iterations'
|
||||
# reviewer/senior prompts can inspect prior transcript data.
|
||||
if step_results:
|
||||
input_contents["execution_evidence"] = _format_execution_evidence(
|
||||
step_results,
|
||||
)
|
||||
|
||||
iterations.append(iter_result)
|
||||
|
||||
# ESCALATE check
|
||||
@@ -1133,16 +1147,33 @@ def _build_context(
|
||||
max_iterations: int,
|
||||
step_results: dict[str, AgentResult] | None = None,
|
||||
) -> dict[str, str]:
|
||||
"""Build the template context dict."""
|
||||
"""Build the template context dict.
|
||||
|
||||
Execution evidence from prior iterations is carried forward in
|
||||
``input_contents["execution_evidence"]``. When the current iteration
|
||||
has its own step results, the evidence is merged so reviewers/seniors
|
||||
see both prior and current data.
|
||||
"""
|
||||
context: dict[str, str] = {}
|
||||
context.update(input_contents)
|
||||
context.update(step_outputs)
|
||||
context["feedback"] = feedback
|
||||
context["iteration"] = str(iteration)
|
||||
context["max_iterations"] = str(max_iterations)
|
||||
# Surface execution evidence from prior steps so reviewers can inspect it
|
||||
# Surface execution evidence from prior steps so reviewers can inspect it.
|
||||
# Prior-iteration evidence may already live in context via input_contents.
|
||||
prior_evidence = context.get("execution_evidence", "")
|
||||
if step_results:
|
||||
context["execution_evidence"] = _format_execution_evidence(step_results)
|
||||
current_evidence = _format_execution_evidence(step_results)
|
||||
if prior_evidence and prior_evidence != "(no prior execution evidence)":
|
||||
context["execution_evidence"] = (
|
||||
"# Prior Iteration Evidence\n"
|
||||
+ prior_evidence
|
||||
+ "\n\n# Current Iteration Evidence\n"
|
||||
+ current_evidence
|
||||
)
|
||||
else:
|
||||
context["execution_evidence"] = current_evidence
|
||||
return context
|
||||
|
||||
|
||||
@@ -1164,6 +1195,7 @@ def _format_execution_evidence(
|
||||
f"- Command: `{result.command_preview}`" if result.command_preview else "",
|
||||
f"- Exit code: {result.exit_code}",
|
||||
f"- Duration: {result.duration_seconds}s",
|
||||
f"- Output size: {len(result.output)} chars",
|
||||
]
|
||||
section = [line for line in section if line]
|
||||
if result.transcript:
|
||||
|
||||
Reference in New Issue
Block a user