feat: propagate execution evidence across iterations and enhance reports

- Carry execution evidence forward so reviewer/senior prompts in subsequent iterations can inspect prior transcript and command data - Add {execution_evidence} to REVIEW_ONLY templates (en/ko) - Add evidence summary table to iteration reports - Fix test_agentic to match stdin-based prompt delivery for Claude - Add expanded claim/no-change marker tests and cross-iteration evidence propagation tests Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 23:36:28 +09:00
parent c467222a2a
commit 87bc0ffbfb
5 changed files with 591 additions and 10 deletions
--- a/cross_eval/pipeline.py
+++ b/cross_eval/pipeline.py
@@ -338,6 +338,13 @@ def _run_simple_pipeline(
                    if tracker:
                        input_contents["previous_senior_tracker"] = tracker

+            # Carry execution evidence forward so subsequent iterations'
+            # reviewer/senior prompts can inspect prior transcript data.
+            if step_results:
+                input_contents["execution_evidence"] = _format_execution_evidence(
+                    step_results,
+                )
+
            iterations.append(iter_result)

            # ESCALATE check (highest priority)
@@ -531,6 +538,13 @@ def _run_phased_pipeline(
                        if tracker:
                            input_contents["previous_senior_tracker"] = tracker

+                # Carry execution evidence forward so subsequent iterations'
+                # reviewer/senior prompts can inspect prior transcript data.
+                if step_results:
+                    input_contents["execution_evidence"] = _format_execution_evidence(
+                        step_results,
+                    )
+
                iterations.append(iter_result)

                # ESCALATE check
@@ -1133,16 +1147,33 @@ def _build_context(
    max_iterations: int,
    step_results: dict[str, AgentResult] | None = None,
 ) -> dict[str, str]:
-    """Build the template context dict."""
+    """Build the template context dict.
+
+    Execution evidence from prior iterations is carried forward in
+    ``input_contents["execution_evidence"]``.  When the current iteration
+    has its own step results, the evidence is merged so reviewers/seniors
+    see both prior and current data.
+    """
    context: dict[str, str] = {}
    context.update(input_contents)
    context.update(step_outputs)
    context["feedback"] = feedback
    context["iteration"] = str(iteration)
    context["max_iterations"] = str(max_iterations)
-    # Surface execution evidence from prior steps so reviewers can inspect it
+    # Surface execution evidence from prior steps so reviewers can inspect it.
+    # Prior-iteration evidence may already live in context via input_contents.
+    prior_evidence = context.get("execution_evidence", "")
    if step_results:
-        context["execution_evidence"] = _format_execution_evidence(step_results)
+        current_evidence = _format_execution_evidence(step_results)
+        if prior_evidence and prior_evidence != "(no prior execution evidence)":
+            context["execution_evidence"] = (
+                "# Prior Iteration Evidence\n"
+                + prior_evidence
+                + "\n\n# Current Iteration Evidence\n"
+                + current_evidence
+            )
+        else:
+            context["execution_evidence"] = current_evidence
    return context


@@ -1164,6 +1195,7 @@ def _format_execution_evidence(
            f"- Command: `{result.command_preview}`" if result.command_preview else "",
            f"- Exit code: {result.exit_code}",
            f"- Duration: {result.duration_seconds}s",
+            f"- Output size: {len(result.output)} chars",
        ]
        section = [line for line in section if line]
        if result.transcript: