feat: propagate execution evidence across iterations and enhance reports
- Carry execution evidence forward so reviewer/senior prompts in
subsequent iterations can inspect prior transcript and command data
- Add {execution_evidence} to REVIEW_ONLY templates (en/ko)
- Add evidence summary table to iteration reports
- Fix test_agentic to match stdin-based prompt delivery for Claude
- Add expanded claim/no-change marker tests and cross-iteration
evidence propagation tests
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -338,6 +338,13 @@ def _run_simple_pipeline(
|
||||
if tracker:
|
||||
input_contents["previous_senior_tracker"] = tracker
|
||||
|
||||
# Carry execution evidence forward so subsequent iterations'
|
||||
# reviewer/senior prompts can inspect prior transcript data.
|
||||
if step_results:
|
||||
input_contents["execution_evidence"] = _format_execution_evidence(
|
||||
step_results,
|
||||
)
|
||||
|
||||
iterations.append(iter_result)
|
||||
|
||||
# ESCALATE check (highest priority)
|
||||
@@ -531,6 +538,13 @@ def _run_phased_pipeline(
|
||||
if tracker:
|
||||
input_contents["previous_senior_tracker"] = tracker
|
||||
|
||||
# Carry execution evidence forward so subsequent iterations'
|
||||
# reviewer/senior prompts can inspect prior transcript data.
|
||||
if step_results:
|
||||
input_contents["execution_evidence"] = _format_execution_evidence(
|
||||
step_results,
|
||||
)
|
||||
|
||||
iterations.append(iter_result)
|
||||
|
||||
# ESCALATE check
|
||||
@@ -1133,16 +1147,33 @@ def _build_context(
|
||||
max_iterations: int,
|
||||
step_results: dict[str, AgentResult] | None = None,
|
||||
) -> dict[str, str]:
|
||||
"""Build the template context dict."""
|
||||
"""Build the template context dict.
|
||||
|
||||
Execution evidence from prior iterations is carried forward in
|
||||
``input_contents["execution_evidence"]``. When the current iteration
|
||||
has its own step results, the evidence is merged so reviewers/seniors
|
||||
see both prior and current data.
|
||||
"""
|
||||
context: dict[str, str] = {}
|
||||
context.update(input_contents)
|
||||
context.update(step_outputs)
|
||||
context["feedback"] = feedback
|
||||
context["iteration"] = str(iteration)
|
||||
context["max_iterations"] = str(max_iterations)
|
||||
# Surface execution evidence from prior steps so reviewers can inspect it
|
||||
# Surface execution evidence from prior steps so reviewers can inspect it.
|
||||
# Prior-iteration evidence may already live in context via input_contents.
|
||||
prior_evidence = context.get("execution_evidence", "")
|
||||
if step_results:
|
||||
context["execution_evidence"] = _format_execution_evidence(step_results)
|
||||
current_evidence = _format_execution_evidence(step_results)
|
||||
if prior_evidence and prior_evidence != "(no prior execution evidence)":
|
||||
context["execution_evidence"] = (
|
||||
"# Prior Iteration Evidence\n"
|
||||
+ prior_evidence
|
||||
+ "\n\n# Current Iteration Evidence\n"
|
||||
+ current_evidence
|
||||
)
|
||||
else:
|
||||
context["execution_evidence"] = current_evidence
|
||||
return context
|
||||
|
||||
|
||||
@@ -1164,6 +1195,7 @@ def _format_execution_evidence(
|
||||
f"- Command: `{result.command_preview}`" if result.command_preview else "",
|
||||
f"- Exit code: {result.exit_code}",
|
||||
f"- Duration: {result.duration_seconds}s",
|
||||
f"- Output size: {len(result.output)} chars",
|
||||
]
|
||||
section = [line for line in section if line]
|
||||
if result.transcript:
|
||||
|
||||
@@ -243,9 +243,14 @@ You are tasked with reviewing existing code against a plan and checklist.
|
||||
## Previous Review (iteration {iteration} of {max_iterations})
|
||||
{feedback}
|
||||
|
||||
## Execution Evidence
|
||||
{execution_evidence}
|
||||
|
||||
## Review Instructions
|
||||
Explore the project directory thoroughly to understand the full codebase, \
|
||||
then evaluate the EXISTING code against ONLY the plan and checklist above.
|
||||
then evaluate the EXISTING code against ONLY the plan and checklist above. \
|
||||
Use the execution evidence above to verify agent claims against actual \
|
||||
command outputs and exit codes.
|
||||
|
||||
You are NOT generating or modifying code. You are auditing what already exists.
|
||||
|
||||
@@ -314,9 +319,13 @@ REVIEW_ONLY_TEMPLATE_KO = """\
|
||||
## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째)
|
||||
{feedback}
|
||||
|
||||
## 실행 증거
|
||||
{execution_evidence}
|
||||
|
||||
## 검토 지침
|
||||
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스를 파악한 뒤, \
|
||||
위 기획서와 체크리스트 기준으로 **기존 코드**를 평가하세요.
|
||||
위 기획서와 체크리스트 기준으로 **기존 코드**를 평가하세요. \
|
||||
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력과 종료 코드로 검증하세요.
|
||||
|
||||
코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다.
|
||||
|
||||
|
||||
@@ -58,6 +58,12 @@ _STRINGS: dict[str, dict[str, str]] = {
|
||||
"metrics_total_issues": "Total Issues",
|
||||
"metrics_na": "N/A",
|
||||
"iteration_details": "Iteration Details",
|
||||
"evidence_summary": "Evidence Summary",
|
||||
"evidence_agent": "Agent",
|
||||
"evidence_exit_code": "Exit Code",
|
||||
"evidence_duration": "Duration",
|
||||
"evidence_output_size": "Output Size",
|
||||
"evidence_transcript": "Execution transcript",
|
||||
},
|
||||
"ko": {
|
||||
"title": "교차 검증 리포트",
|
||||
@@ -99,6 +105,12 @@ _STRINGS: dict[str, dict[str, str]] = {
|
||||
"metrics_total_issues": "총 이슈",
|
||||
"metrics_na": "해당 없음",
|
||||
"iteration_details": "반복 상세",
|
||||
"evidence_summary": "실행 증거 요약",
|
||||
"evidence_agent": "에이전트",
|
||||
"evidence_exit_code": "종료 코드",
|
||||
"evidence_duration": "소요 시간",
|
||||
"evidence_output_size": "출력 크기",
|
||||
"evidence_transcript": "실행 트랜스크립트",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -377,6 +389,30 @@ def _append_iteration_steps(
|
||||
If *skip_extraction* is True, out-of-scope and review-metrics parsing
|
||||
is skipped (useful when a pre-scan already collected that data).
|
||||
"""
|
||||
# Evidence summary table — quick overview of all steps' execution data
|
||||
has_evidence = any(
|
||||
iter_result.step_results.get(s.output_key) for s in steps
|
||||
)
|
||||
if has_evidence:
|
||||
s_step = _t(config, "step")
|
||||
s_agent = _t(config, "evidence_agent")
|
||||
s_exit = _t(config, "evidence_exit_code")
|
||||
s_dur = _t(config, "evidence_duration")
|
||||
s_size = _t(config, "evidence_output_size")
|
||||
lines.append(f"**{_t(config, 'evidence_summary')}**\n")
|
||||
lines.append(f"| {s_step} | {s_agent} | {s_exit} | {s_dur} | {s_size} |")
|
||||
lines.append("|------|-------|-----------|----------|-------------|")
|
||||
for step in steps:
|
||||
ar = iter_result.step_results.get(step.output_key)
|
||||
out = iter_result.step_outputs.get(step.output_key, "")
|
||||
if ar:
|
||||
lines.append(
|
||||
f"| {step.name} | {ar.agent_name} "
|
||||
f"| {ar.exit_code} | {ar.duration_seconds}s "
|
||||
f"| {len(out)} chars |"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
for step in steps:
|
||||
agent_result = iter_result.step_results.get(step.output_key)
|
||||
output = iter_result.step_outputs.get(step.output_key, "")
|
||||
@@ -410,8 +446,9 @@ def _append_iteration_steps(
|
||||
transcript_preview = agent_result.transcript[:1500]
|
||||
if len(agent_result.transcript) > 1500:
|
||||
transcript_preview += "\n... (truncated)"
|
||||
transcript_label = _t(config, "evidence_transcript")
|
||||
lines.append("<details>")
|
||||
lines.append("<summary>Execution transcript</summary>\n")
|
||||
lines.append(f"<summary>{transcript_label}</summary>\n")
|
||||
lines.append(transcript_preview)
|
||||
lines.append("\n</details>\n")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user