feat: tighten agentic runtime handoffs and quality gates

This commit is contained in:
chungyeong
2026-03-14 10:05:25 +09:00
parent 87bc0ffbfb
commit 7b95233edf
15 changed files with 1148 additions and 167 deletions

View File

@@ -415,11 +415,7 @@ def invoke_agent_agentic(
timeout: int | None = None,
quiet: bool = False,
) -> AgentResult:
"""Invoke an agent in agentic mode (no -p, runs in worktree, captures git diff).
The agent runs without print mode so it can modify files directly.
After the agent exits, git diff (since last commit) is captured as the output.
"""
"""Invoke an agent in agentic mode using the worktree as the source of truth."""
from cross_eval.worktree import capture_diff
# Write prompt to a temp file (outside worktree, won't appear in diffs)
@@ -433,10 +429,10 @@ def invoke_agent_agentic(
if agent.reasoning_effort and _supports_reasoning_effort(agent.command):
cmd.extend(["-c", f'model_reasoning_effort="{agent.reasoning_effort}"'])
# Strip stdin sentinel ("-") from args for agentic mode.
# Keep -p/--print: Claude -p mode still has full tool access (Edit, Write,
# Bash, etc.) and is the correct mode for non-interactive subprocess use.
args = [a for a in agent.args if a != "-"]
# Strip print-mode flags and stdin sentinels for agentic mode.
# Agentic runs should operate on the worktree and return a real git diff,
# not behave as a one-shot text completer.
args = [a for a in agent.args if a not in {"-", "-p", "--print"}]
cmd.extend(args)
# System prompt via flag if supported
@@ -454,8 +450,8 @@ def invoke_agent_agentic(
else:
input_data = prompt
else:
# claude -p: deliver prompt via stdin (same as codex).
# -p mode is non-interactive and reads from stdin, then exits.
# claude: deliver the task through stdin and let the worktree be the
# canonical place where files are read/written.
input_data = prompt
cmd_preview = " ".join(cmd[:6])

View File

@@ -266,7 +266,7 @@ def main(argv: list[str] | None = None) -> int:
type=int,
default=None,
metavar="SEC",
help="에이전트 호출 제한 시간 (--live 전용)",
help="에이전트 1회 호출 제한 시간(초). 0=무제한 (기본: 무제한, --live 전용)",
)
# --- run ---
@@ -981,6 +981,7 @@ def cmd_run(args: argparse.Namespace) -> int:
print(f"No files found in: {docs_dir}", file=sys.stderr)
return 1
config.inputs["docs"] = docs_content
config.inputs["docs_ref"] = str(docs_dir)
if args.env_files:
for env_file in args.env_files:
@@ -1007,7 +1008,6 @@ def cmd_run(args: argparse.Namespace) -> int:
apply_input_overrides(config, overrides)
# 3. Validate after all overrides
from cross_eval.config import validate_config
errors = validate_config(config)
if errors:
print("Config error:\n " + "\n ".join(errors), file=sys.stderr)

View File

@@ -698,9 +698,9 @@ def _validate_unique_step_fields(
def _make_agentic(agent: AgentConfig) -> None:
"""Convert an agent to agentic mode in-place (remove -p, set agentic=True)."""
"""Convert an agent to agentic mode in-place."""
agent.agentic = True
agent.args = [a for a in agent.args if a != "-p"]
agent.args = [a for a in agent.args if a not in {"-p", "--print"}]
def sync_phased_iterations(

View File

@@ -217,7 +217,7 @@ def run_mock_demo(preset: str = "simple", show_escalate: bool = False) -> None:
if show_escalate:
print(f"\n{RED}{BOLD}{'=' * 50}")
print(f" Escalation Report")
print(" Escalation Report")
print(f"{'=' * 50}{RESET}")
print(f"{YELLOW}Human review required.{RESET}")
print(f" {RED}{RESET} Requirements are ambiguous — needs stakeholder clarification")

View File

@@ -3,7 +3,7 @@ from __future__ import annotations
import shutil
import subprocess
from dataclasses import dataclass, field
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

View File

@@ -343,6 +343,8 @@ def _run_simple_pipeline(
if step_results:
input_contents["execution_evidence"] = _format_execution_evidence(
step_results,
run_dir=run_dir,
iteration=i,
)
iterations.append(iter_result)
@@ -543,6 +545,8 @@ def _run_phased_pipeline(
if step_results:
input_contents["execution_evidence"] = _format_execution_evidence(
step_results,
run_dir=run_dir,
iteration=global_iter,
)
iterations.append(iter_result)
@@ -661,10 +665,13 @@ def _load_inputs(config: PipelineConfig) -> dict[str, str]:
"""Load input file contents from config."""
input_contents: dict[str, str] = {}
for key, val in config.inputs.items():
if isinstance(val, str):
if key.endswith("_ref"):
input_contents[key] = str(val)
elif isinstance(val, str):
input_contents[key] = val
else:
input_contents[key] = val.read_text(encoding="utf-8")
_refresh_input_references(config, input_contents)
return input_contents
@@ -673,10 +680,99 @@ def _refresh_inputs(
) -> None:
"""Re-read input files (they may have changed on disk)."""
for key, val in config.inputs.items():
if isinstance(val, str):
if key.endswith("_ref"):
input_contents[key] = str(val)
elif isinstance(val, str):
input_contents[key] = val
elif isinstance(val, Path) and val.exists():
input_contents[key] = val.read_text(encoding="utf-8")
_refresh_input_references(config, input_contents)
def _refresh_input_references(
config: PipelineConfig,
input_contents: dict[str, str],
) -> None:
"""Expose stable file references for canonical planning inputs."""
for key, val in config.inputs.items():
if key.endswith("_ref"):
input_contents[key] = str(val)
continue
ref_key = f"{key}_ref"
if isinstance(val, Path):
input_contents[ref_key] = str(val.resolve())
else:
input_contents.setdefault(ref_key, f"(inline {key}; no file path available)")
def _git_ref(cwd: Path, *args: str) -> str:
"""Best-effort git metadata lookup."""
result = subprocess.run(
["git", *args],
cwd=cwd,
capture_output=True,
text=True,
)
if result.returncode != 0:
return "(unknown)"
return result.stdout.strip() or "(unknown)"
def _collect_markdown_refs(run_dir: Path, iteration: int) -> list[Path]:
"""Collect prior markdown artifacts available to the current step."""
refs: list[Path] = []
for idx in range(1, iteration + 1):
iter_dir = run_dir / f"v{idx}"
if not iter_dir.exists():
continue
refs.extend(sorted(iter_dir.glob("*.md")))
return refs
def _build_artifact_references(
context: dict[str, str],
*,
cwd: Path,
run_dir: Path,
iteration: int,
worktree_path: Path | None,
step_results: dict[str, AgentResult] | None = None,
) -> str:
"""Build a compact reference-only handoff for agentic steps."""
repo_cwd = worktree_path or cwd
branch = _git_ref(repo_cwd, "rev-parse", "--abbrev-ref", "HEAD")
commit_hash = _git_ref(repo_cwd, "rev-parse", "HEAD")
lines = [
"### Canonical References",
f"- Plan: {context.get('plan_ref', '(missing)')}",
f"- Checklist: {context.get('checklist_ref', '(missing)')}",
f"- Docs: {context.get('docs_ref', '(none)')}",
f"- Run directory: {run_dir}",
f"- Current iteration directory: {run_dir / f'v{iteration}'}",
f"- Target repository: {repo_cwd}",
f"- Git branch: {branch}",
f"- Git commit: {commit_hash}",
"",
"Use git/cat to inspect the referenced files directly instead of relying on inline summaries.",
f"Suggested git commands: `git -C {repo_cwd} show {commit_hash}` and `git -C {repo_cwd} diff HEAD`",
]
markdown_refs = _collect_markdown_refs(run_dir, iteration)
if markdown_refs:
lines.extend(["", "### Markdown Artifacts"])
lines.extend(f"- {path}" for path in markdown_refs)
if step_results:
lines.extend(["", "### Current Step Artifacts"])
for result in step_results.values():
lines.append(f"- Output: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}")
if result.transcript:
lines.append(
f"- Transcript: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}"
)
return "\n".join(lines)
# ---------------------------------------------------------------------------
@@ -850,6 +946,9 @@ def _execute_step(
# 2. Build context (include prior step results for evidence)
context = _build_context(
input_contents, step_outputs, feedback, iteration, max_iterations,
cwd=cwd,
run_dir=run_dir,
worktree_path=worktree_path,
step_results=step_results,
)
@@ -1031,6 +1130,9 @@ def _execute_parallel_batch(
template = resolve_template(step.prompt_template)
context = _build_context(
context_snapshot, {}, feedback, iteration, max_iterations,
cwd=cwd,
run_dir=run_dir,
worktree_path=worktree_path,
step_results=results_snapshot,
)
if step.context_override:
@@ -1145,6 +1247,10 @@ def _build_context(
feedback: str,
iteration: int,
max_iterations: int,
*,
cwd: Path | None = None,
run_dir: Path | None = None,
worktree_path: Path | None = None,
step_results: dict[str, AgentResult] | None = None,
) -> dict[str, str]:
"""Build the template context dict.
@@ -1160,11 +1266,25 @@ def _build_context(
context["feedback"] = feedback
context["iteration"] = str(iteration)
context["max_iterations"] = str(max_iterations)
ref_cwd = cwd or Path.cwd()
ref_run_dir = run_dir or ref_cwd / ".cross-eval" / "output" / "ad-hoc"
context["artifact_references"] = _build_artifact_references(
context,
cwd=ref_cwd,
run_dir=ref_run_dir,
iteration=iteration,
worktree_path=worktree_path,
step_results=step_results,
)
# Surface execution evidence from prior steps so reviewers can inspect it.
# Prior-iteration evidence may already live in context via input_contents.
prior_evidence = context.get("execution_evidence", "")
if step_results:
current_evidence = _format_execution_evidence(step_results)
current_evidence = _format_execution_evidence(
step_results,
run_dir=ref_run_dir,
iteration=iteration,
)
if prior_evidence and prior_evidence != "(no prior execution evidence)":
context["execution_evidence"] = (
"# Prior Iteration Evidence\n"
@@ -1179,12 +1299,14 @@ def _build_context(
def _format_execution_evidence(
step_results: dict[str, AgentResult],
*,
run_dir: Path | None = None,
iteration: int | None = None,
) -> str:
"""Format execution evidence from prior steps for reviewer consumption.
Produces a compact summary of command, exit code, duration, and a truncated
transcript excerpt for each completed step so that reviewers and seniors
can verify claims against real execution data.
Produces a compact summary of command, exit code, duration, and artifact
paths so that later agents can read markdown/git state directly.
"""
if not step_results:
return "(no prior execution evidence)"
@@ -1198,12 +1320,12 @@ def _format_execution_evidence(
f"- Output size: {len(result.output)} chars",
]
section = [line for line in section if line]
if result.transcript:
# Include a truncated transcript excerpt for debugging
excerpt = result.transcript[:2000]
if len(result.transcript) > 2000:
excerpt += "\n... (truncated)"
section.append(f"\n<details>\n<summary>Transcript excerpt</summary>\n\n{excerpt}\n</details>")
if run_dir is not None and iteration is not None:
section.append(f"- Output artifact: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}")
if result.transcript:
section.append(
f"- Transcript artifact: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}"
)
parts.append("\n".join(section))
return "\n\n---\n\n".join(parts)
@@ -1455,7 +1577,7 @@ def _format_runtime_error_markdown(
f"- **Suggested Action**: {exc.suggested_action}",
"",
"## Command",
f"```",
"```",
exc.cmd_preview,
"```",
"",

View File

@@ -15,58 +15,39 @@ from cross_eval.models import PhaseConfig, StepConfig
CODING_TEMPLATE = """\
You are tasked with implementing code based on a plan and checklist.
## Plan
{plan}
## Checklist
{checklist}
## Reference Documents
{docs}
## Previous Review Feedback
{feedback}
## Artifact References
{artifact_references}
## Iteration
This is iteration {iteration} of {max_iterations}.
## Instructions
1. Explore the project directory to understand the existing codebase structure.
2. Implement ONLY what the plan specifies. Do NOT add extra features, \
1. Read the referenced plan/checklist/docs/review artifacts directly from disk.
2. Explore the project directory and git state to understand the current codebase structure.
3. Implement ONLY what the plan specifies. Do NOT add extra features, \
unnecessary abstractions, or premature optimizations.
3. Follow every item in the checklist.
4. If there is previous feedback, address ONLY the specific issues mentioned.
5. If previous feedback contains items marked as DISMISSED or false positive, \
4. Follow every item in the checklist.
5. If there is previous feedback in the referenced markdown artifacts, address ONLY those issues.
6. If previous feedback contains items marked as DISMISSED or false positive, \
IGNORE those items — they have been verified as correct.
6. Output the complete implementation.
7. Prefer git and markdown artifacts as the source of truth. Use commit hashes, `git show`, `git diff`, and referenced markdown files instead of relying on inline summaries.
8. Output the complete implementation.
"""
REVIEW_TEMPLATE = """\
You are tasked with reviewing code against a plan and checklist.
## Plan
{plan}
## Checklist
{checklist}
## Reference Documents
{docs}
## Coding Output / Previous Step Output
{coding_output}
## Previous Review Feedback
{feedback}
## Artifact References
{artifact_references}
## Execution Evidence
{execution_evidence}
## Review Instructions
Explore the project directory to understand the full codebase context, \
then evaluate the code against ONLY the plan and checklist above. \
Use the execution evidence above to verify agent claims against actual \
command outputs and exit codes.
Read the referenced plan/checklist/docs/review artifacts directly from disk. \
Inspect the referenced commit/git state and markdown artifacts, then evaluate \
the code against ONLY the plan and checklist. Use the execution evidence above \
to verify agent claims against actual command outputs, artifact paths, and exit codes.
For each issue found, classify it with BOTH severity AND category:
@@ -127,55 +108,36 @@ Otherwise output: VERDICT: FAIL
CODING_TEMPLATE_KO = """\
당신은 기획서와 체크리스트를 기반으로 코드를 구현하는 개발자입니다.
## 기획서
{plan}
## 체크리스트
{checklist}
## 참고 문서
{docs}
## 이전 리뷰 피드백
{feedback}
## 참조 아티팩트
{artifact_references}
## 반복 정보
현재 {max_iterations}회 중 {iteration}번째 반복입니다.
## 지침
1. 프로젝트 디렉토리를 탐색하여 기존 코드베이스 구조를 파악하세요.
2. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
3. 체크리스트의 모든 항목을 충족하세요.
4. 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
5. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
6. 완전한 구현을 출력하세요.
1. 참조된 plan/checklist/docs/review markdown를 직접 읽으세요.
2. 프로젝트 디렉토리와 git 상태를 탐색하여 현재 코드베이스 구조를 파악하세요.
3. 기획서에 명시된 것만 구현하세요. 추가 기능, 불필요한 추상화, 과도한 최적화를 하지 마세요.
4. 체크리스트의 모든 항목을 충족하세요.
5. 참조된 이전 리뷰 피드백이 있다면 해당 이슈만 해결하세요.
6. 이전 피드백에서 DISMISSED 또는 오탐으로 표시된 항목은 무시하세요 — 이미 올바른 것으로 검증되었습니다.
7. inline 요약보다 git commit hash, `git show`, `git diff`, markdown 아티팩트를 우선 사용하세요.
8. 완전한 구현을 출력하세요.
"""
REVIEW_TEMPLATE_KO = """\
당신은 기획서와 체크리스트 기준으로 코드를 검토하는 리뷰어입니다.
## 기획서
{plan}
## 체크리스트
{checklist}
## 참고 문서
{docs}
## 검토 대상 코드
{coding_output}
## 이전 리뷰 피드백
{feedback}
## 참조 아티팩트
{artifact_references}
## 실행 증거
{execution_evidence}
## 검토 지침
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스 맥락을 파악한 뒤, \
위 기획서와 체크리스트 기준으로만 코드를 평가하세요. \
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력 종료 코드로 검증하세요.
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \
그 내용을 기준으로만 코드를 평가하세요. \
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요.
발견된 각 이슈에 심각도와 카테고리를 모두 부여하세요:
@@ -307,25 +269,16 @@ Otherwise output: VERDICT: FAIL
REVIEW_ONLY_TEMPLATE_KO = """\
당신은 기존 코드를 기획서와 체크리스트 기준으로 감사하는 리뷰어입니다.
## 기획서
{plan}
## 체크리스트
{checklist}
## 참고 문서
{docs}
## 이전 리뷰 결과 ({max_iterations}회 중 {iteration}번째)
{feedback}
## 참조 아티팩트
{artifact_references}
## 실행 증거
{execution_evidence}
## 검토 지침
프로젝트 디렉토리를 직접 탐색하여 전체 코드베이스를 파악한 뒤, \
위 기획서와 체크리스트 기준으로 **기존 코드**를 평가하세요. \
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력 종료 코드로 검증하세요.
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽고, \
그 내용을 기준으로 **기존 코드**를 평가하세요. \
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요.
코드를 생성하거나 수정하지 마세요. 이미 존재하는 코드를 감사하는 것이 목적입니다.
@@ -522,23 +475,8 @@ PLAN_REVIEW_TEMPLATE_KO = """\
AGGREGATE_REVIEW_TEMPLATE = """\
You are adjudicating multiple review results and turning them into an actionable decision.
## Plan
{plan}
## Checklist
{checklist}
## Reference Documents
{docs}
## Candidate Outputs
{candidate_outputs}
## Reviewer Findings
{reviews_bundle}
## Previous Verification Feedback
{feedback}
## Artifact References
{artifact_references}
## Previous Issue Tracker
{previous_senior_tracker}
@@ -547,9 +485,10 @@ You are adjudicating multiple review results and turning them into an actionable
{execution_evidence}
## Instructions
Explore the project directory to confirm the current codebase state. \
Use the execution evidence above to verify claims against actual command \
outputs and exit codes. Then:
Read the referenced plan/checklist/docs/review artifacts directly from disk. \
Explore the project directory and the referenced git commit/diff to confirm the \
current codebase state. Use the execution evidence above to verify claims against \
actual command outputs, artifact paths, and exit codes. Then:
1. Deduplicate overlapping issues across reviewers.
2. Resolve disagreements explicitly.
3. Keep only issues supported by the plan, checklist, code, or reviewer evidence.
@@ -594,23 +533,8 @@ VERDICT: PASS or VERDICT: FAIL or VERDICT: ESCALATE
AGGREGATE_REVIEW_TEMPLATE_KO = """\
당신은 여러 리뷰 결과를 판정하고 coder가 수정할 액션으로 정리하는 시니어 리뷰어입니다.
## 기획서
{plan}
## 체크리스트
{checklist}
## 참고 문서
{docs}
## 후보 결과물
{candidate_outputs}
## 개별 리뷰 결과
{reviews_bundle}
## 이전 검증 피드백
{feedback}
## 참조 아티팩트
{artifact_references}
## 이전 이슈 트래커
{previous_senior_tracker}
@@ -619,8 +543,8 @@ AGGREGATE_REVIEW_TEMPLATE_KO = """\
{execution_evidence}
## 지침
프로젝트 디렉토리를 탐색하여 현재 코드베이스 상태를 확인한 뒤, \
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력 종료 코드로 검증하세요. \
참조된 plan/checklist/docs/review markdown와 git 상태를 직접 읽어 현재 코드베이스 상태를 확인한 뒤, \
위 실행 증거를 활용하여 에이전트의 주장을 실제 명령어 출력, 아티팩트 경로, 종료 코드로 검증하세요. \
그런 다음 아래를 수행하세요.
1. 리뷰어들 사이에 중복되는 이슈를 합치세요.
2. 의견 충돌은 명시적으로 정리하세요.