feat: harden runtime evidence and claude agentic validation

This commit is contained in:
chungyeong
2026-03-13 22:29:22 +09:00
parent 28dd794f54
commit 3fb19e90c0
5 changed files with 655 additions and 59 deletions

View File

@@ -19,6 +19,34 @@ logger = logging.getLogger(__name__)
# CLI tools that support --system-prompt flag natively
_SYSTEM_PROMPT_AGENTS = ("claude",)
_REASONING_EFFORT_AGENTS = ("codex",)
_NO_CHANGE_ACK_MARKERS = (
"no changes",
"no code changes",
"no file changes",
"did not make any changes",
"nothing to change",
"no modifications were necessary",
"no update was necessary",
"already satisfied",
)
_CHANGE_CLAIM_MARKERS = (
"summary of all changes made",
"here's a summary of all changes made",
"implemented",
"i implemented",
"added",
"i added",
"updated",
"i updated",
"modified",
"i modified",
"created",
"i created",
"fixed",
"i fixed",
"completed the changes",
"finished the changes",
)
class AgentInvocationError(RuntimeError):
@@ -106,6 +134,16 @@ def _classify_agent_failure(detail: str) -> tuple[str, str]:
)
def _claims_file_changes(output: str) -> bool:
"""Heuristic for agent text that claims code changes were made."""
normalized = output.lower()
if not normalized.strip():
return False
if any(marker in normalized for marker in _NO_CHANGE_ACK_MARKERS):
return False
return any(marker in normalized for marker in _CHANGE_CLAIM_MARKERS)
class _Spinner:
"""Animated spinner for long-running agent calls."""
@@ -302,6 +340,9 @@ def invoke_agent(
command_preview=cmd_preview,
stdout=result.stdout,
stderr=result.stderr,
exit_code=result.returncode,
duration_seconds=round(duration, 1),
cwd=str(cwd) if cwd else "",
)
return AgentResult(
@@ -424,6 +465,28 @@ def invoke_agent_agentic(
diff_output = capture_diff(worktree_path)
if not diff_output:
stdout_excerpt = (result.stdout or "").strip()
stderr_excerpt = (result.stderr or "").strip()
if _claims_file_changes(stdout_excerpt):
if spinner:
spinner.stop(f"[{step_name}] FAILED (empty diff)")
raw_error = stdout_excerpt or "(stdout empty)"
if stderr_excerpt:
raw_error = f"{raw_error}\n\n[stderr]\n{stderr_excerpt}"
if len(raw_error) > 2000:
raw_error = raw_error[:2000] + "..."
raise AgentInvocationError(
agent_name=agent.name,
step_name=step_name,
cmd_preview=cmd_preview,
raw_error=raw_error,
failure_type="EMPTY_DIFF",
suggested_action=(
"Agent reported code changes but produced no git diff. "
"Treat this run as failed and require a real worktree diff before continuing."
),
)
diff_output = "(no changes)"
logger.warning(
"Agent '%s' made no file changes at step '%s'",
@@ -438,6 +501,9 @@ def invoke_agent_agentic(
command_preview=cmd_preview,
stdout=result.stdout,
stderr=result.stderr,
exit_code=result.returncode,
duration_seconds=round(duration, 1),
cwd=str(worktree_path),
)
return AgentResult(
@@ -456,6 +522,9 @@ def _build_transcript(
command_preview: str,
stdout: str,
stderr: str,
exit_code: int = 0,
duration_seconds: float = 0.0,
cwd: str = "",
) -> str:
"""Build a compact execution transcript for debugging/audit output."""
sections = [
@@ -466,6 +535,16 @@ def _build_transcript(
command_preview or "(unknown command)",
"```",
"",
]
if cwd:
sections.extend(["## Working Directory", f"`{cwd}`", ""])
sections.extend([
f"## Exit Code: {exit_code}",
"",
])
if duration_seconds > 0:
sections.extend([f"## Duration: {duration_seconds}s", ""])
sections.extend([
"## Stdout",
"```",
(stdout or "(empty)").strip(),
@@ -476,5 +555,5 @@ def _build_transcript(
(stderr or "(empty)").strip(),
"```",
"",
]
])
return "\n".join(sections)