feat: harden runtime evidence and claude agentic validation
This commit is contained in:
@@ -19,6 +19,34 @@ logger = logging.getLogger(__name__)
|
||||
# CLI tools that support --system-prompt flag natively
|
||||
_SYSTEM_PROMPT_AGENTS = ("claude",)
|
||||
_REASONING_EFFORT_AGENTS = ("codex",)
|
||||
_NO_CHANGE_ACK_MARKERS = (
|
||||
"no changes",
|
||||
"no code changes",
|
||||
"no file changes",
|
||||
"did not make any changes",
|
||||
"nothing to change",
|
||||
"no modifications were necessary",
|
||||
"no update was necessary",
|
||||
"already satisfied",
|
||||
)
|
||||
_CHANGE_CLAIM_MARKERS = (
|
||||
"summary of all changes made",
|
||||
"here's a summary of all changes made",
|
||||
"implemented",
|
||||
"i implemented",
|
||||
"added",
|
||||
"i added",
|
||||
"updated",
|
||||
"i updated",
|
||||
"modified",
|
||||
"i modified",
|
||||
"created",
|
||||
"i created",
|
||||
"fixed",
|
||||
"i fixed",
|
||||
"completed the changes",
|
||||
"finished the changes",
|
||||
)
|
||||
|
||||
|
||||
class AgentInvocationError(RuntimeError):
|
||||
@@ -106,6 +134,16 @@ def _classify_agent_failure(detail: str) -> tuple[str, str]:
|
||||
)
|
||||
|
||||
|
||||
def _claims_file_changes(output: str) -> bool:
|
||||
"""Heuristic for agent text that claims code changes were made."""
|
||||
normalized = output.lower()
|
||||
if not normalized.strip():
|
||||
return False
|
||||
if any(marker in normalized for marker in _NO_CHANGE_ACK_MARKERS):
|
||||
return False
|
||||
return any(marker in normalized for marker in _CHANGE_CLAIM_MARKERS)
|
||||
|
||||
|
||||
class _Spinner:
|
||||
"""Animated spinner for long-running agent calls."""
|
||||
|
||||
@@ -302,6 +340,9 @@ def invoke_agent(
|
||||
command_preview=cmd_preview,
|
||||
stdout=result.stdout,
|
||||
stderr=result.stderr,
|
||||
exit_code=result.returncode,
|
||||
duration_seconds=round(duration, 1),
|
||||
cwd=str(cwd) if cwd else "",
|
||||
)
|
||||
|
||||
return AgentResult(
|
||||
@@ -424,6 +465,28 @@ def invoke_agent_agentic(
|
||||
diff_output = capture_diff(worktree_path)
|
||||
|
||||
if not diff_output:
|
||||
stdout_excerpt = (result.stdout or "").strip()
|
||||
stderr_excerpt = (result.stderr or "").strip()
|
||||
if _claims_file_changes(stdout_excerpt):
|
||||
if spinner:
|
||||
spinner.stop(f"[{step_name}] FAILED (empty diff)")
|
||||
raw_error = stdout_excerpt or "(stdout empty)"
|
||||
if stderr_excerpt:
|
||||
raw_error = f"{raw_error}\n\n[stderr]\n{stderr_excerpt}"
|
||||
if len(raw_error) > 2000:
|
||||
raw_error = raw_error[:2000] + "..."
|
||||
raise AgentInvocationError(
|
||||
agent_name=agent.name,
|
||||
step_name=step_name,
|
||||
cmd_preview=cmd_preview,
|
||||
raw_error=raw_error,
|
||||
failure_type="EMPTY_DIFF",
|
||||
suggested_action=(
|
||||
"Agent reported code changes but produced no git diff. "
|
||||
"Treat this run as failed and require a real worktree diff before continuing."
|
||||
),
|
||||
)
|
||||
|
||||
diff_output = "(no changes)"
|
||||
logger.warning(
|
||||
"Agent '%s' made no file changes at step '%s'",
|
||||
@@ -438,6 +501,9 @@ def invoke_agent_agentic(
|
||||
command_preview=cmd_preview,
|
||||
stdout=result.stdout,
|
||||
stderr=result.stderr,
|
||||
exit_code=result.returncode,
|
||||
duration_seconds=round(duration, 1),
|
||||
cwd=str(worktree_path),
|
||||
)
|
||||
|
||||
return AgentResult(
|
||||
@@ -456,6 +522,9 @@ def _build_transcript(
|
||||
command_preview: str,
|
||||
stdout: str,
|
||||
stderr: str,
|
||||
exit_code: int = 0,
|
||||
duration_seconds: float = 0.0,
|
||||
cwd: str = "",
|
||||
) -> str:
|
||||
"""Build a compact execution transcript for debugging/audit output."""
|
||||
sections = [
|
||||
@@ -466,6 +535,16 @@ def _build_transcript(
|
||||
command_preview or "(unknown command)",
|
||||
"```",
|
||||
"",
|
||||
]
|
||||
if cwd:
|
||||
sections.extend(["## Working Directory", f"`{cwd}`", ""])
|
||||
sections.extend([
|
||||
f"## Exit Code: {exit_code}",
|
||||
"",
|
||||
])
|
||||
if duration_seconds > 0:
|
||||
sections.extend([f"## Duration: {duration_seconds}s", ""])
|
||||
sections.extend([
|
||||
"## Stdout",
|
||||
"```",
|
||||
(stdout or "(empty)").strip(),
|
||||
@@ -476,5 +555,5 @@ def _build_transcript(
|
||||
(stderr or "(empty)").strip(),
|
||||
"```",
|
||||
"",
|
||||
]
|
||||
])
|
||||
return "\n".join(sections)
|
||||
|
||||
Reference in New Issue
Block a user