feat: tighten agentic runtime handoffs and quality gates

This commit is contained in:
chungyeong
2026-03-14 10:05:25 +09:00
parent 87bc0ffbfb
commit 7b95233edf
15 changed files with 1148 additions and 167 deletions

View File

@@ -26,10 +26,9 @@ from cross_eval.models import (
IterationResult,
PipelineConfig,
PipelineResult,
ReviewMetrics,
StepConfig,
)
from cross_eval.pipeline import _format_execution_evidence, run_pipeline
from cross_eval.pipeline import _build_artifact_references, _format_execution_evidence, run_pipeline
from cross_eval.report import build_report
@@ -59,7 +58,7 @@ class TestFormatExecutionEvidence(unittest.TestCase):
self.assertIn("Exit code: 0", evidence)
self.assertIn("12.3s", evidence)
self.assertIn("claude --setting-sources user", evidence)
self.assertIn("Transcript excerpt", evidence)
self.assertNotIn("Transcript excerpt", evidence)
def test_multiple_results_separated(self) -> None:
r1 = AgentResult(
@@ -88,10 +87,60 @@ class TestFormatExecutionEvidence(unittest.TestCase):
transcript=long_transcript,
)
evidence = _format_execution_evidence({"key": result})
self.assertIn("truncated", evidence)
# The full 3000-char transcript should NOT appear
self.assertNotIn("x" * 3000, evidence)
def test_artifact_paths_included_when_run_dir_provided(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
result = AgentResult(
output="diff",
exit_code=0,
agent_name="coder",
step_name="coding",
duration_seconds=1.2,
transcript="stdout",
command_preview="claude ...",
)
evidence = _format_execution_evidence(
{"coding_output": result},
run_dir=Path(tmpdir),
iteration=2,
)
self.assertIn("v2/coding.md", evidence)
self.assertIn("v2/coding_transcript.md", evidence)
class TestArtifactReferences(unittest.TestCase):
"""Artifact references should prefer file paths and git state over inline text."""
def test_contains_input_refs_and_git_context(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir) / "repo"
repo.mkdir()
(repo / "plan.md").write_text("plan", encoding="utf-8")
(repo / "checklist.md").write_text("checklist", encoding="utf-8")
import subprocess
subprocess.run(["git", "init"], cwd=repo, capture_output=True, check=True)
subprocess.run(["git", "config", "user.email", "test@test.com"], cwd=repo, capture_output=True, check=True)
subprocess.run(["git", "config", "user.name", "Test"], cwd=repo, capture_output=True, check=True)
subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True)
subprocess.run(["git", "commit", "-m", "init"], cwd=repo, capture_output=True, check=True)
refs = _build_artifact_references(
{
"plan_ref": str((repo / "plan.md").resolve()),
"checklist_ref": str((repo / "checklist.md").resolve()),
"docs_ref": "(none)",
},
cwd=repo,
run_dir=repo / ".cross-eval" / "output" / "run",
iteration=1,
worktree_path=None,
)
self.assertIn("Plan:", refs)
self.assertIn("Git commit:", refs)
self.assertIn("Suggested git commands", refs)
# ---------------------------------------------------------------------------
# 2. Evidence in reviewer prompts (integration)
@@ -162,7 +211,7 @@ class TestEvidenceInReviewerPrompt(unittest.TestCase):
]
self.assertTrue(len(review_prompts) >= 1)
review_prompt = review_prompts[0]["prompt"]
# Evidence section should reference the coding step's command
self.assertIn("Artifact References", review_prompt)
self.assertIn("Execution Evidence", review_prompt)
self.assertIn("claude-coder", review_prompt)