feat: tighten agentic runtime handoffs and quality gates

2026-03-14 10:05:25 +09:00
parent 87bc0ffbfb
commit 7b95233edf
15 changed files with 1148 additions and 167 deletions
--- a/tests/test_evidence.py
+++ b/tests/test_evidence.py
@@ -26,10 +26,9 @@ from cross_eval.models import (
    IterationResult,
    PipelineConfig,
    PipelineResult,
-    ReviewMetrics,
    StepConfig,
 )
-from cross_eval.pipeline import _format_execution_evidence, run_pipeline
+from cross_eval.pipeline import _build_artifact_references, _format_execution_evidence, run_pipeline
 from cross_eval.report import build_report


@@ -59,7 +58,7 @@ class TestFormatExecutionEvidence(unittest.TestCase):
        self.assertIn("Exit code: 0", evidence)
        self.assertIn("12.3s", evidence)
        self.assertIn("claude --setting-sources user", evidence)
-        self.assertIn("Transcript excerpt", evidence)
+        self.assertNotIn("Transcript excerpt", evidence)

    def test_multiple_results_separated(self) -> None:
        r1 = AgentResult(
@@ -88,10 +87,60 @@ class TestFormatExecutionEvidence(unittest.TestCase):
            transcript=long_transcript,
        )
        evidence = _format_execution_evidence({"key": result})
-        self.assertIn("truncated", evidence)
-        # The full 3000-char transcript should NOT appear
        self.assertNotIn("x" * 3000, evidence)

+    def test_artifact_paths_included_when_run_dir_provided(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            result = AgentResult(
+                output="diff",
+                exit_code=0,
+                agent_name="coder",
+                step_name="coding",
+                duration_seconds=1.2,
+                transcript="stdout",
+                command_preview="claude ...",
+            )
+            evidence = _format_execution_evidence(
+                {"coding_output": result},
+                run_dir=Path(tmpdir),
+                iteration=2,
+            )
+            self.assertIn("v2/coding.md", evidence)
+            self.assertIn("v2/coding_transcript.md", evidence)
+
+
+class TestArtifactReferences(unittest.TestCase):
+    """Artifact references should prefer file paths and git state over inline text."""
+
+    def test_contains_input_refs_and_git_context(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            repo = Path(tmpdir) / "repo"
+            repo.mkdir()
+            (repo / "plan.md").write_text("plan", encoding="utf-8")
+            (repo / "checklist.md").write_text("checklist", encoding="utf-8")
+
+            import subprocess
+            subprocess.run(["git", "init"], cwd=repo, capture_output=True, check=True)
+            subprocess.run(["git", "config", "user.email", "test@test.com"], cwd=repo, capture_output=True, check=True)
+            subprocess.run(["git", "config", "user.name", "Test"], cwd=repo, capture_output=True, check=True)
+            subprocess.run(["git", "add", "."], cwd=repo, capture_output=True, check=True)
+            subprocess.run(["git", "commit", "-m", "init"], cwd=repo, capture_output=True, check=True)
+
+            refs = _build_artifact_references(
+                {
+                    "plan_ref": str((repo / "plan.md").resolve()),
+                    "checklist_ref": str((repo / "checklist.md").resolve()),
+                    "docs_ref": "(none)",
+                },
+                cwd=repo,
+                run_dir=repo / ".cross-eval" / "output" / "run",
+                iteration=1,
+                worktree_path=None,
+            )
+            self.assertIn("Plan:", refs)
+            self.assertIn("Git commit:", refs)
+            self.assertIn("Suggested git commands", refs)
+

 # ---------------------------------------------------------------------------
 # 2. Evidence in reviewer prompts (integration)
@@ -162,7 +211,7 @@ class TestEvidenceInReviewerPrompt(unittest.TestCase):
            ]
            self.assertTrue(len(review_prompts) >= 1)
            review_prompt = review_prompts[0]["prompt"]
-            # Evidence section should reference the coding step's command
+            self.assertIn("Artifact References", review_prompt)
            self.assertIn("Execution Evidence", review_prompt)
            self.assertIn("claude-coder", review_prompt)