feat: tighten agentic runtime handoffs and quality gates

2026-03-14 10:05:25 +09:00
parent 87bc0ffbfb
commit 7b95233edf
15 changed files with 1148 additions and 167 deletions
--- a/cross_eval/pipeline.py
+++ b/cross_eval/pipeline.py
@@ -343,6 +343,8 @@ def _run_simple_pipeline(
            if step_results:
                input_contents["execution_evidence"] = _format_execution_evidence(
                    step_results,
+                    run_dir=run_dir,
+                    iteration=i,
                )

            iterations.append(iter_result)
@@ -543,6 +545,8 @@ def _run_phased_pipeline(
                if step_results:
                    input_contents["execution_evidence"] = _format_execution_evidence(
                        step_results,
+                        run_dir=run_dir,
+                        iteration=global_iter,
                    )

                iterations.append(iter_result)
@@ -661,10 +665,13 @@ def _load_inputs(config: PipelineConfig) -> dict[str, str]:
    """Load input file contents from config."""
    input_contents: dict[str, str] = {}
    for key, val in config.inputs.items():
-        if isinstance(val, str):
+        if key.endswith("_ref"):
+            input_contents[key] = str(val)
+        elif isinstance(val, str):
            input_contents[key] = val
        else:
            input_contents[key] = val.read_text(encoding="utf-8")
+    _refresh_input_references(config, input_contents)
    return input_contents


@@ -673,10 +680,99 @@ def _refresh_inputs(
 ) -> None:
    """Re-read input files (they may have changed on disk)."""
    for key, val in config.inputs.items():
-        if isinstance(val, str):
+        if key.endswith("_ref"):
+            input_contents[key] = str(val)
+        elif isinstance(val, str):
            input_contents[key] = val
        elif isinstance(val, Path) and val.exists():
            input_contents[key] = val.read_text(encoding="utf-8")
+    _refresh_input_references(config, input_contents)
+
+
+def _refresh_input_references(
+    config: PipelineConfig,
+    input_contents: dict[str, str],
+) -> None:
+    """Expose stable file references for canonical planning inputs."""
+    for key, val in config.inputs.items():
+        if key.endswith("_ref"):
+            input_contents[key] = str(val)
+            continue
+        ref_key = f"{key}_ref"
+        if isinstance(val, Path):
+            input_contents[ref_key] = str(val.resolve())
+        else:
+            input_contents.setdefault(ref_key, f"(inline {key}; no file path available)")
+
+
+def _git_ref(cwd: Path, *args: str) -> str:
+    """Best-effort git metadata lookup."""
+    result = subprocess.run(
+        ["git", *args],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        return "(unknown)"
+    return result.stdout.strip() or "(unknown)"
+
+
+def _collect_markdown_refs(run_dir: Path, iteration: int) -> list[Path]:
+    """Collect prior markdown artifacts available to the current step."""
+    refs: list[Path] = []
+    for idx in range(1, iteration + 1):
+        iter_dir = run_dir / f"v{idx}"
+        if not iter_dir.exists():
+            continue
+        refs.extend(sorted(iter_dir.glob("*.md")))
+    return refs
+
+
+def _build_artifact_references(
+    context: dict[str, str],
+    *,
+    cwd: Path,
+    run_dir: Path,
+    iteration: int,
+    worktree_path: Path | None,
+    step_results: dict[str, AgentResult] | None = None,
+) -> str:
+    """Build a compact reference-only handoff for agentic steps."""
+    repo_cwd = worktree_path or cwd
+    branch = _git_ref(repo_cwd, "rev-parse", "--abbrev-ref", "HEAD")
+    commit_hash = _git_ref(repo_cwd, "rev-parse", "HEAD")
+
+    lines = [
+        "### Canonical References",
+        f"- Plan: {context.get('plan_ref', '(missing)')}",
+        f"- Checklist: {context.get('checklist_ref', '(missing)')}",
+        f"- Docs: {context.get('docs_ref', '(none)')}",
+        f"- Run directory: {run_dir}",
+        f"- Current iteration directory: {run_dir / f'v{iteration}'}",
+        f"- Target repository: {repo_cwd}",
+        f"- Git branch: {branch}",
+        f"- Git commit: {commit_hash}",
+        "",
+        "Use git/cat to inspect the referenced files directly instead of relying on inline summaries.",
+        f"Suggested git commands: `git -C {repo_cwd} show {commit_hash}` and `git -C {repo_cwd} diff HEAD`",
+    ]
+
+    markdown_refs = _collect_markdown_refs(run_dir, iteration)
+    if markdown_refs:
+        lines.extend(["", "### Markdown Artifacts"])
+        lines.extend(f"- {path}" for path in markdown_refs)
+
+    if step_results:
+        lines.extend(["", "### Current Step Artifacts"])
+        for result in step_results.values():
+            lines.append(f"- Output: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}")
+            if result.transcript:
+                lines.append(
+                    f"- Transcript: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}"
+                )
+
+    return "\n".join(lines)


 # ---------------------------------------------------------------------------
@@ -850,6 +946,9 @@ def _execute_step(
    # 2. Build context (include prior step results for evidence)
    context = _build_context(
        input_contents, step_outputs, feedback, iteration, max_iterations,
+        cwd=cwd,
+        run_dir=run_dir,
+        worktree_path=worktree_path,
        step_results=step_results,
    )

@@ -1031,6 +1130,9 @@ def _execute_parallel_batch(
        template = resolve_template(step.prompt_template)
        context = _build_context(
            context_snapshot, {}, feedback, iteration, max_iterations,
+            cwd=cwd,
+            run_dir=run_dir,
+            worktree_path=worktree_path,
            step_results=results_snapshot,
        )
        if step.context_override:
@@ -1145,6 +1247,10 @@ def _build_context(
    feedback: str,
    iteration: int,
    max_iterations: int,
+    *,
+    cwd: Path | None = None,
+    run_dir: Path | None = None,
+    worktree_path: Path | None = None,
    step_results: dict[str, AgentResult] | None = None,
 ) -> dict[str, str]:
    """Build the template context dict.
@@ -1160,11 +1266,25 @@ def _build_context(
    context["feedback"] = feedback
    context["iteration"] = str(iteration)
    context["max_iterations"] = str(max_iterations)
+    ref_cwd = cwd or Path.cwd()
+    ref_run_dir = run_dir or ref_cwd / ".cross-eval" / "output" / "ad-hoc"
+    context["artifact_references"] = _build_artifact_references(
+        context,
+        cwd=ref_cwd,
+        run_dir=ref_run_dir,
+        iteration=iteration,
+        worktree_path=worktree_path,
+        step_results=step_results,
+    )
    # Surface execution evidence from prior steps so reviewers can inspect it.
    # Prior-iteration evidence may already live in context via input_contents.
    prior_evidence = context.get("execution_evidence", "")
    if step_results:
-        current_evidence = _format_execution_evidence(step_results)
+        current_evidence = _format_execution_evidence(
+            step_results,
+            run_dir=ref_run_dir,
+            iteration=iteration,
+        )
        if prior_evidence and prior_evidence != "(no prior execution evidence)":
            context["execution_evidence"] = (
                "# Prior Iteration Evidence\n"
@@ -1179,12 +1299,14 @@ def _build_context(

 def _format_execution_evidence(
    step_results: dict[str, AgentResult],
+    *,
+    run_dir: Path | None = None,
+    iteration: int | None = None,
 ) -> str:
    """Format execution evidence from prior steps for reviewer consumption.

-    Produces a compact summary of command, exit code, duration, and a truncated
-    transcript excerpt for each completed step so that reviewers and seniors
-    can verify claims against real execution data.
+    Produces a compact summary of command, exit code, duration, and artifact
+    paths so that later agents can read markdown/git state directly.
    """
    if not step_results:
        return "(no prior execution evidence)"
@@ -1198,12 +1320,12 @@ def _format_execution_evidence(
            f"- Output size: {len(result.output)} chars",
        ]
        section = [line for line in section if line]
-        if result.transcript:
-            # Include a truncated transcript excerpt for debugging
-            excerpt = result.transcript[:2000]
-            if len(result.transcript) > 2000:
-                excerpt += "\n... (truncated)"
-            section.append(f"\n<details>\n<summary>Transcript excerpt</summary>\n\n{excerpt}\n</details>")
+        if run_dir is not None and iteration is not None:
+            section.append(f"- Output artifact: {run_dir / f'v{iteration}' / f'{result.step_name}.md'}")
+            if result.transcript:
+                section.append(
+                    f"- Transcript artifact: {run_dir / f'v{iteration}' / f'{result.step_name}_transcript.md'}"
+                )
        parts.append("\n".join(section))
    return "\n\n---\n\n".join(parts)

@@ -1455,7 +1577,7 @@ def _format_runtime_error_markdown(
                f"- **Suggested Action**: {exc.suggested_action}",
                "",
                "## Command",
-                f"```",
+                "```",
                exc.cmd_preview,
                "```",
                "",