dev-puppeteer/my-deepagent/src/my_deepagent/compaction.py

"""Context compaction (v0.3 PR #2).

When `total_input_tokens + total_output_tokens` reaches ~70% of the active
model's context window, we summarise the oldest non-system, non-archived
messages with a cheap model and insert a single `MessageRow(is_summary=True)`
in their place.  Original messages are marked `archived=True` (they stay in
the DB and can be inspected via `sessions show <id> --all`).

The session's LangGraph thread is also rolled forward (caller bumps
`InteractiveSession._thread_suffix`) so the next `ainvoke` starts a clean
state with only the summary + recent K messages.
"""

from __future__ import annotations

import asyncio
import logging
from datetime import UTC, datetime

from langchain_openai import ChatOpenAI
from sqlalchemy import select, update
from sqlalchemy.ext.asyncio import AsyncSession

from .config import Config
from .monitoring.token_budget import (
    count_tokens,
    is_over_threshold,
    model_context_limit,
)
from .persistence.db import Database
from .persistence.models import InteractiveSessionRow, MessageRow
from .secrets import resolve_openrouter_api_key

_LOG = logging.getLogger(__name__)

#: Number of recent non-archived messages kept verbatim during compaction.
KEEP_RECENT_K = 10

#: Minimum number of compactable messages required for `/compact`.
MIN_COMPACTABLE = 4

#: One concurrent compaction per session_id.
_SESSION_LOCKS: dict[str, asyncio.Lock] = {}


def _now_iso() -> str:
    return datetime.now(UTC).isoformat(timespec="seconds")


def _session_lock(session_id: str) -> asyncio.Lock:
    lock = _SESSION_LOCKS.get(session_id)
    if lock is None:
        lock = asyncio.Lock()
        _SESSION_LOCKS[session_id] = lock
    return lock


class CompactionResult:
    """Outcome of a compaction call. Read-only by convention."""

    def __init__(
        self,
        *,
        compacted: bool,
        archived: int = 0,
        summary_tokens: int = 0,
        reason: str = "",
    ) -> None:
        self.compacted = compacted
        self.archived = archived
        self.summary_tokens = summary_tokens
        self.reason = reason

    def __repr__(self) -> str:
        return (
            f"<CompactionResult compacted={self.compacted} archived={self.archived} "
            f"summary_tokens={self.summary_tokens} reason={self.reason!r}>"
        )


def should_compact(session_row: InteractiveSessionRow) -> bool:
    """True when total used tokens >= 70% of the active model's window."""
    used = (session_row.total_input_tokens or 0) + (session_row.total_output_tokens or 0)
    model = session_row.model or ""
    return is_over_threshold(used, model)


async def _collect_messages_for_compaction(
    s: AsyncSession, session_id: str
) -> tuple[list[MessageRow], list[MessageRow]]:
    """Return (to_compact, to_keep) for the session.

    Strategy:
        - Skip all `is_summary` and `archived` rows.
        - Skip `role=system` rows (they are MYDEEPAGENT.md / memory / skills
          injections — owned by the next ainvoke, not by compaction).
        - Keep the last KEEP_RECENT_K non-system, non-archived, non-summary
          messages verbatim.
        - Everything older than that is to_compact.
    """
    rows = (
        (
            await s.execute(
                select(MessageRow)
                .where(MessageRow.session_id == session_id)
                .where(MessageRow.archived.is_(False))
                .where(MessageRow.is_summary.is_(False))
                .where(MessageRow.role != "system")
                .order_by(MessageRow.seq)
            )
        )
        .scalars()
        .all()
    )
    if len(rows) <= KEEP_RECENT_K:
        return ([], list(rows))
    cutoff = len(rows) - KEEP_RECENT_K
    return (list(rows[:cutoff]), list(rows[cutoff:]))


def _format_for_summary(messages: list[MessageRow]) -> str:
    """Render messages as a compact transcript the summariser can consume."""
    lines: list[str] = []
    for m in messages:
        role = m.role.upper()
        content = (m.content or "").strip()
        lines.append(f"[{m.seq}] {role}: {content}")
    return "\n\n".join(lines)


async def _run_summary_llm(config: Config, model: str, transcript: str) -> str:
    """Single LLM call producing a compact paragraph-level summary."""
    api_key = resolve_openrouter_api_key(config)
    llm = ChatOpenAI(
        model=model.removeprefix("openrouter:"),
        base_url=config.openrouter_base_url,
        api_key=api_key,
        timeout=60.0,
        max_completion_tokens=600,
    )
    system_prompt = (
        "You are compressing an interactive conversation history for a developer "
        "agent. Produce a single concise summary (Korean if the conversation is "
        "Korean, otherwise English) that captures: (1) the user's intent and "
        "any key decisions, (2) artefacts mentioned by name/path, (3) any "
        "open questions or pending follow-ups. Aim for ≤ 300 tokens. Do NOT "
        "invent details that are not in the transcript."
    )
    response = await llm.ainvoke(
        [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Transcript:\n\n{transcript}"},
        ]
    )
    content = response.content
    if isinstance(content, list):
        content = "\n".join(
            (c.get("text", str(c)) if isinstance(c, dict) else str(c)) for c in content
        )
    return str(content).strip()


async def compact_session(
    db: Database,
    config: Config,
    session_id: str,
    *,
    summary_model: str | None = None,
) -> CompactionResult:
    """Compact the oldest portion of a session's history. Idempotent under lock.

    The caller is responsible for bumping the LangGraph thread_id (e.g.
    `InteractiveSession.clear_agent_cache()` style) AFTER this returns so the
    next ainvoke starts fresh.
    """
    lock = _session_lock(session_id)
    async with lock:
        async with db.session() as s:
            row = await s.get(InteractiveSessionRow, session_id)
            if row is None:
                return CompactionResult(compacted=False, reason="session_not_found")

            to_compact, _to_keep = await _collect_messages_for_compaction(s, session_id)
            if len(to_compact) < MIN_COMPACTABLE:
                return CompactionResult(
                    compacted=False,
                    reason=f"insufficient_messages ({len(to_compact)} < {MIN_COMPACTABLE})",
                )
            transcript = _format_for_summary(to_compact)
            active_model = row.model or ""
            model_for_summary = summary_model or "openrouter:deepseek/deepseek-chat"

        # The LLM call lives OUTSIDE the DB session — it can take seconds and
        # we don't want to hold an asyncpg connection during the wait.
        try:
            summary_text = await _run_summary_llm(config, model_for_summary, transcript)
        except Exception as e:
            _LOG.exception("compaction summariser failed for session %s", session_id)
            return CompactionResult(compacted=False, reason=f"summariser_failed:{e}")

        if not summary_text:
            return CompactionResult(compacted=False, reason="empty_summary")

        summary_tokens = count_tokens(summary_text, active_model)

        async with db.session() as s:
            # Re-fetch under fresh session in case state changed during LLM call.
            row = await s.get(InteractiveSessionRow, session_id)
            if row is None:
                return CompactionResult(compacted=False, reason="session_disappeared")

            # Insert the summary as a new MessageRow before the kept tail.
            #   - seq: 1 + max(seq of to_compact)  → places summary just before
            #     the first kept message in the ordering, which is what readers
            #     want for "render history in seq order".
            #   - role: "system" + is_summary=True so the next ainvoke includes
            #     it and the GUI can render it distinctly.
            #
            # Then archive the originals in one UPDATE.
            archive_ids = [m.id for m in to_compact]
            archived_token_total = sum(int(m.token_count or 0) for m in to_compact)
            now = _now_iso()

            # Summary seq = smallest archived seq.  We can't pick a fractional
            # position (INTEGER seq) and shifting every kept row by +1 is too
            # expensive.  Instead we archive originals into the negative-seq
            # band (see below) which frees their positive seqs; then the
            # summary at to_compact[0].seq slots naturally in front of the
            # kept tail when readers `ORDER BY seq ASC WHERE archived=False`.
            summary_seq = to_compact[0].seq
            s.add(
                MessageRow(
                    session_id=session_id,
                    seq=summary_seq,
                    role="system",
                    content=summary_text,
                    tool_calls=None,
                    token_count=summary_tokens,
                    is_summary=True,
                    archived=False,
                    ts=now,
                )
            )

            # Mark originals archived — bump their seq into a high band so the
            # UNIQUE (session_id, seq) constraint doesn't trip with the new
            # summary at summary_seq.  We shift archived rows to negative seq
            # space (id-based) to keep them addressable but out of the way.
            #
            # Postgres lets us do this in one UPDATE with FROM clause; SQLite
            # doesn't, so do it per-row for portability.
            for original in to_compact:
                # Use a negative offset based on original.seq so they remain
                # ordered relative to each other.
                new_seq = -(original.seq + 1)
                await s.execute(
                    update(MessageRow)
                    .where(MessageRow.id == original.id)
                    .values(archived=True, seq=new_seq)
                )

            # Update aggregate token counters on the session row.  Subtract
            # archived contribution and add the summary tokens (assigned to
            # system → input bucket).
            row.total_input_tokens = max(
                0,
                int(row.total_input_tokens or 0) - archived_token_total + summary_tokens,
            )
            row.last_message_at = now

            await s.commit()

        _LOG.info(
            "compacted session=%s archived=%d summary_tokens=%d",
            session_id,
            len(archive_ids),
            summary_tokens,
        )
        return CompactionResult(
            compacted=True,
            archived=len(archive_ids),
            summary_tokens=summary_tokens,
            reason="ok",
        )


def context_usage_fraction(session_row: InteractiveSessionRow) -> float:
    """Return current used_tokens / context_limit (0.0 if no model set)."""
    used = (session_row.total_input_tokens or 0) + (session_row.total_output_tokens or 0)
    limit = model_context_limit(session_row.model or "")
    if limit <= 0:
        return 0.0
    return used / limit