"""Context compaction (v0.3 PR #2). When `total_input_tokens + total_output_tokens` reaches ~70% of the active model's context window, we summarise the oldest non-system, non-archived messages with a cheap model and insert a single `MessageRow(is_summary=True)` in their place. Original messages are marked `archived=True` (they stay in the DB and can be inspected via `sessions show --all`). The session's LangGraph thread is also rolled forward (caller bumps `InteractiveSession._thread_suffix`) so the next `ainvoke` starts a clean state with only the summary + recent K messages. """ from __future__ import annotations import asyncio import logging from datetime import UTC, datetime from langchain_openai import ChatOpenAI from sqlalchemy import select, update from sqlalchemy.ext.asyncio import AsyncSession from .config import Config from .monitoring.token_budget import ( count_tokens, is_over_threshold, model_context_limit, ) from .persistence.db import Database from .persistence.models import InteractiveSessionRow, MessageRow from .secrets import resolve_openrouter_api_key _LOG = logging.getLogger(__name__) #: Number of recent non-archived messages kept verbatim during compaction. KEEP_RECENT_K = 10 #: Minimum number of compactable messages required for `/compact`. MIN_COMPACTABLE = 4 #: One concurrent compaction per session_id. _SESSION_LOCKS: dict[str, asyncio.Lock] = {} def _now_iso() -> str: return datetime.now(UTC).isoformat(timespec="seconds") def _session_lock(session_id: str) -> asyncio.Lock: lock = _SESSION_LOCKS.get(session_id) if lock is None: lock = asyncio.Lock() _SESSION_LOCKS[session_id] = lock return lock class CompactionResult: """Outcome of a compaction call. Read-only by convention.""" def __init__( self, *, compacted: bool, archived: int = 0, summary_tokens: int = 0, reason: str = "", ) -> None: self.compacted = compacted self.archived = archived self.summary_tokens = summary_tokens self.reason = reason def __repr__(self) -> str: return ( f"" ) def should_compact(session_row: InteractiveSessionRow) -> bool: """True when total used tokens >= 70% of the active model's window.""" used = (session_row.total_input_tokens or 0) + (session_row.total_output_tokens or 0) model = session_row.model or "" return is_over_threshold(used, model) async def _collect_messages_for_compaction( s: AsyncSession, session_id: str ) -> tuple[list[MessageRow], list[MessageRow]]: """Return (to_compact, to_keep) for the session. Strategy: - Skip all `is_summary` and `archived` rows. - Skip `role=system` rows (they are MYDEEPAGENT.md / memory / skills injections — owned by the next ainvoke, not by compaction). - Keep the last KEEP_RECENT_K non-system, non-archived, non-summary messages verbatim. - Everything older than that is to_compact. """ rows = ( ( await s.execute( select(MessageRow) .where(MessageRow.session_id == session_id) .where(MessageRow.archived.is_(False)) .where(MessageRow.is_summary.is_(False)) .where(MessageRow.role != "system") .order_by(MessageRow.seq) ) ) .scalars() .all() ) if len(rows) <= KEEP_RECENT_K: return ([], list(rows)) cutoff = len(rows) - KEEP_RECENT_K return (list(rows[:cutoff]), list(rows[cutoff:])) def _format_for_summary(messages: list[MessageRow]) -> str: """Render messages as a compact transcript the summariser can consume.""" lines: list[str] = [] for m in messages: role = m.role.upper() content = (m.content or "").strip() lines.append(f"[{m.seq}] {role}: {content}") return "\n\n".join(lines) async def _run_summary_llm(config: Config, model: str, transcript: str) -> str: """Single LLM call producing a compact paragraph-level summary.""" api_key = resolve_openrouter_api_key(config) llm = ChatOpenAI( model=model.removeprefix("openrouter:"), base_url=config.openrouter_base_url, api_key=api_key, timeout=60.0, max_completion_tokens=600, ) system_prompt = ( "You are compressing an interactive conversation history for a developer " "agent. Produce a single concise summary (Korean if the conversation is " "Korean, otherwise English) that captures: (1) the user's intent and " "any key decisions, (2) artefacts mentioned by name/path, (3) any " "open questions or pending follow-ups. Aim for ≤ 300 tokens. Do NOT " "invent details that are not in the transcript." ) response = await llm.ainvoke( [ {"role": "system", "content": system_prompt}, {"role": "user", "content": f"Transcript:\n\n{transcript}"}, ] ) content = response.content if isinstance(content, list): content = "\n".join( (c.get("text", str(c)) if isinstance(c, dict) else str(c)) for c in content ) return str(content).strip() async def compact_session( db: Database, config: Config, session_id: str, *, summary_model: str | None = None, ) -> CompactionResult: """Compact the oldest portion of a session's history. Idempotent under lock. The caller is responsible for bumping the LangGraph thread_id (e.g. `InteractiveSession.clear_agent_cache()` style) AFTER this returns so the next ainvoke starts fresh. """ lock = _session_lock(session_id) async with lock: async with db.session() as s: row = await s.get(InteractiveSessionRow, session_id) if row is None: return CompactionResult(compacted=False, reason="session_not_found") to_compact, _to_keep = await _collect_messages_for_compaction(s, session_id) if len(to_compact) < MIN_COMPACTABLE: return CompactionResult( compacted=False, reason=f"insufficient_messages ({len(to_compact)} < {MIN_COMPACTABLE})", ) transcript = _format_for_summary(to_compact) active_model = row.model or "" model_for_summary = summary_model or "openrouter:deepseek/deepseek-chat" # The LLM call lives OUTSIDE the DB session — it can take seconds and # we don't want to hold an asyncpg connection during the wait. try: summary_text = await _run_summary_llm(config, model_for_summary, transcript) except Exception as e: _LOG.exception("compaction summariser failed for session %s", session_id) return CompactionResult(compacted=False, reason=f"summariser_failed:{e}") if not summary_text: return CompactionResult(compacted=False, reason="empty_summary") summary_tokens = count_tokens(summary_text, active_model) async with db.session() as s: # Re-fetch under fresh session in case state changed during LLM call. row = await s.get(InteractiveSessionRow, session_id) if row is None: return CompactionResult(compacted=False, reason="session_disappeared") # Insert the summary as a new MessageRow before the kept tail. # - seq: 1 + max(seq of to_compact) → places summary just before # the first kept message in the ordering, which is what readers # want for "render history in seq order". # - role: "system" + is_summary=True so the next ainvoke includes # it and the GUI can render it distinctly. # # Then archive the originals in one UPDATE. archive_ids = [m.id for m in to_compact] archived_token_total = sum(int(m.token_count or 0) for m in to_compact) now = _now_iso() # Summary seq = smallest archived seq. We can't pick a fractional # position (INTEGER seq) and shifting every kept row by +1 is too # expensive. Instead we archive originals into the negative-seq # band (see below) which frees their positive seqs; then the # summary at to_compact[0].seq slots naturally in front of the # kept tail when readers `ORDER BY seq ASC WHERE archived=False`. summary_seq = to_compact[0].seq s.add( MessageRow( session_id=session_id, seq=summary_seq, role="system", content=summary_text, tool_calls=None, token_count=summary_tokens, is_summary=True, archived=False, ts=now, ) ) # Mark originals archived — bump their seq into a high band so the # UNIQUE (session_id, seq) constraint doesn't trip with the new # summary at summary_seq. We shift archived rows to negative seq # space (id-based) to keep them addressable but out of the way. # # Postgres lets us do this in one UPDATE with FROM clause; SQLite # doesn't, so do it per-row for portability. for original in to_compact: # Use a negative offset based on original.seq so they remain # ordered relative to each other. new_seq = -(original.seq + 1) await s.execute( update(MessageRow) .where(MessageRow.id == original.id) .values(archived=True, seq=new_seq) ) # Update aggregate token counters on the session row. Subtract # archived contribution and add the summary tokens (assigned to # system → input bucket). row.total_input_tokens = max( 0, int(row.total_input_tokens or 0) - archived_token_total + summary_tokens, ) row.last_message_at = now await s.commit() _LOG.info( "compacted session=%s archived=%d summary_tokens=%d", session_id, len(archive_ids), summary_tokens, ) return CompactionResult( compacted=True, archived=len(archive_ids), summary_tokens=summary_tokens, reason="ok", ) def context_usage_fraction(session_row: InteractiveSessionRow) -> float: """Return current used_tokens / context_limit (0.0 if no model set).""" used = (session_row.total_input_tokens or 0) + (session_row.total_output_tokens or 0) limit = model_context_limit(session_row.model or "") if limit <= 0: return 0.0 return used / limit