feat: enhance vision autocomplete service and UI feedback

- Optimized the vision autocomplete service by starting the SSE stream immediately and deriving KB search queries directly from window titles. - Refactored the service to run KB filesystem pre-computation and agent graph compilation in parallel, improving performance. - Updated the SuggestionPage component to handle new agent step data, displaying progress indicators for each step. - Enhanced the CSS for the suggestion tooltip and agent activity indicators, improving the user interface and experience.
2026-06-20 21:18:13 +02:00 · 2026-04-07 02:49:24 -07:00 · 2026-04-07 02:49:24 -07:00 · bb1dcd32b6
commit bb1dcd32b6
parent 49441233e7
6 changed files with 686 additions and 228 deletions
--- a/surfsense_backend/app/agents/autocomplete/init.py
+++ b/surfsense_backend/app/agents/autocomplete/init.py
@ -0,0 +1,11 @@
+"""Agent-based vision autocomplete with scoped filesystem exploration."""
+
+from app.agents.autocomplete.autocomplete_agent import (
+    create_autocomplete_agent,
+    stream_autocomplete_agent,
+)
+
+__all__ = [
+    "create_autocomplete_agent",
+    "stream_autocomplete_agent",
+]
--- a/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py
+++ b/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py
@ -0,0 +1,429 @@
+"""Vision autocomplete agent with scoped filesystem exploration.
+
+Converts the stateless single-shot vision autocomplete into an agent that
+seeds a virtual filesystem from KB search results and lets the vision LLM
+explore documents via ``ls``, ``read_file``, ``glob``, ``grep``, etc.
+before generating the final completion.
+
+Performance: KB search and agent graph compilation run in parallel so
+the only sequential latency is KB-search (or agent compile, whichever is
+slower) + the agent's LLM turns.  There is no separate "query extraction"
+LLM call — the window title is used directly as the KB search query.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import uuid
+from typing import Any, AsyncGenerator
+
+from deepagents.graph import BASE_AGENT_PROMPT
+from deepagents.middleware.patch_tool_calls import PatchToolCallsMiddleware
+from langchain.agents import create_agent
+from langchain_anthropic.middleware import AnthropicPromptCachingMiddleware
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import AIMessage, ToolMessage
+
+from app.agents.new_chat.middleware.filesystem import SurfSenseFilesystemMiddleware
+from app.agents.new_chat.middleware.knowledge_search import (
+    build_scoped_filesystem,
+    search_knowledge_base,
+)
+from app.services.new_streaming_service import VercelStreamingService
+
+logger = logging.getLogger(__name__)
+
+KB_TOP_K = 10
+
+# ---------------------------------------------------------------------------
+# System prompt
+# ---------------------------------------------------------------------------
+
+AUTOCOMPLETE_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text.
+
+You will receive a screenshot of the user's screen. Your PRIMARY source of truth is the screenshot itself — the visual context determines what to write.
+
+Your job:
+1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.).
+2. Identify the text area where the user will type.
+3. Generate the text the user most likely wants to write based on the visual context.
+
+You also have access to the user's knowledge base documents via filesystem tools. However:
+- ONLY consult the knowledge base if the screenshot clearly involves a topic where your KB documents are DIRECTLY relevant (e.g., the user is writing about a specific project/topic that matches a document title).
+- Do NOT explore documents just because they exist. Most autocomplete requests can be answered purely from the screenshot.
+- If you do read a document, only incorporate information that is 100% relevant to what the user is typing RIGHT NOW. Do not add extra details, background, or tangential information from the KB.
+- Keep your output SHORT — autocomplete should feel like a natural continuation, not an essay.
+
+Key behavior:
+- If the text area is EMPTY, draft a concise response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document).
+- If the text area already has text, continue it naturally — typically just a sentence or two.
+
+Rules:
+- Output ONLY the text to be inserted. No quotes, no explanations, no meta-commentary.
+- Be CONCISE. Prefer a single paragraph or a few sentences. Autocomplete is a quick assist, not a full draft.
+- Match the tone and formality of the surrounding context.
+- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal.
+- Do NOT describe the screenshot or explain your reasoning.
+- Do NOT cite or reference documents explicitly — just let the knowledge inform your writing naturally.
+- If you cannot determine what to write, output nothing.
+
+## Filesystem Tools `ls`, `read_file`, `write_file`, `edit_file`, `glob`, `grep`
+
+All file paths must start with a `/`.
+- ls: list files and directories at a given path.
+- read_file: read a file from the filesystem.
+- write_file: create a temporary file in the session (not persisted).
+- edit_file: edit a file in the session (not persisted for /documents/ files).
+- glob: find files matching a pattern (e.g., "**/*.xml").
+- grep: search for text within files.
+
+## When to Use Filesystem Tools
+
+BEFORE reaching for any tool, ask yourself: "Can I write a good completion purely from the screenshot?" If yes, just write it — do NOT explore the KB.
+
+Only use tools when:
+- The user is clearly writing about a specific topic that likely has detailed information in their KB.
+- You need a specific fact, name, number, or reference that the screenshot doesn't provide.
+
+When you do use tools, be surgical:
+- Check the `ls` output first. If no document title looks relevant, stop — do not read files just to see what's there.
+- If a title looks relevant, read only the `<chunk_index>` (first ~20 lines) and jump to matched chunks. Do not read entire documents.
+- Extract only the specific information you need and move on to generating the completion.
+
+## Reading Documents Efficiently
+
+Documents are formatted as XML. Each document contains:
+- `<document_metadata>` — title, type, URL, etc.
+- `<chunk_index>` — a table of every chunk with its **line range** and a
+  `matched="true"` flag for chunks that matched the search query.
+- `<document_content>` — the actual chunks in original document order.
+
+**Workflow**: read the first ~20 lines to see the `<chunk_index>`, identify
+chunks marked `matched="true"`, then use `read_file(path, offset=<start_line>,
+limit=<lines>)` to jump directly to those sections."""
+
+APP_CONTEXT_BLOCK = """
+
+The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly."""
+
+
+def _build_autocomplete_system_prompt(app_name: str, window_title: str) -> str:
+    prompt = AUTOCOMPLETE_SYSTEM_PROMPT
+    if app_name:
+        prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title)
+    return prompt
+
+
+# ---------------------------------------------------------------------------
+# Pre-compute KB filesystem (runs in parallel with agent compilation)
+# ---------------------------------------------------------------------------
+
+
+class _KBResult:
+    """Container for pre-computed KB filesystem results."""
+    __slots__ = ("files", "ls_ai_msg", "ls_tool_msg")
+
+    def __init__(
+        self,
+        files: dict[str, Any] | None = None,
+        ls_ai_msg: AIMessage | None = None,
+        ls_tool_msg: ToolMessage | None = None,
+    ) -> None:
+        self.files = files
+        self.ls_ai_msg = ls_ai_msg
+        self.ls_tool_msg = ls_tool_msg
+
+    @property
+    def has_documents(self) -> bool:
+        return bool(self.files)
+
+
+async def precompute_kb_filesystem(
+    search_space_id: int,
+    query: str,
+    top_k: int = KB_TOP_K,
+) -> _KBResult:
+    """Search the KB and build the scoped filesystem outside the agent.
+
+    This is designed to be called via ``asyncio.gather`` alongside agent
+    graph compilation so the two run concurrently.
+    """
+    if not query:
+        return _KBResult()
+
+    try:
+        search_results = await search_knowledge_base(
+            query=query,
+            search_space_id=search_space_id,
+            top_k=top_k,
+        )
+
+        if not search_results:
+            return _KBResult()
+
+        new_files, _ = await build_scoped_filesystem(
+            documents=search_results,
+            search_space_id=search_space_id,
+        )
+
+        if not new_files:
+            return _KBResult()
+
+        doc_paths = [
+            p for p, v in new_files.items()
+            if p.startswith("/documents/") and v is not None
+        ]
+        tool_call_id = f"auto_ls_{uuid.uuid4().hex[:12]}"
+        ai_msg = AIMessage(
+            content="",
+            tool_calls=[{"name": "ls", "args": {"path": "/documents"}, "id": tool_call_id}],
+        )
+        tool_msg = ToolMessage(
+            content=str(doc_paths) if doc_paths else "No documents found.",
+            tool_call_id=tool_call_id,
+        )
+        return _KBResult(files=new_files, ls_ai_msg=ai_msg, ls_tool_msg=tool_msg)
+
+    except Exception:
+        logger.warning("KB pre-computation failed, proceeding without KB", exc_info=True)
+        return _KBResult()
+
+
+# ---------------------------------------------------------------------------
+# Filesystem middleware — no save_document, no persistence
+# ---------------------------------------------------------------------------
+
+
+class AutocompleteFilesystemMiddleware(SurfSenseFilesystemMiddleware):
+    """Filesystem middleware for autocomplete — read-only exploration only.
+
+    Strips ``save_document`` (permanent KB persistence) and passes
+    ``search_space_id=None`` so ``write_file`` / ``edit_file`` stay ephemeral.
+    """
+
+    def __init__(self) -> None:
+        super().__init__(search_space_id=None, created_by_id=None)
+        self.tools = [t for t in self.tools if t.name != "save_document"]
+
+
+# ---------------------------------------------------------------------------
+# Agent factory
+# ---------------------------------------------------------------------------
+
+
+async def _compile_agent(
+    llm: BaseChatModel,
+    app_name: str,
+    window_title: str,
+) -> Any:
+    """Compile the agent graph (CPU-bound, runs in a thread)."""
+    system_prompt = _build_autocomplete_system_prompt(app_name, window_title)
+    final_system_prompt = system_prompt + "\n\n" + BASE_AGENT_PROMPT
+
+    middleware = [
+        AutocompleteFilesystemMiddleware(),
+        PatchToolCallsMiddleware(),
+        AnthropicPromptCachingMiddleware(unsupported_model_behavior="ignore"),
+    ]
+
+    agent = await asyncio.to_thread(
+        create_agent,
+        llm,
+        system_prompt=final_system_prompt,
+        tools=[],
+        middleware=middleware,
+    )
+    return agent.with_config({"recursion_limit": 200})
+
+
+async def create_autocomplete_agent(
+    llm: BaseChatModel,
+    *,
+    search_space_id: int,
+    kb_query: str,
+    app_name: str = "",
+    window_title: str = "",
+) -> tuple[Any, _KBResult]:
+    """Create the autocomplete agent and pre-compute KB in parallel.
+
+    Returns ``(agent, kb_result)`` so the caller can inject the pre-computed
+    filesystem into the agent's initial state without any middleware delay.
+    """
+    agent, kb = await asyncio.gather(
+        _compile_agent(llm, app_name, window_title),
+        precompute_kb_filesystem(search_space_id, kb_query),
+    )
+    return agent, kb
+
+
+# ---------------------------------------------------------------------------
+# Streaming helper
+# ---------------------------------------------------------------------------
+
+
+async def stream_autocomplete_agent(
+    agent: Any,
+    input_data: dict[str, Any],
+    streaming_service: VercelStreamingService,
+    *,
+    emit_message_start: bool = True,
+) -> AsyncGenerator[str, None]:
+    """Stream agent events as Vercel SSE, with thinking steps for tool calls.
+
+    When ``emit_message_start`` is False the caller has already sent the
+    ``message_start`` event (e.g. to show preparation steps before the agent
+    runs).
+    """
+    thread_id = uuid.uuid4().hex
+    config = {"configurable": {"thread_id": thread_id}}
+
+    current_text_id: str | None = None
+    active_tool_depth = 0
+    thinking_step_counter = 0
+    tool_step_ids: dict[str, str] = {}
+    step_titles: dict[str, str] = {}
+    completed_step_ids: set[str] = set()
+    last_active_step_id: str | None = None
+
+    def next_thinking_step_id() -> str:
+        nonlocal thinking_step_counter
+        thinking_step_counter += 1
+        return f"autocomplete-step-{thinking_step_counter}"
+
+    def complete_current_step() -> str | None:
+        nonlocal last_active_step_id
+        if last_active_step_id and last_active_step_id not in completed_step_ids:
+            completed_step_ids.add(last_active_step_id)
+            title = step_titles.get(last_active_step_id, "Done")
+            event = streaming_service.format_thinking_step(
+                step_id=last_active_step_id,
+                title=title,
+                status="complete",
+            )
+            last_active_step_id = None
+            return event
+        return None
+
+    if emit_message_start:
+        yield streaming_service.format_message_start()
+
+    # Emit an initial "Generating completion" step so the UI immediately
+    # shows activity once the agent starts its first LLM call.
+    gen_step_id = next_thinking_step_id()
+    last_active_step_id = gen_step_id
+    step_titles[gen_step_id] = "Generating completion"
+    yield streaming_service.format_thinking_step(
+        step_id=gen_step_id,
+        title="Generating completion",
+        status="in_progress",
+    )
+
+    try:
+        async for event in agent.astream_events(input_data, config=config, version="v2"):
+            event_type = event.get("event", "")
+
+            if event_type == "on_chat_model_stream":
+                if active_tool_depth > 0:
+                    continue
+                if "surfsense:internal" in event.get("tags", []):
+                    continue
+                chunk = event.get("data", {}).get("chunk")
+                if chunk and hasattr(chunk, "content"):
+                    content = chunk.content
+                    if content and isinstance(content, str):
+                        if current_text_id is None:
+                            step_event = complete_current_step()
+                            if step_event:
+                                yield step_event
+                            current_text_id = streaming_service.generate_text_id()
+                            yield streaming_service.format_text_start(current_text_id)
+                        yield streaming_service.format_text_delta(current_text_id, content)
+
+            elif event_type == "on_tool_start":
+                active_tool_depth += 1
+                tool_name = event.get("name", "unknown_tool")
+                run_id = event.get("run_id", "")
+                tool_input = event.get("data", {}).get("input", {})
+
+                if current_text_id is not None:
+                    yield streaming_service.format_text_end(current_text_id)
+                    current_text_id = None
+
+                step_event = complete_current_step()
+                if step_event:
+                    yield step_event
+
+                tool_step_id = next_thinking_step_id()
+                tool_step_ids[run_id] = tool_step_id
+                last_active_step_id = tool_step_id
+
+                title, items = _describe_tool_call(tool_name, tool_input)
+                step_titles[tool_step_id] = title
+                yield streaming_service.format_thinking_step(
+                    step_id=tool_step_id,
+                    title=title,
+                    status="in_progress",
+                    items=items,
+                )
+
+            elif event_type == "on_tool_end":
+                active_tool_depth = max(0, active_tool_depth - 1)
+                run_id = event.get("run_id", "")
+                step_id = tool_step_ids.pop(run_id, None)
+                if step_id and step_id not in completed_step_ids:
+                    completed_step_ids.add(step_id)
+                    title = step_titles.get(step_id, "Done")
+                    yield streaming_service.format_thinking_step(
+                        step_id=step_id,
+                        title=title,
+                        status="complete",
+                    )
+                    if last_active_step_id == step_id:
+                        last_active_step_id = None
+
+        if current_text_id is not None:
+            yield streaming_service.format_text_end(current_text_id)
+        step_event = complete_current_step()
+        if step_event:
+            yield step_event
+
+        yield streaming_service.format_finish()
+        yield streaming_service.format_done()
+
+    except Exception as e:
+        logger.error(f"Autocomplete agent streaming error: {e}", exc_info=True)
+        if current_text_id is not None:
+            yield streaming_service.format_text_end(current_text_id)
+        yield streaming_service.format_error("Autocomplete failed. Please try again.")
+        yield streaming_service.format_done()
+
+
+def _describe_tool_call(tool_name: str, tool_input: Any) -> tuple[str, list[str]]:
+    """Return a human-readable (title, items) for a tool call thinking step."""
+    inp = tool_input if isinstance(tool_input, dict) else {}
+    if tool_name == "ls":
+        path = inp.get("path", "/")
+        return "Listing files", [path]
+    if tool_name == "read_file":
+        fp = inp.get("file_path", "")
+        display = fp if len(fp) <= 80 else "…" + fp[-77:]
+        return "Reading file", [display]
+    if tool_name == "write_file":
+        fp = inp.get("file_path", "")
+        display = fp if len(fp) <= 80 else "…" + fp[-77:]
+        return "Writing file", [display]
+    if tool_name == "edit_file":
+        fp = inp.get("file_path", "")
+        display = fp if len(fp) <= 80 else "…" + fp[-77:]
+        return "Editing file", [display]
+    if tool_name == "glob":
+        pat = inp.get("pattern", "")
+        base = inp.get("path", "/")
+        return "Searching files", [f"{pat} in {base}"]
+    if tool_name == "grep":
+        pat = inp.get("pattern", "")
+        path = inp.get("path", "")
+        display_pat = pat[:60] + ("…" if len(pat) > 60 else "")
+        return "Searching content", [f'"{display_pat}"' + (f" in {path}" if path else "")]
+    return f"Using {tool_name}", []
--- a/surfsense_backend/app/services/vision_autocomplete_service.py
+++ b/surfsense_backend/app/services/vision_autocomplete_service.py
@ -1,139 +1,40 @@
+"""Vision autocomplete service — agent-based with scoped filesystem.
+
+Optimized pipeline:
+1. Start the SSE stream immediately so the UI shows progress.
+2. Derive a KB search query from window_title (no separate LLM call).
+3. Run KB filesystem pre-computation and agent graph compilation in PARALLEL.
+4. Inject pre-computed KB files as initial state and stream the agent.
+"""
+
 import logging
 from typing import AsyncGenerator

-from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.messages import HumanMessage
 from sqlalchemy.ext.asyncio import AsyncSession

-from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
+from app.agents.autocomplete import create_autocomplete_agent, stream_autocomplete_agent
 from app.services.llm_service import get_vision_llm
 from app.services.new_streaming_service import VercelStreamingService

 logger = logging.getLogger(__name__)

-KB_TOP_K = 5
-KB_MAX_CHARS = 4000
-
-EXTRACT_QUERY_PROMPT = """Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
-
-EXTRACT_QUERY_PROMPT_WITH_APP = """The user is currently in the application "{app_name}" with the window titled "{window_title}".
-
-Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
-
-VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text.
-
-You will receive a screenshot of the user's screen. Your job:
-1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.).
-2. Identify the text area where the user will type.
-3. Based on the full visual context, generate the text the user most likely wants to write.
-
-Key behavior:
- If the text area is EMPTY, draft a full response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document).
- If the text area already has text, continue it naturally.
-
-Rules:
- Output ONLY the text to be inserted. No quotes, no explanations, no meta-commentary.
- Be concise but complete — a full thought, not a fragment.
- Match the tone and formality of the surrounding context.
- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal.
- Do NOT describe the screenshot or explain your reasoning.
- If you cannot determine what to write, output nothing."""
-
-APP_CONTEXT_BLOCK = """
-
-The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly."""
-
-KB_CONTEXT_BLOCK = """
-
-You also have access to the user's knowledge base documents below. Use them to write more accurate, informed, and contextually relevant text. Do NOT cite or reference the documents explicitly — just let the knowledge inform your writing naturally.
-
-<knowledge_base>
-{kb_context}
-</knowledge_base>"""
+PREP_STEP_ID = "autocomplete-prep"


-def _build_system_prompt(app_name: str, window_title: str, kb_context: str) -> str:
-    """Assemble the system prompt from optional context blocks."""
-    prompt = VISION_SYSTEM_PROMPT
-    if app_name:
-        prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title)
-    if kb_context:
-        prompt += KB_CONTEXT_BLOCK.format(kb_context=kb_context)
-    return prompt
+def _derive_kb_query(app_name: str, window_title: str) -> str:
+    parts = [p for p in (window_title, app_name) if p]
+    return " ".join(parts)


 def _is_vision_unsupported_error(e: Exception) -> bool:
-    """Check if an exception indicates the model doesn't support vision/images."""
    msg = str(e).lower()
    return "content must be a string" in msg or "does not support image" in msg


-async def _extract_query_from_screenshot(
-    llm, screenshot_data_url: str,
-    app_name: str = "", window_title: str = "",
-) -> str | None:
-    """Ask the Vision LLM to describe what the user is working on.
-
-    Raises vision-unsupported errors so the caller can return a
-    friendly message immediately instead of retrying with astream.
-    """
-    if app_name:
-        prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format(
-            app_name=app_name, window_title=window_title,
-        )
-    else:
-        prompt_text = EXTRACT_QUERY_PROMPT
-
-    try:
-        response = await llm.ainvoke([
-            HumanMessage(content=[
-                {"type": "text", "text": prompt_text},
-                {"type": "image_url", "image_url": {"url": screenshot_data_url}},
-            ]),
-        ])
-        query = response.content.strip() if hasattr(response, "content") else ""
-        return query if query else None
-    except Exception as e:
-        if _is_vision_unsupported_error(e):
-            raise
-        logger.warning(f"Failed to extract query from screenshot: {e}")
-        return None
-
-
-async def _search_knowledge_base(
-    session: AsyncSession, search_space_id: int, query: str
-) -> str:
-    """Search the KB and return formatted context string."""
-    try:
-        retriever = ChucksHybridSearchRetriever(session)
-        results = await retriever.hybrid_search(
-            query_text=query,
-            top_k=KB_TOP_K,
-            search_space_id=search_space_id,
-        )
-
-        if not results:
-            return ""
-
-        parts: list[str] = []
-        char_count = 0
-        for doc in results:
-            title = doc.get("document", {}).get("title", "Untitled")
-            for chunk in doc.get("chunks", []):
-                content = chunk.get("content", "").strip()
-                if not content:
-                    continue
-                entry = f"[{title}]\n{content}"
-                if char_count + len(entry) > KB_MAX_CHARS:
-                    break
-                parts.append(entry)
-                char_count += len(entry)
-            if char_count >= KB_MAX_CHARS:
-                break
-
-        return "\n\n---\n\n".join(parts)
-    except Exception as e:
-        logger.warning(f"KB search failed, proceeding without context: {e}")
-        return ""
+# ---------------------------------------------------------------------------
+# Main entry point
+# ---------------------------------------------------------------------------


 async def stream_vision_autocomplete(
@ -144,13 +45,7 @@ async def stream_vision_autocomplete(
    app_name: str = "",
    window_title: str = "",
 ) -> AsyncGenerator[str, None]:
-    """Analyze a screenshot with the vision LLM and stream a text completion.
-
-    Pipeline:
-    1. Extract a search query from the screenshot (non-streaming)
-    2. Search the knowledge base for relevant context
-    3. Stream the final completion with screenshot + KB + app context
-    """
+    """Analyze a screenshot with a vision-LLM agent and stream a text completion."""
    streaming = VercelStreamingService()
    vision_error_msg = (
        "The selected model does not support vision. "
@ -164,62 +59,89 @@ async def stream_vision_autocomplete(
        yield streaming.format_done()
        return

-    kb_context = ""
+    # Start SSE stream immediately so the UI has something to show
+    yield streaming.format_message_start()
+
+    kb_query = _derive_kb_query(app_name, window_title)
+
+    # Show a preparation step while KB search + agent compile run
+    yield streaming.format_thinking_step(
+        step_id=PREP_STEP_ID,
+        title="Searching knowledge base",
+        status="in_progress",
+        items=[kb_query] if kb_query else [],
+    )
+
    try:
-        query = await _extract_query_from_screenshot(
-            llm, screenshot_data_url, app_name=app_name, window_title=window_title,
+        agent, kb = await create_autocomplete_agent(
+            llm,
+            search_space_id=search_space_id,
+            kb_query=kb_query,
+            app_name=app_name,
+            window_title=window_title,
        )
    except Exception as e:
-        logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
-        yield streaming.format_message_start()
-        yield streaming.format_error(vision_error_msg)
+        if _is_vision_unsupported_error(e):
+            logger.warning("Vision autocomplete: model does not support vision: %s", e)
+            yield streaming.format_error(vision_error_msg)
+            yield streaming.format_done()
+            return
+        logger.error("Failed to create autocomplete agent: %s", e, exc_info=True)
+        yield streaming.format_error("Autocomplete failed. Please try again.")
        yield streaming.format_done()
        return

-    if query:
-        kb_context = await _search_knowledge_base(session, search_space_id, query)
+    has_kb = kb.has_documents
+    doc_count = len(kb.files) if has_kb else 0  # type: ignore[arg-type]

-    system_prompt = _build_system_prompt(app_name, window_title, kb_context)
+    yield streaming.format_thinking_step(
+        step_id=PREP_STEP_ID,
+        title="Searching knowledge base",
+        status="complete",
+        items=[f"Found {doc_count} document{'s' if doc_count != 1 else ''}"] if kb_query else ["Skipped"],
+    )

-    messages = [
-        SystemMessage(content=system_prompt),
-        HumanMessage(content=[
-            {
-                "type": "text",
-                "text": "Analyze this screenshot. Understand the full context of what the user is working on, then generate the text they most likely want to write in the active text area.",
-            },
-            {
-                "type": "image_url",
-                "image_url": {"url": screenshot_data_url},
-            },
-        ]),
-    ]
+    # Build agent input with pre-computed KB as initial state
+    if has_kb:
+        instruction = (
+            "Analyze this screenshot, then explore the knowledge base documents "
+            "listed above — read the chunk index of any document whose title "
+            "looks relevant and check matched chunks for useful facts. "
+            "Finally, generate a concise autocomplete for the active text area, "
+            "enhanced with any relevant KB information you found."
+        )
+    else:
+        instruction = (
+            "Analyze this screenshot and generate a concise autocomplete "
+            "for the active text area based on what you see."
+        )

-    text_started = False
-    text_id = ""
+    user_message = HumanMessage(content=[
+        {"type": "text", "text": instruction},
+        {"type": "image_url", "image_url": {"url": screenshot_data_url}},
+    ])
+
+    input_data: dict = {"messages": [user_message]}
+
+    if has_kb:
+        input_data["files"] = kb.files
+        input_data["messages"] = [kb.ls_ai_msg, kb.ls_tool_msg, user_message]
+        logger.info("Autocomplete: injected %d KB files into agent initial state", doc_count)
+    else:
+        logger.info("Autocomplete: no KB documents found, proceeding with screenshot only")
+
+    # Stream the agent (message_start already sent above)
    try:
-        yield streaming.format_message_start()
-        text_id = streaming.generate_text_id()
-        yield streaming.format_text_start(text_id)
-        text_started = True
-
-        async for chunk in llm.astream(messages):
-            token = chunk.content if hasattr(chunk, "content") else str(chunk)
-            if token:
-                yield streaming.format_text_delta(text_id, token)
-
-        yield streaming.format_text_end(text_id)
-        yield streaming.format_finish()
-        yield streaming.format_done()
-
+        async for sse in stream_autocomplete_agent(
+            agent, input_data, streaming, emit_message_start=False,
+        ):
+            yield sse
    except Exception as e:
-        if text_started:
-            yield streaming.format_text_end(text_id)
-
        if _is_vision_unsupported_error(e):
-            logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
+            logger.warning("Vision autocomplete: model does not support vision: %s", e)
            yield streaming.format_error(vision_error_msg)
+            yield streaming.format_done()
        else:
-            logger.error(f"Vision autocomplete streaming error: {e}", exc_info=True)
+            logger.error("Vision autocomplete streaming error: %s", e, exc_info=True)
            yield streaming.format_error("Autocomplete failed. Please try again.")
-        yield streaming.format_done()
+            yield streaming.format_done()