diff --git a/surfsense_backend/app/agents/autocomplete/__init__.py b/surfsense_backend/app/agents/autocomplete/__init__.py
new file mode 100644
index 000000000..55d7a692d
--- /dev/null
+++ b/surfsense_backend/app/agents/autocomplete/__init__.py
@@ -0,0 +1,11 @@
+"""Agent-based vision autocomplete with scoped filesystem exploration."""
+
+from app.agents.autocomplete.autocomplete_agent import (
+    create_autocomplete_agent,
+    stream_autocomplete_agent,
+)
+
+__all__ = [
+    "create_autocomplete_agent",
+    "stream_autocomplete_agent",
+]
diff --git a/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py b/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py
new file mode 100644
index 000000000..928a133cc
--- /dev/null
+++ b/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py
@@ -0,0 +1,429 @@
+"""Vision autocomplete agent with scoped filesystem exploration.
+
+Converts the stateless single-shot vision autocomplete into an agent that
+seeds a virtual filesystem from KB search results and lets the vision LLM
+explore documents via ``ls``, ``read_file``, ``glob``, ``grep``, etc.
+before generating the final completion.
+
+Performance: KB search and agent graph compilation run in parallel so
+the only sequential latency is KB-search (or agent compile, whichever is
+slower) + the agent's LLM turns.  There is no separate "query extraction"
+LLM call — the window title is used directly as the KB search query.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import uuid
+from typing import Any, AsyncGenerator
+
+from deepagents.graph import BASE_AGENT_PROMPT
+from deepagents.middleware.patch_tool_calls import PatchToolCallsMiddleware
+from langchain.agents import create_agent
+from langchain_anthropic.middleware import AnthropicPromptCachingMiddleware
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import AIMessage, ToolMessage
+
+from app.agents.new_chat.middleware.filesystem import SurfSenseFilesystemMiddleware
+from app.agents.new_chat.middleware.knowledge_search import (
+    build_scoped_filesystem,
+    search_knowledge_base,
+)
+from app.services.new_streaming_service import VercelStreamingService
+
+logger = logging.getLogger(__name__)
+
+KB_TOP_K = 10
+
+# ---------------------------------------------------------------------------
+# System prompt
+# ---------------------------------------------------------------------------
+
+AUTOCOMPLETE_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text.
+
+You will receive a screenshot of the user's screen. Your PRIMARY source of truth is the screenshot itself — the visual context determines what to write.
+
+Your job:
+1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.).
+2. Identify the text area where the user will type.
+3. Generate the text the user most likely wants to write based on the visual context.
+
+You also have access to the user's knowledge base documents via filesystem tools. However:
+- ONLY consult the knowledge base if the screenshot clearly involves a topic where your KB documents are DIRECTLY relevant (e.g., the user is writing about a specific project/topic that matches a document title).
+- Do NOT explore documents just because they exist. Most autocomplete requests can be answered purely from the screenshot.
+- If you do read a document, only incorporate information that is 100% relevant to what the user is typing RIGHT NOW. Do not add extra details, background, or tangential information from the KB.
+- Keep your output SHORT — autocomplete should feel like a natural continuation, not an essay.
+
+Key behavior:
+- If the text area is EMPTY, draft a concise response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document).
+- If the text area already has text, continue it naturally — typically just a sentence or two.
+
+Rules:
+- Output ONLY the text to be inserted. No quotes, no explanations, no meta-commentary.
+- Be CONCISE. Prefer a single paragraph or a few sentences. Autocomplete is a quick assist, not a full draft.
+- Match the tone and formality of the surrounding context.
+- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal.
+- Do NOT describe the screenshot or explain your reasoning.
+- Do NOT cite or reference documents explicitly — just let the knowledge inform your writing naturally.
+- If you cannot determine what to write, output nothing.
+
+## Filesystem Tools `ls`, `read_file`, `write_file`, `edit_file`, `glob`, `grep`
+
+All file paths must start with a `/`.
+- ls: list files and directories at a given path.
+- read_file: read a file from the filesystem.
+- write_file: create a temporary file in the session (not persisted).
+- edit_file: edit a file in the session (not persisted for /documents/ files).
+- glob: find files matching a pattern (e.g., "**/*.xml").
+- grep: search for text within files.
+
+## When to Use Filesystem Tools
+
+BEFORE reaching for any tool, ask yourself: "Can I write a good completion purely from the screenshot?" If yes, just write it — do NOT explore the KB.
+
+Only use tools when:
+- The user is clearly writing about a specific topic that likely has detailed information in their KB.
+- You need a specific fact, name, number, or reference that the screenshot doesn't provide.
+
+When you do use tools, be surgical:
+- Check the `ls` output first. If no document title looks relevant, stop — do not read files just to see what's there.
+- If a title looks relevant, read only the `<chunk_index>` (first ~20 lines) and jump to matched chunks. Do not read entire documents.
+- Extract only the specific information you need and move on to generating the completion.
+
+## Reading Documents Efficiently
+
+Documents are formatted as XML. Each document contains:
+- `<document_metadata>` — title, type, URL, etc.
+- `<chunk_index>` — a table of every chunk with its **line range** and a
+  `matched="true"` flag for chunks that matched the search query.
+- `<document_content>` — the actual chunks in original document order.
+
+**Workflow**: read the first ~20 lines to see the `<chunk_index>`, identify
+chunks marked `matched="true"`, then use `read_file(path, offset=<start_line>,
+limit=<lines>)` to jump directly to those sections."""
+
+APP_CONTEXT_BLOCK = """
+
+The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly."""
+
+
+def _build_autocomplete_system_prompt(app_name: str, window_title: str) -> str:
+    prompt = AUTOCOMPLETE_SYSTEM_PROMPT
+    if app_name:
+        prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title)
+    return prompt
+
+
+# ---------------------------------------------------------------------------
+# Pre-compute KB filesystem (runs in parallel with agent compilation)
+# ---------------------------------------------------------------------------
+
+
+class _KBResult:
+    """Container for pre-computed KB filesystem results."""
+    __slots__ = ("files", "ls_ai_msg", "ls_tool_msg")
+
+    def __init__(
+        self,
+        files: dict[str, Any] | None = None,
+        ls_ai_msg: AIMessage | None = None,
+        ls_tool_msg: ToolMessage | None = None,
+    ) -> None:
+        self.files = files
+        self.ls_ai_msg = ls_ai_msg
+        self.ls_tool_msg = ls_tool_msg
+
+    @property
+    def has_documents(self) -> bool:
+        return bool(self.files)
+
+
+async def precompute_kb_filesystem(
+    search_space_id: int,
+    query: str,
+    top_k: int = KB_TOP_K,
+) -> _KBResult:
+    """Search the KB and build the scoped filesystem outside the agent.
+
+    This is designed to be called via ``asyncio.gather`` alongside agent
+    graph compilation so the two run concurrently.
+    """
+    if not query:
+        return _KBResult()
+
+    try:
+        search_results = await search_knowledge_base(
+            query=query,
+            search_space_id=search_space_id,
+            top_k=top_k,
+        )
+
+        if not search_results:
+            return _KBResult()
+
+        new_files, _ = await build_scoped_filesystem(
+            documents=search_results,
+            search_space_id=search_space_id,
+        )
+
+        if not new_files:
+            return _KBResult()
+
+        doc_paths = [
+            p for p, v in new_files.items()
+            if p.startswith("/documents/") and v is not None
+        ]
+        tool_call_id = f"auto_ls_{uuid.uuid4().hex[:12]}"
+        ai_msg = AIMessage(
+            content="",
+            tool_calls=[{"name": "ls", "args": {"path": "/documents"}, "id": tool_call_id}],
+        )
+        tool_msg = ToolMessage(
+            content=str(doc_paths) if doc_paths else "No documents found.",
+            tool_call_id=tool_call_id,
+        )
+        return _KBResult(files=new_files, ls_ai_msg=ai_msg, ls_tool_msg=tool_msg)
+
+    except Exception:
+        logger.warning("KB pre-computation failed, proceeding without KB", exc_info=True)
+        return _KBResult()
+
+
+# ---------------------------------------------------------------------------
+# Filesystem middleware — no save_document, no persistence
+# ---------------------------------------------------------------------------
+
+
+class AutocompleteFilesystemMiddleware(SurfSenseFilesystemMiddleware):
+    """Filesystem middleware for autocomplete — read-only exploration only.
+
+    Strips ``save_document`` (permanent KB persistence) and passes
+    ``search_space_id=None`` so ``write_file`` / ``edit_file`` stay ephemeral.
+    """
+
+    def __init__(self) -> None:
+        super().__init__(search_space_id=None, created_by_id=None)
+        self.tools = [t for t in self.tools if t.name != "save_document"]
+
+
+# ---------------------------------------------------------------------------
+# Agent factory
+# ---------------------------------------------------------------------------
+
+
+async def _compile_agent(
+    llm: BaseChatModel,
+    app_name: str,
+    window_title: str,
+) -> Any:
+    """Compile the agent graph (CPU-bound, runs in a thread)."""
+    system_prompt = _build_autocomplete_system_prompt(app_name, window_title)
+    final_system_prompt = system_prompt + "\n\n" + BASE_AGENT_PROMPT
+
+    middleware = [
+        AutocompleteFilesystemMiddleware(),
+        PatchToolCallsMiddleware(),
+        AnthropicPromptCachingMiddleware(unsupported_model_behavior="ignore"),
+    ]
+
+    agent = await asyncio.to_thread(
+        create_agent,
+        llm,
+        system_prompt=final_system_prompt,
+        tools=[],
+        middleware=middleware,
+    )
+    return agent.with_config({"recursion_limit": 200})
+
+
+async def create_autocomplete_agent(
+    llm: BaseChatModel,
+    *,
+    search_space_id: int,
+    kb_query: str,
+    app_name: str = "",
+    window_title: str = "",
+) -> tuple[Any, _KBResult]:
+    """Create the autocomplete agent and pre-compute KB in parallel.
+
+    Returns ``(agent, kb_result)`` so the caller can inject the pre-computed
+    filesystem into the agent's initial state without any middleware delay.
+    """
+    agent, kb = await asyncio.gather(
+        _compile_agent(llm, app_name, window_title),
+        precompute_kb_filesystem(search_space_id, kb_query),
+    )
+    return agent, kb
+
+
+# ---------------------------------------------------------------------------
+# Streaming helper
+# ---------------------------------------------------------------------------
+
+
+async def stream_autocomplete_agent(
+    agent: Any,
+    input_data: dict[str, Any],
+    streaming_service: VercelStreamingService,
+    *,
+    emit_message_start: bool = True,
+) -> AsyncGenerator[str, None]:
+    """Stream agent events as Vercel SSE, with thinking steps for tool calls.
+
+    When ``emit_message_start`` is False the caller has already sent the
+    ``message_start`` event (e.g. to show preparation steps before the agent
+    runs).
+    """
+    thread_id = uuid.uuid4().hex
+    config = {"configurable": {"thread_id": thread_id}}
+
+    current_text_id: str | None = None
+    active_tool_depth = 0
+    thinking_step_counter = 0
+    tool_step_ids: dict[str, str] = {}
+    step_titles: dict[str, str] = {}
+    completed_step_ids: set[str] = set()
+    last_active_step_id: str | None = None
+
+    def next_thinking_step_id() -> str:
+        nonlocal thinking_step_counter
+        thinking_step_counter += 1
+        return f"autocomplete-step-{thinking_step_counter}"
+
+    def complete_current_step() -> str | None:
+        nonlocal last_active_step_id
+        if last_active_step_id and last_active_step_id not in completed_step_ids:
+            completed_step_ids.add(last_active_step_id)
+            title = step_titles.get(last_active_step_id, "Done")
+            event = streaming_service.format_thinking_step(
+                step_id=last_active_step_id,
+                title=title,
+                status="complete",
+            )
+            last_active_step_id = None
+            return event
+        return None
+
+    if emit_message_start:
+        yield streaming_service.format_message_start()
+
+    # Emit an initial "Generating completion" step so the UI immediately
+    # shows activity once the agent starts its first LLM call.
+    gen_step_id = next_thinking_step_id()
+    last_active_step_id = gen_step_id
+    step_titles[gen_step_id] = "Generating completion"
+    yield streaming_service.format_thinking_step(
+        step_id=gen_step_id,
+        title="Generating completion",
+        status="in_progress",
+    )
+
+    try:
+        async for event in agent.astream_events(input_data, config=config, version="v2"):
+            event_type = event.get("event", "")
+
+            if event_type == "on_chat_model_stream":
+                if active_tool_depth > 0:
+                    continue
+                if "surfsense:internal" in event.get("tags", []):
+                    continue
+                chunk = event.get("data", {}).get("chunk")
+                if chunk and hasattr(chunk, "content"):
+                    content = chunk.content
+                    if content and isinstance(content, str):
+                        if current_text_id is None:
+                            step_event = complete_current_step()
+                            if step_event:
+                                yield step_event
+                            current_text_id = streaming_service.generate_text_id()
+                            yield streaming_service.format_text_start(current_text_id)
+                        yield streaming_service.format_text_delta(current_text_id, content)
+
+            elif event_type == "on_tool_start":
+                active_tool_depth += 1
+                tool_name = event.get("name", "unknown_tool")
+                run_id = event.get("run_id", "")
+                tool_input = event.get("data", {}).get("input", {})
+
+                if current_text_id is not None:
+                    yield streaming_service.format_text_end(current_text_id)
+                    current_text_id = None
+
+                step_event = complete_current_step()
+                if step_event:
+                    yield step_event
+
+                tool_step_id = next_thinking_step_id()
+                tool_step_ids[run_id] = tool_step_id
+                last_active_step_id = tool_step_id
+
+                title, items = _describe_tool_call(tool_name, tool_input)
+                step_titles[tool_step_id] = title
+                yield streaming_service.format_thinking_step(
+                    step_id=tool_step_id,
+                    title=title,
+                    status="in_progress",
+                    items=items,
+                )
+
+            elif event_type == "on_tool_end":
+                active_tool_depth = max(0, active_tool_depth - 1)
+                run_id = event.get("run_id", "")
+                step_id = tool_step_ids.pop(run_id, None)
+                if step_id and step_id not in completed_step_ids:
+                    completed_step_ids.add(step_id)
+                    title = step_titles.get(step_id, "Done")
+                    yield streaming_service.format_thinking_step(
+                        step_id=step_id,
+                        title=title,
+                        status="complete",
+                    )
+                    if last_active_step_id == step_id:
+                        last_active_step_id = None
+
+        if current_text_id is not None:
+            yield streaming_service.format_text_end(current_text_id)
+        step_event = complete_current_step()
+        if step_event:
+            yield step_event
+
+        yield streaming_service.format_finish()
+        yield streaming_service.format_done()
+
+    except Exception as e:
+        logger.error(f"Autocomplete agent streaming error: {e}", exc_info=True)
+        if current_text_id is not None:
+            yield streaming_service.format_text_end(current_text_id)
+        yield streaming_service.format_error("Autocomplete failed. Please try again.")
+        yield streaming_service.format_done()
+
+
+def _describe_tool_call(tool_name: str, tool_input: Any) -> tuple[str, list[str]]:
+    """Return a human-readable (title, items) for a tool call thinking step."""
+    inp = tool_input if isinstance(tool_input, dict) else {}
+    if tool_name == "ls":
+        path = inp.get("path", "/")
+        return "Listing files", [path]
+    if tool_name == "read_file":
+        fp = inp.get("file_path", "")
+        display = fp if len(fp) <= 80 else "…" + fp[-77:]
+        return "Reading file", [display]
+    if tool_name == "write_file":
+        fp = inp.get("file_path", "")
+        display = fp if len(fp) <= 80 else "…" + fp[-77:]
+        return "Writing file", [display]
+    if tool_name == "edit_file":
+        fp = inp.get("file_path", "")
+        display = fp if len(fp) <= 80 else "…" + fp[-77:]
+        return "Editing file", [display]
+    if tool_name == "glob":
+        pat = inp.get("pattern", "")
+        base = inp.get("path", "/")
+        return "Searching files", [f"{pat} in {base}"]
+    if tool_name == "grep":
+        pat = inp.get("pattern", "")
+        path = inp.get("path", "")
+        display_pat = pat[:60] + ("…" if len(pat) > 60 else "")
+        return "Searching content", [f'"{display_pat}"' + (f" in {path}" if path else "")]
+    return f"Using {tool_name}", []
diff --git a/surfsense_backend/app/services/vision_autocomplete_service.py b/surfsense_backend/app/services/vision_autocomplete_service.py
index f24a5c848..7d16c5864 100644
--- a/surfsense_backend/app/services/vision_autocomplete_service.py
+++ b/surfsense_backend/app/services/vision_autocomplete_service.py
@@ -1,139 +1,40 @@
+"""Vision autocomplete service — agent-based with scoped filesystem.
+
+Optimized pipeline:
+1. Start the SSE stream immediately so the UI shows progress.
+2. Derive a KB search query from window_title (no separate LLM call).
+3. Run KB filesystem pre-computation and agent graph compilation in PARALLEL.
+4. Inject pre-computed KB files as initial state and stream the agent.
+"""
+
 import logging
 from typing import AsyncGenerator
 
-from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.messages import HumanMessage
 from sqlalchemy.ext.asyncio import AsyncSession
 
-from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
+from app.agents.autocomplete import create_autocomplete_agent, stream_autocomplete_agent
 from app.services.llm_service import get_vision_llm
 from app.services.new_streaming_service import VercelStreamingService
 
 logger = logging.getLogger(__name__)
 
-KB_TOP_K = 5
-KB_MAX_CHARS = 4000
-
-EXTRACT_QUERY_PROMPT = """Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
-
-EXTRACT_QUERY_PROMPT_WITH_APP = """The user is currently in the application "{app_name}" with the window titled "{window_title}".
-
-Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
-
-VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text.
-
-You will receive a screenshot of the user's screen. Your job:
-1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.).
-2. Identify the text area where the user will type.
-3. Based on the full visual context, generate the text the user most likely wants to write.
-
-Key behavior:
-- If the text area is EMPTY, draft a full response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document).
-- If the text area already has text, continue it naturally.
-
-Rules:
-- Output ONLY the text to be inserted. No quotes, no explanations, no meta-commentary.
-- Be concise but complete — a full thought, not a fragment.
-- Match the tone and formality of the surrounding context.
-- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal.
-- Do NOT describe the screenshot or explain your reasoning.
-- If you cannot determine what to write, output nothing."""
-
-APP_CONTEXT_BLOCK = """
-
-The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly."""
-
-KB_CONTEXT_BLOCK = """
-
-You also have access to the user's knowledge base documents below. Use them to write more accurate, informed, and contextually relevant text. Do NOT cite or reference the documents explicitly — just let the knowledge inform your writing naturally.
-
-<knowledge_base>
-{kb_context}
-</knowledge_base>"""
+PREP_STEP_ID = "autocomplete-prep"
 
 
-def _build_system_prompt(app_name: str, window_title: str, kb_context: str) -> str:
-    """Assemble the system prompt from optional context blocks."""
-    prompt = VISION_SYSTEM_PROMPT
-    if app_name:
-        prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title)
-    if kb_context:
-        prompt += KB_CONTEXT_BLOCK.format(kb_context=kb_context)
-    return prompt
+def _derive_kb_query(app_name: str, window_title: str) -> str:
+    parts = [p for p in (window_title, app_name) if p]
+    return " ".join(parts)
 
 
 def _is_vision_unsupported_error(e: Exception) -> bool:
-    """Check if an exception indicates the model doesn't support vision/images."""
     msg = str(e).lower()
     return "content must be a string" in msg or "does not support image" in msg
 
 
-async def _extract_query_from_screenshot(
-    llm, screenshot_data_url: str,
-    app_name: str = "", window_title: str = "",
-) -> str | None:
-    """Ask the Vision LLM to describe what the user is working on.
-
-    Raises vision-unsupported errors so the caller can return a
-    friendly message immediately instead of retrying with astream.
-    """
-    if app_name:
-        prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format(
-            app_name=app_name, window_title=window_title,
-        )
-    else:
-        prompt_text = EXTRACT_QUERY_PROMPT
-
-    try:
-        response = await llm.ainvoke([
-            HumanMessage(content=[
-                {"type": "text", "text": prompt_text},
-                {"type": "image_url", "image_url": {"url": screenshot_data_url}},
-            ]),
-        ])
-        query = response.content.strip() if hasattr(response, "content") else ""
-        return query if query else None
-    except Exception as e:
-        if _is_vision_unsupported_error(e):
-            raise
-        logger.warning(f"Failed to extract query from screenshot: {e}")
-        return None
-
-
-async def _search_knowledge_base(
-    session: AsyncSession, search_space_id: int, query: str
-) -> str:
-    """Search the KB and return formatted context string."""
-    try:
-        retriever = ChucksHybridSearchRetriever(session)
-        results = await retriever.hybrid_search(
-            query_text=query,
-            top_k=KB_TOP_K,
-            search_space_id=search_space_id,
-        )
-
-        if not results:
-            return ""
-
-        parts: list[str] = []
-        char_count = 0
-        for doc in results:
-            title = doc.get("document", {}).get("title", "Untitled")
-            for chunk in doc.get("chunks", []):
-                content = chunk.get("content", "").strip()
-                if not content:
-                    continue
-                entry = f"[{title}]\n{content}"
-                if char_count + len(entry) > KB_MAX_CHARS:
-                    break
-                parts.append(entry)
-                char_count += len(entry)
-            if char_count >= KB_MAX_CHARS:
-                break
-
-        return "\n\n---\n\n".join(parts)
-    except Exception as e:
-        logger.warning(f"KB search failed, proceeding without context: {e}")
-        return ""
+# ---------------------------------------------------------------------------
+# Main entry point
+# ---------------------------------------------------------------------------
 
 
 async def stream_vision_autocomplete(
@@ -144,13 +45,7 @@ async def stream_vision_autocomplete(
     app_name: str = "",
     window_title: str = "",
 ) -> AsyncGenerator[str, None]:
-    """Analyze a screenshot with the vision LLM and stream a text completion.
-
-    Pipeline:
-    1. Extract a search query from the screenshot (non-streaming)
-    2. Search the knowledge base for relevant context
-    3. Stream the final completion with screenshot + KB + app context
-    """
+    """Analyze a screenshot with a vision-LLM agent and stream a text completion."""
     streaming = VercelStreamingService()
     vision_error_msg = (
         "The selected model does not support vision. "
@@ -164,62 +59,89 @@ async def stream_vision_autocomplete(
         yield streaming.format_done()
         return
 
-    kb_context = ""
+    # Start SSE stream immediately so the UI has something to show
+    yield streaming.format_message_start()
+
+    kb_query = _derive_kb_query(app_name, window_title)
+
+    # Show a preparation step while KB search + agent compile run
+    yield streaming.format_thinking_step(
+        step_id=PREP_STEP_ID,
+        title="Searching knowledge base",
+        status="in_progress",
+        items=[kb_query] if kb_query else [],
+    )
+
     try:
-        query = await _extract_query_from_screenshot(
-            llm, screenshot_data_url, app_name=app_name, window_title=window_title,
+        agent, kb = await create_autocomplete_agent(
+            llm,
+            search_space_id=search_space_id,
+            kb_query=kb_query,
+            app_name=app_name,
+            window_title=window_title,
         )
     except Exception as e:
-        logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
-        yield streaming.format_message_start()
-        yield streaming.format_error(vision_error_msg)
+        if _is_vision_unsupported_error(e):
+            logger.warning("Vision autocomplete: model does not support vision: %s", e)
+            yield streaming.format_error(vision_error_msg)
+            yield streaming.format_done()
+            return
+        logger.error("Failed to create autocomplete agent: %s", e, exc_info=True)
+        yield streaming.format_error("Autocomplete failed. Please try again.")
         yield streaming.format_done()
         return
 
-    if query:
-        kb_context = await _search_knowledge_base(session, search_space_id, query)
+    has_kb = kb.has_documents
+    doc_count = len(kb.files) if has_kb else 0  # type: ignore[arg-type]
 
-    system_prompt = _build_system_prompt(app_name, window_title, kb_context)
+    yield streaming.format_thinking_step(
+        step_id=PREP_STEP_ID,
+        title="Searching knowledge base",
+        status="complete",
+        items=[f"Found {doc_count} document{'s' if doc_count != 1 else ''}"] if kb_query else ["Skipped"],
+    )
 
-    messages = [
-        SystemMessage(content=system_prompt),
-        HumanMessage(content=[
-            {
-                "type": "text",
-                "text": "Analyze this screenshot. Understand the full context of what the user is working on, then generate the text they most likely want to write in the active text area.",
-            },
-            {
-                "type": "image_url",
-                "image_url": {"url": screenshot_data_url},
-            },
-        ]),
-    ]
+    # Build agent input with pre-computed KB as initial state
+    if has_kb:
+        instruction = (
+            "Analyze this screenshot, then explore the knowledge base documents "
+            "listed above — read the chunk index of any document whose title "
+            "looks relevant and check matched chunks for useful facts. "
+            "Finally, generate a concise autocomplete for the active text area, "
+            "enhanced with any relevant KB information you found."
+        )
+    else:
+        instruction = (
+            "Analyze this screenshot and generate a concise autocomplete "
+            "for the active text area based on what you see."
+        )
 
-    text_started = False
-    text_id = ""
+    user_message = HumanMessage(content=[
+        {"type": "text", "text": instruction},
+        {"type": "image_url", "image_url": {"url": screenshot_data_url}},
+    ])
+
+    input_data: dict = {"messages": [user_message]}
+
+    if has_kb:
+        input_data["files"] = kb.files
+        input_data["messages"] = [kb.ls_ai_msg, kb.ls_tool_msg, user_message]
+        logger.info("Autocomplete: injected %d KB files into agent initial state", doc_count)
+    else:
+        logger.info("Autocomplete: no KB documents found, proceeding with screenshot only")
+
+    # Stream the agent (message_start already sent above)
     try:
-        yield streaming.format_message_start()
-        text_id = streaming.generate_text_id()
-        yield streaming.format_text_start(text_id)
-        text_started = True
-
-        async for chunk in llm.astream(messages):
-            token = chunk.content if hasattr(chunk, "content") else str(chunk)
-            if token:
-                yield streaming.format_text_delta(text_id, token)
-
-        yield streaming.format_text_end(text_id)
-        yield streaming.format_finish()
-        yield streaming.format_done()
-
+        async for sse in stream_autocomplete_agent(
+            agent, input_data, streaming, emit_message_start=False,
+        ):
+            yield sse
     except Exception as e:
-        if text_started:
-            yield streaming.format_text_end(text_id)
-
         if _is_vision_unsupported_error(e):
-            logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
+            logger.warning("Vision autocomplete: model does not support vision: %s", e)
             yield streaming.format_error(vision_error_msg)
+            yield streaming.format_done()
         else:
-            logger.error(f"Vision autocomplete streaming error: {e}", exc_info=True)
+            logger.error("Vision autocomplete streaming error: %s", e, exc_info=True)
             yield streaming.format_error("Autocomplete failed. Please try again.")
-        yield streaming.format_done()
+            yield streaming.format_done()
diff --git a/surfsense_web/app/desktop/suggestion/page.tsx b/surfsense_web/app/desktop/suggestion/page.tsx
index fb83e2113..42ce025a8 100644
--- a/surfsense_web/app/desktop/suggestion/page.tsx
+++ b/surfsense_web/app/desktop/suggestion/page.tsx
@@ -10,7 +10,18 @@ type SSEEvent =
 	| { type: "text-end"; id: string }
 	| { type: "start"; messageId: string }
 	| { type: "finish" }
-	| { type: "error"; errorText: string };
+	| { type: "error"; errorText: string }
+	| {
+			type: "data-thinking-step";
+			data: { id: string; title: string; status: string; items: string[] };
+	  };
+
+interface AgentStep {
+	id: string;
+	title: string;
+	status: string;
+	items: string[];
+}
 
 function friendlyError(raw: string | number): string {
 	if (typeof raw === "number") {
@@ -34,11 +45,24 @@ function friendlyError(raw: string | number): string {
 
 const AUTO_DISMISS_MS = 3000;
 
+function StepIcon({ status }: { status: string }) {
+	if (status === "complete") {
+		return (
+			<svg className="step-icon step-icon-done" viewBox="0 0 16 16" fill="none">
+				<circle cx="8" cy="8" r="7" stroke="#4ade80" strokeWidth="1.5" />
+				<path d="M5 8.5l2 2 4-4.5" stroke="#4ade80" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" />
+			</svg>
+		);
+	}
+	return <span className="step-spinner" />;
+}
+
 export default function SuggestionPage() {
 	const api = useElectronAPI();
 	const [suggestion, setSuggestion] = useState("");
 	const [isLoading, setIsLoading] = useState(true);
 	const [error, setError] = useState<string | null>(null);
+	const [steps, setSteps] = useState<AgentStep[]>([]);
 	const abortRef = useRef<AbortController | null>(null);
 
 	const isDesktop = !!api?.onAutocompleteContext;
@@ -66,6 +90,7 @@ export default function SuggestionPage() {
 			setIsLoading(true);
 			setSuggestion("");
 			setError(null);
+			setSteps([]);
 
 			let token = getBearerToken();
 			if (!token) {
@@ -137,6 +162,17 @@ export default function SuggestionPage() {
 									setSuggestion((prev) => prev + parsed.delta);
 								} else if (parsed.type === "error") {
 									setError(friendlyError(parsed.errorText));
+								} else if (parsed.type === "data-thinking-step") {
+									const { id, title, status, items } = parsed.data;
+									setSteps((prev) => {
+										const existing = prev.findIndex((s) => s.id === id);
+										if (existing >= 0) {
+											const updated = [...prev];
+											updated[existing] = { id, title, status, items };
+											return updated;
+										}
+										return [...prev, { id, title, status, items }];
+									});
 								}
 							} catch {
 								continue;
@@ -185,13 +221,33 @@ export default function SuggestionPage() {
 		);
 	}
 
-	if (isLoading && !suggestion) {
+	const showLoading = isLoading && !suggestion;
+
+	if (showLoading) {
 		return (
 			<div className="suggestion-tooltip">
-				<div className="suggestion-loading">
-					<span className="suggestion-dot" />
-					<span className="suggestion-dot" />
-					<span className="suggestion-dot" />
+				<div className="agent-activity">
+					{steps.length === 0 && (
+						<div className="activity-initial">
+							<span className="step-spinner" />
+							<span className="activity-label">Preparing…</span>
+						</div>
+					)}
+					{steps.length > 0 && (
+						<div className="activity-steps">
+							{steps.map((step) => (
+								<div key={step.id} className="activity-step">
+									<StepIcon status={step.status} />
+									<span className="step-label">
+										{step.title}
+										{step.items.length > 0 && (
+											<span className="step-detail"> · {step.items[0]}</span>
+										)}
+									</span>
+								</div>
+							))}
+						</div>
+					)}
 				</div>
 			</div>
 		);
diff --git a/surfsense_web/app/desktop/suggestion/suggestion.css b/surfsense_web/app/desktop/suggestion/suggestion.css
index 62f4d2ea7..d2213fefd 100644
--- a/surfsense_web/app/desktop/suggestion/suggestion.css
+++ b/surfsense_web/app/desktop/suggestion/suggestion.css
@@ -19,13 +19,21 @@ body:has(.suggestion-body) {
 }
 
 .suggestion-tooltip {
+  box-sizing: border-box;
   background: #1e1e1e;
   border: 1px solid #3c3c3c;
   border-radius: 8px;
   padding: 8px 12px;
   margin: 4px;
   max-width: 400px;
+  /* MAX_HEIGHT in suggestion-window.ts is 400px. Subtract 8px for margin
+     (4px * 2) so the tooltip + margin fits within the Electron window.
+     box-sizing: border-box ensures padding + border are included. */
+  max-height: 392px;
   box-shadow: 0 4px 16px rgba(0, 0, 0, 0.5);
+  display: flex;
+  flex-direction: column;
+  overflow: hidden;
 }
 
 .suggestion-text {
@@ -35,6 +43,26 @@ body:has(.suggestion-body) {
   margin: 0 0 6px 0;
   word-wrap: break-word;
   white-space: pre-wrap;
+  overflow-y: auto;
+  flex: 1 1 auto;
+  min-height: 0;
+}
+
+.suggestion-text::-webkit-scrollbar {
+  width: 5px;
+}
+
+.suggestion-text::-webkit-scrollbar-track {
+  background: transparent;
+}
+
+.suggestion-text::-webkit-scrollbar-thumb {
+  background: #555;
+  border-radius: 3px;
+}
+
+.suggestion-text::-webkit-scrollbar-thumb:hover {
+  background: #777;
 }
 
 .suggestion-actions {
@@ -43,6 +71,7 @@ body:has(.suggestion-body) {
   gap: 4px;
   border-top: 1px solid #2a2a2a;
   padding-top: 6px;
+  flex-shrink: 0;
 }
 
 .suggestion-btn {
@@ -86,36 +115,77 @@ body:has(.suggestion-body) {
   font-size: 12px;
 }
 
-.suggestion-loading {
+/* --- Agent activity indicator --- */
+
+.agent-activity {
   display: flex;
-  gap: 5px;
+  flex-direction: column;
+  gap: 4px;
+  overflow-y: auto;
+  max-height: 340px;
+}
+
+.activity-initial {
+  display: flex;
+  align-items: center;
+  gap: 8px;
   padding: 2px 0;
-  justify-content: center;
 }
 
-.suggestion-dot {
-  width: 4px;
-  height: 4px;
+.activity-label {
+  color: #a1a1aa;
+  font-size: 12px;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+}
+
+.activity-steps {
+  display: flex;
+  flex-direction: column;
+  gap: 3px;
+}
+
+.activity-step {
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  min-height: 18px;
+}
+
+.step-label {
+  color: #d4d4d4;
+  font-size: 12px;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+}
+
+.step-detail {
+  color: #71717a;
+  font-size: 11px;
+}
+
+/* Spinner (in_progress) */
+.step-spinner {
+  width: 14px;
+  height: 14px;
+  flex-shrink: 0;
+  border: 1.5px solid #3f3f46;
+  border-top-color: #a78bfa;
   border-radius: 50%;
-  background: #666;
-  animation: suggestion-pulse 1.2s infinite ease-in-out;
+  animation: step-spin 0.7s linear infinite;
 }
 
-.suggestion-dot:nth-child(2) {
-  animation-delay: 0.15s;
+/* Checkmark icon (complete) */
+.step-icon {
+  width: 14px;
+  height: 14px;
+  flex-shrink: 0;
 }
 
-.suggestion-dot:nth-child(3) {
-  animation-delay: 0.3s;
-}
-
-@keyframes suggestion-pulse {
-  0%, 80%, 100% {
-    opacity: 0.3;
-    transform: scale(0.8);
-  }
-  40% {
-    opacity: 1;
-    transform: scale(1.1);
+@keyframes step-spin {
+  to {
+    transform: rotate(360deg);
   }
 }
diff --git a/surfsense_web/components/assistant-ui/thread.tsx b/surfsense_web/components/assistant-ui/thread.tsx
index 7d8765399..6c8c619b2 100644
--- a/surfsense_web/components/assistant-ui/thread.tsx
+++ b/surfsense_web/components/assistant-ui/thread.tsx
@@ -92,15 +92,7 @@ import { useMediaQuery } from "@/hooks/use-media-query";
 import { useElectronAPI } from "@/hooks/use-platform";
 import { cn } from "@/lib/utils";
 
-/** Placeholder texts that cycle in new chats when input is empty */
-const CYCLING_PLACEHOLDERS = [
-	"Ask SurfSense anything or @mention docs",
-	"Generate a podcast from my vacation ideas in Notion",
-	"Sum up last week's meeting notes from Drive in a bulleted list",
-	"Give me a brief overview of the most urgent tickets in Jira and Linear",
-	"Briefly, what are today's top ten important emails and calendar events?",
-	"Check if this week's Slack messages reference any GitHub issues",
-];
+const COMPOSER_PLACEHOLDER = "Ask anything · Type / for prompts · Type @ to mention docs";
 
 export const Thread: FC = () => {
 	return <ThreadContent />;
@@ -380,29 +372,7 @@ const Composer: FC = () => {
 	const isThreadEmpty = useAuiState(({ thread }) => thread.isEmpty);
 	const isThreadRunning = useAuiState(({ thread }) => thread.isRunning);
 
-	// Cycling placeholder state - only cycles in new chats
-	const [placeholderIndex, setPlaceholderIndex] = useState(0);
-
-	// Cycle through placeholders every 4 seconds when thread is empty (new chat)
-	useEffect(() => {
-		// Only cycle when thread is empty (new chat)
-		if (!isThreadEmpty) {
-			// Reset to first placeholder when chat becomes active
-			setPlaceholderIndex(0);
-			return;
-		}
-
-		const intervalId = setInterval(() => {
-			setPlaceholderIndex((prev) => (prev + 1) % CYCLING_PLACEHOLDERS.length);
-		}, 6000);
-
-		return () => clearInterval(intervalId);
-	}, [isThreadEmpty]);
-
-	// Compute current placeholder - only cycle in new chats
-	const currentPlaceholder = isThreadEmpty
-		? CYCLING_PLACEHOLDERS[placeholderIndex]
-		: CYCLING_PLACEHOLDERS[0];
+	const currentPlaceholder = COMPOSER_PLACEHOLDER;
 
 	// Live collaboration state
 	const { data: currentUser } = useAtomValue(currentUserAtom);