feat: enhance vision autocomplete service and UI feedback

- Optimized the vision autocomplete service by starting the SSE stream immediately and deriving KB search queries directly from window titles. - Refactored the service to run KB filesystem pre-computation and agent graph compilation in parallel, improving performance. - Updated the SuggestionPage component to handle new agent step data, displaying progress indicators for each step. - Enhanced the CSS for the suggestion tooltip and agent activity indicators, improving the user interface and experience.
2026-05-09 07:42:39 +02:00 · 2026-04-07 02:49:24 -07:00 · 2026-04-07 02:49:24 -07:00 · bb1dcd32b6
commit bb1dcd32b6
parent 49441233e7
6 changed files with 686 additions and 228 deletions
--- a/surfsense_backend/app/services/vision_autocomplete_service.py
+++ b/surfsense_backend/app/services/vision_autocomplete_service.py
@ -1,139 +1,40 @@
+"""Vision autocomplete service — agent-based with scoped filesystem.
+
+Optimized pipeline:
+1. Start the SSE stream immediately so the UI shows progress.
+2. Derive a KB search query from window_title (no separate LLM call).
+3. Run KB filesystem pre-computation and agent graph compilation in PARALLEL.
+4. Inject pre-computed KB files as initial state and stream the agent.
+"""
+
 import logging
 from typing import AsyncGenerator

-from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.messages import HumanMessage
 from sqlalchemy.ext.asyncio import AsyncSession

-from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
+from app.agents.autocomplete import create_autocomplete_agent, stream_autocomplete_agent
 from app.services.llm_service import get_vision_llm
 from app.services.new_streaming_service import VercelStreamingService

 logger = logging.getLogger(__name__)

-KB_TOP_K = 5
-KB_MAX_CHARS = 4000
-
-EXTRACT_QUERY_PROMPT = """Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
-
-EXTRACT_QUERY_PROMPT_WITH_APP = """The user is currently in the application "{app_name}" with the window titled "{window_title}".
-
-Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
-
-VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text.
-
-You will receive a screenshot of the user's screen. Your job:
-1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.).
-2. Identify the text area where the user will type.
-3. Based on the full visual context, generate the text the user most likely wants to write.
-
-Key behavior:
- If the text area is EMPTY, draft a full response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document).
- If the text area already has text, continue it naturally.
-
-Rules:
- Output ONLY the text to be inserted. No quotes, no explanations, no meta-commentary.
- Be concise but complete — a full thought, not a fragment.
- Match the tone and formality of the surrounding context.
- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal.
- Do NOT describe the screenshot or explain your reasoning.
- If you cannot determine what to write, output nothing."""
-
-APP_CONTEXT_BLOCK = """
-
-The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly."""
-
-KB_CONTEXT_BLOCK = """
-
-You also have access to the user's knowledge base documents below. Use them to write more accurate, informed, and contextually relevant text. Do NOT cite or reference the documents explicitly — just let the knowledge inform your writing naturally.
-
-<knowledge_base>
-{kb_context}
-</knowledge_base>"""
+PREP_STEP_ID = "autocomplete-prep"


-def _build_system_prompt(app_name: str, window_title: str, kb_context: str) -> str:
-    """Assemble the system prompt from optional context blocks."""
-    prompt = VISION_SYSTEM_PROMPT
-    if app_name:
-        prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title)
-    if kb_context:
-        prompt += KB_CONTEXT_BLOCK.format(kb_context=kb_context)
-    return prompt
+def _derive_kb_query(app_name: str, window_title: str) -> str:
+    parts = [p for p in (window_title, app_name) if p]
+    return " ".join(parts)


 def _is_vision_unsupported_error(e: Exception) -> bool:
-    """Check if an exception indicates the model doesn't support vision/images."""
    msg = str(e).lower()
    return "content must be a string" in msg or "does not support image" in msg


-async def _extract_query_from_screenshot(
-    llm, screenshot_data_url: str,
-    app_name: str = "", window_title: str = "",
-) -> str | None:
-    """Ask the Vision LLM to describe what the user is working on.
-
-    Raises vision-unsupported errors so the caller can return a
-    friendly message immediately instead of retrying with astream.
-    """
-    if app_name:
-        prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format(
-            app_name=app_name, window_title=window_title,
-        )
-    else:
-        prompt_text = EXTRACT_QUERY_PROMPT
-
-    try:
-        response = await llm.ainvoke([
-            HumanMessage(content=[
-                {"type": "text", "text": prompt_text},
-                {"type": "image_url", "image_url": {"url": screenshot_data_url}},
-            ]),
-        ])
-        query = response.content.strip() if hasattr(response, "content") else ""
-        return query if query else None
-    except Exception as e:
-        if _is_vision_unsupported_error(e):
-            raise
-        logger.warning(f"Failed to extract query from screenshot: {e}")
-        return None
-
-
-async def _search_knowledge_base(
-    session: AsyncSession, search_space_id: int, query: str
-) -> str:
-    """Search the KB and return formatted context string."""
-    try:
-        retriever = ChucksHybridSearchRetriever(session)
-        results = await retriever.hybrid_search(
-            query_text=query,
-            top_k=KB_TOP_K,
-            search_space_id=search_space_id,
-        )
-
-        if not results:
-            return ""
-
-        parts: list[str] = []
-        char_count = 0
-        for doc in results:
-            title = doc.get("document", {}).get("title", "Untitled")
-            for chunk in doc.get("chunks", []):
-                content = chunk.get("content", "").strip()
-                if not content:
-                    continue
-                entry = f"[{title}]\n{content}"
-                if char_count + len(entry) > KB_MAX_CHARS:
-                    break
-                parts.append(entry)
-                char_count += len(entry)
-            if char_count >= KB_MAX_CHARS:
-                break
-
-        return "\n\n---\n\n".join(parts)
-    except Exception as e:
-        logger.warning(f"KB search failed, proceeding without context: {e}")
-        return ""
+# ---------------------------------------------------------------------------
+# Main entry point
+# ---------------------------------------------------------------------------


 async def stream_vision_autocomplete(
@ -144,13 +45,7 @@ async def stream_vision_autocomplete(
    app_name: str = "",
    window_title: str = "",
 ) -> AsyncGenerator[str, None]:
-    """Analyze a screenshot with the vision LLM and stream a text completion.
-
-    Pipeline:
-    1. Extract a search query from the screenshot (non-streaming)
-    2. Search the knowledge base for relevant context
-    3. Stream the final completion with screenshot + KB + app context
-    """
+    """Analyze a screenshot with a vision-LLM agent and stream a text completion."""
    streaming = VercelStreamingService()
    vision_error_msg = (
        "The selected model does not support vision. "
@ -164,62 +59,89 @@ async def stream_vision_autocomplete(
        yield streaming.format_done()
        return

-    kb_context = ""
+    # Start SSE stream immediately so the UI has something to show
+    yield streaming.format_message_start()
+
+    kb_query = _derive_kb_query(app_name, window_title)
+
+    # Show a preparation step while KB search + agent compile run
+    yield streaming.format_thinking_step(
+        step_id=PREP_STEP_ID,
+        title="Searching knowledge base",
+        status="in_progress",
+        items=[kb_query] if kb_query else [],
+    )
+
    try:
-        query = await _extract_query_from_screenshot(
-            llm, screenshot_data_url, app_name=app_name, window_title=window_title,
+        agent, kb = await create_autocomplete_agent(
+            llm,
+            search_space_id=search_space_id,
+            kb_query=kb_query,
+            app_name=app_name,
+            window_title=window_title,
        )
    except Exception as e:
-        logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
-        yield streaming.format_message_start()
-        yield streaming.format_error(vision_error_msg)
+        if _is_vision_unsupported_error(e):
+            logger.warning("Vision autocomplete: model does not support vision: %s", e)
+            yield streaming.format_error(vision_error_msg)
+            yield streaming.format_done()
+            return
+        logger.error("Failed to create autocomplete agent: %s", e, exc_info=True)
+        yield streaming.format_error("Autocomplete failed. Please try again.")
        yield streaming.format_done()
        return

-    if query:
-        kb_context = await _search_knowledge_base(session, search_space_id, query)
+    has_kb = kb.has_documents
+    doc_count = len(kb.files) if has_kb else 0  # type: ignore[arg-type]

-    system_prompt = _build_system_prompt(app_name, window_title, kb_context)
+    yield streaming.format_thinking_step(
+        step_id=PREP_STEP_ID,
+        title="Searching knowledge base",
+        status="complete",
+        items=[f"Found {doc_count} document{'s' if doc_count != 1 else ''}"] if kb_query else ["Skipped"],
+    )

-    messages = [
-        SystemMessage(content=system_prompt),
-        HumanMessage(content=[
-            {
-                "type": "text",
-                "text": "Analyze this screenshot. Understand the full context of what the user is working on, then generate the text they most likely want to write in the active text area.",
-            },
-            {
-                "type": "image_url",
-                "image_url": {"url": screenshot_data_url},
-            },
-        ]),
-    ]
+    # Build agent input with pre-computed KB as initial state
+    if has_kb:
+        instruction = (
+            "Analyze this screenshot, then explore the knowledge base documents "
+            "listed above — read the chunk index of any document whose title "
+            "looks relevant and check matched chunks for useful facts. "
+            "Finally, generate a concise autocomplete for the active text area, "
+            "enhanced with any relevant KB information you found."
+        )
+    else:
+        instruction = (
+            "Analyze this screenshot and generate a concise autocomplete "
+            "for the active text area based on what you see."
+        )

-    text_started = False
-    text_id = ""
+    user_message = HumanMessage(content=[
+        {"type": "text", "text": instruction},
+        {"type": "image_url", "image_url": {"url": screenshot_data_url}},
+    ])
+
+    input_data: dict = {"messages": [user_message]}
+
+    if has_kb:
+        input_data["files"] = kb.files
+        input_data["messages"] = [kb.ls_ai_msg, kb.ls_tool_msg, user_message]
+        logger.info("Autocomplete: injected %d KB files into agent initial state", doc_count)
+    else:
+        logger.info("Autocomplete: no KB documents found, proceeding with screenshot only")
+
+    # Stream the agent (message_start already sent above)
    try:
-        yield streaming.format_message_start()
-        text_id = streaming.generate_text_id()
-        yield streaming.format_text_start(text_id)
-        text_started = True
-
-        async for chunk in llm.astream(messages):
-            token = chunk.content if hasattr(chunk, "content") else str(chunk)
-            if token:
-                yield streaming.format_text_delta(text_id, token)
-
-        yield streaming.format_text_end(text_id)
-        yield streaming.format_finish()
-        yield streaming.format_done()
-
+        async for sse in stream_autocomplete_agent(
+            agent, input_data, streaming, emit_message_start=False,
+        ):
+            yield sse
    except Exception as e:
-        if text_started:
-            yield streaming.format_text_end(text_id)
-
        if _is_vision_unsupported_error(e):
-            logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
+            logger.warning("Vision autocomplete: model does not support vision: %s", e)
            yield streaming.format_error(vision_error_msg)
+            yield streaming.format_done()
        else:
-            logger.error(f"Vision autocomplete streaming error: {e}", exc_info=True)
+            logger.error("Vision autocomplete streaming error: %s", e, exc_info=True)
            yield streaming.format_error("Autocomplete failed. Please try again.")
-        yield streaming.format_done()
+            yield streaming.format_done()