Merge pull request #1130 from CREDO23/feat/vision-autocomplete

[Feat] Vision-based autocomplete with KB grounding
2026-06-26 21:39:43 +02:00 · 2026-04-05 13:24:55 -07:00 · 2026-04-05 13:24:55 -07:00 · 74bf3df880
commit 74bf3df880
parent 13e302676e a180bf5576
32 changed files with 1482 additions and 33 deletions
--- a/surfsense_backend/app/services/llm_service.py
+++ b/surfsense_backend/app/services/llm_service.py
@ -32,6 +32,7 @@ logger = logging.getLogger(__name__)
 class LLMRole:
    AGENT = "agent"  # For agent/chat operations
    DOCUMENT_SUMMARY = "document_summary"  # For document summarization
+    VISION = "vision"  # For vision/screenshot analysis


 def get_global_llm_config(llm_config_id: int) -> dict | None:
@ -187,7 +188,7 @@ async def get_search_space_llm_instance(
    Args:
        session: Database session
        search_space_id: Search Space ID
-        role: LLM role ('agent' or 'document_summary')
+        role: LLM role ('agent', 'document_summary', or 'vision')

    Returns:
        ChatLiteLLM or ChatLiteLLMRouter instance, or None if not found
@ -209,6 +210,8 @@ async def get_search_space_llm_instance(
            llm_config_id = search_space.agent_llm_id
        elif role == LLMRole.DOCUMENT_SUMMARY:
            llm_config_id = search_space.document_summary_llm_id
+        elif role == LLMRole.VISION:
+            llm_config_id = search_space.vision_llm_id
        else:
            logger.error(f"Invalid LLM role: {role}")
            return None
@ -405,6 +408,13 @@ async def get_document_summary_llm(
    )


+async def get_vision_llm(
+    session: AsyncSession, search_space_id: int
+) -> ChatLiteLLM | ChatLiteLLMRouter | None:
+    """Get the search space's vision LLM instance for screenshot analysis."""
+    return await get_search_space_llm_instance(session, search_space_id, LLMRole.VISION)
+
+
 # Backward-compatible alias (LLM preferences are now per-search-space, not per-user)
 async def get_user_long_context_llm(
    session: AsyncSession,
--- a/surfsense_backend/app/services/vision_autocomplete_service.py
+++ b/surfsense_backend/app/services/vision_autocomplete_service.py
@ -0,0 +1,225 @@
+import logging
+from typing import AsyncGenerator
+
+from langchain_core.messages import HumanMessage, SystemMessage
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
+from app.services.llm_service import get_vision_llm
+from app.services.new_streaming_service import VercelStreamingService
+
+logger = logging.getLogger(__name__)
+
+KB_TOP_K = 5
+KB_MAX_CHARS = 4000
+
+EXTRACT_QUERY_PROMPT = """Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
+
+EXTRACT_QUERY_PROMPT_WITH_APP = """The user is currently in the application "{app_name}" with the window titled "{window_title}".
+
+Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
+
+VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text.
+
+You will receive a screenshot of the user's screen. Your job:
+1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.).
+2. Identify the text area where the user will type.
+3. Based on the full visual context, generate the text the user most likely wants to write.
+
+Key behavior:
+- If the text area is EMPTY, draft a full response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document).
+- If the text area already has text, continue it naturally.
+
+Rules:
+- Output ONLY the text to be inserted. No quotes, no explanations, no meta-commentary.
+- Be concise but complete — a full thought, not a fragment.
+- Match the tone and formality of the surrounding context.
+- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal.
+- Do NOT describe the screenshot or explain your reasoning.
+- If you cannot determine what to write, output nothing."""
+
+APP_CONTEXT_BLOCK = """
+
+The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly."""
+
+KB_CONTEXT_BLOCK = """
+
+You also have access to the user's knowledge base documents below. Use them to write more accurate, informed, and contextually relevant text. Do NOT cite or reference the documents explicitly — just let the knowledge inform your writing naturally.
+
+<knowledge_base>
+{kb_context}
+</knowledge_base>"""
+
+
+def _build_system_prompt(app_name: str, window_title: str, kb_context: str) -> str:
+    """Assemble the system prompt from optional context blocks."""
+    prompt = VISION_SYSTEM_PROMPT
+    if app_name:
+        prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title)
+    if kb_context:
+        prompt += KB_CONTEXT_BLOCK.format(kb_context=kb_context)
+    return prompt
+
+
+def _is_vision_unsupported_error(e: Exception) -> bool:
+    """Check if an exception indicates the model doesn't support vision/images."""
+    msg = str(e).lower()
+    return "content must be a string" in msg or "does not support image" in msg
+
+
+async def _extract_query_from_screenshot(
+    llm, screenshot_data_url: str,
+    app_name: str = "", window_title: str = "",
+) -> str | None:
+    """Ask the Vision LLM to describe what the user is working on.
+
+    Raises vision-unsupported errors so the caller can return a
+    friendly message immediately instead of retrying with astream.
+    """
+    if app_name:
+        prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format(
+            app_name=app_name, window_title=window_title,
+        )
+    else:
+        prompt_text = EXTRACT_QUERY_PROMPT
+
+    try:
+        response = await llm.ainvoke([
+            HumanMessage(content=[
+                {"type": "text", "text": prompt_text},
+                {"type": "image_url", "image_url": {"url": screenshot_data_url}},
+            ]),
+        ])
+        query = response.content.strip() if hasattr(response, "content") else ""
+        return query if query else None
+    except Exception as e:
+        if _is_vision_unsupported_error(e):
+            raise
+        logger.warning(f"Failed to extract query from screenshot: {e}")
+        return None
+
+
+async def _search_knowledge_base(
+    session: AsyncSession, search_space_id: int, query: str
+) -> str:
+    """Search the KB and return formatted context string."""
+    try:
+        retriever = ChucksHybridSearchRetriever(session)
+        results = await retriever.hybrid_search(
+            query_text=query,
+            top_k=KB_TOP_K,
+            search_space_id=search_space_id,
+        )
+
+        if not results:
+            return ""
+
+        parts: list[str] = []
+        char_count = 0
+        for doc in results:
+            title = doc.get("document", {}).get("title", "Untitled")
+            for chunk in doc.get("chunks", []):
+                content = chunk.get("content", "").strip()
+                if not content:
+                    continue
+                entry = f"[{title}]\n{content}"
+                if char_count + len(entry) > KB_MAX_CHARS:
+                    break
+                parts.append(entry)
+                char_count += len(entry)
+            if char_count >= KB_MAX_CHARS:
+                break
+
+        return "\n\n---\n\n".join(parts)
+    except Exception as e:
+        logger.warning(f"KB search failed, proceeding without context: {e}")
+        return ""
+
+
+async def stream_vision_autocomplete(
+    screenshot_data_url: str,
+    search_space_id: int,
+    session: AsyncSession,
+    *,
+    app_name: str = "",
+    window_title: str = "",
+) -> AsyncGenerator[str, None]:
+    """Analyze a screenshot with the vision LLM and stream a text completion.
+
+    Pipeline:
+    1. Extract a search query from the screenshot (non-streaming)
+    2. Search the knowledge base for relevant context
+    3. Stream the final completion with screenshot + KB + app context
+    """
+    streaming = VercelStreamingService()
+    vision_error_msg = (
+        "The selected model does not support vision. "
+        "Please set a vision-capable model (e.g. GPT-4o, Gemini) in your search space settings."
+    )
+
+    llm = await get_vision_llm(session, search_space_id)
+    if not llm:
+        yield streaming.format_message_start()
+        yield streaming.format_error("No Vision LLM configured for this search space")
+        yield streaming.format_done()
+        return
+
+    kb_context = ""
+    try:
+        query = await _extract_query_from_screenshot(
+            llm, screenshot_data_url, app_name=app_name, window_title=window_title,
+        )
+    except Exception as e:
+        logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
+        yield streaming.format_message_start()
+        yield streaming.format_error(vision_error_msg)
+        yield streaming.format_done()
+        return
+
+    if query:
+        kb_context = await _search_knowledge_base(session, search_space_id, query)
+
+    system_prompt = _build_system_prompt(app_name, window_title, kb_context)
+
+    messages = [
+        SystemMessage(content=system_prompt),
+        HumanMessage(content=[
+            {
+                "type": "text",
+                "text": "Analyze this screenshot. Understand the full context of what the user is working on, then generate the text they most likely want to write in the active text area.",
+            },
+            {
+                "type": "image_url",
+                "image_url": {"url": screenshot_data_url},
+            },
+        ]),
+    ]
+
+    text_started = False
+    text_id = ""
+    try:
+        yield streaming.format_message_start()
+        text_id = streaming.generate_text_id()
+        yield streaming.format_text_start(text_id)
+        text_started = True
+
+        async for chunk in llm.astream(messages):
+            token = chunk.content if hasattr(chunk, "content") else str(chunk)
+            if token:
+                yield streaming.format_text_delta(text_id, token)
+
+        yield streaming.format_text_end(text_id)
+        yield streaming.format_finish()
+        yield streaming.format_done()
+
+    except Exception as e:
+        if text_started:
+            yield streaming.format_text_end(text_id)
+
+        if _is_vision_unsupported_error(e):
+            logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
+            yield streaming.format_error(vision_error_msg)
+        else:
+            logger.error(f"Vision autocomplete streaming error: {e}", exc_info=True)
+            yield streaming.format_error("Autocomplete failed. Please try again.")
+        yield streaming.format_done()