replace text-based autocomplete with vision-based endpoint

2026-07-24 23:41:10 +02:00 · 2026-04-03 18:17:50 +02:00 · 2026-04-03 18:17:50 +02:00 · aeb3f13f91
commit aeb3f13f91
parent ced7f7562a
6 changed files with 102 additions and 133 deletions
--- a/surfsense_backend/app/routes/autocomplete_routes.py
+++ b/surfsense_backend/app/routes/autocomplete_routes.py
@ -1,28 +1,29 @@
-from fastapi import APIRouter, Depends, Query
+from fastapi import APIRouter, Depends
 from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.db import User, get_async_session
-from app.services.autocomplete_service import stream_autocomplete
 from app.services.new_streaming_service import VercelStreamingService
+from app.services.vision_autocomplete_service import stream_vision_autocomplete
 from app.users import current_active_user

 router = APIRouter(prefix="/autocomplete", tags=["autocomplete"])


-@router.post("/stream")
-async def autocomplete_stream(
-    text: str = Query(..., description="Current text in the input field"),
-    cursor_position: int = Query(-1, description="Cursor position in the text (-1 for end)"),
-    search_space_id: int = Query(..., description="Search space ID for KB context and LLM config"),
+class VisionAutocompleteRequest(BaseModel):
+    screenshot: str
+    search_space_id: int
+
+
+@router.post("/vision/stream")
+async def vision_autocomplete_stream(
+    body: VisionAutocompleteRequest,
    user: User = Depends(current_active_user),
    session: AsyncSession = Depends(get_async_session),
 ):
-    if cursor_position < 0:
-        cursor_position = len(text)
-
    return StreamingResponse(
-        stream_autocomplete(text, cursor_position, search_space_id, session),
+        stream_vision_autocomplete(body.screenshot, body.search_space_id, session),
        media_type="text/event-stream",
        headers={
            **VercelStreamingService.get_response_headers(),
--- a/surfsense_backend/app/services/autocomplete_service.py
+++ b/surfsense_backend/app/services/autocomplete_service.py
@ -1,110 +0,0 @@
-import logging
-from typing import AsyncGenerator
-
-from langchain_core.messages import HumanMessage, SystemMessage
-from sqlalchemy.ext.asyncio import AsyncSession
-
-from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
-from app.services.llm_service import get_agent_llm
-from app.services.new_streaming_service import VercelStreamingService
-
-logger = logging.getLogger(__name__)
-
-SYSTEM_PROMPT = """You are an inline text autocomplete engine. Your job is to complete the user's text naturally.
-
-Rules:
- Output ONLY the continuation text. Do NOT repeat what the user already typed.
- Keep completions concise: 1-3 sentences maximum.
- Match the user's tone, style, and language.
- If knowledge base context is provided, use it to make the completion factually accurate and personalized.
- Do NOT add quotes, explanations, or meta-commentary.
- Do NOT start with a space unless grammatically required.
- If you cannot produce a useful completion, output nothing."""
-
-KB_CONTEXT_TEMPLATE = """
-Relevant knowledge base context (use this to personalize the completion):
---
-{kb_context}
---
-"""
-
-
-async def _retrieve_kb_context(
-    session: AsyncSession,
-    text: str,
-    search_space_id: int,
-) -> str:
-    try:
-        retriever = ChucksHybridSearchRetriever(session)
-        chunks = await retriever.vector_search(
-            query_text=text[-200:],
-            top_k=3,
-            search_space_id=search_space_id,
-        )
-        if not chunks:
-            return ""
-        snippets = []
-        for chunk in chunks:
-            content = getattr(chunk, "content", None) or getattr(chunk, "chunk_text", "")
-            if content:
-                snippets.append(content[:300])
-        if not snippets:
-            return ""
-        return KB_CONTEXT_TEMPLATE.format(kb_context="\n\n".join(snippets))
-    except Exception as e:
-        logger.warning(f"KB search failed for autocomplete, proceeding without context: {e}")
-        return ""
-
-
-async def stream_autocomplete(
-    text: str,
-    cursor_position: int,
-    search_space_id: int,
-    session: AsyncSession,
-) -> AsyncGenerator[str, None]:
-    """Build context, call the LLM, and yield SSE-formatted tokens."""
-    streaming = VercelStreamingService()
-    text_before_cursor = text[:cursor_position] if cursor_position >= 0 else text
-
-    if not text_before_cursor.strip():
-        yield streaming.format_message_start()
-        yield streaming.format_finish()
-        yield streaming.format_done()
-        return
-
-    kb_context = await _retrieve_kb_context(session, text_before_cursor, search_space_id)
-
-    llm = await get_agent_llm(session, search_space_id)
-    if not llm:
-        yield streaming.format_message_start()
-        yield streaming.format_error("No LLM configured for this search space")
-        yield streaming.format_done()
-        return
-
-    system_prompt = SYSTEM_PROMPT
-    if kb_context:
-        system_prompt += kb_context
-
-    messages = [
-        SystemMessage(content=system_prompt),
-        HumanMessage(content=f"Complete this text:\n{text_before_cursor}"),
-    ]
-
-    try:
-        yield streaming.format_message_start()
-        text_id = streaming.generate_text_id()
-        yield streaming.format_text_start(text_id)
-
-        async for chunk in llm.astream(messages):
-            token = chunk.content if hasattr(chunk, "content") else str(chunk)
-            if token:
-                yield streaming.format_text_delta(text_id, token)
-
-        yield streaming.format_text_end(text_id)
-        yield streaming.format_finish()
-        yield streaming.format_done()
-
-    except Exception as e:
-        logger.error(f"Autocomplete streaming error: {e}")
-        yield streaming.format_error(str(e))
-        yield streaming.format_done()
--- a/surfsense_backend/app/services/vision_autocomplete_service.py
+++ b/surfsense_backend/app/services/vision_autocomplete_service.py
@ -0,0 +1,78 @@
+import logging
+from typing import AsyncGenerator
+
+from langchain_core.messages import HumanMessage, SystemMessage
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.services.llm_service import get_vision_llm
+from app.services.new_streaming_service import VercelStreamingService
+
+logger = logging.getLogger(__name__)
+
+VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text.
+
+You will receive a screenshot of the user's screen. Your job:
+1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.).
+2. Identify the text area where the user will type.
+3. Based on the full visual context, generate the text the user most likely wants to write.
+
+Key behavior:
+- If the text area is EMPTY, draft a full response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document).
+- If the text area already has text, continue it naturally.
+
+Rules:
+- Output ONLY the text to be inserted. No quotes, no explanations, no meta-commentary.
+- Be concise but complete — a full thought, not a fragment.
+- Match the tone and formality of the surrounding context.
+- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal.
+- Do NOT describe the screenshot or explain your reasoning.
+- If you cannot determine what to write, output nothing."""
+
+
+async def stream_vision_autocomplete(
+    screenshot_data_url: str,
+    search_space_id: int,
+    session: AsyncSession,
+) -> AsyncGenerator[str, None]:
+    """Analyze a screenshot with the vision LLM and stream a text completion."""
+    streaming = VercelStreamingService()
+
+    llm = await get_vision_llm(session, search_space_id)
+    if not llm:
+        yield streaming.format_message_start()
+        yield streaming.format_error("No Vision LLM configured for this search space")
+        yield streaming.format_done()
+        return
+
+    messages = [
+        SystemMessage(content=VISION_SYSTEM_PROMPT),
+        HumanMessage(content=[
+            {
+                "type": "text",
+                "text": "Analyze this screenshot. Understand the full context of what the user is working on, then generate the text they most likely want to write in the active text area.",
+            },
+            {
+                "type": "image_url",
+                "image_url": {"url": screenshot_data_url},
+            },
+        ]),
+    ]
+
+    try:
+        yield streaming.format_message_start()
+        text_id = streaming.generate_text_id()
+        yield streaming.format_text_start(text_id)
+
+        async for chunk in llm.astream(messages):
+            token = chunk.content if hasattr(chunk, "content") else str(chunk)
+            if token:
+                yield streaming.format_text_delta(text_id, token)
+
+        yield streaming.format_text_end(text_id)
+        yield streaming.format_finish()
+        yield streaming.format_done()
+
+    except Exception as e:
+        logger.error(f"Vision autocomplete streaming error: {e}")
+        yield streaming.format_error(str(e))
+        yield streaming.format_done()
--- a/surfsense_desktop/src/preload.ts
+++ b/surfsense_desktop/src/preload.ts
@ -26,8 +26,8 @@ contextBridge.exposeInMainWorld('electronAPI', {
  requestAccessibility: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_ACCESSIBILITY),
  restartApp: () => ipcRenderer.invoke(IPC_CHANNELS.RESTART_APP),
  // Autocomplete
-  onAutocompleteContext: (callback: (data: { text: string; cursorPosition: number; searchSpaceId?: string }) => void) => {
-    const listener = (_event: unknown, data: { text: string; cursorPosition: number; searchSpaceId?: string }) => callback(data);
+  onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => {
+    const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string }) => callback(data);
    ipcRenderer.on(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener);
    return () => {
      ipcRenderer.removeListener(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener);
--- a/surfsense_web/app/desktop/suggestion/page.tsx
+++ b/surfsense_web/app/desktop/suggestion/page.tsx
@ -18,7 +18,7 @@ export default function SuggestionPage() {
 	const abortRef = useRef<AbortController | null>(null);

 	const fetchSuggestion = useCallback(
-		async (text: string, cursorPosition: number, searchSpaceId: string) => {
+		async (screenshot: string, searchSpaceId: string) => {
 			abortRef.current?.abort();
 			const controller = new AbortController();
 			abortRef.current = controller;
@ -37,21 +37,19 @@ export default function SuggestionPage() {
 			const backendUrl =
 				process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";

-			const params = new URLSearchParams({
-				text,
-				cursor_position: String(cursorPosition),
-				search_space_id: searchSpaceId,
-			});
-
 			try {
 				const response = await fetch(
-					`${backendUrl}/api/v1/autocomplete/stream?${params}`,
+					`${backendUrl}/api/v1/autocomplete/vision/stream`,
 					{
 						method: "POST",
 						headers: {
 							Authorization: `Bearer ${token}`,
 							"Content-Type": "application/json",
 						},
+						body: JSON.stringify({
+							screenshot,
+							search_space_id: parseInt(searchSpaceId, 10),
+						}),
 						signal: controller.signal,
 					},
 				);
@ -119,7 +117,9 @@ export default function SuggestionPage() {

 		const cleanup = window.electronAPI.onAutocompleteContext((data) => {
 			const searchSpaceId = data.searchSpaceId || "1";
-			fetchSuggestion(data.text, data.cursorPosition, searchSpaceId);
+			if (data.screenshot) {
+				fetchSuggestion(data.screenshot, searchSpaceId);
+			}
 		});

 		return cleanup;
--- a/surfsense_web/types/window.d.ts
+++ b/surfsense_web/types/window.d.ts
@ -21,7 +21,7 @@ interface ElectronAPI {
 	requestAccessibility: () => Promise<void>;
 	restartApp: () => Promise<void>;
 	// Autocomplete
-	onAutocompleteContext: (callback: (data: { text: string; cursorPosition: number; searchSpaceId?: string }) => void) => () => void;
+	onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => () => void;
 	acceptSuggestion: (text: string) => Promise<void>;
 	dismissSuggestion: () => Promise<void>;
 	updateSuggestionText: (text: string) => Promise<void>;