add app context and KB grounding to autocomplete suggestions

2026-07-24 23:41:10 +02:00 · 2026-04-03 21:34:01 +02:00 · 2026-04-03 21:34:01 +02:00 · 960b8fc012
commit 960b8fc012
parent 080acf5e0a
7 changed files with 150 additions and 9 deletions
--- a/surfsense_backend/app/routes/autocomplete_routes.py
+++ b/surfsense_backend/app/routes/autocomplete_routes.py
@ -14,6 +14,8 @@ router = APIRouter(prefix="/autocomplete", tags=["autocomplete"])
 class VisionAutocompleteRequest(BaseModel):
    screenshot: str
    search_space_id: int
+    app_name: str = ""
+    window_title: str = ""


@router.post("/vision/stream")
@ -23,7 +25,10 @@ async def vision_autocomplete_stream(
    session: AsyncSession = Depends(get_async_session),
 ):
    return StreamingResponse(
-        stream_vision_autocomplete(body.screenshot, body.search_space_id, session),
+        stream_vision_autocomplete(
+            body.screenshot, body.search_space_id, session,
+            app_name=body.app_name, window_title=body.window_title,
+        ),
        media_type="text/event-stream",
        headers={
            **VercelStreamingService.get_response_headers(),
--- a/surfsense_backend/app/services/vision_autocomplete_service.py
+++ b/surfsense_backend/app/services/vision_autocomplete_service.py
@ -4,11 +4,21 @@ from typing import AsyncGenerator
 from langchain_core.messages import HumanMessage, SystemMessage
 from sqlalchemy.ext.asyncio import AsyncSession

+from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
 from app.services.llm_service import get_vision_llm
 from app.services.new_streaming_service import VercelStreamingService

 logger = logging.getLogger(__name__)

+KB_TOP_K = 5
+KB_MAX_CHARS = 4000
+
+EXTRACT_QUERY_PROMPT = """Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
+
+EXTRACT_QUERY_PROMPT_WITH_APP = """The user is currently in the application "{app_name}" with the window titled "{window_title}".
+
+Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
+
 VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text.

 You will receive a screenshot of the user's screen. Your job:
@ -28,13 +38,107 @@ Rules:
 - Do NOT describe the screenshot or explain your reasoning.
 - If you cannot determine what to write, output nothing."""

+APP_CONTEXT_BLOCK = """
+
+The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly."""
+
+KB_CONTEXT_BLOCK = """
+
+You also have access to the user's knowledge base documents below. Use them to write more accurate, informed, and contextually relevant text. Do NOT cite or reference the documents explicitly — just let the knowledge inform your writing naturally.
+
+<knowledge_base>
+{kb_context}
+</knowledge_base>"""
+
+
+def _build_system_prompt(app_name: str, window_title: str, kb_context: str) -> str:
+    """Assemble the system prompt from optional context blocks."""
+    prompt = VISION_SYSTEM_PROMPT
+    if app_name:
+        prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title)
+    if kb_context:
+        prompt += KB_CONTEXT_BLOCK.format(kb_context=kb_context)
+    return prompt
+
+
+async def _extract_query_from_screenshot(
+    llm, screenshot_data_url: str,
+    app_name: str = "", window_title: str = "",
+) -> str | None:
+    """Ask the Vision LLM to describe what the user is working on."""
+    if app_name:
+        prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format(
+            app_name=app_name, window_title=window_title,
+        )
+    else:
+        prompt_text = EXTRACT_QUERY_PROMPT
+
+    try:
+        response = await llm.ainvoke([
+            HumanMessage(content=[
+                {"type": "text", "text": prompt_text},
+                {"type": "image_url", "image_url": {"url": screenshot_data_url}},
+            ]),
+        ])
+        query = response.content.strip() if hasattr(response, "content") else ""
+        return query if query else None
+    except Exception as e:
+        logger.warning(f"Failed to extract query from screenshot: {e}")
+        return None
+
+
+async def _search_knowledge_base(
+    session: AsyncSession, search_space_id: int, query: str
+) -> str:
+    """Search the KB and return formatted context string."""
+    try:
+        retriever = ChucksHybridSearchRetriever(session)
+        results = await retriever.hybrid_search(
+            query_text=query,
+            top_k=KB_TOP_K,
+            search_space_id=search_space_id,
+        )
+
+        if not results:
+            return ""
+
+        parts: list[str] = []
+        char_count = 0
+        for doc in results:
+            title = doc.get("document", {}).get("title", "Untitled")
+            for chunk in doc.get("chunks", []):
+                content = chunk.get("content", "").strip()
+                if not content:
+                    continue
+                entry = f"[{title}]\n{content}"
+                if char_count + len(entry) > KB_MAX_CHARS:
+                    break
+                parts.append(entry)
+                char_count += len(entry)
+            if char_count >= KB_MAX_CHARS:
+                break
+
+        return "\n\n---\n\n".join(parts)
+    except Exception as e:
+        logger.warning(f"KB search failed, proceeding without context: {e}")
+        return ""
+

 async def stream_vision_autocomplete(
    screenshot_data_url: str,
    search_space_id: int,
    session: AsyncSession,
+    *,
+    app_name: str = "",
+    window_title: str = "",
 ) -> AsyncGenerator[str, None]:
-    """Analyze a screenshot with the vision LLM and stream a text completion."""
+    """Analyze a screenshot with the vision LLM and stream a text completion.
+
+    Pipeline:
+    1. Extract a search query from the screenshot (non-streaming)
+    2. Search the knowledge base for relevant context
+    3. Stream the final completion with screenshot + KB + app context
+    """
    streaming = VercelStreamingService()

    llm = await get_vision_llm(session, search_space_id)
@ -44,8 +148,17 @@ async def stream_vision_autocomplete(
        yield streaming.format_done()
        return

+    kb_context = ""
+    query = await _extract_query_from_screenshot(
+        llm, screenshot_data_url, app_name=app_name, window_title=window_title,
+    )
+    if query:
+        kb_context = await _search_knowledge_base(session, search_space_id, query)
+
+    system_prompt = _build_system_prompt(app_name, window_title, kb_context)
+
    messages = [
-        SystemMessage(content=VISION_SYSTEM_PROMPT),
+        SystemMessage(content=system_prompt),
        HumanMessage(content=[
            {
                "type": "text",
--- a/surfsense_desktop/src/modules/autocomplete/index.ts
+++ b/surfsense_desktop/src/modules/autocomplete/index.ts
@ -1,6 +1,6 @@
 import { clipboard, globalShortcut, ipcMain, screen } from 'electron';
 import { IPC_CHANNELS } from '../../ipc/channels';
-import { getFrontmostApp, hasAccessibilityPermission, simulatePaste } from '../platform';
+import { getFrontmostApp, getWindowTitle, hasAccessibilityPermission, simulatePaste } from '../platform';
 import { hasScreenRecordingPermission, requestAccessibility, requestScreenRecording } from '../permissions';
 import { getMainWindow } from '../window';
 import { captureScreen } from './screenshot';
@ -27,6 +27,7 @@ async function triggerAutocomplete(): Promise<void> {
  }

  sourceApp = getFrontmostApp();
+  const windowTitle = getWindowTitle();
  savedClipboard = clipboard.readText();

  const screenshot = await captureScreen();
@ -55,6 +56,8 @@ async function triggerAutocomplete(): Promise<void> {
        sw.webContents.send(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, {
          screenshot,
          searchSpaceId,
+          appName: sourceApp,
+          windowTitle,
        });
      }
    }, 300);
--- a/surfsense_desktop/src/modules/platform.ts
+++ b/surfsense_desktop/src/modules/platform.ts
@ -32,6 +32,24 @@ export function checkAccessibilityPermission(): boolean {
  return systemPreferences.isTrustedAccessibilityClient(true);
 }

+export function getWindowTitle(): string {
+  try {
+    if (process.platform === 'darwin') {
+      return execSync(
+        'osascript -e \'tell application "System Events" to get title of front window of first application process whose frontmost is true\''
+      ).toString().trim();
+    }
+    if (process.platform === 'win32') {
+      return execSync(
+        'powershell -command "(Get-Process | Where-Object { $_.MainWindowHandle -eq (Add-Type -MemberDefinition \'[DllImport(\\\"user32.dll\\\")] public static extern IntPtr GetForegroundWindow();\' -Name W -PassThru)::GetForegroundWindow() }).MainWindowTitle"'
+      ).toString().trim();
+    }
+  } catch {
+    return '';
+  }
+  return '';
+}
+
 export function hasAccessibilityPermission(): boolean {
  if (process.platform !== 'darwin') return true;
  return systemPreferences.isTrustedAccessibilityClient(false);
--- a/surfsense_desktop/src/preload.ts
+++ b/surfsense_desktop/src/preload.ts
@ -27,8 +27,8 @@ contextBridge.exposeInMainWorld('electronAPI', {
  requestScreenRecording: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_SCREEN_RECORDING),
  restartApp: () => ipcRenderer.invoke(IPC_CHANNELS.RESTART_APP),
  // Autocomplete
-  onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => {
-    const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string }) => callback(data);
+  onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => void) => {
+    const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => callback(data);
    ipcRenderer.on(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener);
    return () => {
      ipcRenderer.removeListener(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener);
--- a/surfsense_web/app/desktop/suggestion/page.tsx
+++ b/surfsense_web/app/desktop/suggestion/page.tsx
@ -46,7 +46,7 @@ export default function SuggestionPage() {
 	}, [error]);

 	const fetchSuggestion = useCallback(
-		async (screenshot: string, searchSpaceId: string) => {
+		async (screenshot: string, searchSpaceId: string, appName?: string, windowTitle?: string) => {
 			abortRef.current?.abort();
 			const controller = new AbortController();
 			abortRef.current = controller;
@ -77,6 +77,8 @@ export default function SuggestionPage() {
 						body: JSON.stringify({
 							screenshot,
 							search_space_id: parseInt(searchSpaceId, 10),
+							app_name: appName || "",
+							window_title: windowTitle || "",
 						}),
 						signal: controller.signal,
 					},
@ -142,7 +144,7 @@ export default function SuggestionPage() {
 		const cleanup = window.electronAPI.onAutocompleteContext((data) => {
 			const searchSpaceId = data.searchSpaceId || "1";
 			if (data.screenshot) {
-				fetchSuggestion(data.screenshot, searchSpaceId);
+				fetchSuggestion(data.screenshot, searchSpaceId, data.appName, data.windowTitle);
 			}
 		});

--- a/surfsense_web/types/window.d.ts
+++ b/surfsense_web/types/window.d.ts
@ -23,7 +23,7 @@ interface ElectronAPI {
 	requestScreenRecording: () => Promise<void>;
 	restartApp: () => Promise<void>;
 	// Autocomplete
-	onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => () => void;
+	onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => void) => () => void;
 	acceptSuggestion: (text: string) => Promise<void>;
 	dismissSuggestion: () => Promise<void>;
 	setAutocompleteEnabled: (enabled: boolean) => Promise<void>;