diff --git a/surfsense_backend/app/routes/autocomplete_routes.py b/surfsense_backend/app/routes/autocomplete_routes.py index 329476ca1..a6f66f408 100644 --- a/surfsense_backend/app/routes/autocomplete_routes.py +++ b/surfsense_backend/app/routes/autocomplete_routes.py @@ -14,6 +14,8 @@ router = APIRouter(prefix="/autocomplete", tags=["autocomplete"]) class VisionAutocompleteRequest(BaseModel): screenshot: str search_space_id: int + app_name: str = "" + window_title: str = "" @router.post("/vision/stream") @@ -23,7 +25,10 @@ async def vision_autocomplete_stream( session: AsyncSession = Depends(get_async_session), ): return StreamingResponse( - stream_vision_autocomplete(body.screenshot, body.search_space_id, session), + stream_vision_autocomplete( + body.screenshot, body.search_space_id, session, + app_name=body.app_name, window_title=body.window_title, + ), media_type="text/event-stream", headers={ **VercelStreamingService.get_response_headers(), diff --git a/surfsense_backend/app/services/vision_autocomplete_service.py b/surfsense_backend/app/services/vision_autocomplete_service.py index 526b0d35c..0804df7fb 100644 --- a/surfsense_backend/app/services/vision_autocomplete_service.py +++ b/surfsense_backend/app/services/vision_autocomplete_service.py @@ -4,11 +4,21 @@ from typing import AsyncGenerator from langchain_core.messages import HumanMessage, SystemMessage from sqlalchemy.ext.asyncio import AsyncSession +from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever from app.services.llm_service import get_vision_llm from app.services.new_streaming_service import VercelStreamingService logger = logging.getLogger(__name__) +KB_TOP_K = 5 +KB_MAX_CHARS = 4000 + +EXTRACT_QUERY_PROMPT = """Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else.""" + +EXTRACT_QUERY_PROMPT_WITH_APP = """The user is currently in the application "{app_name}" with the window titled "{window_title}". + +Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else.""" + VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text. You will receive a screenshot of the user's screen. Your job: @@ -28,13 +38,107 @@ Rules: - Do NOT describe the screenshot or explain your reasoning. - If you cannot determine what to write, output nothing.""" +APP_CONTEXT_BLOCK = """ + +The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly.""" + +KB_CONTEXT_BLOCK = """ + +You also have access to the user's knowledge base documents below. Use them to write more accurate, informed, and contextually relevant text. Do NOT cite or reference the documents explicitly — just let the knowledge inform your writing naturally. + + +{kb_context} +""" + + +def _build_system_prompt(app_name: str, window_title: str, kb_context: str) -> str: + """Assemble the system prompt from optional context blocks.""" + prompt = VISION_SYSTEM_PROMPT + if app_name: + prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title) + if kb_context: + prompt += KB_CONTEXT_BLOCK.format(kb_context=kb_context) + return prompt + + +async def _extract_query_from_screenshot( + llm, screenshot_data_url: str, + app_name: str = "", window_title: str = "", +) -> str | None: + """Ask the Vision LLM to describe what the user is working on.""" + if app_name: + prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format( + app_name=app_name, window_title=window_title, + ) + else: + prompt_text = EXTRACT_QUERY_PROMPT + + try: + response = await llm.ainvoke([ + HumanMessage(content=[ + {"type": "text", "text": prompt_text}, + {"type": "image_url", "image_url": {"url": screenshot_data_url}}, + ]), + ]) + query = response.content.strip() if hasattr(response, "content") else "" + return query if query else None + except Exception as e: + logger.warning(f"Failed to extract query from screenshot: {e}") + return None + + +async def _search_knowledge_base( + session: AsyncSession, search_space_id: int, query: str +) -> str: + """Search the KB and return formatted context string.""" + try: + retriever = ChucksHybridSearchRetriever(session) + results = await retriever.hybrid_search( + query_text=query, + top_k=KB_TOP_K, + search_space_id=search_space_id, + ) + + if not results: + return "" + + parts: list[str] = [] + char_count = 0 + for doc in results: + title = doc.get("document", {}).get("title", "Untitled") + for chunk in doc.get("chunks", []): + content = chunk.get("content", "").strip() + if not content: + continue + entry = f"[{title}]\n{content}" + if char_count + len(entry) > KB_MAX_CHARS: + break + parts.append(entry) + char_count += len(entry) + if char_count >= KB_MAX_CHARS: + break + + return "\n\n---\n\n".join(parts) + except Exception as e: + logger.warning(f"KB search failed, proceeding without context: {e}") + return "" + async def stream_vision_autocomplete( screenshot_data_url: str, search_space_id: int, session: AsyncSession, + *, + app_name: str = "", + window_title: str = "", ) -> AsyncGenerator[str, None]: - """Analyze a screenshot with the vision LLM and stream a text completion.""" + """Analyze a screenshot with the vision LLM and stream a text completion. + + Pipeline: + 1. Extract a search query from the screenshot (non-streaming) + 2. Search the knowledge base for relevant context + 3. Stream the final completion with screenshot + KB + app context + """ streaming = VercelStreamingService() llm = await get_vision_llm(session, search_space_id) @@ -44,8 +148,17 @@ async def stream_vision_autocomplete( yield streaming.format_done() return + kb_context = "" + query = await _extract_query_from_screenshot( + llm, screenshot_data_url, app_name=app_name, window_title=window_title, + ) + if query: + kb_context = await _search_knowledge_base(session, search_space_id, query) + + system_prompt = _build_system_prompt(app_name, window_title, kb_context) + messages = [ - SystemMessage(content=VISION_SYSTEM_PROMPT), + SystemMessage(content=system_prompt), HumanMessage(content=[ { "type": "text", diff --git a/surfsense_desktop/src/modules/autocomplete/index.ts b/surfsense_desktop/src/modules/autocomplete/index.ts index 3ed9c4a00..0d5073de4 100644 --- a/surfsense_desktop/src/modules/autocomplete/index.ts +++ b/surfsense_desktop/src/modules/autocomplete/index.ts @@ -1,6 +1,6 @@ import { clipboard, globalShortcut, ipcMain, screen } from 'electron'; import { IPC_CHANNELS } from '../../ipc/channels'; -import { getFrontmostApp, hasAccessibilityPermission, simulatePaste } from '../platform'; +import { getFrontmostApp, getWindowTitle, hasAccessibilityPermission, simulatePaste } from '../platform'; import { hasScreenRecordingPermission, requestAccessibility, requestScreenRecording } from '../permissions'; import { getMainWindow } from '../window'; import { captureScreen } from './screenshot'; @@ -27,6 +27,7 @@ async function triggerAutocomplete(): Promise { } sourceApp = getFrontmostApp(); + const windowTitle = getWindowTitle(); savedClipboard = clipboard.readText(); const screenshot = await captureScreen(); @@ -55,6 +56,8 @@ async function triggerAutocomplete(): Promise { sw.webContents.send(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, { screenshot, searchSpaceId, + appName: sourceApp, + windowTitle, }); } }, 300); diff --git a/surfsense_desktop/src/modules/platform.ts b/surfsense_desktop/src/modules/platform.ts index 1e6ac74e4..122e2efed 100644 --- a/surfsense_desktop/src/modules/platform.ts +++ b/surfsense_desktop/src/modules/platform.ts @@ -32,6 +32,24 @@ export function checkAccessibilityPermission(): boolean { return systemPreferences.isTrustedAccessibilityClient(true); } +export function getWindowTitle(): string { + try { + if (process.platform === 'darwin') { + return execSync( + 'osascript -e \'tell application "System Events" to get title of front window of first application process whose frontmost is true\'' + ).toString().trim(); + } + if (process.platform === 'win32') { + return execSync( + 'powershell -command "(Get-Process | Where-Object { $_.MainWindowHandle -eq (Add-Type -MemberDefinition \'[DllImport(\\\"user32.dll\\\")] public static extern IntPtr GetForegroundWindow();\' -Name W -PassThru)::GetForegroundWindow() }).MainWindowTitle"' + ).toString().trim(); + } + } catch { + return ''; + } + return ''; +} + export function hasAccessibilityPermission(): boolean { if (process.platform !== 'darwin') return true; return systemPreferences.isTrustedAccessibilityClient(false); diff --git a/surfsense_desktop/src/preload.ts b/surfsense_desktop/src/preload.ts index 31c5ca865..2bd09f13c 100644 --- a/surfsense_desktop/src/preload.ts +++ b/surfsense_desktop/src/preload.ts @@ -27,8 +27,8 @@ contextBridge.exposeInMainWorld('electronAPI', { requestScreenRecording: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_SCREEN_RECORDING), restartApp: () => ipcRenderer.invoke(IPC_CHANNELS.RESTART_APP), // Autocomplete - onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => { - const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string }) => callback(data); + onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => void) => { + const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => callback(data); ipcRenderer.on(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener); return () => { ipcRenderer.removeListener(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener); diff --git a/surfsense_web/app/desktop/suggestion/page.tsx b/surfsense_web/app/desktop/suggestion/page.tsx index b68fe450d..b7d9b97bd 100644 --- a/surfsense_web/app/desktop/suggestion/page.tsx +++ b/surfsense_web/app/desktop/suggestion/page.tsx @@ -46,7 +46,7 @@ export default function SuggestionPage() { }, [error]); const fetchSuggestion = useCallback( - async (screenshot: string, searchSpaceId: string) => { + async (screenshot: string, searchSpaceId: string, appName?: string, windowTitle?: string) => { abortRef.current?.abort(); const controller = new AbortController(); abortRef.current = controller; @@ -77,6 +77,8 @@ export default function SuggestionPage() { body: JSON.stringify({ screenshot, search_space_id: parseInt(searchSpaceId, 10), + app_name: appName || "", + window_title: windowTitle || "", }), signal: controller.signal, }, @@ -142,7 +144,7 @@ export default function SuggestionPage() { const cleanup = window.electronAPI.onAutocompleteContext((data) => { const searchSpaceId = data.searchSpaceId || "1"; if (data.screenshot) { - fetchSuggestion(data.screenshot, searchSpaceId); + fetchSuggestion(data.screenshot, searchSpaceId, data.appName, data.windowTitle); } }); diff --git a/surfsense_web/types/window.d.ts b/surfsense_web/types/window.d.ts index 2fc550306..85b6bdf51 100644 --- a/surfsense_web/types/window.d.ts +++ b/surfsense_web/types/window.d.ts @@ -23,7 +23,7 @@ interface ElectronAPI { requestScreenRecording: () => Promise; restartApp: () => Promise; // Autocomplete - onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => () => void; + onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => void) => () => void; acceptSuggestion: (text: string) => Promise; dismissSuggestion: () => Promise; setAutocompleteEnabled: (enabled: boolean) => Promise;