diff --git a/surfsense_backend/app/routes/autocomplete_routes.py b/surfsense_backend/app/routes/autocomplete_routes.py
index 329476ca1..a6f66f408 100644
--- a/surfsense_backend/app/routes/autocomplete_routes.py
+++ b/surfsense_backend/app/routes/autocomplete_routes.py
@@ -14,6 +14,8 @@ router = APIRouter(prefix="/autocomplete", tags=["autocomplete"])
class VisionAutocompleteRequest(BaseModel):
screenshot: str
search_space_id: int
+ app_name: str = ""
+ window_title: str = ""
@router.post("/vision/stream")
@@ -23,7 +25,10 @@ async def vision_autocomplete_stream(
session: AsyncSession = Depends(get_async_session),
):
return StreamingResponse(
- stream_vision_autocomplete(body.screenshot, body.search_space_id, session),
+ stream_vision_autocomplete(
+ body.screenshot, body.search_space_id, session,
+ app_name=body.app_name, window_title=body.window_title,
+ ),
media_type="text/event-stream",
headers={
**VercelStreamingService.get_response_headers(),
diff --git a/surfsense_backend/app/services/vision_autocomplete_service.py b/surfsense_backend/app/services/vision_autocomplete_service.py
index 526b0d35c..0804df7fb 100644
--- a/surfsense_backend/app/services/vision_autocomplete_service.py
+++ b/surfsense_backend/app/services/vision_autocomplete_service.py
@@ -4,11 +4,21 @@ from typing import AsyncGenerator
from langchain_core.messages import HumanMessage, SystemMessage
from sqlalchemy.ext.asyncio import AsyncSession
+from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
from app.services.llm_service import get_vision_llm
from app.services.new_streaming_service import VercelStreamingService
logger = logging.getLogger(__name__)
+KB_TOP_K = 5
+KB_MAX_CHARS = 4000
+
+EXTRACT_QUERY_PROMPT = """Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
+
+EXTRACT_QUERY_PROMPT_WITH_APP = """The user is currently in the application "{app_name}" with the window titled "{window_title}".
+
+Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
+
VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text.
You will receive a screenshot of the user's screen. Your job:
@@ -28,13 +38,107 @@ Rules:
- Do NOT describe the screenshot or explain your reasoning.
- If you cannot determine what to write, output nothing."""
+APP_CONTEXT_BLOCK = """
+
+The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly."""
+
+KB_CONTEXT_BLOCK = """
+
+You also have access to the user's knowledge base documents below. Use them to write more accurate, informed, and contextually relevant text. Do NOT cite or reference the documents explicitly — just let the knowledge inform your writing naturally.
+
+
+{kb_context}
+"""
+
+
+def _build_system_prompt(app_name: str, window_title: str, kb_context: str) -> str:
+ """Assemble the system prompt from optional context blocks."""
+ prompt = VISION_SYSTEM_PROMPT
+ if app_name:
+ prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title)
+ if kb_context:
+ prompt += KB_CONTEXT_BLOCK.format(kb_context=kb_context)
+ return prompt
+
+
+async def _extract_query_from_screenshot(
+ llm, screenshot_data_url: str,
+ app_name: str = "", window_title: str = "",
+) -> str | None:
+ """Ask the Vision LLM to describe what the user is working on."""
+ if app_name:
+ prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format(
+ app_name=app_name, window_title=window_title,
+ )
+ else:
+ prompt_text = EXTRACT_QUERY_PROMPT
+
+ try:
+ response = await llm.ainvoke([
+ HumanMessage(content=[
+ {"type": "text", "text": prompt_text},
+ {"type": "image_url", "image_url": {"url": screenshot_data_url}},
+ ]),
+ ])
+ query = response.content.strip() if hasattr(response, "content") else ""
+ return query if query else None
+ except Exception as e:
+ logger.warning(f"Failed to extract query from screenshot: {e}")
+ return None
+
+
+async def _search_knowledge_base(
+ session: AsyncSession, search_space_id: int, query: str
+) -> str:
+ """Search the KB and return formatted context string."""
+ try:
+ retriever = ChucksHybridSearchRetriever(session)
+ results = await retriever.hybrid_search(
+ query_text=query,
+ top_k=KB_TOP_K,
+ search_space_id=search_space_id,
+ )
+
+ if not results:
+ return ""
+
+ parts: list[str] = []
+ char_count = 0
+ for doc in results:
+ title = doc.get("document", {}).get("title", "Untitled")
+ for chunk in doc.get("chunks", []):
+ content = chunk.get("content", "").strip()
+ if not content:
+ continue
+ entry = f"[{title}]\n{content}"
+ if char_count + len(entry) > KB_MAX_CHARS:
+ break
+ parts.append(entry)
+ char_count += len(entry)
+ if char_count >= KB_MAX_CHARS:
+ break
+
+ return "\n\n---\n\n".join(parts)
+ except Exception as e:
+ logger.warning(f"KB search failed, proceeding without context: {e}")
+ return ""
+
async def stream_vision_autocomplete(
screenshot_data_url: str,
search_space_id: int,
session: AsyncSession,
+ *,
+ app_name: str = "",
+ window_title: str = "",
) -> AsyncGenerator[str, None]:
- """Analyze a screenshot with the vision LLM and stream a text completion."""
+ """Analyze a screenshot with the vision LLM and stream a text completion.
+
+ Pipeline:
+ 1. Extract a search query from the screenshot (non-streaming)
+ 2. Search the knowledge base for relevant context
+ 3. Stream the final completion with screenshot + KB + app context
+ """
streaming = VercelStreamingService()
llm = await get_vision_llm(session, search_space_id)
@@ -44,8 +148,17 @@ async def stream_vision_autocomplete(
yield streaming.format_done()
return
+ kb_context = ""
+ query = await _extract_query_from_screenshot(
+ llm, screenshot_data_url, app_name=app_name, window_title=window_title,
+ )
+ if query:
+ kb_context = await _search_knowledge_base(session, search_space_id, query)
+
+ system_prompt = _build_system_prompt(app_name, window_title, kb_context)
+
messages = [
- SystemMessage(content=VISION_SYSTEM_PROMPT),
+ SystemMessage(content=system_prompt),
HumanMessage(content=[
{
"type": "text",
diff --git a/surfsense_desktop/src/modules/autocomplete/index.ts b/surfsense_desktop/src/modules/autocomplete/index.ts
index 3ed9c4a00..0d5073de4 100644
--- a/surfsense_desktop/src/modules/autocomplete/index.ts
+++ b/surfsense_desktop/src/modules/autocomplete/index.ts
@@ -1,6 +1,6 @@
import { clipboard, globalShortcut, ipcMain, screen } from 'electron';
import { IPC_CHANNELS } from '../../ipc/channels';
-import { getFrontmostApp, hasAccessibilityPermission, simulatePaste } from '../platform';
+import { getFrontmostApp, getWindowTitle, hasAccessibilityPermission, simulatePaste } from '../platform';
import { hasScreenRecordingPermission, requestAccessibility, requestScreenRecording } from '../permissions';
import { getMainWindow } from '../window';
import { captureScreen } from './screenshot';
@@ -27,6 +27,7 @@ async function triggerAutocomplete(): Promise {
}
sourceApp = getFrontmostApp();
+ const windowTitle = getWindowTitle();
savedClipboard = clipboard.readText();
const screenshot = await captureScreen();
@@ -55,6 +56,8 @@ async function triggerAutocomplete(): Promise {
sw.webContents.send(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, {
screenshot,
searchSpaceId,
+ appName: sourceApp,
+ windowTitle,
});
}
}, 300);
diff --git a/surfsense_desktop/src/modules/platform.ts b/surfsense_desktop/src/modules/platform.ts
index 1e6ac74e4..122e2efed 100644
--- a/surfsense_desktop/src/modules/platform.ts
+++ b/surfsense_desktop/src/modules/platform.ts
@@ -32,6 +32,24 @@ export function checkAccessibilityPermission(): boolean {
return systemPreferences.isTrustedAccessibilityClient(true);
}
+export function getWindowTitle(): string {
+ try {
+ if (process.platform === 'darwin') {
+ return execSync(
+ 'osascript -e \'tell application "System Events" to get title of front window of first application process whose frontmost is true\''
+ ).toString().trim();
+ }
+ if (process.platform === 'win32') {
+ return execSync(
+ 'powershell -command "(Get-Process | Where-Object { $_.MainWindowHandle -eq (Add-Type -MemberDefinition \'[DllImport(\\\"user32.dll\\\")] public static extern IntPtr GetForegroundWindow();\' -Name W -PassThru)::GetForegroundWindow() }).MainWindowTitle"'
+ ).toString().trim();
+ }
+ } catch {
+ return '';
+ }
+ return '';
+}
+
export function hasAccessibilityPermission(): boolean {
if (process.platform !== 'darwin') return true;
return systemPreferences.isTrustedAccessibilityClient(false);
diff --git a/surfsense_desktop/src/preload.ts b/surfsense_desktop/src/preload.ts
index 31c5ca865..2bd09f13c 100644
--- a/surfsense_desktop/src/preload.ts
+++ b/surfsense_desktop/src/preload.ts
@@ -27,8 +27,8 @@ contextBridge.exposeInMainWorld('electronAPI', {
requestScreenRecording: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_SCREEN_RECORDING),
restartApp: () => ipcRenderer.invoke(IPC_CHANNELS.RESTART_APP),
// Autocomplete
- onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => {
- const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string }) => callback(data);
+ onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => void) => {
+ const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => callback(data);
ipcRenderer.on(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener);
return () => {
ipcRenderer.removeListener(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener);
diff --git a/surfsense_web/app/desktop/suggestion/page.tsx b/surfsense_web/app/desktop/suggestion/page.tsx
index b68fe450d..b7d9b97bd 100644
--- a/surfsense_web/app/desktop/suggestion/page.tsx
+++ b/surfsense_web/app/desktop/suggestion/page.tsx
@@ -46,7 +46,7 @@ export default function SuggestionPage() {
}, [error]);
const fetchSuggestion = useCallback(
- async (screenshot: string, searchSpaceId: string) => {
+ async (screenshot: string, searchSpaceId: string, appName?: string, windowTitle?: string) => {
abortRef.current?.abort();
const controller = new AbortController();
abortRef.current = controller;
@@ -77,6 +77,8 @@ export default function SuggestionPage() {
body: JSON.stringify({
screenshot,
search_space_id: parseInt(searchSpaceId, 10),
+ app_name: appName || "",
+ window_title: windowTitle || "",
}),
signal: controller.signal,
},
@@ -142,7 +144,7 @@ export default function SuggestionPage() {
const cleanup = window.electronAPI.onAutocompleteContext((data) => {
const searchSpaceId = data.searchSpaceId || "1";
if (data.screenshot) {
- fetchSuggestion(data.screenshot, searchSpaceId);
+ fetchSuggestion(data.screenshot, searchSpaceId, data.appName, data.windowTitle);
}
});
diff --git a/surfsense_web/types/window.d.ts b/surfsense_web/types/window.d.ts
index 2fc550306..85b6bdf51 100644
--- a/surfsense_web/types/window.d.ts
+++ b/surfsense_web/types/window.d.ts
@@ -23,7 +23,7 @@ interface ElectronAPI {
requestScreenRecording: () => Promise;
restartApp: () => Promise;
// Autocomplete
- onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => () => void;
+ onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => void) => () => void;
acceptSuggestion: (text: string) => Promise;
dismissSuggestion: () => Promise;
setAutocompleteEnabled: (enabled: boolean) => Promise;