diff --git a/surfsense_backend/app/routes/autocomplete_routes.py b/surfsense_backend/app/routes/autocomplete_routes.py index 68c56d0e0..329476ca1 100644 --- a/surfsense_backend/app/routes/autocomplete_routes.py +++ b/surfsense_backend/app/routes/autocomplete_routes.py @@ -1,28 +1,29 @@ -from fastapi import APIRouter, Depends, Query +from fastapi import APIRouter, Depends from fastapi.responses import StreamingResponse +from pydantic import BaseModel from sqlalchemy.ext.asyncio import AsyncSession from app.db import User, get_async_session -from app.services.autocomplete_service import stream_autocomplete from app.services.new_streaming_service import VercelStreamingService +from app.services.vision_autocomplete_service import stream_vision_autocomplete from app.users import current_active_user router = APIRouter(prefix="/autocomplete", tags=["autocomplete"]) -@router.post("/stream") -async def autocomplete_stream( - text: str = Query(..., description="Current text in the input field"), - cursor_position: int = Query(-1, description="Cursor position in the text (-1 for end)"), - search_space_id: int = Query(..., description="Search space ID for KB context and LLM config"), +class VisionAutocompleteRequest(BaseModel): + screenshot: str + search_space_id: int + + +@router.post("/vision/stream") +async def vision_autocomplete_stream( + body: VisionAutocompleteRequest, user: User = Depends(current_active_user), session: AsyncSession = Depends(get_async_session), ): - if cursor_position < 0: - cursor_position = len(text) - return StreamingResponse( - stream_autocomplete(text, cursor_position, search_space_id, session), + stream_vision_autocomplete(body.screenshot, body.search_space_id, session), media_type="text/event-stream", headers={ **VercelStreamingService.get_response_headers(), diff --git a/surfsense_backend/app/services/autocomplete_service.py b/surfsense_backend/app/services/autocomplete_service.py deleted file mode 100644 index 7c172275d..000000000 --- a/surfsense_backend/app/services/autocomplete_service.py +++ /dev/null @@ -1,110 +0,0 @@ -import logging -from typing import AsyncGenerator - -from langchain_core.messages import HumanMessage, SystemMessage -from sqlalchemy.ext.asyncio import AsyncSession - -from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever -from app.services.llm_service import get_agent_llm -from app.services.new_streaming_service import VercelStreamingService - -logger = logging.getLogger(__name__) - -SYSTEM_PROMPT = """You are an inline text autocomplete engine. Your job is to complete the user's text naturally. - -Rules: -- Output ONLY the continuation text. Do NOT repeat what the user already typed. -- Keep completions concise: 1-3 sentences maximum. -- Match the user's tone, style, and language. -- If knowledge base context is provided, use it to make the completion factually accurate and personalized. -- Do NOT add quotes, explanations, or meta-commentary. -- Do NOT start with a space unless grammatically required. -- If you cannot produce a useful completion, output nothing.""" - -KB_CONTEXT_TEMPLATE = """ -Relevant knowledge base context (use this to personalize the completion): ---- -{kb_context} ---- -""" - - -async def _retrieve_kb_context( - session: AsyncSession, - text: str, - search_space_id: int, -) -> str: - try: - retriever = ChucksHybridSearchRetriever(session) - chunks = await retriever.vector_search( - query_text=text[-200:], - top_k=3, - search_space_id=search_space_id, - ) - if not chunks: - return "" - snippets = [] - for chunk in chunks: - content = getattr(chunk, "content", None) or getattr(chunk, "chunk_text", "") - if content: - snippets.append(content[:300]) - if not snippets: - return "" - return KB_CONTEXT_TEMPLATE.format(kb_context="\n\n".join(snippets)) - except Exception as e: - logger.warning(f"KB search failed for autocomplete, proceeding without context: {e}") - return "" - - -async def stream_autocomplete( - text: str, - cursor_position: int, - search_space_id: int, - session: AsyncSession, -) -> AsyncGenerator[str, None]: - """Build context, call the LLM, and yield SSE-formatted tokens.""" - streaming = VercelStreamingService() - text_before_cursor = text[:cursor_position] if cursor_position >= 0 else text - - if not text_before_cursor.strip(): - yield streaming.format_message_start() - yield streaming.format_finish() - yield streaming.format_done() - return - - kb_context = await _retrieve_kb_context(session, text_before_cursor, search_space_id) - - llm = await get_agent_llm(session, search_space_id) - if not llm: - yield streaming.format_message_start() - yield streaming.format_error("No LLM configured for this search space") - yield streaming.format_done() - return - - system_prompt = SYSTEM_PROMPT - if kb_context: - system_prompt += kb_context - - messages = [ - SystemMessage(content=system_prompt), - HumanMessage(content=f"Complete this text:\n{text_before_cursor}"), - ] - - try: - yield streaming.format_message_start() - text_id = streaming.generate_text_id() - yield streaming.format_text_start(text_id) - - async for chunk in llm.astream(messages): - token = chunk.content if hasattr(chunk, "content") else str(chunk) - if token: - yield streaming.format_text_delta(text_id, token) - - yield streaming.format_text_end(text_id) - yield streaming.format_finish() - yield streaming.format_done() - - except Exception as e: - logger.error(f"Autocomplete streaming error: {e}") - yield streaming.format_error(str(e)) - yield streaming.format_done() diff --git a/surfsense_backend/app/services/vision_autocomplete_service.py b/surfsense_backend/app/services/vision_autocomplete_service.py new file mode 100644 index 000000000..526b0d35c --- /dev/null +++ b/surfsense_backend/app/services/vision_autocomplete_service.py @@ -0,0 +1,78 @@ +import logging +from typing import AsyncGenerator + +from langchain_core.messages import HumanMessage, SystemMessage +from sqlalchemy.ext.asyncio import AsyncSession + +from app.services.llm_service import get_vision_llm +from app.services.new_streaming_service import VercelStreamingService + +logger = logging.getLogger(__name__) + +VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text. + +You will receive a screenshot of the user's screen. Your job: +1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.). +2. Identify the text area where the user will type. +3. Based on the full visual context, generate the text the user most likely wants to write. + +Key behavior: +- If the text area is EMPTY, draft a full response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document). +- If the text area already has text, continue it naturally. + +Rules: +- Output ONLY the text to be inserted. No quotes, no explanations, no meta-commentary. +- Be concise but complete — a full thought, not a fragment. +- Match the tone and formality of the surrounding context. +- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal. +- Do NOT describe the screenshot or explain your reasoning. +- If you cannot determine what to write, output nothing.""" + + +async def stream_vision_autocomplete( + screenshot_data_url: str, + search_space_id: int, + session: AsyncSession, +) -> AsyncGenerator[str, None]: + """Analyze a screenshot with the vision LLM and stream a text completion.""" + streaming = VercelStreamingService() + + llm = await get_vision_llm(session, search_space_id) + if not llm: + yield streaming.format_message_start() + yield streaming.format_error("No Vision LLM configured for this search space") + yield streaming.format_done() + return + + messages = [ + SystemMessage(content=VISION_SYSTEM_PROMPT), + HumanMessage(content=[ + { + "type": "text", + "text": "Analyze this screenshot. Understand the full context of what the user is working on, then generate the text they most likely want to write in the active text area.", + }, + { + "type": "image_url", + "image_url": {"url": screenshot_data_url}, + }, + ]), + ] + + try: + yield streaming.format_message_start() + text_id = streaming.generate_text_id() + yield streaming.format_text_start(text_id) + + async for chunk in llm.astream(messages): + token = chunk.content if hasattr(chunk, "content") else str(chunk) + if token: + yield streaming.format_text_delta(text_id, token) + + yield streaming.format_text_end(text_id) + yield streaming.format_finish() + yield streaming.format_done() + + except Exception as e: + logger.error(f"Vision autocomplete streaming error: {e}") + yield streaming.format_error(str(e)) + yield streaming.format_done() diff --git a/surfsense_desktop/src/preload.ts b/surfsense_desktop/src/preload.ts index 157fe216b..891d9b029 100644 --- a/surfsense_desktop/src/preload.ts +++ b/surfsense_desktop/src/preload.ts @@ -26,8 +26,8 @@ contextBridge.exposeInMainWorld('electronAPI', { requestAccessibility: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_ACCESSIBILITY), restartApp: () => ipcRenderer.invoke(IPC_CHANNELS.RESTART_APP), // Autocomplete - onAutocompleteContext: (callback: (data: { text: string; cursorPosition: number; searchSpaceId?: string }) => void) => { - const listener = (_event: unknown, data: { text: string; cursorPosition: number; searchSpaceId?: string }) => callback(data); + onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => { + const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string }) => callback(data); ipcRenderer.on(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener); return () => { ipcRenderer.removeListener(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener); diff --git a/surfsense_web/app/desktop/suggestion/page.tsx b/surfsense_web/app/desktop/suggestion/page.tsx index 69a19e3f1..2c147eb25 100644 --- a/surfsense_web/app/desktop/suggestion/page.tsx +++ b/surfsense_web/app/desktop/suggestion/page.tsx @@ -18,7 +18,7 @@ export default function SuggestionPage() { const abortRef = useRef(null); const fetchSuggestion = useCallback( - async (text: string, cursorPosition: number, searchSpaceId: string) => { + async (screenshot: string, searchSpaceId: string) => { abortRef.current?.abort(); const controller = new AbortController(); abortRef.current = controller; @@ -37,21 +37,19 @@ export default function SuggestionPage() { const backendUrl = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000"; - const params = new URLSearchParams({ - text, - cursor_position: String(cursorPosition), - search_space_id: searchSpaceId, - }); - try { const response = await fetch( - `${backendUrl}/api/v1/autocomplete/stream?${params}`, + `${backendUrl}/api/v1/autocomplete/vision/stream`, { method: "POST", headers: { Authorization: `Bearer ${token}`, "Content-Type": "application/json", }, + body: JSON.stringify({ + screenshot, + search_space_id: parseInt(searchSpaceId, 10), + }), signal: controller.signal, }, ); @@ -119,7 +117,9 @@ export default function SuggestionPage() { const cleanup = window.electronAPI.onAutocompleteContext((data) => { const searchSpaceId = data.searchSpaceId || "1"; - fetchSuggestion(data.text, data.cursorPosition, searchSpaceId); + if (data.screenshot) { + fetchSuggestion(data.screenshot, searchSpaceId); + } }); return cleanup; diff --git a/surfsense_web/types/window.d.ts b/surfsense_web/types/window.d.ts index 0b312b5ec..a5b8566f9 100644 --- a/surfsense_web/types/window.d.ts +++ b/surfsense_web/types/window.d.ts @@ -21,7 +21,7 @@ interface ElectronAPI { requestAccessibility: () => Promise; restartApp: () => Promise; // Autocomplete - onAutocompleteContext: (callback: (data: { text: string; cursorPosition: number; searchSpaceId?: string }) => void) => () => void; + onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => () => void; acceptSuggestion: (text: string) => Promise; dismissSuggestion: () => Promise; updateSuggestionText: (text: string) => Promise;