mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-08 20:25:19 +02:00
add app context and KB grounding to autocomplete suggestions
This commit is contained in:
parent
080acf5e0a
commit
960b8fc012
7 changed files with 150 additions and 9 deletions
|
|
@ -14,6 +14,8 @@ router = APIRouter(prefix="/autocomplete", tags=["autocomplete"])
|
|||
class VisionAutocompleteRequest(BaseModel):
|
||||
screenshot: str
|
||||
search_space_id: int
|
||||
app_name: str = ""
|
||||
window_title: str = ""
|
||||
|
||||
|
||||
@router.post("/vision/stream")
|
||||
|
|
@ -23,7 +25,10 @@ async def vision_autocomplete_stream(
|
|||
session: AsyncSession = Depends(get_async_session),
|
||||
):
|
||||
return StreamingResponse(
|
||||
stream_vision_autocomplete(body.screenshot, body.search_space_id, session),
|
||||
stream_vision_autocomplete(
|
||||
body.screenshot, body.search_space_id, session,
|
||||
app_name=body.app_name, window_title=body.window_title,
|
||||
),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
**VercelStreamingService.get_response_headers(),
|
||||
|
|
|
|||
|
|
@ -4,11 +4,21 @@ from typing import AsyncGenerator
|
|||
from langchain_core.messages import HumanMessage, SystemMessage
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
|
||||
from app.services.llm_service import get_vision_llm
|
||||
from app.services.new_streaming_service import VercelStreamingService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
KB_TOP_K = 5
|
||||
KB_MAX_CHARS = 4000
|
||||
|
||||
EXTRACT_QUERY_PROMPT = """Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
|
||||
|
||||
EXTRACT_QUERY_PROMPT_WITH_APP = """The user is currently in the application "{app_name}" with the window titled "{window_title}".
|
||||
|
||||
Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
|
||||
|
||||
VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text.
|
||||
|
||||
You will receive a screenshot of the user's screen. Your job:
|
||||
|
|
@ -28,13 +38,107 @@ Rules:
|
|||
- Do NOT describe the screenshot or explain your reasoning.
|
||||
- If you cannot determine what to write, output nothing."""
|
||||
|
||||
APP_CONTEXT_BLOCK = """
|
||||
|
||||
The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly."""
|
||||
|
||||
KB_CONTEXT_BLOCK = """
|
||||
|
||||
You also have access to the user's knowledge base documents below. Use them to write more accurate, informed, and contextually relevant text. Do NOT cite or reference the documents explicitly — just let the knowledge inform your writing naturally.
|
||||
|
||||
<knowledge_base>
|
||||
{kb_context}
|
||||
</knowledge_base>"""
|
||||
|
||||
|
||||
def _build_system_prompt(app_name: str, window_title: str, kb_context: str) -> str:
|
||||
"""Assemble the system prompt from optional context blocks."""
|
||||
prompt = VISION_SYSTEM_PROMPT
|
||||
if app_name:
|
||||
prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title)
|
||||
if kb_context:
|
||||
prompt += KB_CONTEXT_BLOCK.format(kb_context=kb_context)
|
||||
return prompt
|
||||
|
||||
|
||||
async def _extract_query_from_screenshot(
|
||||
llm, screenshot_data_url: str,
|
||||
app_name: str = "", window_title: str = "",
|
||||
) -> str | None:
|
||||
"""Ask the Vision LLM to describe what the user is working on."""
|
||||
if app_name:
|
||||
prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format(
|
||||
app_name=app_name, window_title=window_title,
|
||||
)
|
||||
else:
|
||||
prompt_text = EXTRACT_QUERY_PROMPT
|
||||
|
||||
try:
|
||||
response = await llm.ainvoke([
|
||||
HumanMessage(content=[
|
||||
{"type": "text", "text": prompt_text},
|
||||
{"type": "image_url", "image_url": {"url": screenshot_data_url}},
|
||||
]),
|
||||
])
|
||||
query = response.content.strip() if hasattr(response, "content") else ""
|
||||
return query if query else None
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract query from screenshot: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def _search_knowledge_base(
|
||||
session: AsyncSession, search_space_id: int, query: str
|
||||
) -> str:
|
||||
"""Search the KB and return formatted context string."""
|
||||
try:
|
||||
retriever = ChucksHybridSearchRetriever(session)
|
||||
results = await retriever.hybrid_search(
|
||||
query_text=query,
|
||||
top_k=KB_TOP_K,
|
||||
search_space_id=search_space_id,
|
||||
)
|
||||
|
||||
if not results:
|
||||
return ""
|
||||
|
||||
parts: list[str] = []
|
||||
char_count = 0
|
||||
for doc in results:
|
||||
title = doc.get("document", {}).get("title", "Untitled")
|
||||
for chunk in doc.get("chunks", []):
|
||||
content = chunk.get("content", "").strip()
|
||||
if not content:
|
||||
continue
|
||||
entry = f"[{title}]\n{content}"
|
||||
if char_count + len(entry) > KB_MAX_CHARS:
|
||||
break
|
||||
parts.append(entry)
|
||||
char_count += len(entry)
|
||||
if char_count >= KB_MAX_CHARS:
|
||||
break
|
||||
|
||||
return "\n\n---\n\n".join(parts)
|
||||
except Exception as e:
|
||||
logger.warning(f"KB search failed, proceeding without context: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
async def stream_vision_autocomplete(
|
||||
screenshot_data_url: str,
|
||||
search_space_id: int,
|
||||
session: AsyncSession,
|
||||
*,
|
||||
app_name: str = "",
|
||||
window_title: str = "",
|
||||
) -> AsyncGenerator[str, None]:
|
||||
"""Analyze a screenshot with the vision LLM and stream a text completion."""
|
||||
"""Analyze a screenshot with the vision LLM and stream a text completion.
|
||||
|
||||
Pipeline:
|
||||
1. Extract a search query from the screenshot (non-streaming)
|
||||
2. Search the knowledge base for relevant context
|
||||
3. Stream the final completion with screenshot + KB + app context
|
||||
"""
|
||||
streaming = VercelStreamingService()
|
||||
|
||||
llm = await get_vision_llm(session, search_space_id)
|
||||
|
|
@ -44,8 +148,17 @@ async def stream_vision_autocomplete(
|
|||
yield streaming.format_done()
|
||||
return
|
||||
|
||||
kb_context = ""
|
||||
query = await _extract_query_from_screenshot(
|
||||
llm, screenshot_data_url, app_name=app_name, window_title=window_title,
|
||||
)
|
||||
if query:
|
||||
kb_context = await _search_knowledge_base(session, search_space_id, query)
|
||||
|
||||
system_prompt = _build_system_prompt(app_name, window_title, kb_context)
|
||||
|
||||
messages = [
|
||||
SystemMessage(content=VISION_SYSTEM_PROMPT),
|
||||
SystemMessage(content=system_prompt),
|
||||
HumanMessage(content=[
|
||||
{
|
||||
"type": "text",
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import { clipboard, globalShortcut, ipcMain, screen } from 'electron';
|
||||
import { IPC_CHANNELS } from '../../ipc/channels';
|
||||
import { getFrontmostApp, hasAccessibilityPermission, simulatePaste } from '../platform';
|
||||
import { getFrontmostApp, getWindowTitle, hasAccessibilityPermission, simulatePaste } from '../platform';
|
||||
import { hasScreenRecordingPermission, requestAccessibility, requestScreenRecording } from '../permissions';
|
||||
import { getMainWindow } from '../window';
|
||||
import { captureScreen } from './screenshot';
|
||||
|
|
@ -27,6 +27,7 @@ async function triggerAutocomplete(): Promise<void> {
|
|||
}
|
||||
|
||||
sourceApp = getFrontmostApp();
|
||||
const windowTitle = getWindowTitle();
|
||||
savedClipboard = clipboard.readText();
|
||||
|
||||
const screenshot = await captureScreen();
|
||||
|
|
@ -55,6 +56,8 @@ async function triggerAutocomplete(): Promise<void> {
|
|||
sw.webContents.send(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, {
|
||||
screenshot,
|
||||
searchSpaceId,
|
||||
appName: sourceApp,
|
||||
windowTitle,
|
||||
});
|
||||
}
|
||||
}, 300);
|
||||
|
|
|
|||
|
|
@ -32,6 +32,24 @@ export function checkAccessibilityPermission(): boolean {
|
|||
return systemPreferences.isTrustedAccessibilityClient(true);
|
||||
}
|
||||
|
||||
export function getWindowTitle(): string {
|
||||
try {
|
||||
if (process.platform === 'darwin') {
|
||||
return execSync(
|
||||
'osascript -e \'tell application "System Events" to get title of front window of first application process whose frontmost is true\''
|
||||
).toString().trim();
|
||||
}
|
||||
if (process.platform === 'win32') {
|
||||
return execSync(
|
||||
'powershell -command "(Get-Process | Where-Object { $_.MainWindowHandle -eq (Add-Type -MemberDefinition \'[DllImport(\\\"user32.dll\\\")] public static extern IntPtr GetForegroundWindow();\' -Name W -PassThru)::GetForegroundWindow() }).MainWindowTitle"'
|
||||
).toString().trim();
|
||||
}
|
||||
} catch {
|
||||
return '';
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
export function hasAccessibilityPermission(): boolean {
|
||||
if (process.platform !== 'darwin') return true;
|
||||
return systemPreferences.isTrustedAccessibilityClient(false);
|
||||
|
|
|
|||
|
|
@ -27,8 +27,8 @@ contextBridge.exposeInMainWorld('electronAPI', {
|
|||
requestScreenRecording: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_SCREEN_RECORDING),
|
||||
restartApp: () => ipcRenderer.invoke(IPC_CHANNELS.RESTART_APP),
|
||||
// Autocomplete
|
||||
onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => {
|
||||
const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string }) => callback(data);
|
||||
onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => void) => {
|
||||
const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => callback(data);
|
||||
ipcRenderer.on(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener);
|
||||
return () => {
|
||||
ipcRenderer.removeListener(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener);
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ export default function SuggestionPage() {
|
|||
}, [error]);
|
||||
|
||||
const fetchSuggestion = useCallback(
|
||||
async (screenshot: string, searchSpaceId: string) => {
|
||||
async (screenshot: string, searchSpaceId: string, appName?: string, windowTitle?: string) => {
|
||||
abortRef.current?.abort();
|
||||
const controller = new AbortController();
|
||||
abortRef.current = controller;
|
||||
|
|
@ -77,6 +77,8 @@ export default function SuggestionPage() {
|
|||
body: JSON.stringify({
|
||||
screenshot,
|
||||
search_space_id: parseInt(searchSpaceId, 10),
|
||||
app_name: appName || "",
|
||||
window_title: windowTitle || "",
|
||||
}),
|
||||
signal: controller.signal,
|
||||
},
|
||||
|
|
@ -142,7 +144,7 @@ export default function SuggestionPage() {
|
|||
const cleanup = window.electronAPI.onAutocompleteContext((data) => {
|
||||
const searchSpaceId = data.searchSpaceId || "1";
|
||||
if (data.screenshot) {
|
||||
fetchSuggestion(data.screenshot, searchSpaceId);
|
||||
fetchSuggestion(data.screenshot, searchSpaceId, data.appName, data.windowTitle);
|
||||
}
|
||||
});
|
||||
|
||||
|
|
|
|||
2
surfsense_web/types/window.d.ts
vendored
2
surfsense_web/types/window.d.ts
vendored
|
|
@ -23,7 +23,7 @@ interface ElectronAPI {
|
|||
requestScreenRecording: () => Promise<void>;
|
||||
restartApp: () => Promise<void>;
|
||||
// Autocomplete
|
||||
onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => () => void;
|
||||
onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => void) => () => void;
|
||||
acceptSuggestion: (text: string) => Promise<void>;
|
||||
dismissSuggestion: () => Promise<void>;
|
||||
setAutocompleteEnabled: (enabled: boolean) => Promise<void>;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue