add app context and KB grounding to autocomplete suggestions

This commit is contained in:
CREDO23 2026-04-03 21:34:01 +02:00
parent 080acf5e0a
commit 960b8fc012
7 changed files with 150 additions and 9 deletions

View file

@ -14,6 +14,8 @@ router = APIRouter(prefix="/autocomplete", tags=["autocomplete"])
class VisionAutocompleteRequest(BaseModel):
screenshot: str
search_space_id: int
app_name: str = ""
window_title: str = ""
@router.post("/vision/stream")
@ -23,7 +25,10 @@ async def vision_autocomplete_stream(
session: AsyncSession = Depends(get_async_session),
):
return StreamingResponse(
stream_vision_autocomplete(body.screenshot, body.search_space_id, session),
stream_vision_autocomplete(
body.screenshot, body.search_space_id, session,
app_name=body.app_name, window_title=body.window_title,
),
media_type="text/event-stream",
headers={
**VercelStreamingService.get_response_headers(),

View file

@ -4,11 +4,21 @@ from typing import AsyncGenerator
from langchain_core.messages import HumanMessage, SystemMessage
from sqlalchemy.ext.asyncio import AsyncSession
from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
from app.services.llm_service import get_vision_llm
from app.services.new_streaming_service import VercelStreamingService
logger = logging.getLogger(__name__)
KB_TOP_K = 5
KB_MAX_CHARS = 4000
EXTRACT_QUERY_PROMPT = """Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
EXTRACT_QUERY_PROMPT_WITH_APP = """The user is currently in the application "{app_name}" with the window titled "{window_title}".
Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text.
You will receive a screenshot of the user's screen. Your job:
@ -28,13 +38,107 @@ Rules:
- Do NOT describe the screenshot or explain your reasoning.
- If you cannot determine what to write, output nothing."""
APP_CONTEXT_BLOCK = """
The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly."""
KB_CONTEXT_BLOCK = """
You also have access to the user's knowledge base documents below. Use them to write more accurate, informed, and contextually relevant text. Do NOT cite or reference the documents explicitly — just let the knowledge inform your writing naturally.
<knowledge_base>
{kb_context}
</knowledge_base>"""
def _build_system_prompt(app_name: str, window_title: str, kb_context: str) -> str:
"""Assemble the system prompt from optional context blocks."""
prompt = VISION_SYSTEM_PROMPT
if app_name:
prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title)
if kb_context:
prompt += KB_CONTEXT_BLOCK.format(kb_context=kb_context)
return prompt
async def _extract_query_from_screenshot(
llm, screenshot_data_url: str,
app_name: str = "", window_title: str = "",
) -> str | None:
"""Ask the Vision LLM to describe what the user is working on."""
if app_name:
prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format(
app_name=app_name, window_title=window_title,
)
else:
prompt_text = EXTRACT_QUERY_PROMPT
try:
response = await llm.ainvoke([
HumanMessage(content=[
{"type": "text", "text": prompt_text},
{"type": "image_url", "image_url": {"url": screenshot_data_url}},
]),
])
query = response.content.strip() if hasattr(response, "content") else ""
return query if query else None
except Exception as e:
logger.warning(f"Failed to extract query from screenshot: {e}")
return None
async def _search_knowledge_base(
session: AsyncSession, search_space_id: int, query: str
) -> str:
"""Search the KB and return formatted context string."""
try:
retriever = ChucksHybridSearchRetriever(session)
results = await retriever.hybrid_search(
query_text=query,
top_k=KB_TOP_K,
search_space_id=search_space_id,
)
if not results:
return ""
parts: list[str] = []
char_count = 0
for doc in results:
title = doc.get("document", {}).get("title", "Untitled")
for chunk in doc.get("chunks", []):
content = chunk.get("content", "").strip()
if not content:
continue
entry = f"[{title}]\n{content}"
if char_count + len(entry) > KB_MAX_CHARS:
break
parts.append(entry)
char_count += len(entry)
if char_count >= KB_MAX_CHARS:
break
return "\n\n---\n\n".join(parts)
except Exception as e:
logger.warning(f"KB search failed, proceeding without context: {e}")
return ""
async def stream_vision_autocomplete(
screenshot_data_url: str,
search_space_id: int,
session: AsyncSession,
*,
app_name: str = "",
window_title: str = "",
) -> AsyncGenerator[str, None]:
"""Analyze a screenshot with the vision LLM and stream a text completion."""
"""Analyze a screenshot with the vision LLM and stream a text completion.
Pipeline:
1. Extract a search query from the screenshot (non-streaming)
2. Search the knowledge base for relevant context
3. Stream the final completion with screenshot + KB + app context
"""
streaming = VercelStreamingService()
llm = await get_vision_llm(session, search_space_id)
@ -44,8 +148,17 @@ async def stream_vision_autocomplete(
yield streaming.format_done()
return
kb_context = ""
query = await _extract_query_from_screenshot(
llm, screenshot_data_url, app_name=app_name, window_title=window_title,
)
if query:
kb_context = await _search_knowledge_base(session, search_space_id, query)
system_prompt = _build_system_prompt(app_name, window_title, kb_context)
messages = [
SystemMessage(content=VISION_SYSTEM_PROMPT),
SystemMessage(content=system_prompt),
HumanMessage(content=[
{
"type": "text",

View file

@ -1,6 +1,6 @@
import { clipboard, globalShortcut, ipcMain, screen } from 'electron';
import { IPC_CHANNELS } from '../../ipc/channels';
import { getFrontmostApp, hasAccessibilityPermission, simulatePaste } from '../platform';
import { getFrontmostApp, getWindowTitle, hasAccessibilityPermission, simulatePaste } from '../platform';
import { hasScreenRecordingPermission, requestAccessibility, requestScreenRecording } from '../permissions';
import { getMainWindow } from '../window';
import { captureScreen } from './screenshot';
@ -27,6 +27,7 @@ async function triggerAutocomplete(): Promise<void> {
}
sourceApp = getFrontmostApp();
const windowTitle = getWindowTitle();
savedClipboard = clipboard.readText();
const screenshot = await captureScreen();
@ -55,6 +56,8 @@ async function triggerAutocomplete(): Promise<void> {
sw.webContents.send(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, {
screenshot,
searchSpaceId,
appName: sourceApp,
windowTitle,
});
}
}, 300);

View file

@ -32,6 +32,24 @@ export function checkAccessibilityPermission(): boolean {
return systemPreferences.isTrustedAccessibilityClient(true);
}
export function getWindowTitle(): string {
try {
if (process.platform === 'darwin') {
return execSync(
'osascript -e \'tell application "System Events" to get title of front window of first application process whose frontmost is true\''
).toString().trim();
}
if (process.platform === 'win32') {
return execSync(
'powershell -command "(Get-Process | Where-Object { $_.MainWindowHandle -eq (Add-Type -MemberDefinition \'[DllImport(\\\"user32.dll\\\")] public static extern IntPtr GetForegroundWindow();\' -Name W -PassThru)::GetForegroundWindow() }).MainWindowTitle"'
).toString().trim();
}
} catch {
return '';
}
return '';
}
export function hasAccessibilityPermission(): boolean {
if (process.platform !== 'darwin') return true;
return systemPreferences.isTrustedAccessibilityClient(false);

View file

@ -27,8 +27,8 @@ contextBridge.exposeInMainWorld('electronAPI', {
requestScreenRecording: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_SCREEN_RECORDING),
restartApp: () => ipcRenderer.invoke(IPC_CHANNELS.RESTART_APP),
// Autocomplete
onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => {
const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string }) => callback(data);
onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => void) => {
const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => callback(data);
ipcRenderer.on(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener);
return () => {
ipcRenderer.removeListener(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener);

View file

@ -46,7 +46,7 @@ export default function SuggestionPage() {
}, [error]);
const fetchSuggestion = useCallback(
async (screenshot: string, searchSpaceId: string) => {
async (screenshot: string, searchSpaceId: string, appName?: string, windowTitle?: string) => {
abortRef.current?.abort();
const controller = new AbortController();
abortRef.current = controller;
@ -77,6 +77,8 @@ export default function SuggestionPage() {
body: JSON.stringify({
screenshot,
search_space_id: parseInt(searchSpaceId, 10),
app_name: appName || "",
window_title: windowTitle || "",
}),
signal: controller.signal,
},
@ -142,7 +144,7 @@ export default function SuggestionPage() {
const cleanup = window.electronAPI.onAutocompleteContext((data) => {
const searchSpaceId = data.searchSpaceId || "1";
if (data.screenshot) {
fetchSuggestion(data.screenshot, searchSpaceId);
fetchSuggestion(data.screenshot, searchSpaceId, data.appName, data.windowTitle);
}
});

View file

@ -23,7 +23,7 @@ interface ElectronAPI {
requestScreenRecording: () => Promise<void>;
restartApp: () => Promise<void>;
// Autocomplete
onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string }) => void) => () => void;
onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => void) => () => void;
acceptSuggestion: (text: string) => Promise<void>;
dismissSuggestion: () => Promise<void>;
setAutocompleteEnabled: (enabled: boolean) => Promise<void>;