feat: enhance vision autocomplete service and UI feedback

- Optimized the vision autocomplete service by starting the SSE stream immediately and deriving KB search queries directly from window titles.
- Refactored the service to run KB filesystem pre-computation and agent graph compilation in parallel, improving performance.
- Updated the SuggestionPage component to handle new agent step data, displaying progress indicators for each step.
- Enhanced the CSS for the suggestion tooltip and agent activity indicators, improving the user interface and experience.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-04-07 02:49:24 -07:00
parent 49441233e7
commit bb1dcd32b6
6 changed files with 686 additions and 228 deletions

View file

@ -1,139 +1,40 @@
"""Vision autocomplete service — agent-based with scoped filesystem.
Optimized pipeline:
1. Start the SSE stream immediately so the UI shows progress.
2. Derive a KB search query from window_title (no separate LLM call).
3. Run KB filesystem pre-computation and agent graph compilation in PARALLEL.
4. Inject pre-computed KB files as initial state and stream the agent.
"""
import logging
from typing import AsyncGenerator
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.messages import HumanMessage
from sqlalchemy.ext.asyncio import AsyncSession
from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
from app.agents.autocomplete import create_autocomplete_agent, stream_autocomplete_agent
from app.services.llm_service import get_vision_llm
from app.services.new_streaming_service import VercelStreamingService
logger = logging.getLogger(__name__)
KB_TOP_K = 5
KB_MAX_CHARS = 4000
EXTRACT_QUERY_PROMPT = """Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
EXTRACT_QUERY_PROMPT_WITH_APP = """The user is currently in the application "{app_name}" with the window titled "{window_title}".
Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text.
You will receive a screenshot of the user's screen. Your job:
1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.).
2. Identify the text area where the user will type.
3. Based on the full visual context, generate the text the user most likely wants to write.
Key behavior:
- If the text area is EMPTY, draft a full response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document).
- If the text area already has text, continue it naturally.
Rules:
- Output ONLY the text to be inserted. No quotes, no explanations, no meta-commentary.
- Be concise but complete a full thought, not a fragment.
- Match the tone and formality of the surrounding context.
- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal.
- Do NOT describe the screenshot or explain your reasoning.
- If you cannot determine what to write, output nothing."""
APP_CONTEXT_BLOCK = """
The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly."""
KB_CONTEXT_BLOCK = """
You also have access to the user's knowledge base documents below. Use them to write more accurate, informed, and contextually relevant text. Do NOT cite or reference the documents explicitly — just let the knowledge inform your writing naturally.
<knowledge_base>
{kb_context}
</knowledge_base>"""
PREP_STEP_ID = "autocomplete-prep"
def _build_system_prompt(app_name: str, window_title: str, kb_context: str) -> str:
"""Assemble the system prompt from optional context blocks."""
prompt = VISION_SYSTEM_PROMPT
if app_name:
prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title)
if kb_context:
prompt += KB_CONTEXT_BLOCK.format(kb_context=kb_context)
return prompt
def _derive_kb_query(app_name: str, window_title: str) -> str:
parts = [p for p in (window_title, app_name) if p]
return " ".join(parts)
def _is_vision_unsupported_error(e: Exception) -> bool:
"""Check if an exception indicates the model doesn't support vision/images."""
msg = str(e).lower()
return "content must be a string" in msg or "does not support image" in msg
async def _extract_query_from_screenshot(
llm, screenshot_data_url: str,
app_name: str = "", window_title: str = "",
) -> str | None:
"""Ask the Vision LLM to describe what the user is working on.
Raises vision-unsupported errors so the caller can return a
friendly message immediately instead of retrying with astream.
"""
if app_name:
prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format(
app_name=app_name, window_title=window_title,
)
else:
prompt_text = EXTRACT_QUERY_PROMPT
try:
response = await llm.ainvoke([
HumanMessage(content=[
{"type": "text", "text": prompt_text},
{"type": "image_url", "image_url": {"url": screenshot_data_url}},
]),
])
query = response.content.strip() if hasattr(response, "content") else ""
return query if query else None
except Exception as e:
if _is_vision_unsupported_error(e):
raise
logger.warning(f"Failed to extract query from screenshot: {e}")
return None
async def _search_knowledge_base(
session: AsyncSession, search_space_id: int, query: str
) -> str:
"""Search the KB and return formatted context string."""
try:
retriever = ChucksHybridSearchRetriever(session)
results = await retriever.hybrid_search(
query_text=query,
top_k=KB_TOP_K,
search_space_id=search_space_id,
)
if not results:
return ""
parts: list[str] = []
char_count = 0
for doc in results:
title = doc.get("document", {}).get("title", "Untitled")
for chunk in doc.get("chunks", []):
content = chunk.get("content", "").strip()
if not content:
continue
entry = f"[{title}]\n{content}"
if char_count + len(entry) > KB_MAX_CHARS:
break
parts.append(entry)
char_count += len(entry)
if char_count >= KB_MAX_CHARS:
break
return "\n\n---\n\n".join(parts)
except Exception as e:
logger.warning(f"KB search failed, proceeding without context: {e}")
return ""
# ---------------------------------------------------------------------------
# Main entry point
# ---------------------------------------------------------------------------
async def stream_vision_autocomplete(
@ -144,13 +45,7 @@ async def stream_vision_autocomplete(
app_name: str = "",
window_title: str = "",
) -> AsyncGenerator[str, None]:
"""Analyze a screenshot with the vision LLM and stream a text completion.
Pipeline:
1. Extract a search query from the screenshot (non-streaming)
2. Search the knowledge base for relevant context
3. Stream the final completion with screenshot + KB + app context
"""
"""Analyze a screenshot with a vision-LLM agent and stream a text completion."""
streaming = VercelStreamingService()
vision_error_msg = (
"The selected model does not support vision. "
@ -164,62 +59,89 @@ async def stream_vision_autocomplete(
yield streaming.format_done()
return
kb_context = ""
# Start SSE stream immediately so the UI has something to show
yield streaming.format_message_start()
kb_query = _derive_kb_query(app_name, window_title)
# Show a preparation step while KB search + agent compile run
yield streaming.format_thinking_step(
step_id=PREP_STEP_ID,
title="Searching knowledge base",
status="in_progress",
items=[kb_query] if kb_query else [],
)
try:
query = await _extract_query_from_screenshot(
llm, screenshot_data_url, app_name=app_name, window_title=window_title,
agent, kb = await create_autocomplete_agent(
llm,
search_space_id=search_space_id,
kb_query=kb_query,
app_name=app_name,
window_title=window_title,
)
except Exception as e:
logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
yield streaming.format_message_start()
yield streaming.format_error(vision_error_msg)
if _is_vision_unsupported_error(e):
logger.warning("Vision autocomplete: model does not support vision: %s", e)
yield streaming.format_error(vision_error_msg)
yield streaming.format_done()
return
logger.error("Failed to create autocomplete agent: %s", e, exc_info=True)
yield streaming.format_error("Autocomplete failed. Please try again.")
yield streaming.format_done()
return
if query:
kb_context = await _search_knowledge_base(session, search_space_id, query)
has_kb = kb.has_documents
doc_count = len(kb.files) if has_kb else 0 # type: ignore[arg-type]
system_prompt = _build_system_prompt(app_name, window_title, kb_context)
yield streaming.format_thinking_step(
step_id=PREP_STEP_ID,
title="Searching knowledge base",
status="complete",
items=[f"Found {doc_count} document{'s' if doc_count != 1 else ''}"] if kb_query else ["Skipped"],
)
messages = [
SystemMessage(content=system_prompt),
HumanMessage(content=[
{
"type": "text",
"text": "Analyze this screenshot. Understand the full context of what the user is working on, then generate the text they most likely want to write in the active text area.",
},
{
"type": "image_url",
"image_url": {"url": screenshot_data_url},
},
]),
]
# Build agent input with pre-computed KB as initial state
if has_kb:
instruction = (
"Analyze this screenshot, then explore the knowledge base documents "
"listed above — read the chunk index of any document whose title "
"looks relevant and check matched chunks for useful facts. "
"Finally, generate a concise autocomplete for the active text area, "
"enhanced with any relevant KB information you found."
)
else:
instruction = (
"Analyze this screenshot and generate a concise autocomplete "
"for the active text area based on what you see."
)
text_started = False
text_id = ""
user_message = HumanMessage(content=[
{"type": "text", "text": instruction},
{"type": "image_url", "image_url": {"url": screenshot_data_url}},
])
input_data: dict = {"messages": [user_message]}
if has_kb:
input_data["files"] = kb.files
input_data["messages"] = [kb.ls_ai_msg, kb.ls_tool_msg, user_message]
logger.info("Autocomplete: injected %d KB files into agent initial state", doc_count)
else:
logger.info("Autocomplete: no KB documents found, proceeding with screenshot only")
# Stream the agent (message_start already sent above)
try:
yield streaming.format_message_start()
text_id = streaming.generate_text_id()
yield streaming.format_text_start(text_id)
text_started = True
async for chunk in llm.astream(messages):
token = chunk.content if hasattr(chunk, "content") else str(chunk)
if token:
yield streaming.format_text_delta(text_id, token)
yield streaming.format_text_end(text_id)
yield streaming.format_finish()
yield streaming.format_done()
async for sse in stream_autocomplete_agent(
agent, input_data, streaming, emit_message_start=False,
):
yield sse
except Exception as e:
if text_started:
yield streaming.format_text_end(text_id)
if _is_vision_unsupported_error(e):
logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
logger.warning("Vision autocomplete: model does not support vision: %s", e)
yield streaming.format_error(vision_error_msg)
yield streaming.format_done()
else:
logger.error(f"Vision autocomplete streaming error: {e}", exc_info=True)
logger.error("Vision autocomplete streaming error: %s", e, exc_info=True)
yield streaming.format_error("Autocomplete failed. Please try again.")
yield streaming.format_done()
yield streaming.format_done()