mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
158 lines
5.5 KiB
Python
158 lines
5.5 KiB
Python
"""Vision autocomplete service — agent-based with scoped filesystem.
|
|
|
|
Optimized pipeline:
|
|
1. Start the SSE stream immediately so the UI shows progress.
|
|
2. Derive a KB search query from window_title (no separate LLM call).
|
|
3. Run KB filesystem pre-computation and agent graph compilation in PARALLEL.
|
|
4. Inject pre-computed KB files as initial state and stream the agent.
|
|
"""
|
|
|
|
import logging
|
|
from collections.abc import AsyncGenerator
|
|
|
|
from langchain_core.messages import HumanMessage
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.agents.autocomplete import create_autocomplete_agent, stream_autocomplete_agent
|
|
from app.services.llm_service import get_vision_llm
|
|
from app.services.new_streaming_service import VercelStreamingService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
PREP_STEP_ID = "autocomplete-prep"
|
|
|
|
|
|
def _derive_kb_query(app_name: str, window_title: str) -> str:
|
|
parts = [p for p in (window_title, app_name) if p]
|
|
return " ".join(parts)
|
|
|
|
|
|
def _is_vision_unsupported_error(e: Exception) -> bool:
|
|
msg = str(e).lower()
|
|
return "content must be a string" in msg or "does not support image" in msg
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main entry point
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def stream_vision_autocomplete(
|
|
screenshot_data_url: str,
|
|
search_space_id: int,
|
|
session: AsyncSession,
|
|
*,
|
|
app_name: str = "",
|
|
window_title: str = "",
|
|
) -> AsyncGenerator[str, None]:
|
|
"""Analyze a screenshot with a vision-LLM agent and stream a text completion."""
|
|
streaming = VercelStreamingService()
|
|
vision_error_msg = (
|
|
"The selected model does not support vision. "
|
|
"Please set a vision-capable model (e.g. GPT-4o, Gemini) in your search space settings."
|
|
)
|
|
|
|
llm = await get_vision_llm(session, search_space_id)
|
|
if not llm:
|
|
yield streaming.format_message_start()
|
|
yield streaming.format_error("No Vision LLM configured for this search space")
|
|
yield streaming.format_done()
|
|
return
|
|
|
|
# Start SSE stream immediately so the UI has something to show
|
|
yield streaming.format_message_start()
|
|
|
|
kb_query = _derive_kb_query(app_name, window_title)
|
|
|
|
# Show a preparation step while KB search + agent compile run
|
|
yield streaming.format_thinking_step(
|
|
step_id=PREP_STEP_ID,
|
|
title="Searching knowledge base",
|
|
status="in_progress",
|
|
items=[kb_query] if kb_query else [],
|
|
)
|
|
|
|
try:
|
|
agent, kb = await create_autocomplete_agent(
|
|
llm,
|
|
search_space_id=search_space_id,
|
|
kb_query=kb_query,
|
|
app_name=app_name,
|
|
window_title=window_title,
|
|
)
|
|
except Exception as e:
|
|
if _is_vision_unsupported_error(e):
|
|
logger.warning("Vision autocomplete: model does not support vision: %s", e)
|
|
yield streaming.format_error(vision_error_msg)
|
|
yield streaming.format_done()
|
|
return
|
|
logger.error("Failed to create autocomplete agent: %s", e, exc_info=True)
|
|
yield streaming.format_error("Autocomplete failed. Please try again.")
|
|
yield streaming.format_done()
|
|
return
|
|
|
|
has_kb = kb.has_documents
|
|
doc_count = len(kb.files) if has_kb else 0 # type: ignore[arg-type]
|
|
|
|
yield streaming.format_thinking_step(
|
|
step_id=PREP_STEP_ID,
|
|
title="Searching knowledge base",
|
|
status="complete",
|
|
items=[f"Found {doc_count} document{'s' if doc_count != 1 else ''}"]
|
|
if kb_query
|
|
else ["Skipped"],
|
|
)
|
|
|
|
# Build agent input with pre-computed KB as initial state
|
|
if has_kb:
|
|
instruction = (
|
|
"Analyze this screenshot, then explore the knowledge base documents "
|
|
"listed above — read the chunk index of any document whose title "
|
|
"looks relevant and check matched chunks for useful facts. "
|
|
"Finally, generate a concise autocomplete for the active text area, "
|
|
"enhanced with any relevant KB information you found."
|
|
)
|
|
else:
|
|
instruction = (
|
|
"Analyze this screenshot and generate a concise autocomplete "
|
|
"for the active text area based on what you see."
|
|
)
|
|
|
|
user_message = HumanMessage(
|
|
content=[
|
|
{"type": "text", "text": instruction},
|
|
{"type": "image_url", "image_url": {"url": screenshot_data_url}},
|
|
]
|
|
)
|
|
|
|
input_data: dict = {"messages": [user_message]}
|
|
|
|
if has_kb:
|
|
input_data["files"] = kb.files
|
|
input_data["messages"] = [kb.ls_ai_msg, kb.ls_tool_msg, user_message]
|
|
logger.info(
|
|
"Autocomplete: injected %d KB files into agent initial state", doc_count
|
|
)
|
|
else:
|
|
logger.info(
|
|
"Autocomplete: no KB documents found, proceeding with screenshot only"
|
|
)
|
|
|
|
# Stream the agent (message_start already sent above)
|
|
try:
|
|
async for sse in stream_autocomplete_agent(
|
|
agent,
|
|
input_data,
|
|
streaming,
|
|
emit_message_start=False,
|
|
):
|
|
yield sse
|
|
except Exception as e:
|
|
if _is_vision_unsupported_error(e):
|
|
logger.warning("Vision autocomplete: model does not support vision: %s", e)
|
|
yield streaming.format_error(vision_error_msg)
|
|
yield streaming.format_done()
|
|
else:
|
|
logger.error("Vision autocomplete streaming error: %s", e, exc_info=True)
|
|
yield streaming.format_error("Autocomplete failed. Please try again.")
|
|
yield streaming.format_done()
|