SurfSense/surfsense_backend/app/services/vision_autocomplete_service.py

"""Vision autocomplete service — agent-based with scoped filesystem.

Optimized pipeline:
1. Start the SSE stream immediately so the UI shows progress.
2. Derive a KB search query from window_title (no separate LLM call).
3. Run KB filesystem pre-computation and agent graph compilation in PARALLEL.
4. Inject pre-computed KB files as initial state and stream the agent.
"""

import logging
from collections.abc import AsyncGenerator

from langchain_core.messages import HumanMessage
from sqlalchemy.ext.asyncio import AsyncSession

from app.agents.autocomplete import create_autocomplete_agent, stream_autocomplete_agent
from app.services.llm_service import get_vision_llm
from app.services.new_streaming_service import VercelStreamingService

logger = logging.getLogger(__name__)

PREP_STEP_ID = "autocomplete-prep"


def _derive_kb_query(app_name: str, window_title: str) -> str:
    parts = [p for p in (window_title, app_name) if p]
    return " ".join(parts)


def _is_vision_unsupported_error(e: Exception) -> bool:
    msg = str(e).lower()
    return "content must be a string" in msg or "does not support image" in msg


# ---------------------------------------------------------------------------
# Main entry point
# ---------------------------------------------------------------------------


async def stream_vision_autocomplete(
    screenshot_data_url: str,
    search_space_id: int,
    session: AsyncSession,
    *,
    app_name: str = "",
    window_title: str = "",
) -> AsyncGenerator[str, None]:
    """Analyze a screenshot with a vision-LLM agent and stream a text completion."""
    streaming = VercelStreamingService()
    vision_error_msg = (
        "The selected model does not support vision. "
        "Please set a vision-capable model (e.g. GPT-4o, Gemini) in your search space settings."
    )

    llm = await get_vision_llm(session, search_space_id)
    if not llm:
        yield streaming.format_message_start()
        yield streaming.format_error("No Vision LLM configured for this search space")
        yield streaming.format_done()
        return

    # Start SSE stream immediately so the UI has something to show
    yield streaming.format_message_start()

    kb_query = _derive_kb_query(app_name, window_title)

    # Show a preparation step while KB search + agent compile run
    yield streaming.format_thinking_step(
        step_id=PREP_STEP_ID,
        title="Searching knowledge base",
        status="in_progress",
        items=[kb_query] if kb_query else [],
    )

    try:
        agent, kb = await create_autocomplete_agent(
            llm,
            search_space_id=search_space_id,
            kb_query=kb_query,
            app_name=app_name,
            window_title=window_title,
        )
    except Exception as e:
        if _is_vision_unsupported_error(e):
            logger.warning("Vision autocomplete: model does not support vision: %s", e)
            yield streaming.format_error(vision_error_msg)
            yield streaming.format_done()
            return
        logger.error("Failed to create autocomplete agent: %s", e, exc_info=True)
        yield streaming.format_error("Autocomplete failed. Please try again.")
        yield streaming.format_done()
        return

    has_kb = kb.has_documents
    doc_count = len(kb.files) if has_kb else 0  # type: ignore[arg-type]

    yield streaming.format_thinking_step(
        step_id=PREP_STEP_ID,
        title="Searching knowledge base",
        status="complete",
        items=[f"Found {doc_count} document{'s' if doc_count != 1 else ''}"]
        if kb_query
        else ["Skipped"],
    )

    # Build agent input with pre-computed KB as initial state
    if has_kb:
        instruction = (
            "Analyze this screenshot, then explore the knowledge base documents "
            "listed above — read the chunk index of any document whose title "
            "looks relevant and check matched chunks for useful facts. "
            "Finally, generate a concise autocomplete for the active text area, "
            "enhanced with any relevant KB information you found."
        )
    else:
        instruction = (
            "Analyze this screenshot and generate a concise autocomplete "
            "for the active text area based on what you see."
        )

    user_message = HumanMessage(
        content=[
            {"type": "text", "text": instruction},
            {"type": "image_url", "image_url": {"url": screenshot_data_url}},
        ]
    )

    input_data: dict = {"messages": [user_message]}

    if has_kb:
        input_data["files"] = kb.files
        input_data["messages"] = [kb.ls_ai_msg, kb.ls_tool_msg, user_message]
        logger.info(
            "Autocomplete: injected %d KB files into agent initial state", doc_count
        )
    else:
        logger.info(
            "Autocomplete: no KB documents found, proceeding with screenshot only"
        )

    # Stream the agent (message_start already sent above)
    try:
        async for sse in stream_autocomplete_agent(
            agent,
            input_data,
            streaming,
            emit_message_start=False,
        ):
            yield sse
    except Exception as e:
        if _is_vision_unsupported_error(e):
            logger.warning("Vision autocomplete: model does not support vision: %s", e)
            yield streaming.format_error(vision_error_msg)
            yield streaming.format_done()
        else:
            logger.error("Vision autocomplete streaming error: %s", e, exc_info=True)
            yield streaming.format_error("Autocomplete failed. Please try again.")
            yield streaming.format_done()