diff --git a/surfsense_backend/app/agents/autocomplete/__init__.py b/surfsense_backend/app/agents/autocomplete/__init__.py deleted file mode 100644 index 55d7a692d..000000000 --- a/surfsense_backend/app/agents/autocomplete/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Agent-based vision autocomplete with scoped filesystem exploration.""" - -from app.agents.autocomplete.autocomplete_agent import ( - create_autocomplete_agent, - stream_autocomplete_agent, -) - -__all__ = [ - "create_autocomplete_agent", - "stream_autocomplete_agent", -] diff --git a/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py b/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py deleted file mode 100644 index 2d8f05fd3..000000000 --- a/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py +++ /dev/null @@ -1,495 +0,0 @@ -"""Vision autocomplete agent with scoped filesystem exploration. - -Converts the stateless single-shot vision autocomplete into an agent that -seeds a virtual filesystem from KB search results and lets the vision LLM -explore documents via ``ls``, ``read_file``, ``glob``, ``grep``, etc. -before generating the final completion. - -Performance: KB search and agent graph compilation run in parallel so -the only sequential latency is KB-search (or agent compile, whichever is -slower) + the agent's LLM turns. There is no separate "query extraction" -LLM call — the window title is used directly as the KB search query. -""" - -from __future__ import annotations - -import asyncio -import json -import logging -import re -import uuid -from collections.abc import AsyncGenerator -from typing import Any - -from deepagents.graph import BASE_AGENT_PROMPT -from deepagents.middleware.patch_tool_calls import PatchToolCallsMiddleware -from langchain.agents import create_agent -from langchain_anthropic.middleware import AnthropicPromptCachingMiddleware -from langchain_core.language_models import BaseChatModel -from langchain_core.messages import AIMessage, ToolMessage - -from app.agents.new_chat.middleware.filesystem import SurfSenseFilesystemMiddleware -from app.agents.new_chat.middleware.knowledge_search import ( - build_scoped_filesystem, - search_knowledge_base, -) -from app.services.new_streaming_service import VercelStreamingService - -logger = logging.getLogger(__name__) - -KB_TOP_K = 10 - -# --------------------------------------------------------------------------- -# System prompt -# --------------------------------------------------------------------------- - -AUTOCOMPLETE_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text. - -You will receive a screenshot of the user's screen. Your PRIMARY source of truth is the screenshot itself — the visual context determines what to write. - -Your job: -1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.). -2. Identify the text area where the user will type. -3. Generate the text the user most likely wants to write based on the visual context. - -You also have access to the user's knowledge base documents via filesystem tools. However: -- ONLY consult the knowledge base if the screenshot clearly involves a topic where your KB documents are DIRECTLY relevant (e.g., the user is writing about a specific project/topic that matches a document title). -- Do NOT explore documents just because they exist. Most autocomplete requests can be answered purely from the screenshot. -- If you do read a document, only incorporate information that is 100% relevant to what the user is typing RIGHT NOW. Do not add extra details, background, or tangential information from the KB. -- Keep your output SHORT — autocomplete should feel like a natural continuation, not an essay. - -Key behavior: -- If the text area is EMPTY, draft a concise response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document). -- If the text area already has text, continue it naturally — typically just a sentence or two. - -Rules: -- Be CONCISE. Prefer a single paragraph or a few sentences. Autocomplete is a quick assist, not a full draft. -- Match the tone and formality of the surrounding context. -- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal. -- Do NOT describe the screenshot or explain your reasoning. -- Do NOT cite or reference documents explicitly — just let the knowledge inform your writing naturally. -- If you cannot determine what to write, output an empty JSON array: [] - -## Output Format - -You MUST provide exactly 3 different suggestion options. Each should be a distinct, plausible completion — vary the tone, detail level, or angle. - -Return your suggestions as a JSON array of exactly 3 strings. Output ONLY the JSON array, nothing else — no markdown fences, no explanation, no commentary. - -Example format: -["First suggestion text here.", "Second suggestion — a different take.", "Third option with another approach."] - -## Filesystem Tools `ls`, `read_file`, `write_file`, `edit_file`, `glob`, `grep` - -All file paths must start with a `/`. -- ls: list files and directories at a given path. -- read_file: read a file from the filesystem. -- write_file: create a temporary file in the session (not persisted). -- edit_file: edit a file in the session (not persisted for /documents/ files). -- glob: find files matching a pattern (e.g., "**/*.xml"). -- grep: search for text within files. - -## When to Use Filesystem Tools - -BEFORE reaching for any tool, ask yourself: "Can I write a good completion purely from the screenshot?" If yes, just write it — do NOT explore the KB. - -Only use tools when: -- The user is clearly writing about a specific topic that likely has detailed information in their KB. -- You need a specific fact, name, number, or reference that the screenshot doesn't provide. - -When you do use tools, be surgical: -- Check the `ls` output first. If no document title looks relevant, stop — do not read files just to see what's there. -- If a title looks relevant, read only the `` (first ~20 lines) and jump to matched chunks. Do not read entire documents. -- Extract only the specific information you need and move on to generating the completion. - -## Reading Documents Efficiently - -Documents are formatted as XML. Each document contains: -- `` — title, type, URL, etc. -- `` — a table of every chunk with its **line range** and a - `matched="true"` flag for chunks that matched the search query. -- `` — the actual chunks in original document order. - -**Workflow**: read the first ~20 lines to see the ``, identify -chunks marked `matched="true"`, then use `read_file(path, offset=, -limit=)` to jump directly to those sections.""" - -APP_CONTEXT_BLOCK = """ - -The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly.""" - - -def _build_autocomplete_system_prompt(app_name: str, window_title: str) -> str: - prompt = AUTOCOMPLETE_SYSTEM_PROMPT - if app_name: - prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title) - return prompt - - -# --------------------------------------------------------------------------- -# Pre-compute KB filesystem (runs in parallel with agent compilation) -# --------------------------------------------------------------------------- - - -class _KBResult: - """Container for pre-computed KB filesystem results.""" - - __slots__ = ("files", "ls_ai_msg", "ls_tool_msg") - - def __init__( - self, - files: dict[str, Any] | None = None, - ls_ai_msg: AIMessage | None = None, - ls_tool_msg: ToolMessage | None = None, - ) -> None: - self.files = files - self.ls_ai_msg = ls_ai_msg - self.ls_tool_msg = ls_tool_msg - - @property - def has_documents(self) -> bool: - return bool(self.files) - - -async def precompute_kb_filesystem( - search_space_id: int, - query: str, - top_k: int = KB_TOP_K, -) -> _KBResult: - """Search the KB and build the scoped filesystem outside the agent. - - This is designed to be called via ``asyncio.gather`` alongside agent - graph compilation so the two run concurrently. - """ - if not query: - return _KBResult() - - try: - search_results = await search_knowledge_base( - query=query, - search_space_id=search_space_id, - top_k=top_k, - ) - - if not search_results: - return _KBResult() - - new_files, _ = await build_scoped_filesystem( - documents=search_results, - search_space_id=search_space_id, - ) - - if not new_files: - return _KBResult() - - doc_paths = [ - p - for p, v in new_files.items() - if p.startswith("/documents/") and v is not None - ] - tool_call_id = f"auto_ls_{uuid.uuid4().hex[:12]}" - ai_msg = AIMessage( - content="", - tool_calls=[ - {"name": "ls", "args": {"path": "/documents"}, "id": tool_call_id} - ], - ) - tool_msg = ToolMessage( - content=str(doc_paths) if doc_paths else "No documents found.", - tool_call_id=tool_call_id, - ) - return _KBResult(files=new_files, ls_ai_msg=ai_msg, ls_tool_msg=tool_msg) - - except Exception: - logger.warning( - "KB pre-computation failed, proceeding without KB", exc_info=True - ) - return _KBResult() - - -# --------------------------------------------------------------------------- -# Filesystem middleware — no save_document, no persistence -# --------------------------------------------------------------------------- - - -class AutocompleteFilesystemMiddleware(SurfSenseFilesystemMiddleware): - """Filesystem middleware for autocomplete — read-only exploration only. - - Strips ``save_document`` (permanent KB persistence) and passes - ``search_space_id=None`` so ``write_file`` / ``edit_file`` stay ephemeral. - """ - - def __init__(self) -> None: - super().__init__(search_space_id=None, created_by_id=None) - self.tools = [t for t in self.tools if t.name != "save_document"] - - -# --------------------------------------------------------------------------- -# Agent factory -# --------------------------------------------------------------------------- - - -async def _compile_agent( - llm: BaseChatModel, - app_name: str, - window_title: str, -) -> Any: - """Compile the agent graph (CPU-bound, runs in a thread).""" - system_prompt = _build_autocomplete_system_prompt(app_name, window_title) - final_system_prompt = system_prompt + "\n\n" + BASE_AGENT_PROMPT - - middleware = [ - AutocompleteFilesystemMiddleware(), - PatchToolCallsMiddleware(), - AnthropicPromptCachingMiddleware(unsupported_model_behavior="ignore"), - ] - - agent = await asyncio.to_thread( - create_agent, - llm, - system_prompt=final_system_prompt, - tools=[], - middleware=middleware, - ) - return agent.with_config({"recursion_limit": 200}) - - -async def create_autocomplete_agent( - llm: BaseChatModel, - *, - search_space_id: int, - kb_query: str, - app_name: str = "", - window_title: str = "", -) -> tuple[Any, _KBResult]: - """Create the autocomplete agent and pre-compute KB in parallel. - - Returns ``(agent, kb_result)`` so the caller can inject the pre-computed - filesystem into the agent's initial state without any middleware delay. - """ - agent, kb = await asyncio.gather( - _compile_agent(llm, app_name, window_title), - precompute_kb_filesystem(search_space_id, kb_query), - ) - return agent, kb - - -# --------------------------------------------------------------------------- -# JSON suggestion parsing (with fallback) -# --------------------------------------------------------------------------- - - -def _parse_suggestions(raw: str) -> list[str]: - """Extract a list of suggestion strings from the agent's output. - - Tries, in order: - 1. Direct ``json.loads`` - 2. Extract content between ```json ... ``` fences - 3. Find the first ``[`` … ``]`` span - Falls back to wrapping the raw text as a single suggestion. - """ - text = raw.strip() - if not text: - return [] - - for candidate in _json_candidates(text): - try: - parsed = json.loads(candidate) - if isinstance(parsed, list) and all(isinstance(s, str) for s in parsed): - return [s for s in parsed if s.strip()] - except (json.JSONDecodeError, ValueError): - continue - - return [text] - - -def _json_candidates(text: str) -> list[str]: - """Yield candidate JSON strings from raw text.""" - candidates = [text] - - fence = re.search(r"```(?:json)?\s*\n?(.*?)```", text, re.DOTALL) - if fence: - candidates.append(fence.group(1).strip()) - - bracket = re.search(r"\[.*]", text, re.DOTALL) - if bracket: - candidates.append(bracket.group(0)) - - return candidates - - -# --------------------------------------------------------------------------- -# Streaming helper -# --------------------------------------------------------------------------- - - -async def stream_autocomplete_agent( - agent: Any, - input_data: dict[str, Any], - streaming_service: VercelStreamingService, - *, - emit_message_start: bool = True, -) -> AsyncGenerator[str, None]: - """Stream agent events as Vercel SSE, with thinking steps for tool calls. - - When ``emit_message_start`` is False the caller has already sent the - ``message_start`` event (e.g. to show preparation steps before the agent - runs). - """ - thread_id = uuid.uuid4().hex - config = {"configurable": {"thread_id": thread_id}} - - text_buffer: list[str] = [] - active_tool_depth = 0 - thinking_step_counter = 0 - tool_step_ids: dict[str, str] = {} - step_titles: dict[str, str] = {} - completed_step_ids: set[str] = set() - last_active_step_id: str | None = None - - def next_thinking_step_id() -> str: - nonlocal thinking_step_counter - thinking_step_counter += 1 - return f"autocomplete-step-{thinking_step_counter}" - - def complete_current_step() -> str | None: - nonlocal last_active_step_id - if last_active_step_id and last_active_step_id not in completed_step_ids: - completed_step_ids.add(last_active_step_id) - title = step_titles.get(last_active_step_id, "Done") - event = streaming_service.format_thinking_step( - step_id=last_active_step_id, - title=title, - status="complete", - ) - last_active_step_id = None - return event - return None - - if emit_message_start: - yield streaming_service.format_message_start() - - gen_step_id = next_thinking_step_id() - last_active_step_id = gen_step_id - step_titles[gen_step_id] = "Generating suggestions" - yield streaming_service.format_thinking_step( - step_id=gen_step_id, - title="Generating suggestions", - status="in_progress", - ) - - try: - async for event in agent.astream_events( - input_data, config=config, version="v2" - ): - event_type = event.get("event", "") - if event_type == "on_chat_model_stream": - if active_tool_depth > 0: - continue - if "surfsense:internal" in event.get("tags", []): - continue - chunk = event.get("data", {}).get("chunk") - if chunk and hasattr(chunk, "content"): - content = chunk.content - if content and isinstance(content, str): - text_buffer.append(content) - - elif event_type == "on_chat_model_end": - if active_tool_depth > 0: - continue - if "surfsense:internal" in event.get("tags", []): - continue - output = event.get("data", {}).get("output") - if output and hasattr(output, "content"): - if getattr(output, "tool_calls", None): - continue - content = output.content - if content and isinstance(content, str) and not text_buffer: - text_buffer.append(content) - - elif event_type == "on_tool_start": - active_tool_depth += 1 - tool_name = event.get("name", "unknown_tool") - run_id = event.get("run_id", "") - tool_input = event.get("data", {}).get("input", {}) - - step_event = complete_current_step() - if step_event: - yield step_event - - tool_step_id = next_thinking_step_id() - tool_step_ids[run_id] = tool_step_id - last_active_step_id = tool_step_id - - title, items = _describe_tool_call(tool_name, tool_input) - step_titles[tool_step_id] = title - yield streaming_service.format_thinking_step( - step_id=tool_step_id, - title=title, - status="in_progress", - items=items, - ) - - elif event_type == "on_tool_end": - active_tool_depth = max(0, active_tool_depth - 1) - run_id = event.get("run_id", "") - step_id = tool_step_ids.pop(run_id, None) - if step_id and step_id not in completed_step_ids: - completed_step_ids.add(step_id) - title = step_titles.get(step_id, "Done") - yield streaming_service.format_thinking_step( - step_id=step_id, - title=title, - status="complete", - ) - if last_active_step_id == step_id: - last_active_step_id = None - - step_event = complete_current_step() - if step_event: - yield step_event - - raw_text = "".join(text_buffer) - suggestions = _parse_suggestions(raw_text) - - yield streaming_service.format_data("suggestions", {"options": suggestions}) - - yield streaming_service.format_finish() - yield streaming_service.format_done() - - except Exception as e: - logger.error(f"Autocomplete agent streaming error: {e}", exc_info=True) - yield streaming_service.format_error("Autocomplete failed. Please try again.") - yield streaming_service.format_done() - - -def _describe_tool_call(tool_name: str, tool_input: Any) -> tuple[str, list[str]]: - """Return a human-readable (title, items) for a tool call thinking step.""" - inp = tool_input if isinstance(tool_input, dict) else {} - if tool_name == "ls": - path = inp.get("path", "/") - return "Listing files", [path] - if tool_name == "read_file": - fp = inp.get("file_path", "") - display = fp if len(fp) <= 80 else "…" + fp[-77:] - return "Reading file", [display] - if tool_name == "write_file": - fp = inp.get("file_path", "") - display = fp if len(fp) <= 80 else "…" + fp[-77:] - return "Writing file", [display] - if tool_name == "edit_file": - fp = inp.get("file_path", "") - display = fp if len(fp) <= 80 else "…" + fp[-77:] - return "Editing file", [display] - if tool_name == "glob": - pat = inp.get("pattern", "") - base = inp.get("path", "/") - return "Searching files", [f"{pat} in {base}"] - if tool_name == "grep": - pat = inp.get("pattern", "") - path = inp.get("path", "") - display_pat = pat[:60] + ("…" if len(pat) > 60 else "") - return "Searching content", [ - f'"{display_pat}"' + (f" in {path}" if path else "") - ] - return f"Using {tool_name}", [] diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py index 40ca7a7e8..9464a7ded 100644 --- a/surfsense_backend/app/routes/__init__.py +++ b/surfsense_backend/app/routes/__init__.py @@ -3,7 +3,6 @@ from fastapi import APIRouter from .airtable_add_connector_route import ( router as airtable_add_connector_router, ) -from .autocomplete_routes import router as autocomplete_router from .chat_comments_routes import router as chat_comments_router from .circleback_webhook_route import router as circleback_webhook_router from .clickup_add_connector_route import router as clickup_add_connector_router @@ -104,4 +103,3 @@ router.include_router(stripe_router) # Stripe checkout for additional page pack router.include_router(youtube_router) # YouTube playlist resolution router.include_router(prompts_router) router.include_router(memory_router) # User personal memory (memory.md style) -router.include_router(autocomplete_router) # Lightweight autocomplete with KB context diff --git a/surfsense_backend/app/routes/autocomplete_routes.py b/surfsense_backend/app/routes/autocomplete_routes.py deleted file mode 100644 index a11b7dbc1..000000000 --- a/surfsense_backend/app/routes/autocomplete_routes.py +++ /dev/null @@ -1,45 +0,0 @@ -from fastapi import APIRouter, Depends -from fastapi.responses import StreamingResponse -from pydantic import BaseModel, Field -from sqlalchemy.ext.asyncio import AsyncSession - -from app.db import User, get_async_session -from app.services.new_streaming_service import VercelStreamingService -from app.services.vision_autocomplete_service import stream_vision_autocomplete -from app.users import current_active_user -from app.utils.rbac import check_search_space_access - -router = APIRouter(prefix="/autocomplete", tags=["autocomplete"]) - -MAX_SCREENSHOT_SIZE = 20 * 1024 * 1024 # 20 MB base64 ceiling - - -class VisionAutocompleteRequest(BaseModel): - screenshot: str = Field(..., max_length=MAX_SCREENSHOT_SIZE) - search_space_id: int - app_name: str = "" - window_title: str = "" - - -@router.post("/vision/stream") -async def vision_autocomplete_stream( - body: VisionAutocompleteRequest, - user: User = Depends(current_active_user), - session: AsyncSession = Depends(get_async_session), -): - await check_search_space_access(session, user, body.search_space_id) - - return StreamingResponse( - stream_vision_autocomplete( - body.screenshot, - body.search_space_id, - session, - app_name=body.app_name, - window_title=body.window_title, - ), - media_type="text/event-stream", - headers={ - **VercelStreamingService.get_response_headers(), - "X-Accel-Buffering": "no", - }, - ) diff --git a/surfsense_backend/app/services/vision_autocomplete_service.py b/surfsense_backend/app/services/vision_autocomplete_service.py deleted file mode 100644 index c28962b31..000000000 --- a/surfsense_backend/app/services/vision_autocomplete_service.py +++ /dev/null @@ -1,158 +0,0 @@ -"""Vision autocomplete service — agent-based with scoped filesystem. - -Optimized pipeline: -1. Start the SSE stream immediately so the UI shows progress. -2. Derive a KB search query from window_title (no separate LLM call). -3. Run KB filesystem pre-computation and agent graph compilation in PARALLEL. -4. Inject pre-computed KB files as initial state and stream the agent. -""" - -import logging -from collections.abc import AsyncGenerator - -from langchain_core.messages import HumanMessage -from sqlalchemy.ext.asyncio import AsyncSession - -from app.agents.autocomplete import create_autocomplete_agent, stream_autocomplete_agent -from app.services.llm_service import get_vision_llm -from app.services.new_streaming_service import VercelStreamingService - -logger = logging.getLogger(__name__) - -PREP_STEP_ID = "autocomplete-prep" - - -def _derive_kb_query(app_name: str, window_title: str) -> str: - parts = [p for p in (window_title, app_name) if p] - return " ".join(parts) - - -def _is_vision_unsupported_error(e: Exception) -> bool: - msg = str(e).lower() - return "content must be a string" in msg or "does not support image" in msg - - -# --------------------------------------------------------------------------- -# Main entry point -# --------------------------------------------------------------------------- - - -async def stream_vision_autocomplete( - screenshot_data_url: str, - search_space_id: int, - session: AsyncSession, - *, - app_name: str = "", - window_title: str = "", -) -> AsyncGenerator[str, None]: - """Analyze a screenshot with a vision-LLM agent and stream a text completion.""" - streaming = VercelStreamingService() - vision_error_msg = ( - "The selected model does not support vision. " - "Please set a vision-capable model (e.g. GPT-4o, Gemini) in your search space settings." - ) - - llm = await get_vision_llm(session, search_space_id) - if not llm: - yield streaming.format_message_start() - yield streaming.format_error("No Vision LLM configured for this search space") - yield streaming.format_done() - return - - # Start SSE stream immediately so the UI has something to show - yield streaming.format_message_start() - - kb_query = _derive_kb_query(app_name, window_title) - - # Show a preparation step while KB search + agent compile run - yield streaming.format_thinking_step( - step_id=PREP_STEP_ID, - title="Searching knowledge base", - status="in_progress", - items=[kb_query] if kb_query else [], - ) - - try: - agent, kb = await create_autocomplete_agent( - llm, - search_space_id=search_space_id, - kb_query=kb_query, - app_name=app_name, - window_title=window_title, - ) - except Exception as e: - if _is_vision_unsupported_error(e): - logger.warning("Vision autocomplete: model does not support vision: %s", e) - yield streaming.format_error(vision_error_msg) - yield streaming.format_done() - return - logger.error("Failed to create autocomplete agent: %s", e, exc_info=True) - yield streaming.format_error("Autocomplete failed. Please try again.") - yield streaming.format_done() - return - - has_kb = kb.has_documents - doc_count = len(kb.files) if has_kb else 0 # type: ignore[arg-type] - - yield streaming.format_thinking_step( - step_id=PREP_STEP_ID, - title="Searching knowledge base", - status="complete", - items=[f"Found {doc_count} document{'s' if doc_count != 1 else ''}"] - if kb_query - else ["Skipped"], - ) - - # Build agent input with pre-computed KB as initial state - if has_kb: - instruction = ( - "Analyze this screenshot, then explore the knowledge base documents " - "listed above — read the chunk index of any document whose title " - "looks relevant and check matched chunks for useful facts. " - "Finally, generate a concise autocomplete for the active text area, " - "enhanced with any relevant KB information you found." - ) - else: - instruction = ( - "Analyze this screenshot and generate a concise autocomplete " - "for the active text area based on what you see." - ) - - user_message = HumanMessage( - content=[ - {"type": "text", "text": instruction}, - {"type": "image_url", "image_url": {"url": screenshot_data_url}}, - ] - ) - - input_data: dict = {"messages": [user_message]} - - if has_kb: - input_data["files"] = kb.files - input_data["messages"] = [kb.ls_ai_msg, kb.ls_tool_msg, user_message] - logger.info( - "Autocomplete: injected %d KB files into agent initial state", doc_count - ) - else: - logger.info( - "Autocomplete: no KB documents found, proceeding with screenshot only" - ) - - # Stream the agent (message_start already sent above) - try: - async for sse in stream_autocomplete_agent( - agent, - input_data, - streaming, - emit_message_start=False, - ): - yield sse - except Exception as e: - if _is_vision_unsupported_error(e): - logger.warning("Vision autocomplete: model does not support vision: %s", e) - yield streaming.format_error(vision_error_msg) - yield streaming.format_done() - else: - logger.error("Vision autocomplete streaming error: %s", e, exc_info=True) - yield streaming.format_error("Autocomplete failed. Please try again.") - yield streaming.format_done()