diff --git a/surfsense_backend/app/agents/autocomplete/__init__.py b/surfsense_backend/app/agents/autocomplete/__init__.py new file mode 100644 index 000000000..55d7a692d --- /dev/null +++ b/surfsense_backend/app/agents/autocomplete/__init__.py @@ -0,0 +1,11 @@ +"""Agent-based vision autocomplete with scoped filesystem exploration.""" + +from app.agents.autocomplete.autocomplete_agent import ( + create_autocomplete_agent, + stream_autocomplete_agent, +) + +__all__ = [ + "create_autocomplete_agent", + "stream_autocomplete_agent", +] diff --git a/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py b/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py new file mode 100644 index 000000000..928a133cc --- /dev/null +++ b/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py @@ -0,0 +1,429 @@ +"""Vision autocomplete agent with scoped filesystem exploration. + +Converts the stateless single-shot vision autocomplete into an agent that +seeds a virtual filesystem from KB search results and lets the vision LLM +explore documents via ``ls``, ``read_file``, ``glob``, ``grep``, etc. +before generating the final completion. + +Performance: KB search and agent graph compilation run in parallel so +the only sequential latency is KB-search (or agent compile, whichever is +slower) + the agent's LLM turns. There is no separate "query extraction" +LLM call — the window title is used directly as the KB search query. +""" + +from __future__ import annotations + +import asyncio +import logging +import uuid +from typing import Any, AsyncGenerator + +from deepagents.graph import BASE_AGENT_PROMPT +from deepagents.middleware.patch_tool_calls import PatchToolCallsMiddleware +from langchain.agents import create_agent +from langchain_anthropic.middleware import AnthropicPromptCachingMiddleware +from langchain_core.language_models import BaseChatModel +from langchain_core.messages import AIMessage, ToolMessage + +from app.agents.new_chat.middleware.filesystem import SurfSenseFilesystemMiddleware +from app.agents.new_chat.middleware.knowledge_search import ( + build_scoped_filesystem, + search_knowledge_base, +) +from app.services.new_streaming_service import VercelStreamingService + +logger = logging.getLogger(__name__) + +KB_TOP_K = 10 + +# --------------------------------------------------------------------------- +# System prompt +# --------------------------------------------------------------------------- + +AUTOCOMPLETE_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text. + +You will receive a screenshot of the user's screen. Your PRIMARY source of truth is the screenshot itself — the visual context determines what to write. + +Your job: +1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.). +2. Identify the text area where the user will type. +3. Generate the text the user most likely wants to write based on the visual context. + +You also have access to the user's knowledge base documents via filesystem tools. However: +- ONLY consult the knowledge base if the screenshot clearly involves a topic where your KB documents are DIRECTLY relevant (e.g., the user is writing about a specific project/topic that matches a document title). +- Do NOT explore documents just because they exist. Most autocomplete requests can be answered purely from the screenshot. +- If you do read a document, only incorporate information that is 100% relevant to what the user is typing RIGHT NOW. Do not add extra details, background, or tangential information from the KB. +- Keep your output SHORT — autocomplete should feel like a natural continuation, not an essay. + +Key behavior: +- If the text area is EMPTY, draft a concise response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document). +- If the text area already has text, continue it naturally — typically just a sentence or two. + +Rules: +- Output ONLY the text to be inserted. No quotes, no explanations, no meta-commentary. +- Be CONCISE. Prefer a single paragraph or a few sentences. Autocomplete is a quick assist, not a full draft. +- Match the tone and formality of the surrounding context. +- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal. +- Do NOT describe the screenshot or explain your reasoning. +- Do NOT cite or reference documents explicitly — just let the knowledge inform your writing naturally. +- If you cannot determine what to write, output nothing. + +## Filesystem Tools `ls`, `read_file`, `write_file`, `edit_file`, `glob`, `grep` + +All file paths must start with a `/`. +- ls: list files and directories at a given path. +- read_file: read a file from the filesystem. +- write_file: create a temporary file in the session (not persisted). +- edit_file: edit a file in the session (not persisted for /documents/ files). +- glob: find files matching a pattern (e.g., "**/*.xml"). +- grep: search for text within files. + +## When to Use Filesystem Tools + +BEFORE reaching for any tool, ask yourself: "Can I write a good completion purely from the screenshot?" If yes, just write it — do NOT explore the KB. + +Only use tools when: +- The user is clearly writing about a specific topic that likely has detailed information in their KB. +- You need a specific fact, name, number, or reference that the screenshot doesn't provide. + +When you do use tools, be surgical: +- Check the `ls` output first. If no document title looks relevant, stop — do not read files just to see what's there. +- If a title looks relevant, read only the `` (first ~20 lines) and jump to matched chunks. Do not read entire documents. +- Extract only the specific information you need and move on to generating the completion. + +## Reading Documents Efficiently + +Documents are formatted as XML. Each document contains: +- `` — title, type, URL, etc. +- `` — a table of every chunk with its **line range** and a + `matched="true"` flag for chunks that matched the search query. +- `` — the actual chunks in original document order. + +**Workflow**: read the first ~20 lines to see the ``, identify +chunks marked `matched="true"`, then use `read_file(path, offset=, +limit=)` to jump directly to those sections.""" + +APP_CONTEXT_BLOCK = """ + +The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly.""" + + +def _build_autocomplete_system_prompt(app_name: str, window_title: str) -> str: + prompt = AUTOCOMPLETE_SYSTEM_PROMPT + if app_name: + prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title) + return prompt + + +# --------------------------------------------------------------------------- +# Pre-compute KB filesystem (runs in parallel with agent compilation) +# --------------------------------------------------------------------------- + + +class _KBResult: + """Container for pre-computed KB filesystem results.""" + __slots__ = ("files", "ls_ai_msg", "ls_tool_msg") + + def __init__( + self, + files: dict[str, Any] | None = None, + ls_ai_msg: AIMessage | None = None, + ls_tool_msg: ToolMessage | None = None, + ) -> None: + self.files = files + self.ls_ai_msg = ls_ai_msg + self.ls_tool_msg = ls_tool_msg + + @property + def has_documents(self) -> bool: + return bool(self.files) + + +async def precompute_kb_filesystem( + search_space_id: int, + query: str, + top_k: int = KB_TOP_K, +) -> _KBResult: + """Search the KB and build the scoped filesystem outside the agent. + + This is designed to be called via ``asyncio.gather`` alongside agent + graph compilation so the two run concurrently. + """ + if not query: + return _KBResult() + + try: + search_results = await search_knowledge_base( + query=query, + search_space_id=search_space_id, + top_k=top_k, + ) + + if not search_results: + return _KBResult() + + new_files, _ = await build_scoped_filesystem( + documents=search_results, + search_space_id=search_space_id, + ) + + if not new_files: + return _KBResult() + + doc_paths = [ + p for p, v in new_files.items() + if p.startswith("/documents/") and v is not None + ] + tool_call_id = f"auto_ls_{uuid.uuid4().hex[:12]}" + ai_msg = AIMessage( + content="", + tool_calls=[{"name": "ls", "args": {"path": "/documents"}, "id": tool_call_id}], + ) + tool_msg = ToolMessage( + content=str(doc_paths) if doc_paths else "No documents found.", + tool_call_id=tool_call_id, + ) + return _KBResult(files=new_files, ls_ai_msg=ai_msg, ls_tool_msg=tool_msg) + + except Exception: + logger.warning("KB pre-computation failed, proceeding without KB", exc_info=True) + return _KBResult() + + +# --------------------------------------------------------------------------- +# Filesystem middleware — no save_document, no persistence +# --------------------------------------------------------------------------- + + +class AutocompleteFilesystemMiddleware(SurfSenseFilesystemMiddleware): + """Filesystem middleware for autocomplete — read-only exploration only. + + Strips ``save_document`` (permanent KB persistence) and passes + ``search_space_id=None`` so ``write_file`` / ``edit_file`` stay ephemeral. + """ + + def __init__(self) -> None: + super().__init__(search_space_id=None, created_by_id=None) + self.tools = [t for t in self.tools if t.name != "save_document"] + + +# --------------------------------------------------------------------------- +# Agent factory +# --------------------------------------------------------------------------- + + +async def _compile_agent( + llm: BaseChatModel, + app_name: str, + window_title: str, +) -> Any: + """Compile the agent graph (CPU-bound, runs in a thread).""" + system_prompt = _build_autocomplete_system_prompt(app_name, window_title) + final_system_prompt = system_prompt + "\n\n" + BASE_AGENT_PROMPT + + middleware = [ + AutocompleteFilesystemMiddleware(), + PatchToolCallsMiddleware(), + AnthropicPromptCachingMiddleware(unsupported_model_behavior="ignore"), + ] + + agent = await asyncio.to_thread( + create_agent, + llm, + system_prompt=final_system_prompt, + tools=[], + middleware=middleware, + ) + return agent.with_config({"recursion_limit": 200}) + + +async def create_autocomplete_agent( + llm: BaseChatModel, + *, + search_space_id: int, + kb_query: str, + app_name: str = "", + window_title: str = "", +) -> tuple[Any, _KBResult]: + """Create the autocomplete agent and pre-compute KB in parallel. + + Returns ``(agent, kb_result)`` so the caller can inject the pre-computed + filesystem into the agent's initial state without any middleware delay. + """ + agent, kb = await asyncio.gather( + _compile_agent(llm, app_name, window_title), + precompute_kb_filesystem(search_space_id, kb_query), + ) + return agent, kb + + +# --------------------------------------------------------------------------- +# Streaming helper +# --------------------------------------------------------------------------- + + +async def stream_autocomplete_agent( + agent: Any, + input_data: dict[str, Any], + streaming_service: VercelStreamingService, + *, + emit_message_start: bool = True, +) -> AsyncGenerator[str, None]: + """Stream agent events as Vercel SSE, with thinking steps for tool calls. + + When ``emit_message_start`` is False the caller has already sent the + ``message_start`` event (e.g. to show preparation steps before the agent + runs). + """ + thread_id = uuid.uuid4().hex + config = {"configurable": {"thread_id": thread_id}} + + current_text_id: str | None = None + active_tool_depth = 0 + thinking_step_counter = 0 + tool_step_ids: dict[str, str] = {} + step_titles: dict[str, str] = {} + completed_step_ids: set[str] = set() + last_active_step_id: str | None = None + + def next_thinking_step_id() -> str: + nonlocal thinking_step_counter + thinking_step_counter += 1 + return f"autocomplete-step-{thinking_step_counter}" + + def complete_current_step() -> str | None: + nonlocal last_active_step_id + if last_active_step_id and last_active_step_id not in completed_step_ids: + completed_step_ids.add(last_active_step_id) + title = step_titles.get(last_active_step_id, "Done") + event = streaming_service.format_thinking_step( + step_id=last_active_step_id, + title=title, + status="complete", + ) + last_active_step_id = None + return event + return None + + if emit_message_start: + yield streaming_service.format_message_start() + + # Emit an initial "Generating completion" step so the UI immediately + # shows activity once the agent starts its first LLM call. + gen_step_id = next_thinking_step_id() + last_active_step_id = gen_step_id + step_titles[gen_step_id] = "Generating completion" + yield streaming_service.format_thinking_step( + step_id=gen_step_id, + title="Generating completion", + status="in_progress", + ) + + try: + async for event in agent.astream_events(input_data, config=config, version="v2"): + event_type = event.get("event", "") + + if event_type == "on_chat_model_stream": + if active_tool_depth > 0: + continue + if "surfsense:internal" in event.get("tags", []): + continue + chunk = event.get("data", {}).get("chunk") + if chunk and hasattr(chunk, "content"): + content = chunk.content + if content and isinstance(content, str): + if current_text_id is None: + step_event = complete_current_step() + if step_event: + yield step_event + current_text_id = streaming_service.generate_text_id() + yield streaming_service.format_text_start(current_text_id) + yield streaming_service.format_text_delta(current_text_id, content) + + elif event_type == "on_tool_start": + active_tool_depth += 1 + tool_name = event.get("name", "unknown_tool") + run_id = event.get("run_id", "") + tool_input = event.get("data", {}).get("input", {}) + + if current_text_id is not None: + yield streaming_service.format_text_end(current_text_id) + current_text_id = None + + step_event = complete_current_step() + if step_event: + yield step_event + + tool_step_id = next_thinking_step_id() + tool_step_ids[run_id] = tool_step_id + last_active_step_id = tool_step_id + + title, items = _describe_tool_call(tool_name, tool_input) + step_titles[tool_step_id] = title + yield streaming_service.format_thinking_step( + step_id=tool_step_id, + title=title, + status="in_progress", + items=items, + ) + + elif event_type == "on_tool_end": + active_tool_depth = max(0, active_tool_depth - 1) + run_id = event.get("run_id", "") + step_id = tool_step_ids.pop(run_id, None) + if step_id and step_id not in completed_step_ids: + completed_step_ids.add(step_id) + title = step_titles.get(step_id, "Done") + yield streaming_service.format_thinking_step( + step_id=step_id, + title=title, + status="complete", + ) + if last_active_step_id == step_id: + last_active_step_id = None + + if current_text_id is not None: + yield streaming_service.format_text_end(current_text_id) + step_event = complete_current_step() + if step_event: + yield step_event + + yield streaming_service.format_finish() + yield streaming_service.format_done() + + except Exception as e: + logger.error(f"Autocomplete agent streaming error: {e}", exc_info=True) + if current_text_id is not None: + yield streaming_service.format_text_end(current_text_id) + yield streaming_service.format_error("Autocomplete failed. Please try again.") + yield streaming_service.format_done() + + +def _describe_tool_call(tool_name: str, tool_input: Any) -> tuple[str, list[str]]: + """Return a human-readable (title, items) for a tool call thinking step.""" + inp = tool_input if isinstance(tool_input, dict) else {} + if tool_name == "ls": + path = inp.get("path", "/") + return "Listing files", [path] + if tool_name == "read_file": + fp = inp.get("file_path", "") + display = fp if len(fp) <= 80 else "…" + fp[-77:] + return "Reading file", [display] + if tool_name == "write_file": + fp = inp.get("file_path", "") + display = fp if len(fp) <= 80 else "…" + fp[-77:] + return "Writing file", [display] + if tool_name == "edit_file": + fp = inp.get("file_path", "") + display = fp if len(fp) <= 80 else "…" + fp[-77:] + return "Editing file", [display] + if tool_name == "glob": + pat = inp.get("pattern", "") + base = inp.get("path", "/") + return "Searching files", [f"{pat} in {base}"] + if tool_name == "grep": + pat = inp.get("pattern", "") + path = inp.get("path", "") + display_pat = pat[:60] + ("…" if len(pat) > 60 else "") + return "Searching content", [f'"{display_pat}"' + (f" in {path}" if path else "")] + return f"Using {tool_name}", [] diff --git a/surfsense_backend/app/services/vision_autocomplete_service.py b/surfsense_backend/app/services/vision_autocomplete_service.py index f24a5c848..7d16c5864 100644 --- a/surfsense_backend/app/services/vision_autocomplete_service.py +++ b/surfsense_backend/app/services/vision_autocomplete_service.py @@ -1,139 +1,40 @@ +"""Vision autocomplete service — agent-based with scoped filesystem. + +Optimized pipeline: +1. Start the SSE stream immediately so the UI shows progress. +2. Derive a KB search query from window_title (no separate LLM call). +3. Run KB filesystem pre-computation and agent graph compilation in PARALLEL. +4. Inject pre-computed KB files as initial state and stream the agent. +""" + import logging from typing import AsyncGenerator -from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.messages import HumanMessage from sqlalchemy.ext.asyncio import AsyncSession -from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever +from app.agents.autocomplete import create_autocomplete_agent, stream_autocomplete_agent from app.services.llm_service import get_vision_llm from app.services.new_streaming_service import VercelStreamingService logger = logging.getLogger(__name__) -KB_TOP_K = 5 -KB_MAX_CHARS = 4000 - -EXTRACT_QUERY_PROMPT = """Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else.""" - -EXTRACT_QUERY_PROMPT_WITH_APP = """The user is currently in the application "{app_name}" with the window titled "{window_title}". - -Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else.""" - -VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text. - -You will receive a screenshot of the user's screen. Your job: -1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.). -2. Identify the text area where the user will type. -3. Based on the full visual context, generate the text the user most likely wants to write. - -Key behavior: -- If the text area is EMPTY, draft a full response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document). -- If the text area already has text, continue it naturally. - -Rules: -- Output ONLY the text to be inserted. No quotes, no explanations, no meta-commentary. -- Be concise but complete — a full thought, not a fragment. -- Match the tone and formality of the surrounding context. -- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal. -- Do NOT describe the screenshot or explain your reasoning. -- If you cannot determine what to write, output nothing.""" - -APP_CONTEXT_BLOCK = """ - -The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly.""" - -KB_CONTEXT_BLOCK = """ - -You also have access to the user's knowledge base documents below. Use them to write more accurate, informed, and contextually relevant text. Do NOT cite or reference the documents explicitly — just let the knowledge inform your writing naturally. - - -{kb_context} -""" +PREP_STEP_ID = "autocomplete-prep" -def _build_system_prompt(app_name: str, window_title: str, kb_context: str) -> str: - """Assemble the system prompt from optional context blocks.""" - prompt = VISION_SYSTEM_PROMPT - if app_name: - prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title) - if kb_context: - prompt += KB_CONTEXT_BLOCK.format(kb_context=kb_context) - return prompt +def _derive_kb_query(app_name: str, window_title: str) -> str: + parts = [p for p in (window_title, app_name) if p] + return " ".join(parts) def _is_vision_unsupported_error(e: Exception) -> bool: - """Check if an exception indicates the model doesn't support vision/images.""" msg = str(e).lower() return "content must be a string" in msg or "does not support image" in msg -async def _extract_query_from_screenshot( - llm, screenshot_data_url: str, - app_name: str = "", window_title: str = "", -) -> str | None: - """Ask the Vision LLM to describe what the user is working on. - - Raises vision-unsupported errors so the caller can return a - friendly message immediately instead of retrying with astream. - """ - if app_name: - prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format( - app_name=app_name, window_title=window_title, - ) - else: - prompt_text = EXTRACT_QUERY_PROMPT - - try: - response = await llm.ainvoke([ - HumanMessage(content=[ - {"type": "text", "text": prompt_text}, - {"type": "image_url", "image_url": {"url": screenshot_data_url}}, - ]), - ]) - query = response.content.strip() if hasattr(response, "content") else "" - return query if query else None - except Exception as e: - if _is_vision_unsupported_error(e): - raise - logger.warning(f"Failed to extract query from screenshot: {e}") - return None - - -async def _search_knowledge_base( - session: AsyncSession, search_space_id: int, query: str -) -> str: - """Search the KB and return formatted context string.""" - try: - retriever = ChucksHybridSearchRetriever(session) - results = await retriever.hybrid_search( - query_text=query, - top_k=KB_TOP_K, - search_space_id=search_space_id, - ) - - if not results: - return "" - - parts: list[str] = [] - char_count = 0 - for doc in results: - title = doc.get("document", {}).get("title", "Untitled") - for chunk in doc.get("chunks", []): - content = chunk.get("content", "").strip() - if not content: - continue - entry = f"[{title}]\n{content}" - if char_count + len(entry) > KB_MAX_CHARS: - break - parts.append(entry) - char_count += len(entry) - if char_count >= KB_MAX_CHARS: - break - - return "\n\n---\n\n".join(parts) - except Exception as e: - logger.warning(f"KB search failed, proceeding without context: {e}") - return "" +# --------------------------------------------------------------------------- +# Main entry point +# --------------------------------------------------------------------------- async def stream_vision_autocomplete( @@ -144,13 +45,7 @@ async def stream_vision_autocomplete( app_name: str = "", window_title: str = "", ) -> AsyncGenerator[str, None]: - """Analyze a screenshot with the vision LLM and stream a text completion. - - Pipeline: - 1. Extract a search query from the screenshot (non-streaming) - 2. Search the knowledge base for relevant context - 3. Stream the final completion with screenshot + KB + app context - """ + """Analyze a screenshot with a vision-LLM agent and stream a text completion.""" streaming = VercelStreamingService() vision_error_msg = ( "The selected model does not support vision. " @@ -164,62 +59,89 @@ async def stream_vision_autocomplete( yield streaming.format_done() return - kb_context = "" + # Start SSE stream immediately so the UI has something to show + yield streaming.format_message_start() + + kb_query = _derive_kb_query(app_name, window_title) + + # Show a preparation step while KB search + agent compile run + yield streaming.format_thinking_step( + step_id=PREP_STEP_ID, + title="Searching knowledge base", + status="in_progress", + items=[kb_query] if kb_query else [], + ) + try: - query = await _extract_query_from_screenshot( - llm, screenshot_data_url, app_name=app_name, window_title=window_title, + agent, kb = await create_autocomplete_agent( + llm, + search_space_id=search_space_id, + kb_query=kb_query, + app_name=app_name, + window_title=window_title, ) except Exception as e: - logger.warning(f"Vision autocomplete: selected model does not support vision: {e}") - yield streaming.format_message_start() - yield streaming.format_error(vision_error_msg) + if _is_vision_unsupported_error(e): + logger.warning("Vision autocomplete: model does not support vision: %s", e) + yield streaming.format_error(vision_error_msg) + yield streaming.format_done() + return + logger.error("Failed to create autocomplete agent: %s", e, exc_info=True) + yield streaming.format_error("Autocomplete failed. Please try again.") yield streaming.format_done() return - if query: - kb_context = await _search_knowledge_base(session, search_space_id, query) + has_kb = kb.has_documents + doc_count = len(kb.files) if has_kb else 0 # type: ignore[arg-type] - system_prompt = _build_system_prompt(app_name, window_title, kb_context) + yield streaming.format_thinking_step( + step_id=PREP_STEP_ID, + title="Searching knowledge base", + status="complete", + items=[f"Found {doc_count} document{'s' if doc_count != 1 else ''}"] if kb_query else ["Skipped"], + ) - messages = [ - SystemMessage(content=system_prompt), - HumanMessage(content=[ - { - "type": "text", - "text": "Analyze this screenshot. Understand the full context of what the user is working on, then generate the text they most likely want to write in the active text area.", - }, - { - "type": "image_url", - "image_url": {"url": screenshot_data_url}, - }, - ]), - ] + # Build agent input with pre-computed KB as initial state + if has_kb: + instruction = ( + "Analyze this screenshot, then explore the knowledge base documents " + "listed above — read the chunk index of any document whose title " + "looks relevant and check matched chunks for useful facts. " + "Finally, generate a concise autocomplete for the active text area, " + "enhanced with any relevant KB information you found." + ) + else: + instruction = ( + "Analyze this screenshot and generate a concise autocomplete " + "for the active text area based on what you see." + ) - text_started = False - text_id = "" + user_message = HumanMessage(content=[ + {"type": "text", "text": instruction}, + {"type": "image_url", "image_url": {"url": screenshot_data_url}}, + ]) + + input_data: dict = {"messages": [user_message]} + + if has_kb: + input_data["files"] = kb.files + input_data["messages"] = [kb.ls_ai_msg, kb.ls_tool_msg, user_message] + logger.info("Autocomplete: injected %d KB files into agent initial state", doc_count) + else: + logger.info("Autocomplete: no KB documents found, proceeding with screenshot only") + + # Stream the agent (message_start already sent above) try: - yield streaming.format_message_start() - text_id = streaming.generate_text_id() - yield streaming.format_text_start(text_id) - text_started = True - - async for chunk in llm.astream(messages): - token = chunk.content if hasattr(chunk, "content") else str(chunk) - if token: - yield streaming.format_text_delta(text_id, token) - - yield streaming.format_text_end(text_id) - yield streaming.format_finish() - yield streaming.format_done() - + async for sse in stream_autocomplete_agent( + agent, input_data, streaming, emit_message_start=False, + ): + yield sse except Exception as e: - if text_started: - yield streaming.format_text_end(text_id) - if _is_vision_unsupported_error(e): - logger.warning(f"Vision autocomplete: selected model does not support vision: {e}") + logger.warning("Vision autocomplete: model does not support vision: %s", e) yield streaming.format_error(vision_error_msg) + yield streaming.format_done() else: - logger.error(f"Vision autocomplete streaming error: {e}", exc_info=True) + logger.error("Vision autocomplete streaming error: %s", e, exc_info=True) yield streaming.format_error("Autocomplete failed. Please try again.") - yield streaming.format_done() + yield streaming.format_done() diff --git a/surfsense_web/app/desktop/suggestion/page.tsx b/surfsense_web/app/desktop/suggestion/page.tsx index fb83e2113..42ce025a8 100644 --- a/surfsense_web/app/desktop/suggestion/page.tsx +++ b/surfsense_web/app/desktop/suggestion/page.tsx @@ -10,7 +10,18 @@ type SSEEvent = | { type: "text-end"; id: string } | { type: "start"; messageId: string } | { type: "finish" } - | { type: "error"; errorText: string }; + | { type: "error"; errorText: string } + | { + type: "data-thinking-step"; + data: { id: string; title: string; status: string; items: string[] }; + }; + +interface AgentStep { + id: string; + title: string; + status: string; + items: string[]; +} function friendlyError(raw: string | number): string { if (typeof raw === "number") { @@ -34,11 +45,24 @@ function friendlyError(raw: string | number): string { const AUTO_DISMISS_MS = 3000; +function StepIcon({ status }: { status: string }) { + if (status === "complete") { + return ( + + + + + ); + } + return ; +} + export default function SuggestionPage() { const api = useElectronAPI(); const [suggestion, setSuggestion] = useState(""); const [isLoading, setIsLoading] = useState(true); const [error, setError] = useState(null); + const [steps, setSteps] = useState([]); const abortRef = useRef(null); const isDesktop = !!api?.onAutocompleteContext; @@ -66,6 +90,7 @@ export default function SuggestionPage() { setIsLoading(true); setSuggestion(""); setError(null); + setSteps([]); let token = getBearerToken(); if (!token) { @@ -137,6 +162,17 @@ export default function SuggestionPage() { setSuggestion((prev) => prev + parsed.delta); } else if (parsed.type === "error") { setError(friendlyError(parsed.errorText)); + } else if (parsed.type === "data-thinking-step") { + const { id, title, status, items } = parsed.data; + setSteps((prev) => { + const existing = prev.findIndex((s) => s.id === id); + if (existing >= 0) { + const updated = [...prev]; + updated[existing] = { id, title, status, items }; + return updated; + } + return [...prev, { id, title, status, items }]; + }); } } catch { continue; @@ -185,13 +221,33 @@ export default function SuggestionPage() { ); } - if (isLoading && !suggestion) { + const showLoading = isLoading && !suggestion; + + if (showLoading) { return (
-
- - - +
+ {steps.length === 0 && ( +
+ + Preparing… +
+ )} + {steps.length > 0 && ( +
+ {steps.map((step) => ( +
+ + + {step.title} + {step.items.length > 0 && ( + · {step.items[0]} + )} + +
+ ))} +
+ )}
); diff --git a/surfsense_web/app/desktop/suggestion/suggestion.css b/surfsense_web/app/desktop/suggestion/suggestion.css index 62f4d2ea7..d2213fefd 100644 --- a/surfsense_web/app/desktop/suggestion/suggestion.css +++ b/surfsense_web/app/desktop/suggestion/suggestion.css @@ -19,13 +19,21 @@ body:has(.suggestion-body) { } .suggestion-tooltip { + box-sizing: border-box; background: #1e1e1e; border: 1px solid #3c3c3c; border-radius: 8px; padding: 8px 12px; margin: 4px; max-width: 400px; + /* MAX_HEIGHT in suggestion-window.ts is 400px. Subtract 8px for margin + (4px * 2) so the tooltip + margin fits within the Electron window. + box-sizing: border-box ensures padding + border are included. */ + max-height: 392px; box-shadow: 0 4px 16px rgba(0, 0, 0, 0.5); + display: flex; + flex-direction: column; + overflow: hidden; } .suggestion-text { @@ -35,6 +43,26 @@ body:has(.suggestion-body) { margin: 0 0 6px 0; word-wrap: break-word; white-space: pre-wrap; + overflow-y: auto; + flex: 1 1 auto; + min-height: 0; +} + +.suggestion-text::-webkit-scrollbar { + width: 5px; +} + +.suggestion-text::-webkit-scrollbar-track { + background: transparent; +} + +.suggestion-text::-webkit-scrollbar-thumb { + background: #555; + border-radius: 3px; +} + +.suggestion-text::-webkit-scrollbar-thumb:hover { + background: #777; } .suggestion-actions { @@ -43,6 +71,7 @@ body:has(.suggestion-body) { gap: 4px; border-top: 1px solid #2a2a2a; padding-top: 6px; + flex-shrink: 0; } .suggestion-btn { @@ -86,36 +115,77 @@ body:has(.suggestion-body) { font-size: 12px; } -.suggestion-loading { +/* --- Agent activity indicator --- */ + +.agent-activity { display: flex; - gap: 5px; + flex-direction: column; + gap: 4px; + overflow-y: auto; + max-height: 340px; +} + +.activity-initial { + display: flex; + align-items: center; + gap: 8px; padding: 2px 0; - justify-content: center; } -.suggestion-dot { - width: 4px; - height: 4px; +.activity-label { + color: #a1a1aa; + font-size: 12px; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} + +.activity-steps { + display: flex; + flex-direction: column; + gap: 3px; +} + +.activity-step { + display: flex; + align-items: center; + gap: 6px; + min-height: 18px; +} + +.step-label { + color: #d4d4d4; + font-size: 12px; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} + +.step-detail { + color: #71717a; + font-size: 11px; +} + +/* Spinner (in_progress) */ +.step-spinner { + width: 14px; + height: 14px; + flex-shrink: 0; + border: 1.5px solid #3f3f46; + border-top-color: #a78bfa; border-radius: 50%; - background: #666; - animation: suggestion-pulse 1.2s infinite ease-in-out; + animation: step-spin 0.7s linear infinite; } -.suggestion-dot:nth-child(2) { - animation-delay: 0.15s; +/* Checkmark icon (complete) */ +.step-icon { + width: 14px; + height: 14px; + flex-shrink: 0; } -.suggestion-dot:nth-child(3) { - animation-delay: 0.3s; -} - -@keyframes suggestion-pulse { - 0%, 80%, 100% { - opacity: 0.3; - transform: scale(0.8); - } - 40% { - opacity: 1; - transform: scale(1.1); +@keyframes step-spin { + to { + transform: rotate(360deg); } } diff --git a/surfsense_web/components/assistant-ui/thread.tsx b/surfsense_web/components/assistant-ui/thread.tsx index 7d8765399..6c8c619b2 100644 --- a/surfsense_web/components/assistant-ui/thread.tsx +++ b/surfsense_web/components/assistant-ui/thread.tsx @@ -92,15 +92,7 @@ import { useMediaQuery } from "@/hooks/use-media-query"; import { useElectronAPI } from "@/hooks/use-platform"; import { cn } from "@/lib/utils"; -/** Placeholder texts that cycle in new chats when input is empty */ -const CYCLING_PLACEHOLDERS = [ - "Ask SurfSense anything or @mention docs", - "Generate a podcast from my vacation ideas in Notion", - "Sum up last week's meeting notes from Drive in a bulleted list", - "Give me a brief overview of the most urgent tickets in Jira and Linear", - "Briefly, what are today's top ten important emails and calendar events?", - "Check if this week's Slack messages reference any GitHub issues", -]; +const COMPOSER_PLACEHOLDER = "Ask anything · Type / for prompts · Type @ to mention docs"; export const Thread: FC = () => { return ; @@ -380,29 +372,7 @@ const Composer: FC = () => { const isThreadEmpty = useAuiState(({ thread }) => thread.isEmpty); const isThreadRunning = useAuiState(({ thread }) => thread.isRunning); - // Cycling placeholder state - only cycles in new chats - const [placeholderIndex, setPlaceholderIndex] = useState(0); - - // Cycle through placeholders every 4 seconds when thread is empty (new chat) - useEffect(() => { - // Only cycle when thread is empty (new chat) - if (!isThreadEmpty) { - // Reset to first placeholder when chat becomes active - setPlaceholderIndex(0); - return; - } - - const intervalId = setInterval(() => { - setPlaceholderIndex((prev) => (prev + 1) % CYCLING_PLACEHOLDERS.length); - }, 6000); - - return () => clearInterval(intervalId); - }, [isThreadEmpty]); - - // Compute current placeholder - only cycle in new chats - const currentPlaceholder = isThreadEmpty - ? CYCLING_PLACEHOLDERS[placeholderIndex] - : CYCLING_PLACEHOLDERS[0]; + const currentPlaceholder = COMPOSER_PLACEHOLDER; // Live collaboration state const { data: currentUser } = useAtomValue(currentUserAtom);