Merge branch 'dev' into google-drive-connector

Merge in dev
2026-06-28 21:49:40 +02:00 · 2025-12-28 19:00:09 +02:00 · 2025-12-28 19:00:09 +02:00 · c5c61a2c6b
commit c5c61a2c6b
parent 0b006de32d 0e1ea9c30f
76 changed files with 3237 additions and 961 deletions
--- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py
+++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
@ -50,6 +50,9 @@ def create_surfsense_deep_agent(
    - display_image: Display images in chat
    - scrape_webpage: Extract content from webpages

+    The agent also includes TodoListMiddleware by default (via create_deep_agent) which provides:
+    - write_todos: Create and update planning/todo lists for complex tasks
+
    The system prompt can be configured via agent_config:
    - Custom system instructions (or use defaults)
    - Citation toggle (enable/disable citation requirements)
@ -138,6 +141,7 @@ def create_surfsense_deep_agent(
        system_prompt = build_surfsense_system_prompt()

    # Create the deep agent with system prompt and checkpointer
+    # Note: TodoListMiddleware (write_todos) is included by default in create_deep_agent
    agent = create_deep_agent(
        model=llm,
        tools=tools,
--- a/surfsense_backend/app/agents/new_chat/system_prompt.py
+++ b/surfsense_backend/app/agents/new_chat/system_prompt.py
@ -64,18 +64,23 @@ You have access to the following tools:
  - The preview card will automatically be displayed in the chat.

 4. display_image: Display an image in the chat with metadata.
-  - Use this tool when you want to show an image from a URL to the user.
+  - Use this tool ONLY when you have a valid public HTTP/HTTPS image URL to show.
  - This displays the image with an optional title, description, and source attribution.
-  - Common use cases:
-    * Showing an image from a URL mentioned in the conversation
-    * Displaying a diagram, chart, or illustration you're referencing
-    * Showing visual examples when explaining concepts
-  - IMPORTANT: Do NOT use this tool for user-uploaded image attachments!
-    * User attachments are already visible in the chat UI - the user can see them
-    * This tool requires a valid HTTP/HTTPS URL, not a local file path
-    * When a user uploads an image, just analyze it and respond - don't try to display it again
+  - Valid use cases:
+    * Showing an image from a URL the user explicitly mentioned in their message
+    * Displaying images found in scraped webpage content (from scrape_webpage tool)
+    * Showing a publicly accessible diagram or chart from a known URL
+  
+  CRITICAL - NEVER USE THIS TOOL FOR USER-UPLOADED ATTACHMENTS:
+  When a user uploads/attaches an image file to their message:
+    * The image is ALREADY VISIBLE in the chat UI as a thumbnail on their message
+    * You do NOT have a URL for their uploaded image - only extracted text/description
+    * Calling display_image will FAIL and show "Image not available" error
+    * Simply analyze the image content and respond with your analysis - DO NOT try to display it
+    * The user can already see their own uploaded image - they don't need you to show it again
+  
  - Args:
-    - src: The URL of the image to display (must be a valid HTTP/HTTPS image URL, not a local path)
+    - src: The URL of the image (MUST be a valid public HTTP/HTTPS URL that you know exists)
    - alt: Alternative text describing the image (for accessibility)
    - title: Optional title to display below the image
    - description: Optional description providing context about the image
@ -104,6 +109,20 @@ You have access to the following tools:
    * This makes your response more visual and engaging.
    * Prioritize showing: diagrams, charts, infographics, key illustrations, or images that help explain the content.
    * Don't show every image - just the most relevant 1-3 images that enhance understanding.
+
+6. write_todos: Create and update a planning/todo list to break down complex tasks.
+  - IMPORTANT: Use this tool when the user asks you to create a plan, break down a task, or explain something in structured steps.
+  - This tool creates a visual plan with progress tracking that the user can see in the UI.
+  - When to use:
+    * User asks to "create a plan" or "break down" a task
+    * User asks for "steps" to do something
+    * User asks you to "explain" something in sections
+    * Any multi-step task that would benefit from structured planning
+  - Args:
+    - todos: List of todo items, each with:
+      * content: Description of the task (required)
+      * status: "pending", "in_progress", or "completed" (required)
+  - The tool automatically adds IDs and formats the output for the UI.
 </tools>
 <tool_call_examples>
 - User: "Fetch all my notes and what's in them?"
@ -134,8 +153,15 @@ You have access to the following tools:
 - User: "Show me this image: https://example.com/image.png"
  - Call: `display_image(src="https://example.com/image.png", alt="User shared image")`

- User: "Can you display a diagram of a neural network?"
-  - Call: `display_image(src="https://example.com/neural-network.png", alt="Neural network diagram", title="Neural Network Architecture", description="A visual representation of a neural network with input, hidden, and output layers")`
+- User uploads an image file and asks: "What is this image about?"
+  - DO NOT call display_image! The user's uploaded image is already visible in the chat.
+  - Simply analyze the image content (which you receive as extracted text/description) and respond.
+  - WRONG: `display_image(src="...", ...)` - This will fail with "Image not available"
+  - CORRECT: Just provide your analysis directly: "Based on the image you shared, this appears to be..."
+
+- User uploads a screenshot and asks: "Can you explain what's in this image?"
+  - DO NOT call display_image! Just analyze and respond directly.
+  - The user can already see their screenshot - they don't need you to display it again.

 - User: "Read this article and summarize it for me: https://example.com/blog/ai-trends"
  - Call: `scrape_webpage(url="https://example.com/blog/ai-trends")`
@ -154,6 +180,34 @@ You have access to the following tools:
  - Then, if the content contains useful diagrams/images like `![Neural Network Diagram](https://example.com/nn-diagram.png)`:
    - Call: `display_image(src="https://example.com/nn-diagram.png", alt="Neural Network Diagram", title="Neural Network Architecture")`
  - Then provide your explanation, referencing the displayed image
+
+- User: "Create a plan for building a user authentication system"
+  - Call: `write_todos(todos=[{"content": "Design database schema for users and sessions", "status": "in_progress"}, {"content": "Implement registration and login endpoints", "status": "pending"}, {"content": "Add password reset functionality", "status": "pending"}])`
+  - Then explain each step in detail as you work through them
+
+- User: "Break down how to build a REST API into steps"
+  - Call: `write_todos(todos=[{"content": "Design API endpoints and data models", "status": "in_progress"}, {"content": "Set up server framework and routing", "status": "pending"}, {"content": "Implement CRUD operations", "status": "pending"}, {"content": "Add authentication and error handling", "status": "pending"}])`
+  - Then provide detailed explanations for each step
+
+- User: "Help me plan my trip to Japan"
+  - Call: `write_todos(todos=[{"content": "Research best time to visit and book flights", "status": "in_progress"}, {"content": "Plan itinerary for cities to visit", "status": "pending"}, {"content": "Book accommodations", "status": "pending"}, {"content": "Prepare travel documents and currency", "status": "pending"}])`
+  - Then provide travel preparation guidance
+
+- User: "Break down how to learn guitar"
+  - Call: `write_todos(todos=[{"content": "Learn basic chords and finger positioning", "status": "in_progress"}, {"content": "Practice strumming patterns", "status": "pending"}, {"content": "Learn to read tabs and sheet music", "status": "pending"}, {"content": "Master simple songs", "status": "pending"}])`
+  - Then provide learning milestones and tips
+
+- User: "Plan my workout routine for the week"
+  - Call: `write_todos(todos=[{"content": "Monday: Upper body strength training", "status": "in_progress"}, {"content": "Tuesday: Cardio and core workout", "status": "pending"}, {"content": "Wednesday: Rest or light stretching", "status": "pending"}, {"content": "Thursday: Lower body strength training", "status": "pending"}, {"content": "Friday: Full body HIIT session", "status": "pending"}])`
+  - Then provide exercise details and tips
+
+- User: "Help me organize my home renovation project"
+  - Call: `write_todos(todos=[{"content": "Define scope and create budget", "status": "in_progress"}, {"content": "Research and hire contractors", "status": "pending"}, {"content": "Obtain necessary permits", "status": "pending"}, {"content": "Order materials and fixtures", "status": "pending"}, {"content": "Execute renovation phases", "status": "pending"}])`
+  - Then provide detailed renovation guidance
+
+- User: "What steps should I take to start a podcast?"
+  - Call: `write_todos(todos=[{"content": "Define podcast concept and target audience", "status": "in_progress"}, {"content": "Set up recording equipment and software", "status": "pending"}, {"content": "Plan episode structure and content", "status": "pending"}, {"content": "Record and edit first episodes", "status": "pending"}, {"content": "Choose hosting platform and publish", "status": "pending"}])`
+  - Then provide podcast launch guidance
 </tool_call_examples>
 """

--- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py
+++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py
@ -14,8 +14,8 @@ from urllib.parse import urlparse
 import httpx
 import trafilatura
 from fake_useragent import UserAgent
-from langchain_community.document_loaders import AsyncChromiumLoader
 from langchain_core.tools import tool
+from playwright.async_api import async_playwright

 logger = logging.getLogger(__name__)

@ -170,7 +170,7 @@ def _make_absolute_url(image_url: str, base_url: str) -> str:

 async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
    """
-    Fetch page content using headless Chromium browser.
+    Fetch page content using headless Chromium browser via Playwright.
    Used as a fallback when simple HTTP requests are blocked (403, etc.).

    Args:
@ -186,18 +186,17 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
        ua = UserAgent()
        user_agent = ua.random

-        # Use AsyncChromiumLoader to fetch the page
-        crawl_loader = AsyncChromiumLoader(
-            urls=[url], headless=True, user_agent=user_agent
-        )
-        documents = await crawl_loader.aload()
+        # Use Playwright to fetch the page
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            context = await browser.new_context(user_agent=user_agent)
+            page = await context.new_page()

-        if not documents:
-            logger.warning(f"[link_preview] Chromium returned no documents for {url}")
-            return None
-
-        doc = documents[0]
-        raw_html = doc.page_content
+            try:
+                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
+                raw_html = await page.content()
+            finally:
+                await browser.close()

        if not raw_html or len(raw_html.strip()) == 0:
            logger.warning(f"[link_preview] Chromium returned empty content for {url}")
@ -280,15 +279,18 @@ def create_link_preview_tool():
            url = f"https://{url}"

        try:
+            # Generate a random User-Agent to avoid bot detection
+            ua = UserAgent()
+            user_agent = ua.random
+
            # Use a browser-like User-Agent to fetch Open Graph metadata.
-            # This is the same approach used by Slack, Discord, Twitter, etc. for link previews.
            # We're only fetching publicly available metadata (title, description, thumbnail)
            # that websites intentionally expose via OG tags for link preview purposes.
            async with httpx.AsyncClient(
                timeout=10.0,
                follow_redirects=True,
                headers={
-                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                    "User-Agent": user_agent,
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
                    "Accept-Language": "en-US,en;q=0.9",
                    "Accept-Encoding": "gzip, deflate, br",
--- a/surfsense_backend/app/agents/new_chat/tools/registry.py
+++ b/surfsense_backend/app/agents/new_chat/tools/registry.py
@ -125,6 +125,7 @@ BUILTIN_TOOLS: list[ToolDefinition] = [
        ),
        requires=[],  # firecrawl_api_key is optional
    ),
+    # Note: write_todos is now provided by TodoListMiddleware from deepagents
    # =========================================================================
    # ADD YOUR CUSTOM TOOLS BELOW
    # =========================================================================
--- a/surfsense_backend/app/connectors/webcrawler_connector.py
+++ b/surfsense_backend/app/connectors/webcrawler_connector.py
@ -1,7 +1,7 @@
 """
 WebCrawler Connector Module

-A module for crawling web pages and extracting content using Firecrawl or AsyncChromiumLoader.
+A module for crawling web pages and extracting content using Firecrawl or Playwright.
 Provides a unified interface for web scraping.
 """

@ -12,7 +12,7 @@ import trafilatura
 import validators
 from fake_useragent import UserAgent
 from firecrawl import AsyncFirecrawlApp
-from langchain_community.document_loaders import AsyncChromiumLoader
+from playwright.async_api import async_playwright

 logger = logging.getLogger(__name__)

@ -25,7 +25,9 @@ class WebCrawlerConnector:
        Initialize the WebCrawlerConnector class.

        Args:
-            firecrawl_api_key: Firecrawl API key (optional, will use AsyncChromiumLoader if not provided)
+            firecrawl_api_key: Firecrawl API key (optional). If provided, Firecrawl will be tried first
+                             and Chromium will be used as fallback if Firecrawl fails. If not provided,
+                             Chromium will be used directly.
        """
        self.firecrawl_api_key = firecrawl_api_key
        self.use_firecrawl = bool(firecrawl_api_key)
@ -46,6 +48,9 @@ class WebCrawlerConnector:
        """
        Crawl a single URL and extract its content.

+        If Firecrawl API key is provided, tries Firecrawl first and falls back to Chromium
+        if Firecrawl fails. If no Firecrawl API key is provided, uses Chromium directly.
+
        Args:
            url: URL to crawl
            formats: List of formats to extract (e.g., ["markdown", "html"]) - only for Firecrawl
@ -56,19 +61,37 @@ class WebCrawlerConnector:
                - content: Extracted content (markdown or HTML)
                - metadata: Page metadata (title, description, etc.)
                - source: Original URL
-                - crawler_type: Type of crawler used
+                - crawler_type: Type of crawler used ("firecrawl" or "chromium")
        """
        try:
            # Validate URL
            if not validators.url(url):
                return None, f"Invalid URL: {url}"

+            # Try Firecrawl first if API key is provided
            if self.use_firecrawl:
-                result = await self._crawl_with_firecrawl(url, formats)
+                try:
+                    logger.info(f"[webcrawler] Using Firecrawl for: {url}")
+                    result = await self._crawl_with_firecrawl(url, formats)
+                    return result, None
+                except Exception as firecrawl_error:
+                    # Firecrawl failed, fallback to Chromium
+                    logger.warning(
+                        f"[webcrawler] Firecrawl failed, falling back to Chromium+Trafilatura for: {url}"
+                    )
+                    try:
+                        result = await self._crawl_with_chromium(url)
+                        return result, None
+                    except Exception as chromium_error:
+                        return (
+                            None,
+                            f"Both Firecrawl and Chromium failed. Firecrawl error: {firecrawl_error!s}, Chromium error: {chromium_error!s}",
+                        )
            else:
+                # No Firecrawl API key, use Chromium directly
+                logger.info(f"[webcrawler] Using Chromium+Trafilatura for: {url}")
                result = await self._crawl_with_chromium(url)
-
-            return result, None
+                return result, None

        except Exception as e:
            return None, f"Error crawling URL {url}: {e!s}"
@ -126,7 +149,7 @@ class WebCrawlerConnector:

    async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
        """
-        Crawl URL using AsyncChromiumLoader with Trafilatura for content extraction.
+        Crawl URL using Playwright with Trafilatura for content extraction.
        Falls back to raw HTML if Trafilatura extraction fails.

        Args:
@ -142,30 +165,30 @@ class WebCrawlerConnector:
        ua = UserAgent()
        user_agent = ua.random

-        # Pass User-Agent to AsyncChromiumLoader
-        crawl_loader = AsyncChromiumLoader(
-            urls=[url], headless=True, user_agent=user_agent
-        )
-        documents = await crawl_loader.aload()
+        # Use Playwright to fetch the page
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            context = await browser.new_context(user_agent=user_agent)
+            page = await context.new_page()

-        if not documents:
+            try:
+                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
+                raw_html = await page.content()
+                page_title = await page.title()
+            finally:
+                await browser.close()
+
+        if not raw_html:
            raise ValueError(f"Failed to load content from {url}")

-        doc = documents[0]
-        raw_html = doc.page_content
-
-        # Extract basic metadata from the document
-        base_metadata = doc.metadata if doc.metadata else {}
+        # Extract basic metadata from the page
+        base_metadata = {"title": page_title} if page_title else {}

        # Try to extract main content using Trafilatura
        extracted_content = None
        trafilatura_metadata = None

        try:
-            logger.info(
-                f"Attempting to extract main content from {url} using Trafilatura"
-            )
-
            # Extract main content as markdown
            extracted_content = trafilatura.extract(
                raw_html,
@ -179,23 +202,10 @@ class WebCrawlerConnector:
            # Extract metadata using Trafilatura
            trafilatura_metadata = trafilatura.extract_metadata(raw_html)

-            if extracted_content and len(extracted_content.strip()) > 0:
-                logger.info(
-                    f"Successfully extracted main content from {url} using Trafilatura "
-                    f"({len(extracted_content)} chars vs {len(raw_html)} chars raw HTML)"
-                )
-            else:
-                logger.warning(
-                    f"Trafilatura extraction returned empty content for {url}, "
-                    "falling back to raw HTML"
-                )
+            if not extracted_content or len(extracted_content.strip()) == 0:
                extracted_content = None

-        except Exception as e:
-            logger.warning(
-                f"Trafilatura extraction failed for {url}: {e}. "
-                "Falling back to raw HTML"
-            )
+        except Exception:
            extracted_content = None

        # Build metadata, preferring Trafilatura metadata when available
--- a/surfsense_backend/app/routes/logs_routes.py
+++ b/surfsense_backend/app/routes/logs_routes.py
@ -319,6 +319,9 @@ async def get_logs_summary(
                    if log.log_metadata
                    else "Unknown"
                )
+                document_id = (
+                    log.log_metadata.get("document_id") if log.log_metadata else None
+                )
                summary["active_tasks"].append(
                    {
                        "id": log.id,
@ -326,6 +329,7 @@ async def get_logs_summary(
                        "message": log.message,
                        "started_at": log.created_at,
                        "source": log.source,
+                        "document_id": document_id,
                    }
                )

--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@ -69,6 +69,30 @@ def format_mentioned_documents_as_context(documents: list[Document]) -> str:
    return "\n".join(context_parts)


+def extract_todos_from_deepagents(command_output) -> dict:
+    """
+    Extract todos from deepagents' TodoListMiddleware Command output.
+
+    deepagents returns a Command object with:
+    - Command.update['todos'] = [{'content': '...', 'status': '...'}]
+
+    Returns the todos directly (no transformation needed - UI matches deepagents format).
+    """
+    todos_data = []
+    if hasattr(command_output, "update"):
+        # It's a Command object from deepagents
+        update = command_output.update
+        todos_data = update.get("todos", [])
+    elif isinstance(command_output, dict):
+        # Already a dict - check if it has todos directly or in update
+        if "todos" in command_output:
+            todos_data = command_output.get("todos", [])
+        elif "update" in command_output and isinstance(command_output["update"], dict):
+            todos_data = command_output["update"].get("todos", [])
+
+    return {"todos": todos_data}
+
+
 async def stream_new_chat(
    user_query: str,
    search_space_id: int,
@ -146,6 +170,16 @@ async def stream_new_chat(
        # Create connector service
        connector_service = ConnectorService(session, search_space_id=search_space_id)

+        # Get Firecrawl API key from webcrawler connector if configured
+        from app.db import SearchSourceConnectorType
+
+        firecrawl_api_key = None
+        webcrawler_connector = await connector_service.get_connector_by_type(
+            SearchSourceConnectorType.WEBCRAWLER_CONNECTOR, search_space_id
+        )
+        if webcrawler_connector and webcrawler_connector.config:
+            firecrawl_api_key = webcrawler_connector.config.get("FIRECRAWL_API_KEY")
+
        # Get the PostgreSQL checkpointer for persistent conversation memory
        checkpointer = await get_checkpointer()

@ -157,6 +191,7 @@ async def stream_new_chat(
            connector_service=connector_service,
            checkpointer=checkpointer,
            agent_config=agent_config,  # Pass prompt configuration
+            firecrawl_api_key=firecrawl_api_key,  # Pass Firecrawl API key if configured
        )

        # Build input with message history from frontend
@ -211,7 +246,8 @@ async def stream_new_chat(
        config = {
            "configurable": {
                "thread_id": str(chat_id),
-            }
+            },
+            "recursion_limit": 80,  # Increase from default 25 to allow more tool iterations
        }

        # Start the message stream
@ -233,6 +269,8 @@ async def stream_new_chat(
        completed_step_ids: set[str] = set()
        # Track if we just finished a tool (text flows silently after tools)
        just_finished_tool: bool = False
+        # Track write_todos calls to show "Creating plan" vs "Updating plan"
+        write_todos_call_count: int = 0

        def next_thinking_step_id() -> str:
            nonlocal thinking_step_counter
@ -441,6 +479,60 @@ async def stream_new_chat(
                        status="in_progress",
                        items=last_active_step_items,
                    )
+                elif tool_name == "write_todos":
+                    # Track write_todos calls for better messaging
+                    write_todos_call_count += 1
+                    todos = (
+                        tool_input.get("todos", [])
+                        if isinstance(tool_input, dict)
+                        else []
+                    )
+                    todo_count = len(todos) if isinstance(todos, list) else 0
+
+                    if write_todos_call_count == 1:
+                        # First call - creating the plan
+                        last_active_step_title = "Creating plan"
+                        last_active_step_items = [f"Defining {todo_count} tasks..."]
+                    else:
+                        # Subsequent calls - updating the plan
+                        # Try to provide context about what's being updated
+                        in_progress_count = (
+                            sum(
+                                1
+                                for t in todos
+                                if isinstance(t, dict)
+                                and t.get("status") == "in_progress"
+                            )
+                            if isinstance(todos, list)
+                            else 0
+                        )
+                        completed_count = (
+                            sum(
+                                1
+                                for t in todos
+                                if isinstance(t, dict)
+                                and t.get("status") == "completed"
+                            )
+                            if isinstance(todos, list)
+                            else 0
+                        )
+
+                        last_active_step_title = "Updating progress"
+                        last_active_step_items = (
+                            [
+                                f"Progress: {completed_count}/{todo_count} completed",
+                                f"In progress: {in_progress_count} tasks",
+                            ]
+                            if completed_count > 0
+                            else [f"Working on {todo_count} tasks"]
+                        )
+
+                    yield streaming_service.format_thinking_step(
+                        step_id=tool_step_id,
+                        title=last_active_step_title,
+                        status="in_progress",
+                        items=last_active_step_items,
+                    )
                elif tool_name == "generate_podcast":
                    podcast_title = (
                        tool_input.get("podcast_title", "SurfSense Podcast")
@ -465,6 +557,15 @@ async def stream_new_chat(
                        status="in_progress",
                        items=last_active_step_items,
                    )
+                # elif tool_name == "ls":
+                #     last_active_step_title = "Exploring files"
+                #     last_active_step_items = []
+                #     yield streaming_service.format_thinking_step(
+                #         step_id=tool_step_id,
+                #         title="Exploring files",
+                #         status="in_progress",
+                #         items=None,
+                #     )
                else:
                    last_active_step_title = f"Using {tool_name.replace('_', ' ')}"
                    last_active_step_items = []
@ -546,9 +647,11 @@ async def stream_new_chat(
                tool_name = event.get("name", "unknown_tool")
                raw_output = event.get("data", {}).get("output", "")

-                # Extract content from ToolMessage if needed
-                # LangGraph may return a ToolMessage object instead of raw dict
-                if hasattr(raw_output, "content"):
+                # Handle deepagents' write_todos Command object specially
+                if tool_name == "write_todos" and hasattr(raw_output, "update"):
+                    # deepagents returns a Command object - extract todos directly
+                    tool_output = extract_todos_from_deepagents(raw_output)
+                elif hasattr(raw_output, "content"):
                    # It's a ToolMessage object - extract the content
                    content = raw_output.content
                    # If content is a string that looks like JSON, try to parse it
@ -707,6 +810,104 @@ async def stream_new_chat(
                        status="completed",
                        items=completed_items,
                    )
+                elif tool_name == "write_todos":
+                    # Build completion items for planning/updating
+                    if isinstance(tool_output, dict):
+                        todos = tool_output.get("todos", [])
+                        todo_count = len(todos) if isinstance(todos, list) else 0
+                        completed_count = (
+                            sum(
+                                1
+                                for t in todos
+                                if isinstance(t, dict)
+                                and t.get("status") == "completed"
+                            )
+                            if isinstance(todos, list)
+                            else 0
+                        )
+                        in_progress_count = (
+                            sum(
+                                1
+                                for t in todos
+                                if isinstance(t, dict)
+                                and t.get("status") == "in_progress"
+                            )
+                            if isinstance(todos, list)
+                            else 0
+                        )
+
+                        # Use context-aware completion message
+                        if last_active_step_title == "Creating plan":
+                            completed_items = [f"Created {todo_count} tasks"]
+                        else:
+                            # Updating progress - show stats
+                            completed_items = [
+                                f"Progress: {completed_count}/{todo_count} completed",
+                            ]
+                            if in_progress_count > 0:
+                                # Find the currently in-progress task name
+                                in_progress_task = next(
+                                    (
+                                        t.get("content", "")[:40]
+                                        for t in todos
+                                        if isinstance(t, dict)
+                                        and t.get("status") == "in_progress"
+                                    ),
+                                    None,
+                                )
+                                if in_progress_task:
+                                    completed_items.append(
+                                        f"Current: {in_progress_task}..."
+                                    )
+                    else:
+                        completed_items = ["Plan updated"]
+                    yield streaming_service.format_thinking_step(
+                        step_id=original_step_id,
+                        title=last_active_step_title,
+                        status="completed",
+                        items=completed_items,
+                    )
+                elif tool_name == "ls":
+                    # Build completion items showing file names found
+                    if isinstance(tool_output, dict):
+                        result = tool_output.get("result", "")
+                    elif isinstance(tool_output, str):
+                        result = tool_output
+                    else:
+                        result = str(tool_output) if tool_output else ""
+
+                    # Parse file paths and extract just the file names
+                    file_names = []
+                    if result:
+                        # The ls tool returns paths, extract just the file/folder names
+                        for line in result.strip().split("\n"):
+                            line = line.strip()
+                            if line:
+                                # Get just the filename from the path
+                                name = line.rstrip("/").split("/")[-1]
+                                if name and len(name) <= 40:
+                                    file_names.append(name)
+                                elif name:
+                                    file_names.append(name[:37] + "...")
+
+                    # Build display items - wrap file names in brackets for icon rendering
+                    if file_names:
+                        if len(file_names) <= 5:
+                            # Wrap each file name in brackets for styled tile rendering
+                            completed_items = [f"[{name}]" for name in file_names]
+                        else:
+                            # Show first few with brackets and count
+                            completed_items = [f"[{name}]" for name in file_names[:4]]
+                            completed_items.append(f"(+{len(file_names) - 4} more)")
+                    else:
+                        completed_items = ["No files found"]
+
+                    yield streaming_service.format_thinking_step(
+                        step_id=original_step_id,
+                        title="Exploring files",
+                        status="completed",
+                        items=completed_items,
+                    )
                else:
                    yield streaming_service.format_thinking_step(
                        step_id=original_step_id,
@ -843,6 +1044,27 @@ async def stream_new_chat(
                    yield streaming_service.format_terminal_info(
                        "Knowledge base search completed", "success"
                    )
+                elif tool_name == "write_todos":
+                    # Stream the full write_todos result so frontend can render the Plan component
+                    yield streaming_service.format_tool_output_available(
+                        tool_call_id,
+                        tool_output
+                        if isinstance(tool_output, dict)
+                        else {"result": tool_output},
+                    )
+                    # Send terminal message with plan info
+                    if isinstance(tool_output, dict):
+                        todos = tool_output.get("todos", [])
+                        todo_count = len(todos) if isinstance(todos, list) else 0
+                        yield streaming_service.format_terminal_info(
+                            f"Plan created ({todo_count} tasks)",
+                            "success",
+                        )
+                    else:
+                        yield streaming_service.format_terminal_info(
+                            "Plan created",
+                            "success",
+                        )
                else:
                    # Default handling for other tools
                    yield streaming_service.format_tool_output_available(
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@ -7,6 +7,8 @@ requires-python = ">=3.12"
 dependencies = [
    "alembic>=1.13.0",
    "asyncpg>=0.30.0",
+    "datasets>=2.21.0",
+    "pyarrow>=15.0.0,<19.0.0",
    "discord-py>=2.5.2",
    "docling>=2.15.0",
    "fastapi>=0.115.8",
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
@ -1148,6 +1148,30 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/c3/be/d0d44e092656fe7a06b55e6103cbce807cdbdee17884a5367c68c9860853/dataclasses_json-0.6.7-py3-none-any.whl", hash = "sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a", size = 28686 },
 ]

+[[package]]
+name = "datasets"
+version = "4.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dill" },
+    { name = "filelock" },
+    { name = "fsspec", extra = ["http"] },
+    { name = "huggingface-hub" },
+    { name = "multiprocess" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pandas" },
+    { name = "pyarrow" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "xxhash" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e3/9d/348ed92110ba5f9b70b51ca1078d4809767a835aa2b7ce7e74ad2b98323d/datasets-4.0.0.tar.gz", hash = "sha256:9657e7140a9050db13443ba21cb5de185af8af944479b00e7ff1e00a61c8dbf1", size = 569566 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/eb/62/eb8157afb21bd229c864521c1ab4fa8e9b4f1b06bafdd8c4668a7a31b5dd/datasets-4.0.0-py3-none-any.whl", hash = "sha256:7ef95e62025fd122882dbce6cb904c8cd3fbc829de6669a5eb939c77d50e203d", size = 494825 },
+]
+
 [[package]]
 name = "dateparser"
 version = "1.2.2"
@ -1213,11 +1237,11 @@ wheels = [

 [[package]]
 name = "dill"
-version = "0.4.0"
+version = "0.3.8"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/12/80/630b4b88364e9a8c8c5797f4602d0f76ef820909ee32f0bacb9f90654042/dill-0.4.0.tar.gz", hash = "sha256:0633f1d2df477324f53a895b02c901fb961bdbf65a17122586ea7019292cbcf0", size = 186976 }
+sdist = { url = "https://files.pythonhosted.org/packages/17/4d/ac7ffa80c69ea1df30a8aa11b3578692a5118e7cd1aa157e3ef73b092d15/dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca", size = 184847 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/50/3d/9373ad9c56321fdab5b41197068e1d8c25883b3fea29dd361f9b55116869/dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049", size = 119668 },
+    { url = "https://files.pythonhosted.org/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7", size = 116252 },
 ]

 [[package]]
@ -1875,11 +1899,16 @@ wheels = [

 [[package]]
 name = "fsspec"
-version = "2025.5.1"
+version = "2025.3.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/00/f7/27f15d41f0ed38e8fcc488584b57e902b331da7f7c6dcda53721b15838fc/fsspec-2025.5.1.tar.gz", hash = "sha256:2e55e47a540b91843b755e83ded97c6e897fa0942b11490113f09e9c443c2475", size = 303033 }
+sdist = { url = "https://files.pythonhosted.org/packages/34/f4/5721faf47b8c499e776bc34c6a8fc17efdf7fdef0b00f398128bc5dcb4ac/fsspec-2025.3.0.tar.gz", hash = "sha256:a935fd1ea872591f2b5148907d103488fc523295e6c64b835cfad8c3eca44972", size = 298491 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/bb/61/78c7b3851add1481b048b5fdc29067397a1784e2910592bc81bb3f608635/fsspec-2025.5.1-py3-none-any.whl", hash = "sha256:24d3a2e663d5fc735ab256263c4075f374a174c3410c0b25e5bd1970bceaa462", size = 199052 },
+    { url = "https://files.pythonhosted.org/packages/56/53/eb690efa8513166adef3e0669afd31e95ffde69fb3c52ec2ac7223ed6018/fsspec-2025.3.0-py3-none-any.whl", hash = "sha256:efb87af3efa9103f94ca91a7f8cb7a4df91af9f74fc106c9c7ea0efd7277c1b3", size = 193615 },
+]
+
+[package.optional-dependencies]
+http = [
+    { name = "aiohttp" },
 ]

 [[package]]
@ -3711,19 +3740,18 @@ wheels = [

 [[package]]
 name = "multiprocess"
-version = "0.70.18"
+version = "0.70.16"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "dill" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/72/fd/2ae3826f5be24c6ed87266bc4e59c46ea5b059a103f3d7e7eb76a52aeecb/multiprocess-0.70.18.tar.gz", hash = "sha256:f9597128e6b3e67b23956da07cf3d2e5cba79e2f4e0fba8d7903636663ec6d0d", size = 1798503 }
+sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ba/d8/0cba6cf51a1a31f20471fbc823a716170c73012ddc4fb85d706630ed6e8f/multiprocess-0.70.18-py310-none-any.whl", hash = "sha256:60c194974c31784019c1f459d984e8f33ee48f10fcf42c309ba97b30d9bd53ea", size = 134948 },
-    { url = "https://files.pythonhosted.org/packages/4b/88/9039f2fed1012ef584751d4ceff9ab4a51e5ae264898f0b7cbf44340a859/multiprocess-0.70.18-py311-none-any.whl", hash = "sha256:5aa6eef98e691281b3ad923be2832bf1c55dd2c859acd73e5ec53a66aae06a1d", size = 144462 },
-    { url = "https://files.pythonhosted.org/packages/bf/b6/5f922792be93b82ec6b5f270bbb1ef031fd0622847070bbcf9da816502cc/multiprocess-0.70.18-py312-none-any.whl", hash = "sha256:9b78f8e5024b573730bfb654783a13800c2c0f2dfc0c25e70b40d184d64adaa2", size = 150287 },
-    { url = "https://files.pythonhosted.org/packages/ee/25/7d7e78e750bc1aecfaf0efbf826c69a791d2eeaf29cf20cba93ff4cced78/multiprocess-0.70.18-py313-none-any.whl", hash = "sha256:871743755f43ef57d7910a38433cfe41319e72be1bbd90b79c7a5ac523eb9334", size = 151917 },
-    { url = "https://files.pythonhosted.org/packages/3b/c3/ca84c19bd14cdfc21c388fdcebf08b86a7a470ebc9f5c3c084fc2dbc50f7/multiprocess-0.70.18-py38-none-any.whl", hash = "sha256:dbf705e52a154fe5e90fb17b38f02556169557c2dd8bb084f2e06c2784d8279b", size = 132636 },
-    { url = "https://files.pythonhosted.org/packages/6c/28/dd72947e59a6a8c856448a5e74da6201cb5502ddff644fbc790e4bd40b9a/multiprocess-0.70.18-py39-none-any.whl", hash = "sha256:e78ca805a72b1b810c690b6b4cc32579eba34f403094bbbae962b7b5bf9dfcb8", size = 133478 },
+    { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824 },
+    { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519 },
+    { url = "https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e", size = 146741 },
+    { url = "https://files.pythonhosted.org/packages/ea/89/38df130f2c799090c978b366cfdf5b96d08de5b29a4a293df7f7429fa50b/multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435", size = 132628 },
+    { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351 },
 ]

 [[package]]
@ -4931,6 +4959,34 @@ bcrypt = [
    { name = "bcrypt" },
 ]

+[[package]]
+name = "pyarrow"
+version = "18.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7f/7b/640785a9062bb00314caa8a387abce547d2a420cf09bd6c715fe659ccffb/pyarrow-18.1.0.tar.gz", hash = "sha256:9386d3ca9c145b5539a1cfc75df07757dff870168c959b473a0bccbc3abc8c73", size = 1118671 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6a/50/12829e7111b932581e51dda51d5cb39207a056c30fe31ef43f14c63c4d7e/pyarrow-18.1.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:9f3a76670b263dc41d0ae877f09124ab96ce10e4e48f3e3e4257273cee61ad0d", size = 29514620 },
+    { url = "https://files.pythonhosted.org/packages/d1/41/468c944eab157702e96abab3d07b48b8424927d4933541ab43788bb6964d/pyarrow-18.1.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:da31fbca07c435be88a0c321402c4e31a2ba61593ec7473630769de8346b54ee", size = 30856494 },
+    { url = "https://files.pythonhosted.org/packages/68/f9/29fb659b390312a7345aeb858a9d9c157552a8852522f2c8bad437c29c0a/pyarrow-18.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:543ad8459bc438efc46d29a759e1079436290bd583141384c6f7a1068ed6f992", size = 39203624 },
+    { url = "https://files.pythonhosted.org/packages/6e/f6/19360dae44200e35753c5c2889dc478154cd78e61b1f738514c9f131734d/pyarrow-18.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0743e503c55be0fdb5c08e7d44853da27f19dc854531c0570f9f394ec9671d54", size = 40139341 },
+    { url = "https://files.pythonhosted.org/packages/bb/e6/9b3afbbcf10cc724312e824af94a2e993d8ace22994d823f5c35324cebf5/pyarrow-18.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:d4b3d2a34780645bed6414e22dda55a92e0fcd1b8a637fba86800ad737057e33", size = 38618629 },
+    { url = "https://files.pythonhosted.org/packages/3a/2e/3b99f8a3d9e0ccae0e961978a0d0089b25fb46ebbcfb5ebae3cca179a5b3/pyarrow-18.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c52f81aa6f6575058d8e2c782bf79d4f9fdc89887f16825ec3a66607a5dd8e30", size = 40078661 },
+    { url = "https://files.pythonhosted.org/packages/76/52/f8da04195000099d394012b8d42c503d7041b79f778d854f410e5f05049a/pyarrow-18.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:0ad4892617e1a6c7a551cfc827e072a633eaff758fa09f21c4ee548c30bcaf99", size = 25092330 },
+    { url = "https://files.pythonhosted.org/packages/cb/87/aa4d249732edef6ad88899399047d7e49311a55749d3c373007d034ee471/pyarrow-18.1.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:84e314d22231357d473eabec709d0ba285fa706a72377f9cc8e1cb3c8013813b", size = 29497406 },
+    { url = "https://files.pythonhosted.org/packages/3c/c7/ed6adb46d93a3177540e228b5ca30d99fc8ea3b13bdb88b6f8b6467e2cb7/pyarrow-18.1.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:f591704ac05dfd0477bb8f8e0bd4b5dc52c1cadf50503858dce3a15db6e46ff2", size = 30835095 },
+    { url = "https://files.pythonhosted.org/packages/41/d7/ed85001edfb96200ff606943cff71d64f91926ab42828676c0fc0db98963/pyarrow-18.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:acb7564204d3c40babf93a05624fc6a8ec1ab1def295c363afc40b0c9e66c191", size = 39194527 },
+    { url = "https://files.pythonhosted.org/packages/59/16/35e28eab126342fa391593415d79477e89582de411bb95232f28b131a769/pyarrow-18.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74de649d1d2ccb778f7c3afff6085bd5092aed4c23df9feeb45dd6b16f3811aa", size = 40131443 },
+    { url = "https://files.pythonhosted.org/packages/0c/95/e855880614c8da20f4cd74fa85d7268c725cf0013dc754048593a38896a0/pyarrow-18.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:f96bd502cb11abb08efea6dab09c003305161cb6c9eafd432e35e76e7fa9b90c", size = 38608750 },
+    { url = "https://files.pythonhosted.org/packages/54/9d/f253554b1457d4fdb3831b7bd5f8f00f1795585a606eabf6fec0a58a9c38/pyarrow-18.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:36ac22d7782554754a3b50201b607d553a8d71b78cdf03b33c1125be4b52397c", size = 40066690 },
+    { url = "https://files.pythonhosted.org/packages/2f/58/8912a2563e6b8273e8aa7b605a345bba5a06204549826f6493065575ebc0/pyarrow-18.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:25dbacab8c5952df0ca6ca0af28f50d45bd31c1ff6fcf79e2d120b4a65ee7181", size = 25081054 },
+    { url = "https://files.pythonhosted.org/packages/82/f9/d06ddc06cab1ada0c2f2fd205ac8c25c2701182de1b9c4bf7a0a44844431/pyarrow-18.1.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:6a276190309aba7bc9d5bd2933230458b3521a4317acfefe69a354f2fe59f2bc", size = 29525542 },
+    { url = "https://files.pythonhosted.org/packages/ab/94/8917e3b961810587ecbdaa417f8ebac0abb25105ae667b7aa11c05876976/pyarrow-18.1.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:ad514dbfcffe30124ce655d72771ae070f30bf850b48bc4d9d3b25993ee0e386", size = 30829412 },
+    { url = "https://files.pythonhosted.org/packages/5e/e3/3b16c3190f3d71d3b10f6758d2d5f7779ef008c4fd367cedab3ed178a9f7/pyarrow-18.1.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aebc13a11ed3032d8dd6e7171eb6e86d40d67a5639d96c35142bd568b9299324", size = 39119106 },
+    { url = "https://files.pythonhosted.org/packages/1d/d6/5d704b0d25c3c79532f8c0639f253ec2803b897100f64bcb3f53ced236e5/pyarrow-18.1.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6cf5c05f3cee251d80e98726b5c7cc9f21bab9e9783673bac58e6dfab57ecc8", size = 40090940 },
+    { url = "https://files.pythonhosted.org/packages/37/29/366bc7e588220d74ec00e497ac6710c2833c9176f0372fe0286929b2d64c/pyarrow-18.1.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:11b676cd410cf162d3f6a70b43fb9e1e40affbc542a1e9ed3681895f2962d3d9", size = 38548177 },
+    { url = "https://files.pythonhosted.org/packages/c8/11/fabf6ecabb1fe5b7d96889228ca2a9158c4c3bb732e3b8ee3f7f6d40b703/pyarrow-18.1.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b76130d835261b38f14fc41fdfb39ad8d672afb84c447126b84d5472244cfaba", size = 40043567 },
+]
+
 [[package]]
 name = "pyasn1"
 version = "0.6.1"
@ -6353,7 +6409,7 @@ wheels = [

 [[package]]
 name = "surf-new-backend"
-version = "0.0.8"
+version = "0.0.9"
 source = { virtual = "." }
 dependencies = [
    { name = "alembic" },
@ -6361,6 +6417,7 @@ dependencies = [
    { name = "boto3" },
    { name = "celery", extra = ["redis"] },
    { name = "chonkie", extra = ["all"] },
+    { name = "datasets" },
    { name = "deepagents" },
    { name = "discord-py" },
    { name = "docling" },
@ -6391,6 +6448,7 @@ dependencies = [
    { name = "pgvector" },
    { name = "playwright" },
    { name = "psycopg", extra = ["binary", "pool"] },
+    { name = "pyarrow" },
    { name = "pypdf" },
    { name = "python-ffmpeg" },
    { name = "redis" },
@ -6421,6 +6479,7 @@ requires-dist = [
    { name = "boto3", specifier = ">=1.35.0" },
    { name = "celery", extras = ["redis"], specifier = ">=5.5.3" },
    { name = "chonkie", extras = ["all"], specifier = ">=1.5.0" },
+    { name = "datasets", specifier = ">=2.21.0" },
    { name = "deepagents", specifier = ">=0.3.0" },
    { name = "discord-py", specifier = ">=2.5.2" },
    { name = "docling", specifier = ">=2.15.0" },
@ -6451,6 +6510,7 @@ requires-dist = [
    { name = "pgvector", specifier = ">=0.3.6" },
    { name = "playwright", specifier = ">=1.50.0" },
    { name = "psycopg", extras = ["binary", "pool"], specifier = ">=3.3.2" },
+    { name = "pyarrow", specifier = ">=15.0.0,<19.0.0" },
    { name = "pypdf", specifier = ">=5.1.0" },
    { name = "python-ffmpeg", specifier = ">=2.0.12" },
    { name = "redis", specifier = ">=5.2.1" },