feat: add new FastAPI debug configurations and enhance web crawling capabilities with real-time web query support

2026-06-30 21:59:46 +02:00 · 2026-02-20 17:28:20 -08:00 · 2026-02-20 17:28:20 -08:00 · ed497909fa
commit ed497909fa
parent 81dfc7102f
5 changed files with 256 additions and 104 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -24,6 +24,16 @@
            "cwd": "${workspaceFolder}/surfsense_backend",
            "python": "${command:python.interpreterPath}"
        },
        {
            "name": "Backend: FastAPI (No Reload)",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/surfsense_backend/main.py",
            "console": "integratedTerminal",
            "justMyCode": false,
            "cwd": "${workspaceFolder}/surfsense_backend",
            "python": "${command:python.interpreterPath}"
        },
        {
            "name": "Backend: FastAPI (main.py)",
            "type": "debugpy",
@ -124,6 +134,34 @@
                "group": "Full Stack",
                "order": 2
            }
        },
        {
            "name": "Full Stack: Backend (No Reload) + Frontend + Celery",
            "configurations": [
                "Backend: FastAPI (No Reload)",
                "Frontend: Next.js",
                "Celery: Worker",
                "Celery: Beat Scheduler"
            ],
            "stopAll": true,
            "presentation": {
                "hidden": false,
                "group": "Full Stack",
                "order": 3
            }
        },
        {
            "name": "Full Stack: Backend (No Reload) + Frontend",
            "configurations": [
                "Backend: FastAPI (No Reload)",
                "Frontend: Next.js"
            ],
            "stopAll": true,
            "presentation": {
                "hidden": false,
                "group": "Full Stack",
                "order": 4
            }
        }
    ]
 }
--- a/surfsense_backend/app/agents/new_chat/system_prompt.py
+++ b/surfsense_backend/app/agents/new_chat/system_prompt.py
@ -74,6 +74,14 @@ You have access to the following tools:
  - IMPORTANT: When searching for information (meetings, schedules, notes, tasks, etc.), ALWAYS search broadly 
    across ALL sources first by omitting connectors_to_search. The user may store information in various places
    including calendar apps, note-taking apps (Obsidian, Notion), chat apps (Slack, Discord), and more.
  - IMPORTANT (REAL-TIME / PUBLIC WEB QUERIES): For questions that require current public web data
    (e.g., live exchange rates, stock prices, breaking news, weather, current events), you MUST call
    `search_knowledge_base` using live web connectors via `connectors_to_search`:
    ["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"].
  - For these real-time/public web queries, DO NOT answer from memory and DO NOT say you lack internet
    access before attempting a live connector search.
  - If the live connectors return no relevant results, explain that live web sources did not return enough
    data and ask the user if they want you to retry with a refined query.
  - Only narrow to specific connectors if the user explicitly asks (e.g., "check my Slack" or "in my calendar").
  - Personal notes in Obsidian, Notion, or NOTE often contain schedules, meeting times, reminders, and other 
    important information that may not be in calendars.
@ -358,6 +366,14 @@ _TOOLS_INSTRUCTIONS_EXAMPLES_COMMON = """
 - User: "What's in my Obsidian vault about project ideas?"
  - Call: `search_knowledge_base(query="project ideas", connectors_to_search=["OBSIDIAN_CONNECTOR"])`
 - User: "search me current usd to inr rate"
  - Call: `search_knowledge_base(query="current USD to INR exchange rate", connectors_to_search=["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"])`
  - Then answer using the returned live web results with citations.
 - User: "cant you search using linkup?"
  - Call: `search_knowledge_base(query="<refined user request>", connectors_to_search=["LINKUP_API"])`
  - Then answer from retrieved results (or clearly state that Linkup returned no data).
 - User: "Give me a podcast about AI trends based on what we discussed"
  - First search for relevant content, then call: `generate_podcast(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", podcast_title="AI Trends Podcast")`
--- a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
+++ b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
@ -593,6 +593,9 @@ IMPORTANT:
 - If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below.
 - If `connectors_to_search` is omitted/empty, the system will search broadly.
 - Only connectors that are enabled/configured for this search space are available.{doc_types_info}
 - For real-time/public web queries (e.g., current exchange rates, stock prices, breaking news, weather),
  explicitly include live web connectors in `connectors_to_search`, prioritizing:
  ["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"].
 ## Available connector enums for `connectors_to_search`
--- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py
+++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py
@ -5,6 +5,7 @@ This module provides a tool for fetching URL metadata (title, description,
 Open Graph image, etc.) to display rich link previews in the chat UI.
 """
 import asyncio
 import hashlib
 import logging
 import re
@ -15,7 +16,7 @@ import httpx
 import trafilatura
 from fake_useragent import UserAgent
 from langchain_core.tools import tool
-from playwright.async_api import async_playwright
+from playwright.sync_api import sync_playwright
 from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url
@ -175,6 +176,9 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
    Fetch page content using headless Chromium browser via Playwright.
    Used as a fallback when simple HTTP requests are blocked (403, etc.).
    Runs the sync Playwright API in a thread so it works on any event
    loop, including Windows ``SelectorEventLoop``.
    Args:
        url: URL to fetch
@ -182,65 +186,63 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
        Dict with title, description, image, and raw_html, or None if failed
    """
    try:
-        logger.info(f"[link_preview] Falling back to Chromium for {url}")
+        return await asyncio.to_thread(_fetch_with_chromium_sync, url)
        # Generate a realistic User-Agent to avoid bot detection
        ua = UserAgent()
        user_agent = ua.random
        # Use residential proxy if configured
        playwright_proxy = get_playwright_proxy()
        # Use Playwright to fetch the page
        async with async_playwright() as p:
            launch_kwargs: dict = {"headless": True}
            if playwright_proxy:
                launch_kwargs["proxy"] = playwright_proxy
            browser = await p.chromium.launch(**launch_kwargs)
            context = await browser.new_context(user_agent=user_agent)
            page = await context.new_page()
            try:
                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
                raw_html = await page.content()
            finally:
                await browser.close()
        if not raw_html or len(raw_html.strip()) == 0:
            logger.warning(f"[link_preview] Chromium returned empty content for {url}")
            return None
        # Extract metadata using Trafilatura
        trafilatura_metadata = trafilatura.extract_metadata(raw_html)
        # Extract OG image from raw HTML (trafilatura doesn't extract this)
        image = extract_image(raw_html)
        result = {
            "title": None,
            "description": None,
            "image": image,
            "raw_html": raw_html,
        }
        if trafilatura_metadata:
            result["title"] = trafilatura_metadata.title
            result["description"] = trafilatura_metadata.description
        # If trafilatura didn't get the title/description, try OG tags
        if not result["title"]:
            result["title"] = extract_title(raw_html)
        if not result["description"]:
            result["description"] = extract_description(raw_html)
        logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
        return result
    except Exception as e:
        logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}")
        return None
 def _fetch_with_chromium_sync(url: str) -> dict[str, Any] | None:
    """Synchronous Playwright fetch executed in a worker thread."""
    logger.info(f"[link_preview] Falling back to Chromium for {url}")
    ua = UserAgent()
    user_agent = ua.random
    playwright_proxy = get_playwright_proxy()
    with sync_playwright() as p:
        launch_kwargs: dict = {"headless": True}
        if playwright_proxy:
            launch_kwargs["proxy"] = playwright_proxy
        browser = p.chromium.launch(**launch_kwargs)
        context = browser.new_context(user_agent=user_agent)
        page = context.new_page()
        try:
            page.goto(url, wait_until="domcontentloaded", timeout=30000)
            raw_html = page.content()
        finally:
            browser.close()
    if not raw_html or len(raw_html.strip()) == 0:
        logger.warning(f"[link_preview] Chromium returned empty content for {url}")
        return None
    trafilatura_metadata = trafilatura.extract_metadata(raw_html)
    image = extract_image(raw_html)
    result: dict[str, Any] = {
        "title": None,
        "description": None,
        "image": image,
        "raw_html": raw_html,
    }
    if trafilatura_metadata:
        result["title"] = trafilatura_metadata.title
        result["description"] = trafilatura_metadata.description
    if not result["title"]:
        result["title"] = extract_title(raw_html)
    if not result["description"]:
        result["description"] = extract_description(raw_html)
    logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
    return result
 def create_link_preview_tool():
    """
    Factory function to create the link_preview tool.
--- a/surfsense_backend/app/connectors/webcrawler_connector.py
+++ b/surfsense_backend/app/connectors/webcrawler_connector.py
@ -1,20 +1,28 @@
 """
 WebCrawler Connector Module
-A module for crawling web pages and extracting content using Firecrawl or Playwright.
+A module for crawling web pages and extracting content using Firecrawl,
-Provides a unified interface for web scraping.
+plain HTTP+Trafilatura, or Playwright.  Provides a unified interface for
 web scraping.
 Fallback order:
  1. Firecrawl  (if API key is configured)
  2. HTTP + Trafilatura  (lightweight, works on any event loop)
  3. Playwright / Chromium  (runs in a thread to avoid event-loop limitations)
 """
 import asyncio
 import logging
 from typing import Any
 import httpx
 import trafilatura
 import validators
 from fake_useragent import UserAgent
 from firecrawl import AsyncFirecrawlApp
-from playwright.async_api import async_playwright
+from playwright.sync_api import sync_playwright
-from app.utils.proxy_config import get_playwright_proxy
+from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url
 logger = logging.getLogger(__name__)
@ -50,8 +58,10 @@ class WebCrawlerConnector:
        """
        Crawl a single URL and extract its content.
-        If Firecrawl API key is provided, tries Firecrawl first and falls back to Chromium
+        Fallback order:
-        if Firecrawl fails. If no Firecrawl API key is provided, uses Chromium directly.
+          1. Firecrawl (if API key configured)
          2. Plain HTTP + Trafilatura (lightweight, no subprocess)
          3. Playwright / Chromium (needs subprocess-capable event loop)
        Args:
            url: URL to crawl
@ -63,37 +73,57 @@ class WebCrawlerConnector:
                - content: Extracted content (markdown or HTML)
                - metadata: Page metadata (title, description, etc.)
                - source: Original URL
-                - crawler_type: Type of crawler used ("firecrawl" or "chromium")
+                - crawler_type: Type of crawler used
            # Validate URL
        """
        try:
            # Validate URL
            if not validators.url(url):
                return None, f"Invalid URL: {url}"
-            # Try Firecrawl first if API key is provided
+            errors: list[str] = []
            # --- 1. Firecrawl (premium, if configured) ---
            if self.use_firecrawl:
                try:
                    logger.info(f"[webcrawler] Using Firecrawl for: {url}")
-                    result = await self._crawl_with_firecrawl(url, formats)
+                    return await self._crawl_with_firecrawl(url, formats), None
-                    return result, None
+                except Exception as exc:
-                except Exception as firecrawl_error:
+                    errors.append(f"Firecrawl: {exc!s}")
                    # Firecrawl failed, fallback to Chromium
                    logger.warning(
-                        f"[webcrawler] Firecrawl failed, falling back to Chromium+Trafilatura for: {url}"
+                        f"[webcrawler] Firecrawl failed for {url}: {exc!s}"
                    )
-                    try:
+
-                        result = await self._crawl_with_chromium(url)
+            # --- 2. HTTP + Trafilatura (no subprocess required) ---
-                        return result, None
+            try:
-                    except Exception as chromium_error:
+                logger.info(f"[webcrawler] Using HTTP+Trafilatura for: {url}")
-                        return (
+                result = await self._crawl_with_http(url)
-                            None,
+                if result:
-                            f"Both Firecrawl and Chromium failed. Firecrawl error: {firecrawl_error!s}, Chromium error: {chromium_error!s}",
+                    return result, None
-                        )
+                errors.append("HTTP+Trafilatura: empty extraction")
-            else:
+            except Exception as exc:
-                # No Firecrawl API key, use Chromium directly
+                errors.append(f"HTTP+Trafilatura: {exc!s}")
                logger.warning(
                    f"[webcrawler] HTTP+Trafilatura failed for {url}: {exc!s}"
                )
            # --- 3. Playwright / Chromium (full browser, last resort) ---
            try:
                logger.info(f"[webcrawler] Using Chromium+Trafilatura for: {url}")
-                result = await self._crawl_with_chromium(url)
+                return await self._crawl_with_chromium(url), None
-                return result, None
+            except NotImplementedError:
                errors.append(
                    "Chromium: event loop does not support subprocesses "
                    "(common on Windows with uvicorn --reload)"
                )
                logger.warning(
                    f"[webcrawler] Chromium unavailable for {url}: "
                    "current event loop does not support subprocesses"
                )
            except Exception as exc:
                errors.append(f"Chromium: {exc!s}")
                logger.warning(f"[webcrawler] Chromium failed for {url}: {exc!s}")
            return None, f"All crawl methods failed for {url}. {'; '.join(errors)}"
        except Exception as e:
            return None, f"Error crawling URL {url}: {e!s}"
@ -149,11 +179,80 @@ class WebCrawlerConnector:
            "crawler_type": "firecrawl",
        }
    async def _crawl_with_http(self, url: str) -> dict[str, Any] | None:
        """
        Crawl URL using a plain HTTP request + Trafilatura content extraction.
        This method avoids launching a browser subprocess, making it safe to
        call from any asyncio event loop (including Windows SelectorEventLoop
        which does not support ``create_subprocess_exec``).
        Returns ``None`` when Trafilatura cannot extract meaningful content
        (e.g. JS-rendered SPAs) so the caller can fall through to Chromium.
        """
        ua = UserAgent()
        user_agent = ua.random
        proxy_url = get_residential_proxy_url()
        async with httpx.AsyncClient(
            timeout=20.0,
            follow_redirects=True,
            proxy=proxy_url,
            headers={
                "User-Agent": user_agent,
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.9",
                "Accept-Encoding": "gzip, deflate, br",
            },
        ) as client:
            response = await client.get(url)
            response.raise_for_status()
            raw_html = response.text
        if not raw_html or len(raw_html.strip()) == 0:
            return None
        extracted_content = trafilatura.extract(
            raw_html,
            output_format="markdown",
            include_comments=False,
            include_tables=True,
            include_images=True,
            include_links=True,
        )
        if not extracted_content or len(extracted_content.strip()) == 0:
            return None
        trafilatura_metadata = trafilatura.extract_metadata(raw_html)
        metadata: dict[str, str] = {"source": url}
        if trafilatura_metadata:
            if trafilatura_metadata.title:
                metadata["title"] = trafilatura_metadata.title
            if trafilatura_metadata.description:
                metadata["description"] = trafilatura_metadata.description
            if trafilatura_metadata.author:
                metadata["author"] = trafilatura_metadata.author
            if trafilatura_metadata.date:
                metadata["date"] = trafilatura_metadata.date
        metadata.setdefault("title", url)
        return {
            "content": extracted_content,
            "metadata": metadata,
            "crawler_type": "http",
        }
    async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
        """
        Crawl URL using Playwright with Trafilatura for content extraction.
        Falls back to raw HTML if Trafilatura extraction fails.
        Runs the sync Playwright API in a thread so it works on any event
        loop, including Windows ``SelectorEventLoop`` which cannot spawn
        subprocesses.
        Args:
            url: URL to crawl
@ -163,51 +262,48 @@ class WebCrawlerConnector:
        Raises:
            Exception: If crawling fails
        """
-        # Generate a realistic User-Agent to avoid bot detection
+        return await asyncio.to_thread(self._crawl_with_chromium_sync, url)
    def _crawl_with_chromium_sync(self, url: str) -> dict[str, Any]:
        """Synchronous Playwright crawl executed in a worker thread."""
        ua = UserAgent()
        user_agent = ua.random
        # Use residential proxy if configured
        playwright_proxy = get_playwright_proxy()
-        # Use Playwright to fetch the page
+        with sync_playwright() as p:
        async with async_playwright() as p:
            launch_kwargs: dict = {"headless": True}
            if playwright_proxy:
                launch_kwargs["proxy"] = playwright_proxy
-            browser = await p.chromium.launch(**launch_kwargs)
+            browser = p.chromium.launch(**launch_kwargs)
-            context = await browser.new_context(user_agent=user_agent)
+            context = browser.new_context(user_agent=user_agent)
-            page = await context.new_page()
+            page = context.new_page()
            try:
-                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
+                page.goto(url, wait_until="domcontentloaded", timeout=30000)
-                raw_html = await page.content()
+                raw_html = page.content()
-                page_title = await page.title()
+                page_title = page.title()
            finally:
-                await browser.close()
+                browser.close()
        if not raw_html:
            raise ValueError(f"Failed to load content from {url}")
        # Extract basic metadata from the page
        base_metadata = {"title": page_title} if page_title else {}
        # Try to extract main content using Trafilatura
        extracted_content = None
        trafilatura_metadata = None
        try:
            # Extract main content as markdown
            extracted_content = trafilatura.extract(
                raw_html,
-                output_format="markdown",  # Get clean markdown
+                output_format="markdown",
-                include_comments=False,  # Exclude comments
+                include_comments=False,
-                include_tables=True,  # Keep tables
+                include_tables=True,
-                include_images=True,  # Keep image references
+                include_images=True,
-                include_links=True,  # Keep links
+                include_links=True,
            )
            # Extract metadata using Trafilatura
            trafilatura_metadata = trafilatura.extract_metadata(raw_html)
            if not extracted_content or len(extracted_content.strip()) == 0:
@ -216,7 +312,6 @@ class WebCrawlerConnector:
        except Exception:
            extracted_content = None
        # Build metadata, preferring Trafilatura metadata when available
        metadata = {
            "source": url,
            "title": (
@ -226,7 +321,6 @@ class WebCrawlerConnector:
            ),
        }
        # Add additional metadata from Trafilatura if available
        if trafilatura_metadata:
            if trafilatura_metadata.description:
                metadata["description"] = trafilatura_metadata.description
@ -235,7 +329,6 @@ class WebCrawlerConnector:
            if trafilatura_metadata.date:
                metadata["date"] = trafilatura_metadata.date
        # Add any remaining base metadata
        metadata.update(base_metadata)
        return {