feat: add new FastAPI debug configurations and enhance web crawling capabilities with real-time web query support

2026-07-24 23:41:10 +02:00 · 2026-02-20 17:28:20 -08:00 · 2026-02-20 17:28:20 -08:00 · ed497909fa
commit ed497909fa
parent 81dfc7102f
5 changed files with 256 additions and 104 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -24,6 +24,16 @@
            "cwd": "${workspaceFolder}/surfsense_backend",
            "python": "${command:python.interpreterPath}"
        },
+        {
+            "name": "Backend: FastAPI (No Reload)",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${workspaceFolder}/surfsense_backend/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "cwd": "${workspaceFolder}/surfsense_backend",
+            "python": "${command:python.interpreterPath}"
+        },
        {
            "name": "Backend: FastAPI (main.py)",
            "type": "debugpy",
@ -124,6 +134,34 @@
                "group": "Full Stack",
                "order": 2
            }
+        },
+        {
+            "name": "Full Stack: Backend (No Reload) + Frontend + Celery",
+            "configurations": [
+                "Backend: FastAPI (No Reload)",
+                "Frontend: Next.js",
+                "Celery: Worker",
+                "Celery: Beat Scheduler"
+            ],
+            "stopAll": true,
+            "presentation": {
+                "hidden": false,
+                "group": "Full Stack",
+                "order": 3
+            }
+        },
+        {
+            "name": "Full Stack: Backend (No Reload) + Frontend",
+            "configurations": [
+                "Backend: FastAPI (No Reload)",
+                "Frontend: Next.js"
+            ],
+            "stopAll": true,
+            "presentation": {
+                "hidden": false,
+                "group": "Full Stack",
+                "order": 4
+            }
        }
    ]
 }
--- a/surfsense_backend/app/agents/new_chat/system_prompt.py
+++ b/surfsense_backend/app/agents/new_chat/system_prompt.py
@ -74,6 +74,14 @@ You have access to the following tools:
  - IMPORTANT: When searching for information (meetings, schedules, notes, tasks, etc.), ALWAYS search broadly 
    across ALL sources first by omitting connectors_to_search. The user may store information in various places
    including calendar apps, note-taking apps (Obsidian, Notion), chat apps (Slack, Discord), and more.
+  - IMPORTANT (REAL-TIME / PUBLIC WEB QUERIES): For questions that require current public web data
+    (e.g., live exchange rates, stock prices, breaking news, weather, current events), you MUST call
+    `search_knowledge_base` using live web connectors via `connectors_to_search`:
+    ["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"].
+  - For these real-time/public web queries, DO NOT answer from memory and DO NOT say you lack internet
+    access before attempting a live connector search.
+  - If the live connectors return no relevant results, explain that live web sources did not return enough
+    data and ask the user if they want you to retry with a refined query.
  - Only narrow to specific connectors if the user explicitly asks (e.g., "check my Slack" or "in my calendar").
  - Personal notes in Obsidian, Notion, or NOTE often contain schedules, meeting times, reminders, and other 
    important information that may not be in calendars.
@ -358,6 +366,14 @@ _TOOLS_INSTRUCTIONS_EXAMPLES_COMMON = """
 - User: "What's in my Obsidian vault about project ideas?"
  - Call: `search_knowledge_base(query="project ideas", connectors_to_search=["OBSIDIAN_CONNECTOR"])`

+- User: "search me current usd to inr rate"
+  - Call: `search_knowledge_base(query="current USD to INR exchange rate", connectors_to_search=["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"])`
+  - Then answer using the returned live web results with citations.
+
+- User: "cant you search using linkup?"
+  - Call: `search_knowledge_base(query="<refined user request>", connectors_to_search=["LINKUP_API"])`
+  - Then answer from retrieved results (or clearly state that Linkup returned no data).
+
 - User: "Give me a podcast about AI trends based on what we discussed"
  - First search for relevant content, then call: `generate_podcast(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", podcast_title="AI Trends Podcast")`

--- a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
+++ b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
@ -593,6 +593,9 @@ IMPORTANT:
 - If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below.
 - If `connectors_to_search` is omitted/empty, the system will search broadly.
 - Only connectors that are enabled/configured for this search space are available.{doc_types_info}
+- For real-time/public web queries (e.g., current exchange rates, stock prices, breaking news, weather),
+  explicitly include live web connectors in `connectors_to_search`, prioritizing:
+  ["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"].

 ## Available connector enums for `connectors_to_search`

--- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py
+++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py
@ -5,6 +5,7 @@ This module provides a tool for fetching URL metadata (title, description,
 Open Graph image, etc.) to display rich link previews in the chat UI.
 """

+import asyncio
 import hashlib
 import logging
 import re
@ -15,7 +16,7 @@ import httpx
 import trafilatura
 from fake_useragent import UserAgent
 from langchain_core.tools import tool
-from playwright.async_api import async_playwright
+from playwright.sync_api import sync_playwright

 from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url

@ -175,6 +176,9 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
    Fetch page content using headless Chromium browser via Playwright.
    Used as a fallback when simple HTTP requests are blocked (403, etc.).

+    Runs the sync Playwright API in a thread so it works on any event
+    loop, including Windows ``SelectorEventLoop``.
+
    Args:
        url: URL to fetch

@ -182,65 +186,63 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
        Dict with title, description, image, and raw_html, or None if failed
    """
    try:
-        logger.info(f"[link_preview] Falling back to Chromium for {url}")
-
-        # Generate a realistic User-Agent to avoid bot detection
-        ua = UserAgent()
-        user_agent = ua.random
-
-        # Use residential proxy if configured
-        playwright_proxy = get_playwright_proxy()
-
-        # Use Playwright to fetch the page
-        async with async_playwright() as p:
-            launch_kwargs: dict = {"headless": True}
-            if playwright_proxy:
-                launch_kwargs["proxy"] = playwright_proxy
-            browser = await p.chromium.launch(**launch_kwargs)
-            context = await browser.new_context(user_agent=user_agent)
-            page = await context.new_page()
-
-            try:
-                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
-                raw_html = await page.content()
-            finally:
-                await browser.close()
-
-        if not raw_html or len(raw_html.strip()) == 0:
-            logger.warning(f"[link_preview] Chromium returned empty content for {url}")
-            return None
-
-        # Extract metadata using Trafilatura
-        trafilatura_metadata = trafilatura.extract_metadata(raw_html)
-
-        # Extract OG image from raw HTML (trafilatura doesn't extract this)
-        image = extract_image(raw_html)
-
-        result = {
-            "title": None,
-            "description": None,
-            "image": image,
-            "raw_html": raw_html,
-        }
-
-        if trafilatura_metadata:
-            result["title"] = trafilatura_metadata.title
-            result["description"] = trafilatura_metadata.description
-
-        # If trafilatura didn't get the title/description, try OG tags
-        if not result["title"]:
-            result["title"] = extract_title(raw_html)
-        if not result["description"]:
-            result["description"] = extract_description(raw_html)
-
-        logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
-        return result
-
+        return await asyncio.to_thread(_fetch_with_chromium_sync, url)
    except Exception as e:
        logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}")
        return None


+def _fetch_with_chromium_sync(url: str) -> dict[str, Any] | None:
+    """Synchronous Playwright fetch executed in a worker thread."""
+    logger.info(f"[link_preview] Falling back to Chromium for {url}")
+
+    ua = UserAgent()
+    user_agent = ua.random
+
+    playwright_proxy = get_playwright_proxy()
+
+    with sync_playwright() as p:
+        launch_kwargs: dict = {"headless": True}
+        if playwright_proxy:
+            launch_kwargs["proxy"] = playwright_proxy
+        browser = p.chromium.launch(**launch_kwargs)
+        context = browser.new_context(user_agent=user_agent)
+        page = context.new_page()
+
+        try:
+            page.goto(url, wait_until="domcontentloaded", timeout=30000)
+            raw_html = page.content()
+        finally:
+            browser.close()
+
+    if not raw_html or len(raw_html.strip()) == 0:
+        logger.warning(f"[link_preview] Chromium returned empty content for {url}")
+        return None
+
+    trafilatura_metadata = trafilatura.extract_metadata(raw_html)
+
+    image = extract_image(raw_html)
+
+    result: dict[str, Any] = {
+        "title": None,
+        "description": None,
+        "image": image,
+        "raw_html": raw_html,
+    }
+
+    if trafilatura_metadata:
+        result["title"] = trafilatura_metadata.title
+        result["description"] = trafilatura_metadata.description
+
+    if not result["title"]:
+        result["title"] = extract_title(raw_html)
+    if not result["description"]:
+        result["description"] = extract_description(raw_html)
+
+    logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
+    return result
+
+
 def create_link_preview_tool():
    """
    Factory function to create the link_preview tool.
--- a/surfsense_backend/app/connectors/webcrawler_connector.py
+++ b/surfsense_backend/app/connectors/webcrawler_connector.py
@ -1,20 +1,28 @@
 """
 WebCrawler Connector Module

-A module for crawling web pages and extracting content using Firecrawl or Playwright.
-Provides a unified interface for web scraping.
+A module for crawling web pages and extracting content using Firecrawl,
+plain HTTP+Trafilatura, or Playwright.  Provides a unified interface for
+web scraping.
+
+Fallback order:
+  1. Firecrawl  (if API key is configured)
+  2. HTTP + Trafilatura  (lightweight, works on any event loop)
+  3. Playwright / Chromium  (runs in a thread to avoid event-loop limitations)
 """

+import asyncio
 import logging
 from typing import Any

+import httpx
 import trafilatura
 import validators
 from fake_useragent import UserAgent
 from firecrawl import AsyncFirecrawlApp
-from playwright.async_api import async_playwright
+from playwright.sync_api import sync_playwright

-from app.utils.proxy_config import get_playwright_proxy
+from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url

 logger = logging.getLogger(__name__)

@ -50,8 +58,10 @@ class WebCrawlerConnector:
        """
        Crawl a single URL and extract its content.

-        If Firecrawl API key is provided, tries Firecrawl first and falls back to Chromium
-        if Firecrawl fails. If no Firecrawl API key is provided, uses Chromium directly.
+        Fallback order:
+          1. Firecrawl (if API key configured)
+          2. Plain HTTP + Trafilatura (lightweight, no subprocess)
+          3. Playwright / Chromium (needs subprocess-capable event loop)

        Args:
            url: URL to crawl
@ -63,37 +73,57 @@ class WebCrawlerConnector:
                - content: Extracted content (markdown or HTML)
                - metadata: Page metadata (title, description, etc.)
                - source: Original URL
-                - crawler_type: Type of crawler used ("firecrawl" or "chromium")
+                - crawler_type: Type of crawler used
+            # Validate URL
        """
        try:
-            # Validate URL
            if not validators.url(url):
                return None, f"Invalid URL: {url}"

-            # Try Firecrawl first if API key is provided
+            errors: list[str] = []
+
+            # --- 1. Firecrawl (premium, if configured) ---
            if self.use_firecrawl:
                try:
                    logger.info(f"[webcrawler] Using Firecrawl for: {url}")
-                    result = await self._crawl_with_firecrawl(url, formats)
-                    return result, None
-                except Exception as firecrawl_error:
-                    # Firecrawl failed, fallback to Chromium
+                    return await self._crawl_with_firecrawl(url, formats), None
+                except Exception as exc:
+                    errors.append(f"Firecrawl: {exc!s}")
                    logger.warning(
-                        f"[webcrawler] Firecrawl failed, falling back to Chromium+Trafilatura for: {url}"
+                        f"[webcrawler] Firecrawl failed for {url}: {exc!s}"
                    )
-                    try:
-                        result = await self._crawl_with_chromium(url)
-                        return result, None
-                    except Exception as chromium_error:
-                        return (
-                            None,
-                            f"Both Firecrawl and Chromium failed. Firecrawl error: {firecrawl_error!s}, Chromium error: {chromium_error!s}",
-                        )
-            else:
-                # No Firecrawl API key, use Chromium directly
+
+            # --- 2. HTTP + Trafilatura (no subprocess required) ---
+            try:
+                logger.info(f"[webcrawler] Using HTTP+Trafilatura for: {url}")
+                result = await self._crawl_with_http(url)
+                if result:
+                    return result, None
+                errors.append("HTTP+Trafilatura: empty extraction")
+            except Exception as exc:
+                errors.append(f"HTTP+Trafilatura: {exc!s}")
+                logger.warning(
+                    f"[webcrawler] HTTP+Trafilatura failed for {url}: {exc!s}"
+                )
+
+            # --- 3. Playwright / Chromium (full browser, last resort) ---
+            try:
                logger.info(f"[webcrawler] Using Chromium+Trafilatura for: {url}")
-                result = await self._crawl_with_chromium(url)
-                return result, None
+                return await self._crawl_with_chromium(url), None
+            except NotImplementedError:
+                errors.append(
+                    "Chromium: event loop does not support subprocesses "
+                    "(common on Windows with uvicorn --reload)"
+                )
+                logger.warning(
+                    f"[webcrawler] Chromium unavailable for {url}: "
+                    "current event loop does not support subprocesses"
+                )
+            except Exception as exc:
+                errors.append(f"Chromium: {exc!s}")
+                logger.warning(f"[webcrawler] Chromium failed for {url}: {exc!s}")
+
+            return None, f"All crawl methods failed for {url}. {'; '.join(errors)}"

        except Exception as e:
            return None, f"Error crawling URL {url}: {e!s}"
@ -149,11 +179,80 @@ class WebCrawlerConnector:
            "crawler_type": "firecrawl",
        }

+    async def _crawl_with_http(self, url: str) -> dict[str, Any] | None:
+        """
+        Crawl URL using a plain HTTP request + Trafilatura content extraction.
+
+        This method avoids launching a browser subprocess, making it safe to
+        call from any asyncio event loop (including Windows SelectorEventLoop
+        which does not support ``create_subprocess_exec``).
+
+        Returns ``None`` when Trafilatura cannot extract meaningful content
+        (e.g. JS-rendered SPAs) so the caller can fall through to Chromium.
+        """
+        ua = UserAgent()
+        user_agent = ua.random
+        proxy_url = get_residential_proxy_url()
+
+        async with httpx.AsyncClient(
+            timeout=20.0,
+            follow_redirects=True,
+            proxy=proxy_url,
+            headers={
+                "User-Agent": user_agent,
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.9",
+                "Accept-Encoding": "gzip, deflate, br",
+            },
+        ) as client:
+            response = await client.get(url)
+            response.raise_for_status()
+            raw_html = response.text
+
+        if not raw_html or len(raw_html.strip()) == 0:
+            return None
+
+        extracted_content = trafilatura.extract(
+            raw_html,
+            output_format="markdown",
+            include_comments=False,
+            include_tables=True,
+            include_images=True,
+            include_links=True,
+        )
+
+        if not extracted_content or len(extracted_content.strip()) == 0:
+            return None
+
+        trafilatura_metadata = trafilatura.extract_metadata(raw_html)
+
+        metadata: dict[str, str] = {"source": url}
+        if trafilatura_metadata:
+            if trafilatura_metadata.title:
+                metadata["title"] = trafilatura_metadata.title
+            if trafilatura_metadata.description:
+                metadata["description"] = trafilatura_metadata.description
+            if trafilatura_metadata.author:
+                metadata["author"] = trafilatura_metadata.author
+            if trafilatura_metadata.date:
+                metadata["date"] = trafilatura_metadata.date
+        metadata.setdefault("title", url)
+
+        return {
+            "content": extracted_content,
+            "metadata": metadata,
+            "crawler_type": "http",
+        }
+
    async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
        """
        Crawl URL using Playwright with Trafilatura for content extraction.
        Falls back to raw HTML if Trafilatura extraction fails.

+        Runs the sync Playwright API in a thread so it works on any event
+        loop, including Windows ``SelectorEventLoop`` which cannot spawn
+        subprocesses.
+
        Args:
            url: URL to crawl

@ -163,51 +262,48 @@ class WebCrawlerConnector:
        Raises:
            Exception: If crawling fails
        """
-        # Generate a realistic User-Agent to avoid bot detection
+        return await asyncio.to_thread(self._crawl_with_chromium_sync, url)
+
+    def _crawl_with_chromium_sync(self, url: str) -> dict[str, Any]:
+        """Synchronous Playwright crawl executed in a worker thread."""
        ua = UserAgent()
        user_agent = ua.random

-        # Use residential proxy if configured
        playwright_proxy = get_playwright_proxy()

-        # Use Playwright to fetch the page
-        async with async_playwright() as p:
+        with sync_playwright() as p:
            launch_kwargs: dict = {"headless": True}
            if playwright_proxy:
                launch_kwargs["proxy"] = playwright_proxy
-            browser = await p.chromium.launch(**launch_kwargs)
-            context = await browser.new_context(user_agent=user_agent)
-            page = await context.new_page()
+            browser = p.chromium.launch(**launch_kwargs)
+            context = browser.new_context(user_agent=user_agent)
+            page = context.new_page()

            try:
-                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
-                raw_html = await page.content()
-                page_title = await page.title()
+                page.goto(url, wait_until="domcontentloaded", timeout=30000)
+                raw_html = page.content()
+                page_title = page.title()
            finally:
-                await browser.close()
+                browser.close()

        if not raw_html:
            raise ValueError(f"Failed to load content from {url}")

-        # Extract basic metadata from the page
        base_metadata = {"title": page_title} if page_title else {}

-        # Try to extract main content using Trafilatura
        extracted_content = None
        trafilatura_metadata = None

        try:
-            # Extract main content as markdown
            extracted_content = trafilatura.extract(
                raw_html,
-                output_format="markdown",  # Get clean markdown
-                include_comments=False,  # Exclude comments
-                include_tables=True,  # Keep tables
-                include_images=True,  # Keep image references
-                include_links=True,  # Keep links
+                output_format="markdown",
+                include_comments=False,
+                include_tables=True,
+                include_images=True,
+                include_links=True,
            )

-            # Extract metadata using Trafilatura
            trafilatura_metadata = trafilatura.extract_metadata(raw_html)

            if not extracted_content or len(extracted_content.strip()) == 0:
@ -216,7 +312,6 @@ class WebCrawlerConnector:
        except Exception:
            extracted_content = None

-        # Build metadata, preferring Trafilatura metadata when available
        metadata = {
            "source": url,
            "title": (
@ -226,7 +321,6 @@ class WebCrawlerConnector:
            ),
        }

-        # Add additional metadata from Trafilatura if available
        if trafilatura_metadata:
            if trafilatura_metadata.description:
                metadata["description"] = trafilatura_metadata.description
@ -235,7 +329,6 @@ class WebCrawlerConnector:
            if trafilatura_metadata.date:
                metadata["date"] = trafilatura_metadata.date

-        # Add any remaining base metadata
        metadata.update(base_metadata)

        return {