diff --git a/.vscode/launch.json b/.vscode/launch.json index 4988cc8f3..2c4784c0e 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -24,6 +24,16 @@ "cwd": "${workspaceFolder}/surfsense_backend", "python": "${command:python.interpreterPath}" }, + { + "name": "Backend: FastAPI (No Reload)", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/surfsense_backend/main.py", + "console": "integratedTerminal", + "justMyCode": false, + "cwd": "${workspaceFolder}/surfsense_backend", + "python": "${command:python.interpreterPath}" + }, { "name": "Backend: FastAPI (main.py)", "type": "debugpy", @@ -124,6 +134,34 @@ "group": "Full Stack", "order": 2 } + }, + { + "name": "Full Stack: Backend (No Reload) + Frontend + Celery", + "configurations": [ + "Backend: FastAPI (No Reload)", + "Frontend: Next.js", + "Celery: Worker", + "Celery: Beat Scheduler" + ], + "stopAll": true, + "presentation": { + "hidden": false, + "group": "Full Stack", + "order": 3 + } + }, + { + "name": "Full Stack: Backend (No Reload) + Frontend", + "configurations": [ + "Backend: FastAPI (No Reload)", + "Frontend: Next.js" + ], + "stopAll": true, + "presentation": { + "hidden": false, + "group": "Full Stack", + "order": 4 + } } ] } diff --git a/surfsense_backend/app/agents/new_chat/system_prompt.py b/surfsense_backend/app/agents/new_chat/system_prompt.py index 3fd83b716..a5be211b4 100644 --- a/surfsense_backend/app/agents/new_chat/system_prompt.py +++ b/surfsense_backend/app/agents/new_chat/system_prompt.py @@ -74,6 +74,14 @@ You have access to the following tools: - IMPORTANT: When searching for information (meetings, schedules, notes, tasks, etc.), ALWAYS search broadly across ALL sources first by omitting connectors_to_search. The user may store information in various places including calendar apps, note-taking apps (Obsidian, Notion), chat apps (Slack, Discord), and more. + - IMPORTANT (REAL-TIME / PUBLIC WEB QUERIES): For questions that require current public web data + (e.g., live exchange rates, stock prices, breaking news, weather, current events), you MUST call + `search_knowledge_base` using live web connectors via `connectors_to_search`: + ["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"]. + - For these real-time/public web queries, DO NOT answer from memory and DO NOT say you lack internet + access before attempting a live connector search. + - If the live connectors return no relevant results, explain that live web sources did not return enough + data and ask the user if they want you to retry with a refined query. - Only narrow to specific connectors if the user explicitly asks (e.g., "check my Slack" or "in my calendar"). - Personal notes in Obsidian, Notion, or NOTE often contain schedules, meeting times, reminders, and other important information that may not be in calendars. @@ -358,6 +366,14 @@ _TOOLS_INSTRUCTIONS_EXAMPLES_COMMON = """ - User: "What's in my Obsidian vault about project ideas?" - Call: `search_knowledge_base(query="project ideas", connectors_to_search=["OBSIDIAN_CONNECTOR"])` +- User: "search me current usd to inr rate" + - Call: `search_knowledge_base(query="current USD to INR exchange rate", connectors_to_search=["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"])` + - Then answer using the returned live web results with citations. + +- User: "cant you search using linkup?" + - Call: `search_knowledge_base(query="", connectors_to_search=["LINKUP_API"])` + - Then answer from retrieved results (or clearly state that Linkup returned no data). + - User: "Give me a podcast about AI trends based on what we discussed" - First search for relevant content, then call: `generate_podcast(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", podcast_title="AI Trends Podcast")` diff --git a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py index 931ea1657..bdd368a32 100644 --- a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py +++ b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py @@ -593,6 +593,9 @@ IMPORTANT: - If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below. - If `connectors_to_search` is omitted/empty, the system will search broadly. - Only connectors that are enabled/configured for this search space are available.{doc_types_info} +- For real-time/public web queries (e.g., current exchange rates, stock prices, breaking news, weather), + explicitly include live web connectors in `connectors_to_search`, prioritizing: + ["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"]. ## Available connector enums for `connectors_to_search` diff --git a/surfsense_backend/app/agents/new_chat/tools/link_preview.py b/surfsense_backend/app/agents/new_chat/tools/link_preview.py index bf7b4af38..81d91d54c 100644 --- a/surfsense_backend/app/agents/new_chat/tools/link_preview.py +++ b/surfsense_backend/app/agents/new_chat/tools/link_preview.py @@ -5,6 +5,7 @@ This module provides a tool for fetching URL metadata (title, description, Open Graph image, etc.) to display rich link previews in the chat UI. """ +import asyncio import hashlib import logging import re @@ -15,7 +16,7 @@ import httpx import trafilatura from fake_useragent import UserAgent from langchain_core.tools import tool -from playwright.async_api import async_playwright +from playwright.sync_api import sync_playwright from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url @@ -175,6 +176,9 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None: Fetch page content using headless Chromium browser via Playwright. Used as a fallback when simple HTTP requests are blocked (403, etc.). + Runs the sync Playwright API in a thread so it works on any event + loop, including Windows ``SelectorEventLoop``. + Args: url: URL to fetch @@ -182,65 +186,63 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None: Dict with title, description, image, and raw_html, or None if failed """ try: - logger.info(f"[link_preview] Falling back to Chromium for {url}") - - # Generate a realistic User-Agent to avoid bot detection - ua = UserAgent() - user_agent = ua.random - - # Use residential proxy if configured - playwright_proxy = get_playwright_proxy() - - # Use Playwright to fetch the page - async with async_playwright() as p: - launch_kwargs: dict = {"headless": True} - if playwright_proxy: - launch_kwargs["proxy"] = playwright_proxy - browser = await p.chromium.launch(**launch_kwargs) - context = await browser.new_context(user_agent=user_agent) - page = await context.new_page() - - try: - await page.goto(url, wait_until="domcontentloaded", timeout=30000) - raw_html = await page.content() - finally: - await browser.close() - - if not raw_html or len(raw_html.strip()) == 0: - logger.warning(f"[link_preview] Chromium returned empty content for {url}") - return None - - # Extract metadata using Trafilatura - trafilatura_metadata = trafilatura.extract_metadata(raw_html) - - # Extract OG image from raw HTML (trafilatura doesn't extract this) - image = extract_image(raw_html) - - result = { - "title": None, - "description": None, - "image": image, - "raw_html": raw_html, - } - - if trafilatura_metadata: - result["title"] = trafilatura_metadata.title - result["description"] = trafilatura_metadata.description - - # If trafilatura didn't get the title/description, try OG tags - if not result["title"]: - result["title"] = extract_title(raw_html) - if not result["description"]: - result["description"] = extract_description(raw_html) - - logger.info(f"[link_preview] Successfully fetched {url} via Chromium") - return result - + return await asyncio.to_thread(_fetch_with_chromium_sync, url) except Exception as e: logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}") return None +def _fetch_with_chromium_sync(url: str) -> dict[str, Any] | None: + """Synchronous Playwright fetch executed in a worker thread.""" + logger.info(f"[link_preview] Falling back to Chromium for {url}") + + ua = UserAgent() + user_agent = ua.random + + playwright_proxy = get_playwright_proxy() + + with sync_playwright() as p: + launch_kwargs: dict = {"headless": True} + if playwright_proxy: + launch_kwargs["proxy"] = playwright_proxy + browser = p.chromium.launch(**launch_kwargs) + context = browser.new_context(user_agent=user_agent) + page = context.new_page() + + try: + page.goto(url, wait_until="domcontentloaded", timeout=30000) + raw_html = page.content() + finally: + browser.close() + + if not raw_html or len(raw_html.strip()) == 0: + logger.warning(f"[link_preview] Chromium returned empty content for {url}") + return None + + trafilatura_metadata = trafilatura.extract_metadata(raw_html) + + image = extract_image(raw_html) + + result: dict[str, Any] = { + "title": None, + "description": None, + "image": image, + "raw_html": raw_html, + } + + if trafilatura_metadata: + result["title"] = trafilatura_metadata.title + result["description"] = trafilatura_metadata.description + + if not result["title"]: + result["title"] = extract_title(raw_html) + if not result["description"]: + result["description"] = extract_description(raw_html) + + logger.info(f"[link_preview] Successfully fetched {url} via Chromium") + return result + + def create_link_preview_tool(): """ Factory function to create the link_preview tool. diff --git a/surfsense_backend/app/connectors/webcrawler_connector.py b/surfsense_backend/app/connectors/webcrawler_connector.py index a5fb33e1a..70272976e 100644 --- a/surfsense_backend/app/connectors/webcrawler_connector.py +++ b/surfsense_backend/app/connectors/webcrawler_connector.py @@ -1,20 +1,28 @@ """ WebCrawler Connector Module -A module for crawling web pages and extracting content using Firecrawl or Playwright. -Provides a unified interface for web scraping. +A module for crawling web pages and extracting content using Firecrawl, +plain HTTP+Trafilatura, or Playwright. Provides a unified interface for +web scraping. + +Fallback order: + 1. Firecrawl (if API key is configured) + 2. HTTP + Trafilatura (lightweight, works on any event loop) + 3. Playwright / Chromium (runs in a thread to avoid event-loop limitations) """ +import asyncio import logging from typing import Any +import httpx import trafilatura import validators from fake_useragent import UserAgent from firecrawl import AsyncFirecrawlApp -from playwright.async_api import async_playwright +from playwright.sync_api import sync_playwright -from app.utils.proxy_config import get_playwright_proxy +from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url logger = logging.getLogger(__name__) @@ -50,8 +58,10 @@ class WebCrawlerConnector: """ Crawl a single URL and extract its content. - If Firecrawl API key is provided, tries Firecrawl first and falls back to Chromium - if Firecrawl fails. If no Firecrawl API key is provided, uses Chromium directly. + Fallback order: + 1. Firecrawl (if API key configured) + 2. Plain HTTP + Trafilatura (lightweight, no subprocess) + 3. Playwright / Chromium (needs subprocess-capable event loop) Args: url: URL to crawl @@ -63,37 +73,57 @@ class WebCrawlerConnector: - content: Extracted content (markdown or HTML) - metadata: Page metadata (title, description, etc.) - source: Original URL - - crawler_type: Type of crawler used ("firecrawl" or "chromium") + - crawler_type: Type of crawler used + # Validate URL """ try: - # Validate URL if not validators.url(url): return None, f"Invalid URL: {url}" - # Try Firecrawl first if API key is provided + errors: list[str] = [] + + # --- 1. Firecrawl (premium, if configured) --- if self.use_firecrawl: try: logger.info(f"[webcrawler] Using Firecrawl for: {url}") - result = await self._crawl_with_firecrawl(url, formats) - return result, None - except Exception as firecrawl_error: - # Firecrawl failed, fallback to Chromium + return await self._crawl_with_firecrawl(url, formats), None + except Exception as exc: + errors.append(f"Firecrawl: {exc!s}") logger.warning( - f"[webcrawler] Firecrawl failed, falling back to Chromium+Trafilatura for: {url}" + f"[webcrawler] Firecrawl failed for {url}: {exc!s}" ) - try: - result = await self._crawl_with_chromium(url) - return result, None - except Exception as chromium_error: - return ( - None, - f"Both Firecrawl and Chromium failed. Firecrawl error: {firecrawl_error!s}, Chromium error: {chromium_error!s}", - ) - else: - # No Firecrawl API key, use Chromium directly + + # --- 2. HTTP + Trafilatura (no subprocess required) --- + try: + logger.info(f"[webcrawler] Using HTTP+Trafilatura for: {url}") + result = await self._crawl_with_http(url) + if result: + return result, None + errors.append("HTTP+Trafilatura: empty extraction") + except Exception as exc: + errors.append(f"HTTP+Trafilatura: {exc!s}") + logger.warning( + f"[webcrawler] HTTP+Trafilatura failed for {url}: {exc!s}" + ) + + # --- 3. Playwright / Chromium (full browser, last resort) --- + try: logger.info(f"[webcrawler] Using Chromium+Trafilatura for: {url}") - result = await self._crawl_with_chromium(url) - return result, None + return await self._crawl_with_chromium(url), None + except NotImplementedError: + errors.append( + "Chromium: event loop does not support subprocesses " + "(common on Windows with uvicorn --reload)" + ) + logger.warning( + f"[webcrawler] Chromium unavailable for {url}: " + "current event loop does not support subprocesses" + ) + except Exception as exc: + errors.append(f"Chromium: {exc!s}") + logger.warning(f"[webcrawler] Chromium failed for {url}: {exc!s}") + + return None, f"All crawl methods failed for {url}. {'; '.join(errors)}" except Exception as e: return None, f"Error crawling URL {url}: {e!s}" @@ -149,11 +179,80 @@ class WebCrawlerConnector: "crawler_type": "firecrawl", } + async def _crawl_with_http(self, url: str) -> dict[str, Any] | None: + """ + Crawl URL using a plain HTTP request + Trafilatura content extraction. + + This method avoids launching a browser subprocess, making it safe to + call from any asyncio event loop (including Windows SelectorEventLoop + which does not support ``create_subprocess_exec``). + + Returns ``None`` when Trafilatura cannot extract meaningful content + (e.g. JS-rendered SPAs) so the caller can fall through to Chromium. + """ + ua = UserAgent() + user_agent = ua.random + proxy_url = get_residential_proxy_url() + + async with httpx.AsyncClient( + timeout=20.0, + follow_redirects=True, + proxy=proxy_url, + headers={ + "User-Agent": user_agent, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + }, + ) as client: + response = await client.get(url) + response.raise_for_status() + raw_html = response.text + + if not raw_html or len(raw_html.strip()) == 0: + return None + + extracted_content = trafilatura.extract( + raw_html, + output_format="markdown", + include_comments=False, + include_tables=True, + include_images=True, + include_links=True, + ) + + if not extracted_content or len(extracted_content.strip()) == 0: + return None + + trafilatura_metadata = trafilatura.extract_metadata(raw_html) + + metadata: dict[str, str] = {"source": url} + if trafilatura_metadata: + if trafilatura_metadata.title: + metadata["title"] = trafilatura_metadata.title + if trafilatura_metadata.description: + metadata["description"] = trafilatura_metadata.description + if trafilatura_metadata.author: + metadata["author"] = trafilatura_metadata.author + if trafilatura_metadata.date: + metadata["date"] = trafilatura_metadata.date + metadata.setdefault("title", url) + + return { + "content": extracted_content, + "metadata": metadata, + "crawler_type": "http", + } + async def _crawl_with_chromium(self, url: str) -> dict[str, Any]: """ Crawl URL using Playwright with Trafilatura for content extraction. Falls back to raw HTML if Trafilatura extraction fails. + Runs the sync Playwright API in a thread so it works on any event + loop, including Windows ``SelectorEventLoop`` which cannot spawn + subprocesses. + Args: url: URL to crawl @@ -163,51 +262,48 @@ class WebCrawlerConnector: Raises: Exception: If crawling fails """ - # Generate a realistic User-Agent to avoid bot detection + return await asyncio.to_thread(self._crawl_with_chromium_sync, url) + + def _crawl_with_chromium_sync(self, url: str) -> dict[str, Any]: + """Synchronous Playwright crawl executed in a worker thread.""" ua = UserAgent() user_agent = ua.random - # Use residential proxy if configured playwright_proxy = get_playwright_proxy() - # Use Playwright to fetch the page - async with async_playwright() as p: + with sync_playwright() as p: launch_kwargs: dict = {"headless": True} if playwright_proxy: launch_kwargs["proxy"] = playwright_proxy - browser = await p.chromium.launch(**launch_kwargs) - context = await browser.new_context(user_agent=user_agent) - page = await context.new_page() + browser = p.chromium.launch(**launch_kwargs) + context = browser.new_context(user_agent=user_agent) + page = context.new_page() try: - await page.goto(url, wait_until="domcontentloaded", timeout=30000) - raw_html = await page.content() - page_title = await page.title() + page.goto(url, wait_until="domcontentloaded", timeout=30000) + raw_html = page.content() + page_title = page.title() finally: - await browser.close() + browser.close() if not raw_html: raise ValueError(f"Failed to load content from {url}") - # Extract basic metadata from the page base_metadata = {"title": page_title} if page_title else {} - # Try to extract main content using Trafilatura extracted_content = None trafilatura_metadata = None try: - # Extract main content as markdown extracted_content = trafilatura.extract( raw_html, - output_format="markdown", # Get clean markdown - include_comments=False, # Exclude comments - include_tables=True, # Keep tables - include_images=True, # Keep image references - include_links=True, # Keep links + output_format="markdown", + include_comments=False, + include_tables=True, + include_images=True, + include_links=True, ) - # Extract metadata using Trafilatura trafilatura_metadata = trafilatura.extract_metadata(raw_html) if not extracted_content or len(extracted_content.strip()) == 0: @@ -216,7 +312,6 @@ class WebCrawlerConnector: except Exception: extracted_content = None - # Build metadata, preferring Trafilatura metadata when available metadata = { "source": url, "title": ( @@ -226,7 +321,6 @@ class WebCrawlerConnector: ), } - # Add additional metadata from Trafilatura if available if trafilatura_metadata: if trafilatura_metadata.description: metadata["description"] = trafilatura_metadata.description @@ -235,7 +329,6 @@ class WebCrawlerConnector: if trafilatura_metadata.date: metadata["date"] = trafilatura_metadata.date - # Add any remaining base metadata metadata.update(base_metadata) return {