feat: add new FastAPI debug configurations and enhance web crawling capabilities with real-time web query support

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-02-20 17:28:20 -08:00
parent 81dfc7102f
commit ed497909fa
5 changed files with 256 additions and 104 deletions

38
.vscode/launch.json vendored
View file

@ -24,6 +24,16 @@
"cwd": "${workspaceFolder}/surfsense_backend", "cwd": "${workspaceFolder}/surfsense_backend",
"python": "${command:python.interpreterPath}" "python": "${command:python.interpreterPath}"
}, },
{
"name": "Backend: FastAPI (No Reload)",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/surfsense_backend/main.py",
"console": "integratedTerminal",
"justMyCode": false,
"cwd": "${workspaceFolder}/surfsense_backend",
"python": "${command:python.interpreterPath}"
},
{ {
"name": "Backend: FastAPI (main.py)", "name": "Backend: FastAPI (main.py)",
"type": "debugpy", "type": "debugpy",
@ -124,6 +134,34 @@
"group": "Full Stack", "group": "Full Stack",
"order": 2 "order": 2
} }
},
{
"name": "Full Stack: Backend (No Reload) + Frontend + Celery",
"configurations": [
"Backend: FastAPI (No Reload)",
"Frontend: Next.js",
"Celery: Worker",
"Celery: Beat Scheduler"
],
"stopAll": true,
"presentation": {
"hidden": false,
"group": "Full Stack",
"order": 3
}
},
{
"name": "Full Stack: Backend (No Reload) + Frontend",
"configurations": [
"Backend: FastAPI (No Reload)",
"Frontend: Next.js"
],
"stopAll": true,
"presentation": {
"hidden": false,
"group": "Full Stack",
"order": 4
}
} }
] ]
} }

View file

@ -74,6 +74,14 @@ You have access to the following tools:
- IMPORTANT: When searching for information (meetings, schedules, notes, tasks, etc.), ALWAYS search broadly - IMPORTANT: When searching for information (meetings, schedules, notes, tasks, etc.), ALWAYS search broadly
across ALL sources first by omitting connectors_to_search. The user may store information in various places across ALL sources first by omitting connectors_to_search. The user may store information in various places
including calendar apps, note-taking apps (Obsidian, Notion), chat apps (Slack, Discord), and more. including calendar apps, note-taking apps (Obsidian, Notion), chat apps (Slack, Discord), and more.
- IMPORTANT (REAL-TIME / PUBLIC WEB QUERIES): For questions that require current public web data
(e.g., live exchange rates, stock prices, breaking news, weather, current events), you MUST call
`search_knowledge_base` using live web connectors via `connectors_to_search`:
["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"].
- For these real-time/public web queries, DO NOT answer from memory and DO NOT say you lack internet
access before attempting a live connector search.
- If the live connectors return no relevant results, explain that live web sources did not return enough
data and ask the user if they want you to retry with a refined query.
- Only narrow to specific connectors if the user explicitly asks (e.g., "check my Slack" or "in my calendar"). - Only narrow to specific connectors if the user explicitly asks (e.g., "check my Slack" or "in my calendar").
- Personal notes in Obsidian, Notion, or NOTE often contain schedules, meeting times, reminders, and other - Personal notes in Obsidian, Notion, or NOTE often contain schedules, meeting times, reminders, and other
important information that may not be in calendars. important information that may not be in calendars.
@ -358,6 +366,14 @@ _TOOLS_INSTRUCTIONS_EXAMPLES_COMMON = """
- User: "What's in my Obsidian vault about project ideas?" - User: "What's in my Obsidian vault about project ideas?"
- Call: `search_knowledge_base(query="project ideas", connectors_to_search=["OBSIDIAN_CONNECTOR"])` - Call: `search_knowledge_base(query="project ideas", connectors_to_search=["OBSIDIAN_CONNECTOR"])`
- User: "search me current usd to inr rate"
- Call: `search_knowledge_base(query="current USD to INR exchange rate", connectors_to_search=["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"])`
- Then answer using the returned live web results with citations.
- User: "cant you search using linkup?"
- Call: `search_knowledge_base(query="<refined user request>", connectors_to_search=["LINKUP_API"])`
- Then answer from retrieved results (or clearly state that Linkup returned no data).
- User: "Give me a podcast about AI trends based on what we discussed" - User: "Give me a podcast about AI trends based on what we discussed"
- First search for relevant content, then call: `generate_podcast(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", podcast_title="AI Trends Podcast")` - First search for relevant content, then call: `generate_podcast(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", podcast_title="AI Trends Podcast")`

View file

@ -593,6 +593,9 @@ IMPORTANT:
- If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below. - If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below.
- If `connectors_to_search` is omitted/empty, the system will search broadly. - If `connectors_to_search` is omitted/empty, the system will search broadly.
- Only connectors that are enabled/configured for this search space are available.{doc_types_info} - Only connectors that are enabled/configured for this search space are available.{doc_types_info}
- For real-time/public web queries (e.g., current exchange rates, stock prices, breaking news, weather),
explicitly include live web connectors in `connectors_to_search`, prioritizing:
["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"].
## Available connector enums for `connectors_to_search` ## Available connector enums for `connectors_to_search`

View file

@ -5,6 +5,7 @@ This module provides a tool for fetching URL metadata (title, description,
Open Graph image, etc.) to display rich link previews in the chat UI. Open Graph image, etc.) to display rich link previews in the chat UI.
""" """
import asyncio
import hashlib import hashlib
import logging import logging
import re import re
@ -15,7 +16,7 @@ import httpx
import trafilatura import trafilatura
from fake_useragent import UserAgent from fake_useragent import UserAgent
from langchain_core.tools import tool from langchain_core.tools import tool
from playwright.async_api import async_playwright from playwright.sync_api import sync_playwright
from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url
@ -175,6 +176,9 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
Fetch page content using headless Chromium browser via Playwright. Fetch page content using headless Chromium browser via Playwright.
Used as a fallback when simple HTTP requests are blocked (403, etc.). Used as a fallback when simple HTTP requests are blocked (403, etc.).
Runs the sync Playwright API in a thread so it works on any event
loop, including Windows ``SelectorEventLoop``.
Args: Args:
url: URL to fetch url: URL to fetch
@ -182,65 +186,63 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
Dict with title, description, image, and raw_html, or None if failed Dict with title, description, image, and raw_html, or None if failed
""" """
try: try:
logger.info(f"[link_preview] Falling back to Chromium for {url}") return await asyncio.to_thread(_fetch_with_chromium_sync, url)
# Generate a realistic User-Agent to avoid bot detection
ua = UserAgent()
user_agent = ua.random
# Use residential proxy if configured
playwright_proxy = get_playwright_proxy()
# Use Playwright to fetch the page
async with async_playwright() as p:
launch_kwargs: dict = {"headless": True}
if playwright_proxy:
launch_kwargs["proxy"] = playwright_proxy
browser = await p.chromium.launch(**launch_kwargs)
context = await browser.new_context(user_agent=user_agent)
page = await context.new_page()
try:
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
raw_html = await page.content()
finally:
await browser.close()
if not raw_html or len(raw_html.strip()) == 0:
logger.warning(f"[link_preview] Chromium returned empty content for {url}")
return None
# Extract metadata using Trafilatura
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
# Extract OG image from raw HTML (trafilatura doesn't extract this)
image = extract_image(raw_html)
result = {
"title": None,
"description": None,
"image": image,
"raw_html": raw_html,
}
if trafilatura_metadata:
result["title"] = trafilatura_metadata.title
result["description"] = trafilatura_metadata.description
# If trafilatura didn't get the title/description, try OG tags
if not result["title"]:
result["title"] = extract_title(raw_html)
if not result["description"]:
result["description"] = extract_description(raw_html)
logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
return result
except Exception as e: except Exception as e:
logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}") logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}")
return None return None
def _fetch_with_chromium_sync(url: str) -> dict[str, Any] | None:
"""Synchronous Playwright fetch executed in a worker thread."""
logger.info(f"[link_preview] Falling back to Chromium for {url}")
ua = UserAgent()
user_agent = ua.random
playwright_proxy = get_playwright_proxy()
with sync_playwright() as p:
launch_kwargs: dict = {"headless": True}
if playwright_proxy:
launch_kwargs["proxy"] = playwright_proxy
browser = p.chromium.launch(**launch_kwargs)
context = browser.new_context(user_agent=user_agent)
page = context.new_page()
try:
page.goto(url, wait_until="domcontentloaded", timeout=30000)
raw_html = page.content()
finally:
browser.close()
if not raw_html or len(raw_html.strip()) == 0:
logger.warning(f"[link_preview] Chromium returned empty content for {url}")
return None
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
image = extract_image(raw_html)
result: dict[str, Any] = {
"title": None,
"description": None,
"image": image,
"raw_html": raw_html,
}
if trafilatura_metadata:
result["title"] = trafilatura_metadata.title
result["description"] = trafilatura_metadata.description
if not result["title"]:
result["title"] = extract_title(raw_html)
if not result["description"]:
result["description"] = extract_description(raw_html)
logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
return result
def create_link_preview_tool(): def create_link_preview_tool():
""" """
Factory function to create the link_preview tool. Factory function to create the link_preview tool.

View file

@ -1,20 +1,28 @@
""" """
WebCrawler Connector Module WebCrawler Connector Module
A module for crawling web pages and extracting content using Firecrawl or Playwright. A module for crawling web pages and extracting content using Firecrawl,
Provides a unified interface for web scraping. plain HTTP+Trafilatura, or Playwright. Provides a unified interface for
web scraping.
Fallback order:
1. Firecrawl (if API key is configured)
2. HTTP + Trafilatura (lightweight, works on any event loop)
3. Playwright / Chromium (runs in a thread to avoid event-loop limitations)
""" """
import asyncio
import logging import logging
from typing import Any from typing import Any
import httpx
import trafilatura import trafilatura
import validators import validators
from fake_useragent import UserAgent from fake_useragent import UserAgent
from firecrawl import AsyncFirecrawlApp from firecrawl import AsyncFirecrawlApp
from playwright.async_api import async_playwright from playwright.sync_api import sync_playwright
from app.utils.proxy_config import get_playwright_proxy from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -50,8 +58,10 @@ class WebCrawlerConnector:
""" """
Crawl a single URL and extract its content. Crawl a single URL and extract its content.
If Firecrawl API key is provided, tries Firecrawl first and falls back to Chromium Fallback order:
if Firecrawl fails. If no Firecrawl API key is provided, uses Chromium directly. 1. Firecrawl (if API key configured)
2. Plain HTTP + Trafilatura (lightweight, no subprocess)
3. Playwright / Chromium (needs subprocess-capable event loop)
Args: Args:
url: URL to crawl url: URL to crawl
@ -63,37 +73,57 @@ class WebCrawlerConnector:
- content: Extracted content (markdown or HTML) - content: Extracted content (markdown or HTML)
- metadata: Page metadata (title, description, etc.) - metadata: Page metadata (title, description, etc.)
- source: Original URL - source: Original URL
- crawler_type: Type of crawler used ("firecrawl" or "chromium") - crawler_type: Type of crawler used
# Validate URL
""" """
try: try:
# Validate URL
if not validators.url(url): if not validators.url(url):
return None, f"Invalid URL: {url}" return None, f"Invalid URL: {url}"
# Try Firecrawl first if API key is provided errors: list[str] = []
# --- 1. Firecrawl (premium, if configured) ---
if self.use_firecrawl: if self.use_firecrawl:
try: try:
logger.info(f"[webcrawler] Using Firecrawl for: {url}") logger.info(f"[webcrawler] Using Firecrawl for: {url}")
result = await self._crawl_with_firecrawl(url, formats) return await self._crawl_with_firecrawl(url, formats), None
return result, None except Exception as exc:
except Exception as firecrawl_error: errors.append(f"Firecrawl: {exc!s}")
# Firecrawl failed, fallback to Chromium
logger.warning( logger.warning(
f"[webcrawler] Firecrawl failed, falling back to Chromium+Trafilatura for: {url}" f"[webcrawler] Firecrawl failed for {url}: {exc!s}"
) )
try:
result = await self._crawl_with_chromium(url) # --- 2. HTTP + Trafilatura (no subprocess required) ---
return result, None try:
except Exception as chromium_error: logger.info(f"[webcrawler] Using HTTP+Trafilatura for: {url}")
return ( result = await self._crawl_with_http(url)
None, if result:
f"Both Firecrawl and Chromium failed. Firecrawl error: {firecrawl_error!s}, Chromium error: {chromium_error!s}", return result, None
) errors.append("HTTP+Trafilatura: empty extraction")
else: except Exception as exc:
# No Firecrawl API key, use Chromium directly errors.append(f"HTTP+Trafilatura: {exc!s}")
logger.warning(
f"[webcrawler] HTTP+Trafilatura failed for {url}: {exc!s}"
)
# --- 3. Playwright / Chromium (full browser, last resort) ---
try:
logger.info(f"[webcrawler] Using Chromium+Trafilatura for: {url}") logger.info(f"[webcrawler] Using Chromium+Trafilatura for: {url}")
result = await self._crawl_with_chromium(url) return await self._crawl_with_chromium(url), None
return result, None except NotImplementedError:
errors.append(
"Chromium: event loop does not support subprocesses "
"(common on Windows with uvicorn --reload)"
)
logger.warning(
f"[webcrawler] Chromium unavailable for {url}: "
"current event loop does not support subprocesses"
)
except Exception as exc:
errors.append(f"Chromium: {exc!s}")
logger.warning(f"[webcrawler] Chromium failed for {url}: {exc!s}")
return None, f"All crawl methods failed for {url}. {'; '.join(errors)}"
except Exception as e: except Exception as e:
return None, f"Error crawling URL {url}: {e!s}" return None, f"Error crawling URL {url}: {e!s}"
@ -149,11 +179,80 @@ class WebCrawlerConnector:
"crawler_type": "firecrawl", "crawler_type": "firecrawl",
} }
async def _crawl_with_http(self, url: str) -> dict[str, Any] | None:
"""
Crawl URL using a plain HTTP request + Trafilatura content extraction.
This method avoids launching a browser subprocess, making it safe to
call from any asyncio event loop (including Windows SelectorEventLoop
which does not support ``create_subprocess_exec``).
Returns ``None`` when Trafilatura cannot extract meaningful content
(e.g. JS-rendered SPAs) so the caller can fall through to Chromium.
"""
ua = UserAgent()
user_agent = ua.random
proxy_url = get_residential_proxy_url()
async with httpx.AsyncClient(
timeout=20.0,
follow_redirects=True,
proxy=proxy_url,
headers={
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
},
) as client:
response = await client.get(url)
response.raise_for_status()
raw_html = response.text
if not raw_html or len(raw_html.strip()) == 0:
return None
extracted_content = trafilatura.extract(
raw_html,
output_format="markdown",
include_comments=False,
include_tables=True,
include_images=True,
include_links=True,
)
if not extracted_content or len(extracted_content.strip()) == 0:
return None
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
metadata: dict[str, str] = {"source": url}
if trafilatura_metadata:
if trafilatura_metadata.title:
metadata["title"] = trafilatura_metadata.title
if trafilatura_metadata.description:
metadata["description"] = trafilatura_metadata.description
if trafilatura_metadata.author:
metadata["author"] = trafilatura_metadata.author
if trafilatura_metadata.date:
metadata["date"] = trafilatura_metadata.date
metadata.setdefault("title", url)
return {
"content": extracted_content,
"metadata": metadata,
"crawler_type": "http",
}
async def _crawl_with_chromium(self, url: str) -> dict[str, Any]: async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
""" """
Crawl URL using Playwright with Trafilatura for content extraction. Crawl URL using Playwright with Trafilatura for content extraction.
Falls back to raw HTML if Trafilatura extraction fails. Falls back to raw HTML if Trafilatura extraction fails.
Runs the sync Playwright API in a thread so it works on any event
loop, including Windows ``SelectorEventLoop`` which cannot spawn
subprocesses.
Args: Args:
url: URL to crawl url: URL to crawl
@ -163,51 +262,48 @@ class WebCrawlerConnector:
Raises: Raises:
Exception: If crawling fails Exception: If crawling fails
""" """
# Generate a realistic User-Agent to avoid bot detection return await asyncio.to_thread(self._crawl_with_chromium_sync, url)
def _crawl_with_chromium_sync(self, url: str) -> dict[str, Any]:
"""Synchronous Playwright crawl executed in a worker thread."""
ua = UserAgent() ua = UserAgent()
user_agent = ua.random user_agent = ua.random
# Use residential proxy if configured
playwright_proxy = get_playwright_proxy() playwright_proxy = get_playwright_proxy()
# Use Playwright to fetch the page with sync_playwright() as p:
async with async_playwright() as p:
launch_kwargs: dict = {"headless": True} launch_kwargs: dict = {"headless": True}
if playwright_proxy: if playwright_proxy:
launch_kwargs["proxy"] = playwright_proxy launch_kwargs["proxy"] = playwright_proxy
browser = await p.chromium.launch(**launch_kwargs) browser = p.chromium.launch(**launch_kwargs)
context = await browser.new_context(user_agent=user_agent) context = browser.new_context(user_agent=user_agent)
page = await context.new_page() page = context.new_page()
try: try:
await page.goto(url, wait_until="domcontentloaded", timeout=30000) page.goto(url, wait_until="domcontentloaded", timeout=30000)
raw_html = await page.content() raw_html = page.content()
page_title = await page.title() page_title = page.title()
finally: finally:
await browser.close() browser.close()
if not raw_html: if not raw_html:
raise ValueError(f"Failed to load content from {url}") raise ValueError(f"Failed to load content from {url}")
# Extract basic metadata from the page
base_metadata = {"title": page_title} if page_title else {} base_metadata = {"title": page_title} if page_title else {}
# Try to extract main content using Trafilatura
extracted_content = None extracted_content = None
trafilatura_metadata = None trafilatura_metadata = None
try: try:
# Extract main content as markdown
extracted_content = trafilatura.extract( extracted_content = trafilatura.extract(
raw_html, raw_html,
output_format="markdown", # Get clean markdown output_format="markdown",
include_comments=False, # Exclude comments include_comments=False,
include_tables=True, # Keep tables include_tables=True,
include_images=True, # Keep image references include_images=True,
include_links=True, # Keep links include_links=True,
) )
# Extract metadata using Trafilatura
trafilatura_metadata = trafilatura.extract_metadata(raw_html) trafilatura_metadata = trafilatura.extract_metadata(raw_html)
if not extracted_content or len(extracted_content.strip()) == 0: if not extracted_content or len(extracted_content.strip()) == 0:
@ -216,7 +312,6 @@ class WebCrawlerConnector:
except Exception: except Exception:
extracted_content = None extracted_content = None
# Build metadata, preferring Trafilatura metadata when available
metadata = { metadata = {
"source": url, "source": url,
"title": ( "title": (
@ -226,7 +321,6 @@ class WebCrawlerConnector:
), ),
} }
# Add additional metadata from Trafilatura if available
if trafilatura_metadata: if trafilatura_metadata:
if trafilatura_metadata.description: if trafilatura_metadata.description:
metadata["description"] = trafilatura_metadata.description metadata["description"] = trafilatura_metadata.description
@ -235,7 +329,6 @@ class WebCrawlerConnector:
if trafilatura_metadata.date: if trafilatura_metadata.date:
metadata["date"] = trafilatura_metadata.date metadata["date"] = trafilatura_metadata.date
# Add any remaining base metadata
metadata.update(base_metadata) metadata.update(base_metadata)
return { return {