mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-08 20:25:19 +02:00
feat: add new FastAPI debug configurations and enhance web crawling capabilities with real-time web query support
This commit is contained in:
parent
81dfc7102f
commit
ed497909fa
5 changed files with 256 additions and 104 deletions
38
.vscode/launch.json
vendored
38
.vscode/launch.json
vendored
|
|
@ -24,6 +24,16 @@
|
|||
"cwd": "${workspaceFolder}/surfsense_backend",
|
||||
"python": "${command:python.interpreterPath}"
|
||||
},
|
||||
{
|
||||
"name": "Backend: FastAPI (No Reload)",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/surfsense_backend/main.py",
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false,
|
||||
"cwd": "${workspaceFolder}/surfsense_backend",
|
||||
"python": "${command:python.interpreterPath}"
|
||||
},
|
||||
{
|
||||
"name": "Backend: FastAPI (main.py)",
|
||||
"type": "debugpy",
|
||||
|
|
@ -124,6 +134,34 @@
|
|||
"group": "Full Stack",
|
||||
"order": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Full Stack: Backend (No Reload) + Frontend + Celery",
|
||||
"configurations": [
|
||||
"Backend: FastAPI (No Reload)",
|
||||
"Frontend: Next.js",
|
||||
"Celery: Worker",
|
||||
"Celery: Beat Scheduler"
|
||||
],
|
||||
"stopAll": true,
|
||||
"presentation": {
|
||||
"hidden": false,
|
||||
"group": "Full Stack",
|
||||
"order": 3
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Full Stack: Backend (No Reload) + Frontend",
|
||||
"configurations": [
|
||||
"Backend: FastAPI (No Reload)",
|
||||
"Frontend: Next.js"
|
||||
],
|
||||
"stopAll": true,
|
||||
"presentation": {
|
||||
"hidden": false,
|
||||
"group": "Full Stack",
|
||||
"order": 4
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -74,6 +74,14 @@ You have access to the following tools:
|
|||
- IMPORTANT: When searching for information (meetings, schedules, notes, tasks, etc.), ALWAYS search broadly
|
||||
across ALL sources first by omitting connectors_to_search. The user may store information in various places
|
||||
including calendar apps, note-taking apps (Obsidian, Notion), chat apps (Slack, Discord), and more.
|
||||
- IMPORTANT (REAL-TIME / PUBLIC WEB QUERIES): For questions that require current public web data
|
||||
(e.g., live exchange rates, stock prices, breaking news, weather, current events), you MUST call
|
||||
`search_knowledge_base` using live web connectors via `connectors_to_search`:
|
||||
["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"].
|
||||
- For these real-time/public web queries, DO NOT answer from memory and DO NOT say you lack internet
|
||||
access before attempting a live connector search.
|
||||
- If the live connectors return no relevant results, explain that live web sources did not return enough
|
||||
data and ask the user if they want you to retry with a refined query.
|
||||
- Only narrow to specific connectors if the user explicitly asks (e.g., "check my Slack" or "in my calendar").
|
||||
- Personal notes in Obsidian, Notion, or NOTE often contain schedules, meeting times, reminders, and other
|
||||
important information that may not be in calendars.
|
||||
|
|
@ -358,6 +366,14 @@ _TOOLS_INSTRUCTIONS_EXAMPLES_COMMON = """
|
|||
- User: "What's in my Obsidian vault about project ideas?"
|
||||
- Call: `search_knowledge_base(query="project ideas", connectors_to_search=["OBSIDIAN_CONNECTOR"])`
|
||||
|
||||
- User: "search me current usd to inr rate"
|
||||
- Call: `search_knowledge_base(query="current USD to INR exchange rate", connectors_to_search=["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"])`
|
||||
- Then answer using the returned live web results with citations.
|
||||
|
||||
- User: "cant you search using linkup?"
|
||||
- Call: `search_knowledge_base(query="<refined user request>", connectors_to_search=["LINKUP_API"])`
|
||||
- Then answer from retrieved results (or clearly state that Linkup returned no data).
|
||||
|
||||
- User: "Give me a podcast about AI trends based on what we discussed"
|
||||
- First search for relevant content, then call: `generate_podcast(source_content="Based on our conversation and search results: [detailed summary of chat + search findings]", podcast_title="AI Trends Podcast")`
|
||||
|
||||
|
|
|
|||
|
|
@ -593,6 +593,9 @@ IMPORTANT:
|
|||
- If the user requests a specific source type (e.g. "my notes", "Slack messages"), pass `connectors_to_search=[...]` using the enums below.
|
||||
- If `connectors_to_search` is omitted/empty, the system will search broadly.
|
||||
- Only connectors that are enabled/configured for this search space are available.{doc_types_info}
|
||||
- For real-time/public web queries (e.g., current exchange rates, stock prices, breaking news, weather),
|
||||
explicitly include live web connectors in `connectors_to_search`, prioritizing:
|
||||
["LINKUP_API", "TAVILY_API", "SEARXNG_API", "BAIDU_SEARCH_API"].
|
||||
|
||||
## Available connector enums for `connectors_to_search`
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ This module provides a tool for fetching URL metadata (title, description,
|
|||
Open Graph image, etc.) to display rich link previews in the chat UI.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
|
|
@ -15,7 +16,7 @@ import httpx
|
|||
import trafilatura
|
||||
from fake_useragent import UserAgent
|
||||
from langchain_core.tools import tool
|
||||
from playwright.async_api import async_playwright
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url
|
||||
|
||||
|
|
@ -175,6 +176,9 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
|
|||
Fetch page content using headless Chromium browser via Playwright.
|
||||
Used as a fallback when simple HTTP requests are blocked (403, etc.).
|
||||
|
||||
Runs the sync Playwright API in a thread so it works on any event
|
||||
loop, including Windows ``SelectorEventLoop``.
|
||||
|
||||
Args:
|
||||
url: URL to fetch
|
||||
|
||||
|
|
@ -182,65 +186,63 @@ async def fetch_with_chromium(url: str) -> dict[str, Any] | None:
|
|||
Dict with title, description, image, and raw_html, or None if failed
|
||||
"""
|
||||
try:
|
||||
logger.info(f"[link_preview] Falling back to Chromium for {url}")
|
||||
|
||||
# Generate a realistic User-Agent to avoid bot detection
|
||||
ua = UserAgent()
|
||||
user_agent = ua.random
|
||||
|
||||
# Use residential proxy if configured
|
||||
playwright_proxy = get_playwright_proxy()
|
||||
|
||||
# Use Playwright to fetch the page
|
||||
async with async_playwright() as p:
|
||||
launch_kwargs: dict = {"headless": True}
|
||||
if playwright_proxy:
|
||||
launch_kwargs["proxy"] = playwright_proxy
|
||||
browser = await p.chromium.launch(**launch_kwargs)
|
||||
context = await browser.new_context(user_agent=user_agent)
|
||||
page = await context.new_page()
|
||||
|
||||
try:
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
raw_html = await page.content()
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
if not raw_html or len(raw_html.strip()) == 0:
|
||||
logger.warning(f"[link_preview] Chromium returned empty content for {url}")
|
||||
return None
|
||||
|
||||
# Extract metadata using Trafilatura
|
||||
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
|
||||
|
||||
# Extract OG image from raw HTML (trafilatura doesn't extract this)
|
||||
image = extract_image(raw_html)
|
||||
|
||||
result = {
|
||||
"title": None,
|
||||
"description": None,
|
||||
"image": image,
|
||||
"raw_html": raw_html,
|
||||
}
|
||||
|
||||
if trafilatura_metadata:
|
||||
result["title"] = trafilatura_metadata.title
|
||||
result["description"] = trafilatura_metadata.description
|
||||
|
||||
# If trafilatura didn't get the title/description, try OG tags
|
||||
if not result["title"]:
|
||||
result["title"] = extract_title(raw_html)
|
||||
if not result["description"]:
|
||||
result["description"] = extract_description(raw_html)
|
||||
|
||||
logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
|
||||
return result
|
||||
|
||||
return await asyncio.to_thread(_fetch_with_chromium_sync, url)
|
||||
except Exception as e:
|
||||
logger.error(f"[link_preview] Chromium fallback failed for {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _fetch_with_chromium_sync(url: str) -> dict[str, Any] | None:
|
||||
"""Synchronous Playwright fetch executed in a worker thread."""
|
||||
logger.info(f"[link_preview] Falling back to Chromium for {url}")
|
||||
|
||||
ua = UserAgent()
|
||||
user_agent = ua.random
|
||||
|
||||
playwright_proxy = get_playwright_proxy()
|
||||
|
||||
with sync_playwright() as p:
|
||||
launch_kwargs: dict = {"headless": True}
|
||||
if playwright_proxy:
|
||||
launch_kwargs["proxy"] = playwright_proxy
|
||||
browser = p.chromium.launch(**launch_kwargs)
|
||||
context = browser.new_context(user_agent=user_agent)
|
||||
page = context.new_page()
|
||||
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
raw_html = page.content()
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
if not raw_html or len(raw_html.strip()) == 0:
|
||||
logger.warning(f"[link_preview] Chromium returned empty content for {url}")
|
||||
return None
|
||||
|
||||
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
|
||||
|
||||
image = extract_image(raw_html)
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"title": None,
|
||||
"description": None,
|
||||
"image": image,
|
||||
"raw_html": raw_html,
|
||||
}
|
||||
|
||||
if trafilatura_metadata:
|
||||
result["title"] = trafilatura_metadata.title
|
||||
result["description"] = trafilatura_metadata.description
|
||||
|
||||
if not result["title"]:
|
||||
result["title"] = extract_title(raw_html)
|
||||
if not result["description"]:
|
||||
result["description"] = extract_description(raw_html)
|
||||
|
||||
logger.info(f"[link_preview] Successfully fetched {url} via Chromium")
|
||||
return result
|
||||
|
||||
|
||||
def create_link_preview_tool():
|
||||
"""
|
||||
Factory function to create the link_preview tool.
|
||||
|
|
|
|||
|
|
@ -1,20 +1,28 @@
|
|||
"""
|
||||
WebCrawler Connector Module
|
||||
|
||||
A module for crawling web pages and extracting content using Firecrawl or Playwright.
|
||||
Provides a unified interface for web scraping.
|
||||
A module for crawling web pages and extracting content using Firecrawl,
|
||||
plain HTTP+Trafilatura, or Playwright. Provides a unified interface for
|
||||
web scraping.
|
||||
|
||||
Fallback order:
|
||||
1. Firecrawl (if API key is configured)
|
||||
2. HTTP + Trafilatura (lightweight, works on any event loop)
|
||||
3. Playwright / Chromium (runs in a thread to avoid event-loop limitations)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
import trafilatura
|
||||
import validators
|
||||
from fake_useragent import UserAgent
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
from playwright.async_api import async_playwright
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
from app.utils.proxy_config import get_playwright_proxy
|
||||
from app.utils.proxy_config import get_playwright_proxy, get_residential_proxy_url
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -50,8 +58,10 @@ class WebCrawlerConnector:
|
|||
"""
|
||||
Crawl a single URL and extract its content.
|
||||
|
||||
If Firecrawl API key is provided, tries Firecrawl first and falls back to Chromium
|
||||
if Firecrawl fails. If no Firecrawl API key is provided, uses Chromium directly.
|
||||
Fallback order:
|
||||
1. Firecrawl (if API key configured)
|
||||
2. Plain HTTP + Trafilatura (lightweight, no subprocess)
|
||||
3. Playwright / Chromium (needs subprocess-capable event loop)
|
||||
|
||||
Args:
|
||||
url: URL to crawl
|
||||
|
|
@ -63,37 +73,57 @@ class WebCrawlerConnector:
|
|||
- content: Extracted content (markdown or HTML)
|
||||
- metadata: Page metadata (title, description, etc.)
|
||||
- source: Original URL
|
||||
- crawler_type: Type of crawler used ("firecrawl" or "chromium")
|
||||
- crawler_type: Type of crawler used
|
||||
# Validate URL
|
||||
"""
|
||||
try:
|
||||
# Validate URL
|
||||
if not validators.url(url):
|
||||
return None, f"Invalid URL: {url}"
|
||||
|
||||
# Try Firecrawl first if API key is provided
|
||||
errors: list[str] = []
|
||||
|
||||
# --- 1. Firecrawl (premium, if configured) ---
|
||||
if self.use_firecrawl:
|
||||
try:
|
||||
logger.info(f"[webcrawler] Using Firecrawl for: {url}")
|
||||
result = await self._crawl_with_firecrawl(url, formats)
|
||||
return result, None
|
||||
except Exception as firecrawl_error:
|
||||
# Firecrawl failed, fallback to Chromium
|
||||
return await self._crawl_with_firecrawl(url, formats), None
|
||||
except Exception as exc:
|
||||
errors.append(f"Firecrawl: {exc!s}")
|
||||
logger.warning(
|
||||
f"[webcrawler] Firecrawl failed, falling back to Chromium+Trafilatura for: {url}"
|
||||
f"[webcrawler] Firecrawl failed for {url}: {exc!s}"
|
||||
)
|
||||
try:
|
||||
result = await self._crawl_with_chromium(url)
|
||||
return result, None
|
||||
except Exception as chromium_error:
|
||||
return (
|
||||
None,
|
||||
f"Both Firecrawl and Chromium failed. Firecrawl error: {firecrawl_error!s}, Chromium error: {chromium_error!s}",
|
||||
)
|
||||
else:
|
||||
# No Firecrawl API key, use Chromium directly
|
||||
|
||||
# --- 2. HTTP + Trafilatura (no subprocess required) ---
|
||||
try:
|
||||
logger.info(f"[webcrawler] Using HTTP+Trafilatura for: {url}")
|
||||
result = await self._crawl_with_http(url)
|
||||
if result:
|
||||
return result, None
|
||||
errors.append("HTTP+Trafilatura: empty extraction")
|
||||
except Exception as exc:
|
||||
errors.append(f"HTTP+Trafilatura: {exc!s}")
|
||||
logger.warning(
|
||||
f"[webcrawler] HTTP+Trafilatura failed for {url}: {exc!s}"
|
||||
)
|
||||
|
||||
# --- 3. Playwright / Chromium (full browser, last resort) ---
|
||||
try:
|
||||
logger.info(f"[webcrawler] Using Chromium+Trafilatura for: {url}")
|
||||
result = await self._crawl_with_chromium(url)
|
||||
return result, None
|
||||
return await self._crawl_with_chromium(url), None
|
||||
except NotImplementedError:
|
||||
errors.append(
|
||||
"Chromium: event loop does not support subprocesses "
|
||||
"(common on Windows with uvicorn --reload)"
|
||||
)
|
||||
logger.warning(
|
||||
f"[webcrawler] Chromium unavailable for {url}: "
|
||||
"current event loop does not support subprocesses"
|
||||
)
|
||||
except Exception as exc:
|
||||
errors.append(f"Chromium: {exc!s}")
|
||||
logger.warning(f"[webcrawler] Chromium failed for {url}: {exc!s}")
|
||||
|
||||
return None, f"All crawl methods failed for {url}. {'; '.join(errors)}"
|
||||
|
||||
except Exception as e:
|
||||
return None, f"Error crawling URL {url}: {e!s}"
|
||||
|
|
@ -149,11 +179,80 @@ class WebCrawlerConnector:
|
|||
"crawler_type": "firecrawl",
|
||||
}
|
||||
|
||||
async def _crawl_with_http(self, url: str) -> dict[str, Any] | None:
|
||||
"""
|
||||
Crawl URL using a plain HTTP request + Trafilatura content extraction.
|
||||
|
||||
This method avoids launching a browser subprocess, making it safe to
|
||||
call from any asyncio event loop (including Windows SelectorEventLoop
|
||||
which does not support ``create_subprocess_exec``).
|
||||
|
||||
Returns ``None`` when Trafilatura cannot extract meaningful content
|
||||
(e.g. JS-rendered SPAs) so the caller can fall through to Chromium.
|
||||
"""
|
||||
ua = UserAgent()
|
||||
user_agent = ua.random
|
||||
proxy_url = get_residential_proxy_url()
|
||||
|
||||
async with httpx.AsyncClient(
|
||||
timeout=20.0,
|
||||
follow_redirects=True,
|
||||
proxy=proxy_url,
|
||||
headers={
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
},
|
||||
) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
raw_html = response.text
|
||||
|
||||
if not raw_html or len(raw_html.strip()) == 0:
|
||||
return None
|
||||
|
||||
extracted_content = trafilatura.extract(
|
||||
raw_html,
|
||||
output_format="markdown",
|
||||
include_comments=False,
|
||||
include_tables=True,
|
||||
include_images=True,
|
||||
include_links=True,
|
||||
)
|
||||
|
||||
if not extracted_content or len(extracted_content.strip()) == 0:
|
||||
return None
|
||||
|
||||
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
|
||||
|
||||
metadata: dict[str, str] = {"source": url}
|
||||
if trafilatura_metadata:
|
||||
if trafilatura_metadata.title:
|
||||
metadata["title"] = trafilatura_metadata.title
|
||||
if trafilatura_metadata.description:
|
||||
metadata["description"] = trafilatura_metadata.description
|
||||
if trafilatura_metadata.author:
|
||||
metadata["author"] = trafilatura_metadata.author
|
||||
if trafilatura_metadata.date:
|
||||
metadata["date"] = trafilatura_metadata.date
|
||||
metadata.setdefault("title", url)
|
||||
|
||||
return {
|
||||
"content": extracted_content,
|
||||
"metadata": metadata,
|
||||
"crawler_type": "http",
|
||||
}
|
||||
|
||||
async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
|
||||
"""
|
||||
Crawl URL using Playwright with Trafilatura for content extraction.
|
||||
Falls back to raw HTML if Trafilatura extraction fails.
|
||||
|
||||
Runs the sync Playwright API in a thread so it works on any event
|
||||
loop, including Windows ``SelectorEventLoop`` which cannot spawn
|
||||
subprocesses.
|
||||
|
||||
Args:
|
||||
url: URL to crawl
|
||||
|
||||
|
|
@ -163,51 +262,48 @@ class WebCrawlerConnector:
|
|||
Raises:
|
||||
Exception: If crawling fails
|
||||
"""
|
||||
# Generate a realistic User-Agent to avoid bot detection
|
||||
return await asyncio.to_thread(self._crawl_with_chromium_sync, url)
|
||||
|
||||
def _crawl_with_chromium_sync(self, url: str) -> dict[str, Any]:
|
||||
"""Synchronous Playwright crawl executed in a worker thread."""
|
||||
ua = UserAgent()
|
||||
user_agent = ua.random
|
||||
|
||||
# Use residential proxy if configured
|
||||
playwright_proxy = get_playwright_proxy()
|
||||
|
||||
# Use Playwright to fetch the page
|
||||
async with async_playwright() as p:
|
||||
with sync_playwright() as p:
|
||||
launch_kwargs: dict = {"headless": True}
|
||||
if playwright_proxy:
|
||||
launch_kwargs["proxy"] = playwright_proxy
|
||||
browser = await p.chromium.launch(**launch_kwargs)
|
||||
context = await browser.new_context(user_agent=user_agent)
|
||||
page = await context.new_page()
|
||||
browser = p.chromium.launch(**launch_kwargs)
|
||||
context = browser.new_context(user_agent=user_agent)
|
||||
page = context.new_page()
|
||||
|
||||
try:
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
raw_html = await page.content()
|
||||
page_title = await page.title()
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
raw_html = page.content()
|
||||
page_title = page.title()
|
||||
finally:
|
||||
await browser.close()
|
||||
browser.close()
|
||||
|
||||
if not raw_html:
|
||||
raise ValueError(f"Failed to load content from {url}")
|
||||
|
||||
# Extract basic metadata from the page
|
||||
base_metadata = {"title": page_title} if page_title else {}
|
||||
|
||||
# Try to extract main content using Trafilatura
|
||||
extracted_content = None
|
||||
trafilatura_metadata = None
|
||||
|
||||
try:
|
||||
# Extract main content as markdown
|
||||
extracted_content = trafilatura.extract(
|
||||
raw_html,
|
||||
output_format="markdown", # Get clean markdown
|
||||
include_comments=False, # Exclude comments
|
||||
include_tables=True, # Keep tables
|
||||
include_images=True, # Keep image references
|
||||
include_links=True, # Keep links
|
||||
output_format="markdown",
|
||||
include_comments=False,
|
||||
include_tables=True,
|
||||
include_images=True,
|
||||
include_links=True,
|
||||
)
|
||||
|
||||
# Extract metadata using Trafilatura
|
||||
trafilatura_metadata = trafilatura.extract_metadata(raw_html)
|
||||
|
||||
if not extracted_content or len(extracted_content.strip()) == 0:
|
||||
|
|
@ -216,7 +312,6 @@ class WebCrawlerConnector:
|
|||
except Exception:
|
||||
extracted_content = None
|
||||
|
||||
# Build metadata, preferring Trafilatura metadata when available
|
||||
metadata = {
|
||||
"source": url,
|
||||
"title": (
|
||||
|
|
@ -226,7 +321,6 @@ class WebCrawlerConnector:
|
|||
),
|
||||
}
|
||||
|
||||
# Add additional metadata from Trafilatura if available
|
||||
if trafilatura_metadata:
|
||||
if trafilatura_metadata.description:
|
||||
metadata["description"] = trafilatura_metadata.description
|
||||
|
|
@ -235,7 +329,6 @@ class WebCrawlerConnector:
|
|||
if trafilatura_metadata.date:
|
||||
metadata["date"] = trafilatura_metadata.date
|
||||
|
||||
# Add any remaining base metadata
|
||||
metadata.update(base_metadata)
|
||||
|
||||
return {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue