SurfSense/surfsense_backend/app/services/web_search_service.py

291 lines
8.7 KiB
Python
Raw Normal View History

"""
Platform-level web search service backed by SearXNG.
Redis is used only for result caching (graceful degradation if unavailable).
The circuit breaker is fully in-process no external dependency, zero
latency overhead.
"""
from __future__ import annotations
2026-03-17 04:40:46 +05:30
import contextlib
import hashlib
import json
import logging
import threading
import time
from typing import Any
from urllib.parse import urljoin
import httpx
import redis
from app.config import config
logger = logging.getLogger(__name__)
_EMPTY_RESULT: dict[str, Any] = {
"id": 11,
"name": "Web Search",
"type": "SEARXNG_API",
"sources": [],
}
# ---------------------------------------------------------------------------
# Redis — used only for result caching
# ---------------------------------------------------------------------------
_redis_client: redis.Redis | None = None
def _get_redis() -> redis.Redis:
global _redis_client
if _redis_client is None:
_redis_client = redis.from_url(config.REDIS_APP_URL, decode_responses=True)
return _redis_client
# ---------------------------------------------------------------------------
# In-process Circuit Breaker (no Redis dependency)
# ---------------------------------------------------------------------------
_CB_FAILURE_THRESHOLD = 5
_CB_FAILURE_WINDOW_SECONDS = 60
_CB_COOLDOWN_SECONDS = 30
_cb_lock = threading.Lock()
_cb_failure_count: int = 0
_cb_last_failure_time: float = 0.0
_cb_open_until: float = 0.0
def _circuit_is_open() -> bool:
return time.monotonic() < _cb_open_until
def _record_failure() -> None:
global _cb_failure_count, _cb_last_failure_time, _cb_open_until
now = time.monotonic()
with _cb_lock:
if now - _cb_last_failure_time > _CB_FAILURE_WINDOW_SECONDS:
_cb_failure_count = 0
_cb_failure_count += 1
_cb_last_failure_time = now
if _cb_failure_count >= _CB_FAILURE_THRESHOLD:
_cb_open_until = now + _CB_COOLDOWN_SECONDS
logger.warning(
"Circuit breaker OPENED after %d failures — "
"SearXNG calls paused for %ds",
_cb_failure_count,
_CB_COOLDOWN_SECONDS,
)
def _record_success() -> None:
global _cb_failure_count, _cb_open_until
with _cb_lock:
_cb_failure_count = 0
_cb_open_until = 0.0
# ---------------------------------------------------------------------------
# Result Caching (Redis, graceful degradation)
# ---------------------------------------------------------------------------
_CACHE_TTL_SECONDS = 300 # 5 minutes
_CACHE_PREFIX = "websearch:cache:"
def _cache_key(query: str, engines: str | None, language: str | None) -> str:
raw = f"{query}|{engines or ''}|{language or ''}"
digest = hashlib.sha256(raw.encode()).hexdigest()[:24]
return f"{_CACHE_PREFIX}{digest}"
def _cache_get(key: str) -> dict | None:
try:
data = _get_redis().get(key)
if data:
return json.loads(data)
except (redis.RedisError, json.JSONDecodeError):
pass
return None
def _cache_set(key: str, value: dict) -> None:
2026-03-17 04:40:46 +05:30
with contextlib.suppress(redis.RedisError):
_get_redis().setex(key, _CACHE_TTL_SECONDS, json.dumps(value))
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def is_available() -> bool:
"""Return ``True`` when the platform SearXNG host is configured."""
return bool(config.SEARXNG_DEFAULT_HOST)
async def health_check() -> dict[str, Any]:
"""Ping the SearXNG ``/healthz`` endpoint and return status info."""
host = config.SEARXNG_DEFAULT_HOST
if not host:
return {"status": "unavailable", "error": "SEARXNG_DEFAULT_HOST not set"}
healthz_url = urljoin(host if host.endswith("/") else f"{host}/", "healthz")
t0 = time.perf_counter()
try:
async with httpx.AsyncClient(timeout=5.0, verify=False) as client:
resp = await client.get(healthz_url)
resp.raise_for_status()
elapsed_ms = round((time.perf_counter() - t0) * 1000)
return {
"status": "healthy",
"response_time_ms": elapsed_ms,
"circuit_breaker": "open" if _circuit_is_open() else "closed",
}
except Exception as exc:
elapsed_ms = round((time.perf_counter() - t0) * 1000)
return {
"status": "unhealthy",
"error": str(exc),
"response_time_ms": elapsed_ms,
"circuit_breaker": "open" if _circuit_is_open() else "closed",
}
async def search(
query: str,
top_k: int = 20,
*,
engines: str | None = None,
language: str | None = None,
safesearch: int | None = None,
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
"""Execute a web search against the platform SearXNG instance.
Returns the standard ``(result_object, documents)`` tuple expected by
``ConnectorService.search_searxng``.
"""
host = config.SEARXNG_DEFAULT_HOST
if not host:
return dict(_EMPTY_RESULT), []
if _circuit_is_open():
logger.info("Web search skipped — circuit breaker is open")
result = dict(_EMPTY_RESULT)
result["error"] = "Web search temporarily unavailable (circuit open)"
result["status"] = "degraded"
return result, []
ck = _cache_key(query, engines, language)
cached = _cache_get(ck)
if cached is not None:
logger.debug("Web search cache HIT for query=%r", query[:60])
return cached["result"], cached["documents"]
params: dict[str, Any] = {
"q": query,
"format": "json",
"limit": max(1, min(top_k, 50)),
}
if engines:
params["engines"] = engines
if language:
params["language"] = language
if safesearch is not None and 0 <= safesearch <= 2:
params["safesearch"] = safesearch
searx_endpoint = urljoin(host if host.endswith("/") else f"{host}/", "search")
headers = {"Accept": "application/json"}
data: dict[str, Any] | None = None
last_error: Exception | None = None
for attempt in range(2):
try:
async with httpx.AsyncClient(timeout=15.0, verify=False) as client:
response = await client.get(
2026-03-17 04:40:46 +05:30
searx_endpoint,
params=params,
headers=headers,
)
response.raise_for_status()
data = response.json()
break
except (httpx.HTTPStatusError, httpx.TimeoutException) as exc:
last_error = exc
if attempt == 0 and (
isinstance(exc, httpx.TimeoutException)
2026-03-17 04:40:46 +05:30
or (
isinstance(exc, httpx.HTTPStatusError)
and exc.response.status_code >= 500
)
):
continue
break
except httpx.HTTPError as exc:
last_error = exc
break
except ValueError as exc:
last_error = exc
break
if data is None:
_record_failure()
logger.warning("Web search failed after retries: %s", last_error)
return dict(_EMPTY_RESULT), []
_record_success()
searx_results = data.get("results", [])
if not searx_results:
return dict(_EMPTY_RESULT), []
sources_list: list[dict[str, Any]] = []
documents: list[dict[str, Any]] = []
for idx, result in enumerate(searx_results):
source_id = 200_000 + idx
description = result.get("content") or result.get("snippet") or ""
2026-03-17 04:40:46 +05:30
sources_list.append(
{
"id": source_id,
"title": result.get("title", "Web Search Result"),
2026-03-17 04:40:46 +05:30
"description": description,
"url": result.get("url", ""),
}
)
documents.append(
{
"chunk_id": source_id,
"content": description or result.get("content", ""),
"score": result.get("score", 0.0),
"document": {
"id": source_id,
"title": result.get("title", "Web Search Result"),
"document_type": "SEARXNG_API",
"metadata": {
"url": result.get("url", ""),
"engines": result.get("engines", []),
"category": result.get("category"),
"source": "SEARXNG_API",
},
},
2026-03-17 04:40:46 +05:30
}
)
result_object: dict[str, Any] = {
"id": 11,
"name": "Web Search",
"type": "SEARXNG_API",
"sources": sources_list,
}
_cache_set(ck, {"result": result_object, "documents": documents})
return result_object, documents