mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-20 21:18:13 +02:00
Merge pull request #392 from Aki-07/feature/add-searxng-connector
Add Searxng connector
This commit is contained in:
commit
08661e686c
15 changed files with 838 additions and 7 deletions
|
|
@ -0,0 +1,42 @@
|
|||
"""Add SearxNG connector enum value
|
||||
|
||||
Revision ID: 26
|
||||
Revises: 25
|
||||
Create Date: 2025-01-18 00:00:00.000000
|
||||
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "26"
|
||||
down_revision: str | None = "25"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Safely add SEARXNG_API to searchsourceconnectortype enum."""
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_type t
|
||||
JOIN pg_enum e ON t.oid = e.enumtypid
|
||||
WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'SEARXNG_API'
|
||||
) THEN
|
||||
ALTER TYPE searchsourceconnectortype ADD VALUE 'SEARXNG_API';
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Downgrade not supported for enum edits."""
|
||||
pass
|
||||
|
||||
|
|
@ -1033,6 +1033,30 @@ async def fetch_relevant_documents(
|
|||
}
|
||||
)
|
||||
|
||||
elif connector == "SEARXNG_API":
|
||||
(
|
||||
source_object,
|
||||
searx_chunks,
|
||||
) = await connector_service.search_searxng(
|
||||
user_query=reformulated_query,
|
||||
user_id=user_id,
|
||||
search_space_id=search_space_id,
|
||||
top_k=top_k,
|
||||
)
|
||||
|
||||
if source_object:
|
||||
all_sources.append(source_object)
|
||||
all_raw_documents.extend(searx_chunks)
|
||||
|
||||
if streaming_service and writer:
|
||||
writer(
|
||||
{
|
||||
"yield_value": streaming_service.format_terminal_info_delta(
|
||||
f"🌐 Found {len(searx_chunks)} SearxNG results related to your query"
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
elif connector == "LINKUP_API":
|
||||
linkup_mode = "standard"
|
||||
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ class DocumentType(str, Enum):
|
|||
class SearchSourceConnectorType(str, Enum):
|
||||
SERPER_API = "SERPER_API" # NOT IMPLEMENTED YET : DON'T REMEMBER WHY : MOST PROBABLY BECAUSE WE NEED TO CRAWL THE RESULTS RETURNED BY IT
|
||||
TAVILY_API = "TAVILY_API"
|
||||
SEARXNG_API = "SEARXNG_API"
|
||||
LINKUP_API = "LINKUP_API"
|
||||
SLACK_CONNECTOR = "SLACK_CONNECTOR"
|
||||
NOTION_CONNECTOR = "NOTION_CONNECTOR"
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
import asyncio
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import httpx
|
||||
from linkup import LinkupClient
|
||||
from sqlalchemy import func
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
|
@ -372,6 +374,192 @@ class ConnectorService:
|
|||
"sources": [],
|
||||
}, []
|
||||
|
||||
async def search_searxng(
|
||||
self,
|
||||
user_query: str,
|
||||
user_id: str,
|
||||
search_space_id: int,
|
||||
top_k: int = 20,
|
||||
) -> tuple:
|
||||
"""
|
||||
Search using a configured SearxNG instance and return both sources and documents.
|
||||
"""
|
||||
searx_connector = await self.get_connector_by_type(
|
||||
user_id, SearchSourceConnectorType.SEARXNG_API, search_space_id
|
||||
)
|
||||
|
||||
if not searx_connector:
|
||||
return {
|
||||
"id": 11,
|
||||
"name": "SearxNG Search",
|
||||
"type": "SEARXNG_API",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
config = searx_connector.config or {}
|
||||
host = config.get("SEARXNG_HOST")
|
||||
|
||||
if not host:
|
||||
print("SearxNG connector is missing SEARXNG_HOST configuration")
|
||||
return {
|
||||
"id": 11,
|
||||
"name": "SearxNG Search",
|
||||
"type": "SEARXNG_API",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
api_key = config.get("SEARXNG_API_KEY")
|
||||
engines = config.get("SEARXNG_ENGINES")
|
||||
categories = config.get("SEARXNG_CATEGORIES")
|
||||
language = config.get("SEARXNG_LANGUAGE")
|
||||
safesearch = config.get("SEARXNG_SAFESEARCH")
|
||||
|
||||
def _parse_bool(value: Any, default: bool = True) -> bool:
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
lowered = value.strip().lower()
|
||||
if lowered in {"true", "1", "yes", "on"}:
|
||||
return True
|
||||
if lowered in {"false", "0", "no", "off"}:
|
||||
return False
|
||||
return default
|
||||
|
||||
verify_ssl = _parse_bool(config.get("SEARXNG_VERIFY_SSL", True))
|
||||
|
||||
safesearch_value: int | None = None
|
||||
if isinstance(safesearch, str):
|
||||
safesearch_clean = safesearch.strip()
|
||||
if safesearch_clean.isdigit():
|
||||
safesearch_value = int(safesearch_clean)
|
||||
elif isinstance(safesearch, (int, float)):
|
||||
safesearch_value = int(safesearch)
|
||||
|
||||
if safesearch_value is not None and not (0 <= safesearch_value <= 2):
|
||||
safesearch_value = None
|
||||
|
||||
def _format_list(value: Any) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, str):
|
||||
value = value.strip()
|
||||
return value or None
|
||||
if isinstance(value, (list, tuple, set)):
|
||||
cleaned = [str(item).strip() for item in value if str(item).strip()]
|
||||
return ",".join(cleaned) if cleaned else None
|
||||
return str(value)
|
||||
|
||||
params: dict[str, Any] = {
|
||||
"q": user_query,
|
||||
"format": "json",
|
||||
"language": language or "",
|
||||
"limit": max(1, min(top_k, 50)),
|
||||
}
|
||||
|
||||
engines_param = _format_list(engines)
|
||||
if engines_param:
|
||||
params["engines"] = engines_param
|
||||
|
||||
categories_param = _format_list(categories)
|
||||
if categories_param:
|
||||
params["categories"] = categories_param
|
||||
|
||||
if safesearch_value is not None:
|
||||
params["safesearch"] = safesearch_value
|
||||
|
||||
if not params.get("language"):
|
||||
params.pop("language")
|
||||
|
||||
headers = {"Accept": "application/json"}
|
||||
if api_key:
|
||||
headers["X-API-KEY"] = api_key
|
||||
|
||||
searx_endpoint = urljoin(host if host.endswith("/") else f"{host}/", "search")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=20.0, verify=verify_ssl) as client:
|
||||
response = await client.get(
|
||||
searx_endpoint,
|
||||
params=params,
|
||||
headers=headers,
|
||||
)
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPError as exc:
|
||||
print(f"Error searching with SearxNG: {exc!s}")
|
||||
return {
|
||||
"id": 11,
|
||||
"name": "SearxNG Search",
|
||||
"type": "SEARXNG_API",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
try:
|
||||
data = response.json()
|
||||
except ValueError:
|
||||
print("Failed to decode JSON response from SearxNG")
|
||||
return {
|
||||
"id": 11,
|
||||
"name": "SearxNG Search",
|
||||
"type": "SEARXNG_API",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
searx_results = data.get("results", [])
|
||||
if not searx_results:
|
||||
return {
|
||||
"id": 11,
|
||||
"name": "SearxNG Search",
|
||||
"type": "SEARXNG_API",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
sources_list: list[dict[str, Any]] = []
|
||||
documents: list[dict[str, Any]] = []
|
||||
|
||||
async with self.counter_lock:
|
||||
for result in searx_results:
|
||||
description = result.get("content") or result.get("snippet") or ""
|
||||
if len(description) > 160:
|
||||
description = f"{description[:157]}..."
|
||||
|
||||
source = {
|
||||
"id": self.source_id_counter,
|
||||
"title": result.get("title", "SearxNG Result"),
|
||||
"description": description,
|
||||
"url": result.get("url", ""),
|
||||
}
|
||||
sources_list.append(source)
|
||||
|
||||
metadata = {
|
||||
"url": result.get("url", ""),
|
||||
"engines": result.get("engines", []),
|
||||
"category": result.get("category"),
|
||||
"source": "SEARXNG_API",
|
||||
}
|
||||
|
||||
document = {
|
||||
"chunk_id": self.source_id_counter,
|
||||
"content": description or result.get("content", ""),
|
||||
"score": result.get("score", 0.0),
|
||||
"document": {
|
||||
"id": self.source_id_counter,
|
||||
"title": result.get("title", "SearxNG Result"),
|
||||
"document_type": "SEARXNG_API",
|
||||
"metadata": metadata,
|
||||
},
|
||||
}
|
||||
documents.append(document)
|
||||
self.source_id_counter += 1
|
||||
|
||||
result_object = {
|
||||
"id": 11,
|
||||
"name": "SearxNG Search",
|
||||
"type": "SEARXNG_API",
|
||||
"sources": sources_list,
|
||||
}
|
||||
|
||||
return result_object, documents
|
||||
|
||||
async def search_slack(
|
||||
self,
|
||||
user_query: str,
|
||||
|
|
|
|||
|
|
@ -424,6 +424,22 @@ def validate_connector_config(
|
|||
connector_rules = {
|
||||
"SERPER_API": {"required": ["SERPER_API_KEY"], "validators": {}},
|
||||
"TAVILY_API": {"required": ["TAVILY_API_KEY"], "validators": {}},
|
||||
"SEARXNG_API": {
|
||||
"required": ["SEARXNG_HOST"],
|
||||
"optional": [
|
||||
"SEARXNG_API_KEY",
|
||||
"SEARXNG_ENGINES",
|
||||
"SEARXNG_CATEGORIES",
|
||||
"SEARXNG_LANGUAGE",
|
||||
"SEARXNG_SAFESEARCH",
|
||||
"SEARXNG_VERIFY_SSL",
|
||||
],
|
||||
"validators": {
|
||||
"SEARXNG_HOST": lambda: validate_url_field(
|
||||
"SEARXNG_HOST", "SearxNG"
|
||||
)
|
||||
},
|
||||
},
|
||||
"LINKUP_API": {"required": ["LINKUP_API_KEY"], "validators": {}},
|
||||
"SLACK_CONNECTOR": {"required": ["SLACK_BOT_TOKEN"], "validators": {}},
|
||||
"NOTION_CONNECTOR": {
|
||||
|
|
@ -484,10 +500,21 @@ def validate_connector_config(
|
|||
if not rules:
|
||||
return config # Unknown connector type, pass through
|
||||
|
||||
# Validate required keys match exactly
|
||||
if set(config.keys()) != set(rules["required"]):
|
||||
required_keys = set(rules["required"])
|
||||
optional_keys = set(rules.get("optional", []))
|
||||
config_keys = set(config.keys())
|
||||
|
||||
# Validate that no unexpected keys are present
|
||||
if not config_keys.issubset(required_keys | optional_keys):
|
||||
allowed_keys = list(required_keys | optional_keys)
|
||||
raise ValueError(
|
||||
f"For {connector_type_str} connector type, config must only contain these keys: {rules['required']}"
|
||||
f"For {connector_type_str} connector type, config may only contain these keys: {allowed_keys}"
|
||||
)
|
||||
|
||||
# Validate that all required keys are present
|
||||
if not required_keys.issubset(config_keys):
|
||||
raise ValueError(
|
||||
f"For {connector_type_str} connector type, config must include these keys: {sorted(required_keys)}"
|
||||
)
|
||||
|
||||
# Apply custom validators first (these check format before emptiness)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue