mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-07-04 22:02:16 +02:00
feat: add Baidu AI Search integration
- Add BAIDU_SEARCH_API connector type to support Chinese web search - Implement search_baidu() method in connector_service.py - Add frontend configuration page for Baidu Search API - Create Alembic migration for new enum values - Add validation rules and agent integration - Support configurable model, search source, and deep search options - Update .gitignore to exclude .env.local and other env files Addresses integration with Chinese search ecosystem for better local market support. Baidu AI Search provides intelligent search with automatic summarization.
This commit is contained in:
parent
fa39176b82
commit
beaf8f89a6
11 changed files with 669 additions and 0 deletions
|
|
@ -0,0 +1,73 @@
|
|||
"""Add BAIDU_SEARCH_API to searchsourceconnectortype enum
|
||||
|
||||
Revision ID: 30
|
||||
Revises: 29
|
||||
|
||||
Changes:
|
||||
1. Add BAIDU_SEARCH_API value to searchsourceconnectortype enum
|
||||
2. Add BAIDU_SEARCH_API value to documenttype enum for consistency
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "30"
|
||||
down_revision: str | None = "29"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Add BAIDU_SEARCH_API to searchsourceconnectortype and documenttype enums."""
|
||||
|
||||
# Add BAIDU_SEARCH_API to searchsourceconnectortype enum
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_type t
|
||||
JOIN pg_enum e ON t.oid = e.enumtypid
|
||||
WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'BAIDU_SEARCH_API'
|
||||
) THEN
|
||||
ALTER TYPE searchsourceconnectortype ADD VALUE 'BAIDU_SEARCH_API';
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
|
||||
# Add BAIDU_SEARCH_API to documenttype enum for consistency
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_type t
|
||||
JOIN pg_enum e ON t.oid = e.enumtypid
|
||||
WHERE t.typname = 'documenttype' AND e.enumlabel = 'BAIDU_SEARCH_API'
|
||||
) THEN
|
||||
ALTER TYPE documenttype ADD VALUE 'BAIDU_SEARCH_API';
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""
|
||||
Downgrade is not supported for enum values in PostgreSQL.
|
||||
|
||||
Removing enum values can break existing data and is generally not safe.
|
||||
To remove these values, you would need to:
|
||||
1. Remove all references to BAIDU_SEARCH_API in the database
|
||||
2. Recreate the enum type without BAIDU_SEARCH_API
|
||||
3. Reapply all other enum values
|
||||
|
||||
This is intentionally left as a no-op for safety.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
|
@ -1037,6 +1037,32 @@ async def fetch_relevant_documents(
|
|||
}
|
||||
)
|
||||
|
||||
elif connector == "BAIDU_SEARCH_API":
|
||||
(
|
||||
source_object,
|
||||
baidu_chunks,
|
||||
) = await connector_service.search_baidu(
|
||||
user_query=reformulated_query,
|
||||
user_id=user_id,
|
||||
search_space_id=search_space_id,
|
||||
top_k=top_k,
|
||||
)
|
||||
|
||||
# Add to sources and raw documents
|
||||
if source_object:
|
||||
all_sources.append(source_object)
|
||||
all_raw_documents.extend(baidu_chunks)
|
||||
|
||||
# Stream found document count
|
||||
if streaming_service and writer:
|
||||
writer(
|
||||
{
|
||||
"yield_value": streaming_service.format_terminal_info_delta(
|
||||
f"🇨🇳 Found {len(baidu_chunks)} Baidu Search results related to your query"
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
elif connector == "DISCORD_CONNECTOR":
|
||||
(
|
||||
source_object,
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ def get_connector_emoji(connector_name: str) -> str:
|
|||
"DISCORD_CONNECTOR": "🗨️",
|
||||
"TAVILY_API": "🔍",
|
||||
"LINKUP_API": "🔗",
|
||||
"BAIDU_SEARCH_API": "🇨🇳",
|
||||
"GOOGLE_CALENDAR_CONNECTOR": "📅",
|
||||
"AIRTABLE_CONNECTOR": "🗃️",
|
||||
"LUMA_CONNECTOR": "✨",
|
||||
|
|
@ -72,6 +73,7 @@ def get_connector_friendly_name(connector_name: str) -> str:
|
|||
"DISCORD_CONNECTOR": "Discord",
|
||||
"TAVILY_API": "Tavily Search",
|
||||
"LINKUP_API": "Linkup Search",
|
||||
"BAIDU_SEARCH_API": "Baidu Search",
|
||||
"AIRTABLE_CONNECTOR": "Airtable",
|
||||
"LUMA_CONNECTOR": "Luma",
|
||||
}
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ class SearchSourceConnectorType(str, Enum):
|
|||
TAVILY_API = "TAVILY_API"
|
||||
SEARXNG_API = "SEARXNG_API"
|
||||
LINKUP_API = "LINKUP_API"
|
||||
BAIDU_SEARCH_API = "BAIDU_SEARCH_API" # Baidu AI Search API for Chinese web search
|
||||
SLACK_CONNECTOR = "SLACK_CONNECTOR"
|
||||
NOTION_CONNECTOR = "NOTION_CONNECTOR"
|
||||
GITHUB_CONNECTOR = "GITHUB_CONNECTOR"
|
||||
|
|
|
|||
|
|
@ -560,6 +560,230 @@ class ConnectorService:
|
|||
|
||||
return result_object, documents
|
||||
|
||||
async def search_baidu(
|
||||
self,
|
||||
user_query: str,
|
||||
user_id: str,
|
||||
search_space_id: int,
|
||||
top_k: int = 20,
|
||||
) -> tuple:
|
||||
"""
|
||||
Search using Baidu AI Search API and return both sources and documents.
|
||||
|
||||
Baidu AI Search provides intelligent search with automatic summarization.
|
||||
We extract the raw search results (references) from the API response.
|
||||
|
||||
Args:
|
||||
user_query: User's search query
|
||||
user_id: User ID
|
||||
search_space_id: Search space ID
|
||||
top_k: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
tuple: (sources_info_dict, documents_list)
|
||||
"""
|
||||
# Get Baidu connector configuration
|
||||
baidu_connector = await self.get_connector_by_type(
|
||||
user_id, SearchSourceConnectorType.BAIDU_SEARCH_API, search_space_id
|
||||
)
|
||||
|
||||
if not baidu_connector:
|
||||
return {
|
||||
"id": 12,
|
||||
"name": "Baidu Search",
|
||||
"type": "BAIDU_SEARCH_API",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
config = baidu_connector.config or {}
|
||||
api_key = config.get("BAIDU_API_KEY")
|
||||
|
||||
if not api_key:
|
||||
print("ERROR: Baidu connector is missing BAIDU_API_KEY configuration")
|
||||
print(f"Connector config: {config}")
|
||||
return {
|
||||
"id": 12,
|
||||
"name": "Baidu Search",
|
||||
"type": "BAIDU_SEARCH_API",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
print(f"DEBUG: Using Baidu API Key: {api_key[:20]}... (length: {len(api_key)})")
|
||||
|
||||
# Optional configuration parameters
|
||||
model = config.get("BAIDU_MODEL", "ernie-3.5-8k")
|
||||
search_source = config.get("BAIDU_SEARCH_SOURCE", "baidu_search_v2")
|
||||
enable_deep_search = config.get("BAIDU_ENABLE_DEEP_SEARCH", False)
|
||||
|
||||
# Baidu AI Search API endpoint
|
||||
baidu_endpoint = "https://qianfan.baidubce.com/v2/ai_search/chat/completions"
|
||||
|
||||
# Prepare request headers
|
||||
# Note: Baidu uses X-Appbuilder-Authorization instead of standard Authorization header
|
||||
headers = {
|
||||
"X-Appbuilder-Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
# Prepare request payload
|
||||
# Calculate resource_type_filter top_k values
|
||||
# Baidu v2 supports max 20 per type
|
||||
max_per_type = min(top_k, 20)
|
||||
|
||||
payload = {
|
||||
"messages": [{"role": "user", "content": user_query}],
|
||||
"model": model,
|
||||
"search_source": search_source,
|
||||
"resource_type_filter": [
|
||||
{"type": "web", "top_k": max_per_type},
|
||||
{"type": "video", "top_k": max(1, max_per_type // 4)}, # Fewer videos
|
||||
],
|
||||
"stream": False, # Non-streaming for simpler processing
|
||||
"enable_deep_search": enable_deep_search,
|
||||
"enable_corner_markers": True, # Enable reference markers
|
||||
}
|
||||
|
||||
try:
|
||||
# Baidu AI Search may take longer as it performs search + summarization
|
||||
# Increase timeout to 90 seconds
|
||||
async with httpx.AsyncClient(timeout=90.0) as client:
|
||||
response = await client.post(
|
||||
baidu_endpoint,
|
||||
headers=headers,
|
||||
json=payload,
|
||||
)
|
||||
response.raise_for_status()
|
||||
except httpx.TimeoutException as exc:
|
||||
print(f"ERROR: Baidu API request timeout after 90s: {exc!r}")
|
||||
print(f"Endpoint: {baidu_endpoint}")
|
||||
return {
|
||||
"id": 12,
|
||||
"name": "Baidu Search",
|
||||
"type": "BAIDU_SEARCH_API",
|
||||
"sources": [],
|
||||
}, []
|
||||
except httpx.HTTPStatusError as exc:
|
||||
print(f"ERROR: Baidu API HTTP Status Error: {exc.response.status_code}")
|
||||
print(f"Response text: {exc.response.text[:500]}")
|
||||
print(f"Request URL: {exc.request.url}")
|
||||
return {
|
||||
"id": 12,
|
||||
"name": "Baidu Search",
|
||||
"type": "BAIDU_SEARCH_API",
|
||||
"sources": [],
|
||||
}, []
|
||||
except httpx.RequestError as exc:
|
||||
print(f"ERROR: Baidu API Request Error: {type(exc).__name__}: {exc!r}")
|
||||
print(f"Endpoint: {baidu_endpoint}")
|
||||
return {
|
||||
"id": 12,
|
||||
"name": "Baidu Search",
|
||||
"type": "BAIDU_SEARCH_API",
|
||||
"sources": [],
|
||||
}, []
|
||||
except Exception as exc:
|
||||
print(f"ERROR: Unexpected error calling Baidu API: {type(exc).__name__}: {exc!r}")
|
||||
print(f"Endpoint: {baidu_endpoint}")
|
||||
print(f"Payload: {payload}")
|
||||
return {
|
||||
"id": 12,
|
||||
"name": "Baidu Search",
|
||||
"type": "BAIDU_SEARCH_API",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
try:
|
||||
data = response.json()
|
||||
except ValueError as e:
|
||||
print(f"ERROR: Failed to decode JSON response from Baidu AI Search: {e}")
|
||||
print(f"Response status: {response.status_code}")
|
||||
print(f"Response text: {response.text[:500]}") # First 500 chars
|
||||
return {
|
||||
"id": 12,
|
||||
"name": "Baidu Search",
|
||||
"type": "BAIDU_SEARCH_API",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
# Extract references (search results) from the response
|
||||
baidu_references = data.get("references", [])
|
||||
|
||||
print(f"DEBUG: Baidu API returned {len(baidu_references)} references")
|
||||
|
||||
if "code" in data or "message" in data:
|
||||
print(f"WARNING: Baidu API returned error - Code: {data.get('code')}, Message: {data.get('message')}")
|
||||
|
||||
if not baidu_references:
|
||||
print("WARNING: No references found in Baidu API response")
|
||||
print(f"Response keys: {list(data.keys())}")
|
||||
return {
|
||||
"id": 12,
|
||||
"name": "Baidu Search",
|
||||
"type": "BAIDU_SEARCH_API",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
sources_list: list[dict[str, Any]] = []
|
||||
documents: list[dict[str, Any]] = []
|
||||
|
||||
async with self.counter_lock:
|
||||
for reference in baidu_references:
|
||||
# Extract basic fields
|
||||
title = reference.get("title", "Baidu Search Result")
|
||||
url = reference.get("url", "")
|
||||
content = reference.get("content", "")
|
||||
date = reference.get("date", "")
|
||||
ref_type = reference.get("type", "web") # web, image, video
|
||||
|
||||
# Create a source entry
|
||||
source = {
|
||||
"id": self.source_id_counter,
|
||||
"title": title,
|
||||
"description": content[:300] if content else "", # Limit description length
|
||||
"url": url,
|
||||
}
|
||||
sources_list.append(source)
|
||||
|
||||
# Prepare metadata
|
||||
metadata = {
|
||||
"url": url,
|
||||
"date": date,
|
||||
"type": ref_type,
|
||||
"source": "BAIDU_SEARCH_API",
|
||||
"web_anchor": reference.get("web_anchor", ""),
|
||||
"website": reference.get("website", ""),
|
||||
}
|
||||
|
||||
# Add type-specific metadata
|
||||
if ref_type == "image" and reference.get("image"):
|
||||
metadata["image"] = reference["image"]
|
||||
elif ref_type == "video" and reference.get("video"):
|
||||
metadata["video"] = reference["video"]
|
||||
|
||||
# Create a document entry
|
||||
document = {
|
||||
"chunk_id": self.source_id_counter,
|
||||
"content": content,
|
||||
"score": 1.0, # Baidu doesn't provide relevance scores
|
||||
"document": {
|
||||
"id": self.source_id_counter,
|
||||
"title": title,
|
||||
"document_type": "BAIDU_SEARCH_API",
|
||||
"metadata": metadata,
|
||||
},
|
||||
}
|
||||
documents.append(document)
|
||||
self.source_id_counter += 1
|
||||
|
||||
result_object = {
|
||||
"id": 12,
|
||||
"name": "Baidu Search",
|
||||
"type": "BAIDU_SEARCH_API",
|
||||
"sources": sources_list,
|
||||
}
|
||||
|
||||
return result_object, documents
|
||||
|
||||
async def search_slack(
|
||||
self,
|
||||
user_query: str,
|
||||
|
|
|
|||
|
|
@ -434,6 +434,15 @@ def validate_connector_config(
|
|||
},
|
||||
},
|
||||
"LINKUP_API": {"required": ["LINKUP_API_KEY"], "validators": {}},
|
||||
"BAIDU_SEARCH_API": {
|
||||
"required": ["BAIDU_API_KEY"],
|
||||
"optional": [
|
||||
"BAIDU_MODEL",
|
||||
"BAIDU_SEARCH_SOURCE",
|
||||
"BAIDU_ENABLE_DEEP_SEARCH",
|
||||
],
|
||||
"validators": {},
|
||||
},
|
||||
"SLACK_CONNECTOR": {"required": ["SLACK_BOT_TOKEN"], "validators": {}},
|
||||
"NOTION_CONNECTOR": {
|
||||
"required": ["NOTION_INTEGRATION_TOKEN"],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue