mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-07-04 22:02:16 +02:00
feat: added elasticsearch connector
This commit is contained in:
parent
402039f02f
commit
55d752e3c8
27 changed files with 4331 additions and 2499 deletions
|
|
@ -0,0 +1,56 @@
|
|||
"""Add ElasticSearch connector enums
|
||||
|
||||
Revision ID: 26
|
||||
Revises: 25
|
||||
Create Date: 2025-10-12 12:00:00.000000
|
||||
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers
|
||||
revision: str = "26"
|
||||
down_revision: str | None = "25"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add enum values
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_type t
|
||||
JOIN pg_enum e ON t.oid = e.enumtypid
|
||||
WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'ELASTICSEARCH_CONNECTOR'
|
||||
) THEN
|
||||
ALTER TYPE searchsourceconnectortype ADD VALUE 'ELASTICSEARCH_CONNECTOR';
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_type t
|
||||
JOIN pg_enum e ON t.oid = e.enumtypid
|
||||
WHERE t.typname = 'documenttype' AND e.enumlabel = 'ELASTICSEARCH_CONNECTOR'
|
||||
) THEN
|
||||
ALTER TYPE documenttype ADD VALUE 'ELASTICSEARCH_CONNECTOR';
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Remove 'ELASTICSEARCH_CONNECTOR' from enum types."""
|
||||
pass
|
||||
|
|
@ -488,6 +488,25 @@ async def fetch_documents_by_ids(
|
|||
)
|
||||
url = metadata.get("url", "")
|
||||
|
||||
elif doc_type == "ELASTICSEARCH_CONNECTOR":
|
||||
# Prefer explicit title in metadata/source, otherwise fallback to doc.title
|
||||
es_title = (
|
||||
metadata.get("title")
|
||||
or metadata.get("es_title")
|
||||
or doc.title
|
||||
or f"Elasticsearch: {metadata.get('elasticsearch_index', '')}"
|
||||
)
|
||||
title = es_title
|
||||
description = metadata.get("description") or (
|
||||
doc.content[:100] + "..."
|
||||
if len(doc.content) > 100
|
||||
else doc.content
|
||||
)
|
||||
# If a link or index info is stored, surface it
|
||||
url = metadata.get("url", "") or metadata.get(
|
||||
"elasticsearch_index", ""
|
||||
)
|
||||
|
||||
else: # FILE and other types
|
||||
title = doc.title
|
||||
description = (
|
||||
|
|
@ -512,6 +531,7 @@ async def fetch_documents_by_ids(
|
|||
"SLACK_CONNECTOR": "Slack (Selected)",
|
||||
"NOTION_CONNECTOR": "Notion (Selected)",
|
||||
"GITHUB_CONNECTOR": "GitHub (Selected)",
|
||||
"ELASTICSEARCH_CONNECTOR": "Elasticsearch (Selected)",
|
||||
"YOUTUBE_VIDEO": "YouTube Videos (Selected)",
|
||||
"DISCORD_CONNECTOR": "Discord (Selected)",
|
||||
"JIRA_CONNECTOR": "Jira Issues (Selected)",
|
||||
|
|
@ -1266,6 +1286,33 @@ async def fetch_relevant_documents(
|
|||
}
|
||||
)
|
||||
|
||||
elif connector == "ELASTICSEARCH_CONNECTOR":
|
||||
(
|
||||
source_object,
|
||||
elasticsearch_chunks,
|
||||
) = await connector_service.search_elasticsearch(
|
||||
user_query=reformulated_query,
|
||||
user_id=user_id,
|
||||
search_space_id=search_space_id,
|
||||
top_k=top_k,
|
||||
search_mode=search_mode,
|
||||
)
|
||||
|
||||
# Add to sources and raw documents
|
||||
if source_object:
|
||||
all_sources.append(source_object)
|
||||
all_raw_documents.extend(elasticsearch_chunks)
|
||||
|
||||
# Stream found document count
|
||||
if streaming_service and writer:
|
||||
writer(
|
||||
{
|
||||
"yield_value": streaming_service.format_terminal_info_delta(
|
||||
f"🔎 Found {len(elasticsearch_chunks)} Elasticsearch chunks related to your query"
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error("Error in search_airtable: %s", traceback.format_exc())
|
||||
error_message = f"Error searching connector {connector}: {e!s}"
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ def get_connector_emoji(connector_name: str) -> str:
|
|||
"GOOGLE_CALENDAR_CONNECTOR": "📅",
|
||||
"AIRTABLE_CONNECTOR": "🗃️",
|
||||
"LUMA_CONNECTOR": "✨",
|
||||
"ELASTICSEARCH_CONNECTOR": "🔎",
|
||||
}
|
||||
return connector_emojis.get(connector_name, "🔎")
|
||||
|
||||
|
|
@ -74,6 +75,7 @@ def get_connector_friendly_name(connector_name: str) -> str:
|
|||
"LINKUP_API": "Linkup Search",
|
||||
"AIRTABLE_CONNECTOR": "Airtable",
|
||||
"LUMA_CONNECTOR": "Luma",
|
||||
"ELASTICSEARCH_CONNECTOR": "Elasticsearch",
|
||||
}
|
||||
return connector_friendly_names.get(connector_name, connector_name)
|
||||
|
||||
|
|
|
|||
254
surfsense_backend/app/connectors/elasticsearch_connector.py
Normal file
254
surfsense_backend/app/connectors/elasticsearch_connector.py
Normal file
|
|
@ -0,0 +1,254 @@
|
|||
"""
|
||||
Elasticsearch connector for SurfSense
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from elasticsearch import AsyncElasticsearch
|
||||
from elasticsearch.exceptions import (
|
||||
AuthenticationException,
|
||||
ConnectionError,
|
||||
NotFoundError,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ElasticsearchConnector:
|
||||
"""
|
||||
Connector for Elasticsearch instances
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
api_key: str | None = None,
|
||||
username: str | None = None,
|
||||
password: str | None = None,
|
||||
verify_certs: bool = True,
|
||||
ca_certs: str | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize Elasticsearch connector
|
||||
|
||||
Args:
|
||||
url: Full Elasticsearch URL (e.g., https://host:port or cloud endpoint)
|
||||
api_key: API key for authentication (preferred method)
|
||||
username: Username for basic authentication
|
||||
password: Password for basic authentication
|
||||
verify_certs: Whether to verify SSL certificates
|
||||
ca_certs: Path to CA certificates file
|
||||
"""
|
||||
self.url = url
|
||||
self.api_key = api_key
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.verify_certs = verify_certs
|
||||
self.ca_certs = ca_certs
|
||||
|
||||
# Build connection configuration
|
||||
self.es_config = self._build_config()
|
||||
|
||||
# Initialize Elasticsearch client
|
||||
try:
|
||||
self.client = AsyncElasticsearch(**self.es_config)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize Elasticsearch client: {e}")
|
||||
raise
|
||||
|
||||
def _build_config(self) -> dict[str, Any]:
|
||||
"""Build Elasticsearch client configuration"""
|
||||
config = {
|
||||
"hosts": [self.url],
|
||||
"verify_certs": self.verify_certs,
|
||||
"request_timeout": 30,
|
||||
"max_retries": 3,
|
||||
"retry_on_timeout": True,
|
||||
}
|
||||
|
||||
# Authentication - API key takes precedence
|
||||
if self.api_key:
|
||||
config["api_key"] = self.api_key
|
||||
elif self.username and self.password:
|
||||
config["basic_auth"] = (self.username, self.password)
|
||||
|
||||
# SSL configuration
|
||||
if self.ca_certs:
|
||||
config["ca_certs"] = self.ca_certs
|
||||
|
||||
return config
|
||||
|
||||
async def search(
|
||||
self,
|
||||
index: str | list[str],
|
||||
query: dict[str, Any],
|
||||
size: int = 100,
|
||||
from_: int = 0,
|
||||
fields: list[str] | None = None,
|
||||
sort: list[dict[str, Any]] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Search documents in Elasticsearch
|
||||
|
||||
Args:
|
||||
index: Elasticsearch index name or list of indices
|
||||
query: Elasticsearch query DSL
|
||||
size: Number of results to return
|
||||
from_: Starting offset for pagination
|
||||
fields: List of fields to include in response
|
||||
sort: Sort configuration
|
||||
|
||||
Returns:
|
||||
Elasticsearch search response
|
||||
"""
|
||||
try:
|
||||
search_body: dict[str, Any] = {
|
||||
"query": query,
|
||||
"size": size,
|
||||
"from": from_,
|
||||
}
|
||||
|
||||
if fields:
|
||||
search_body["_source"] = fields
|
||||
|
||||
if sort:
|
||||
search_body["sort"] = sort
|
||||
|
||||
response = await self.client.search(index=index, body=search_body)
|
||||
|
||||
logger.info(
|
||||
f"Successfully searched index '{index}', found {response['hits']['total']['value']} results"
|
||||
)
|
||||
return response
|
||||
|
||||
except NotFoundError:
|
||||
logger.error(f"Index '{index}' not found")
|
||||
raise
|
||||
except AuthenticationException:
|
||||
logger.error("Authentication failed")
|
||||
raise
|
||||
except ConnectionError:
|
||||
logger.error("Failed to connect to Elasticsearch")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Search failed: {e}")
|
||||
raise
|
||||
|
||||
async def get_indices(self) -> list[str]:
|
||||
"""
|
||||
Get list of available indices
|
||||
|
||||
Returns:
|
||||
List of index names
|
||||
"""
|
||||
try:
|
||||
indices = await self.client.indices.get_alias(index="*")
|
||||
return list(indices.keys())
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get indices: {e}")
|
||||
raise
|
||||
|
||||
async def get_mapping(self, index: str) -> dict[str, Any]:
|
||||
"""
|
||||
Get mapping for an index
|
||||
|
||||
Args:
|
||||
index: Index name
|
||||
|
||||
Returns:
|
||||
Index mapping
|
||||
"""
|
||||
try:
|
||||
mapping = await self.client.indices.get_mapping(index=index)
|
||||
return mapping[index]["mappings"] if index in mapping else {}
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get mapping for index '{index}': {e}")
|
||||
raise
|
||||
|
||||
async def scroll_search(
|
||||
self,
|
||||
index: str | list[str],
|
||||
query: dict[str, Any],
|
||||
size: int = 1000,
|
||||
scroll_timeout: str = "5m",
|
||||
fields: list[str] | None = None,
|
||||
):
|
||||
"""
|
||||
Perform a scroll search for large result sets
|
||||
|
||||
Args:
|
||||
index: Elasticsearch index name or list of indices
|
||||
query: Elasticsearch query DSL
|
||||
size: Number of results per scroll
|
||||
scroll_timeout: Scroll timeout
|
||||
fields: List of fields to include in response
|
||||
|
||||
Yields:
|
||||
Document hits from Elasticsearch
|
||||
"""
|
||||
try:
|
||||
search_body: dict[str, Any] = {
|
||||
"query": query,
|
||||
"size": size,
|
||||
}
|
||||
|
||||
if fields:
|
||||
search_body["_source"] = fields
|
||||
|
||||
# Initial search
|
||||
response = await self.client.search(
|
||||
index=index, body=search_body, scroll=scroll_timeout
|
||||
)
|
||||
|
||||
scroll_id = response.get("_scroll_id")
|
||||
hits = response["hits"]["hits"]
|
||||
|
||||
while hits:
|
||||
for hit in hits:
|
||||
yield hit
|
||||
|
||||
# Continue scrolling
|
||||
if scroll_id:
|
||||
response = await self.client.scroll(
|
||||
scroll_id=scroll_id, scroll=scroll_timeout
|
||||
)
|
||||
scroll_id = response.get("_scroll_id")
|
||||
hits = response["hits"]["hits"]
|
||||
|
||||
# Clear scroll
|
||||
if scroll_id:
|
||||
await self.client.clear_scroll(scroll_id=scroll_id)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Scroll search failed: {e}")
|
||||
raise
|
||||
|
||||
async def count_documents(
|
||||
self, index: str | list[str], query: dict[str, Any] | None = None
|
||||
) -> int:
|
||||
"""
|
||||
Count documents in an index
|
||||
|
||||
Args:
|
||||
index: Index name or list of indices
|
||||
query: Optional query to filter documents
|
||||
|
||||
Returns:
|
||||
Number of documents
|
||||
"""
|
||||
try:
|
||||
if query:
|
||||
response = await self.client.count(index=index, body={"query": query})
|
||||
else:
|
||||
response = await self.client.count(index=index)
|
||||
|
||||
return response["count"]
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to count documents in index '{index}': {e}")
|
||||
raise
|
||||
|
||||
async def close(self):
|
||||
"""Close the Elasticsearch client connection"""
|
||||
if hasattr(self, "client"):
|
||||
await self.client.close()
|
||||
|
|
@ -50,6 +50,7 @@ class DocumentType(str, Enum):
|
|||
GOOGLE_GMAIL_CONNECTOR = "GOOGLE_GMAIL_CONNECTOR"
|
||||
AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR"
|
||||
LUMA_CONNECTOR = "LUMA_CONNECTOR"
|
||||
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
|
||||
|
||||
|
||||
class SearchSourceConnectorType(str, Enum):
|
||||
|
|
@ -68,6 +69,7 @@ class SearchSourceConnectorType(str, Enum):
|
|||
GOOGLE_GMAIL_CONNECTOR = "GOOGLE_GMAIL_CONNECTOR"
|
||||
AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR"
|
||||
LUMA_CONNECTOR = "LUMA_CONNECTOR"
|
||||
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
|
||||
|
||||
|
||||
class ChatType(str, Enum):
|
||||
|
|
|
|||
|
|
@ -40,6 +40,7 @@ from app.tasks.connector_indexers import (
|
|||
index_clickup_tasks,
|
||||
index_confluence_pages,
|
||||
index_discord_messages,
|
||||
index_elasticsearch_documents,
|
||||
index_github_repos,
|
||||
index_google_calendar_events,
|
||||
index_google_gmail_messages,
|
||||
|
|
@ -363,6 +364,7 @@ async def index_connector_content(
|
|||
- JIRA_CONNECTOR: Indexes issues and comments from Jira
|
||||
- DISCORD_CONNECTOR: Indexes messages from all accessible Discord channels
|
||||
- LUMA_CONNECTOR: Indexes events from Luma
|
||||
- ELASTICSEARCH_CONNECTOR: Indexes documents from Elasticsearch
|
||||
|
||||
Args:
|
||||
connector_id: ID of the connector to use
|
||||
|
|
@ -589,6 +591,24 @@ async def index_connector_content(
|
|||
)
|
||||
response_message = "Luma indexing started in the background."
|
||||
|
||||
elif (
|
||||
connector.connector_type
|
||||
== SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR
|
||||
):
|
||||
# Run indexing in background
|
||||
logger.info(
|
||||
f"Triggering Elasticsearch indexing for connector {connector_id} into search space {search_space_id}"
|
||||
)
|
||||
background_tasks.add_task(
|
||||
run_elasticsearch_indexing_with_new_session,
|
||||
connector_id,
|
||||
search_space_id,
|
||||
str(user.id),
|
||||
indexing_from,
|
||||
indexing_to,
|
||||
)
|
||||
response_message = "Elasticsearch indexing started in the background."
|
||||
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
|
|
@ -1358,3 +1378,61 @@ async def run_luma_indexing(
|
|||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in background Luma indexing task: {e!s}")
|
||||
|
||||
|
||||
async def run_elasticsearch_indexing_with_new_session(
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
):
|
||||
"""Wrapper to run Elasticsearch indexing with its own database session."""
|
||||
logger.info(
|
||||
f"Background task started: Indexing Elasticsearch connector {connector_id} into space {search_space_id}"
|
||||
)
|
||||
async with async_session_maker() as session:
|
||||
await run_elasticsearch_indexing(
|
||||
session, connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
logger.info(
|
||||
f"Background task finished: Indexing Elasticsearch connector {connector_id}"
|
||||
)
|
||||
|
||||
|
||||
async def run_elasticsearch_indexing(
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
):
|
||||
"""Runs the Elasticsearch indexing task and updates the timestamp."""
|
||||
try:
|
||||
indexed_count, error_message = await index_elasticsearch_documents(
|
||||
session,
|
||||
connector_id,
|
||||
search_space_id,
|
||||
user_id,
|
||||
start_date,
|
||||
end_date,
|
||||
update_last_indexed=False,
|
||||
)
|
||||
if error_message:
|
||||
logger.error(
|
||||
f"Elasticsearch indexing failed for connector {connector_id}: {error_message}"
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
f"Elasticsearch indexing successful for connector {connector_id}. Indexed {indexed_count} documents."
|
||||
)
|
||||
# Update the last indexed timestamp only on success
|
||||
await update_connector_last_indexed(session, connector_id)
|
||||
await session.commit()
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
logger.error(
|
||||
f"Critical error in run_elasticsearch_indexing for connector {connector_id}: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -2028,3 +2028,117 @@ class ConnectorService:
|
|||
}
|
||||
|
||||
return result_object, luma_chunks
|
||||
|
||||
async def search_elasticsearch(
|
||||
self,
|
||||
user_query: str,
|
||||
user_id: str,
|
||||
search_space_id: int,
|
||||
top_k: int = 20,
|
||||
search_mode: SearchMode = SearchMode.CHUNKS,
|
||||
) -> tuple:
|
||||
"""
|
||||
Search for Elasticsearch documents and return both the source information and langchain documents
|
||||
|
||||
Args:
|
||||
user_query: The user's query
|
||||
user_id: The user's ID
|
||||
search_space_id: The search space ID to search in
|
||||
top_k: Maximum number of results to return
|
||||
search_mode: Search mode (CHUNKS or DOCUMENTS)
|
||||
|
||||
Returns:
|
||||
tuple: (sources_info, langchain_documents)
|
||||
"""
|
||||
if search_mode == SearchMode.CHUNKS:
|
||||
elasticsearch_chunks = await self.chunk_retriever.hybrid_search(
|
||||
query_text=user_query,
|
||||
top_k=top_k,
|
||||
user_id=user_id,
|
||||
search_space_id=search_space_id,
|
||||
document_type="ELASTICSEARCH_CONNECTOR",
|
||||
)
|
||||
elif search_mode == SearchMode.DOCUMENTS:
|
||||
elasticsearch_chunks = await self.document_retriever.hybrid_search(
|
||||
query_text=user_query,
|
||||
top_k=top_k,
|
||||
user_id=user_id,
|
||||
search_space_id=search_space_id,
|
||||
document_type="ELASTICSEARCH_CONNECTOR",
|
||||
)
|
||||
# Transform document retriever results to match expected format
|
||||
elasticsearch_chunks = self._transform_document_results(
|
||||
elasticsearch_chunks
|
||||
)
|
||||
|
||||
# Early return if no results
|
||||
if not elasticsearch_chunks:
|
||||
return {
|
||||
"id": 34,
|
||||
"name": "Elasticsearch",
|
||||
"type": "ELASTICSEARCH_CONNECTOR",
|
||||
"sources": [],
|
||||
}, []
|
||||
|
||||
# Process each chunk and create sources directly without deduplication
|
||||
sources_list = []
|
||||
async with self.counter_lock:
|
||||
for _i, chunk in enumerate(elasticsearch_chunks):
|
||||
# Extract document metadata
|
||||
document = chunk.get("document", {})
|
||||
metadata = document.get("metadata", {})
|
||||
|
||||
# Extract Elasticsearch-specific metadata
|
||||
es_id = metadata.get("elasticsearch_id", "")
|
||||
es_index = metadata.get("elasticsearch_index", "")
|
||||
es_score = metadata.get("elasticsearch_score", "")
|
||||
|
||||
# Create a more descriptive title for Elasticsearch documents
|
||||
title = document.get("title", "Elasticsearch Document")
|
||||
if es_index:
|
||||
title = f"{title} (Index: {es_index})"
|
||||
|
||||
# Create a more descriptive description for Elasticsearch documents
|
||||
description = chunk.get("content", "")[:150]
|
||||
if len(description) == 150:
|
||||
description += "..."
|
||||
|
||||
# Add Elasticsearch info to description
|
||||
info_parts = []
|
||||
if es_id:
|
||||
info_parts.append(f"ID: {es_id}")
|
||||
if es_score:
|
||||
info_parts.append(f"Score: {es_score}")
|
||||
|
||||
if info_parts:
|
||||
if description:
|
||||
description = f"{description} | {' | '.join(info_parts)}"
|
||||
else:
|
||||
description = " | ".join(info_parts)
|
||||
|
||||
# For URL, we could construct a URL to view the document if we have the Elasticsearch UI URL
|
||||
url = ""
|
||||
# Could be extended to include Kibana or other UI URLs if configured
|
||||
|
||||
source = {
|
||||
"id": chunk.get("chunk_id", self.source_id_counter),
|
||||
"title": title,
|
||||
"description": description,
|
||||
"url": url,
|
||||
"elasticsearch_id": es_id,
|
||||
"elasticsearch_index": es_index,
|
||||
"elasticsearch_score": es_score,
|
||||
}
|
||||
|
||||
self.source_id_counter += 1
|
||||
sources_list.append(source)
|
||||
|
||||
# Create result object
|
||||
result_object = {
|
||||
"id": 34, # Assign a unique ID for the Elasticsearch connector
|
||||
"name": "Elasticsearch",
|
||||
"type": "ELASTICSEARCH_CONNECTOR",
|
||||
"sources": sources_list,
|
||||
}
|
||||
|
||||
return result_object, elasticsearch_chunks
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ Available indexers:
|
|||
- Google Gmail: Index messages from Google Gmail
|
||||
- Google Calendar: Index events from Google Calendar
|
||||
- Luma: Index events from Luma
|
||||
- Elasticsearch: Index documents from Elasticsearch instances
|
||||
"""
|
||||
|
||||
# Communication platforms
|
||||
|
|
@ -27,6 +28,7 @@ from .confluence_indexer import index_confluence_pages
|
|||
from .discord_indexer import index_discord_messages
|
||||
|
||||
# Development platforms
|
||||
from .elasticsearch_indexer import index_elasticsearch_documents
|
||||
from .github_indexer import index_github_repos
|
||||
from .google_calendar_indexer import index_google_calendar_events
|
||||
from .google_gmail_indexer import index_google_gmail_messages
|
||||
|
|
@ -46,6 +48,7 @@ __all__ = [ # noqa: RUF022
|
|||
"index_confluence_pages",
|
||||
"index_discord_messages",
|
||||
# Development platforms
|
||||
"index_elasticsearch_documents",
|
||||
"index_github_repos",
|
||||
# Calendar and scheduling
|
||||
"index_google_calendar_events",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,354 @@
|
|||
"""
|
||||
Elasticsearch indexer for SurfSense
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
|
||||
from app.connectors.elasticsearch_connector import ElasticsearchConnector
|
||||
from app.db import Document, DocumentType, SearchSourceConnector
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class _ChunkingService:
|
||||
def __init__(self, chunk_size: int = 1000, overlap: int = 200) -> None:
|
||||
self.chunk_size = max(100, chunk_size)
|
||||
self.overlap = max(0, min(overlap, self.chunk_size - 1))
|
||||
|
||||
def chunk_text(self, text: str) -> list[str]:
|
||||
if not text:
|
||||
return []
|
||||
text = text.strip()
|
||||
if len(text) <= self.chunk_size:
|
||||
return [text]
|
||||
chunks: list[str] = []
|
||||
step = self.chunk_size - self.overlap
|
||||
pos = 0
|
||||
while pos < len(text):
|
||||
end = pos + self.chunk_size
|
||||
chunks.append(text[pos:end].strip())
|
||||
pos += step
|
||||
return chunks
|
||||
|
||||
|
||||
class _DocumentService:
|
||||
def __init__(self, session):
|
||||
self.session = session
|
||||
|
||||
async def get_document_by_hash(self, content_hash: str):
|
||||
from sqlalchemy.future import select
|
||||
|
||||
from app.db import Document
|
||||
|
||||
if not content_hash:
|
||||
return None
|
||||
result = await self.session.execute(
|
||||
select(Document).where(Document.content_hash == content_hash)
|
||||
)
|
||||
return result.scalars().first()
|
||||
|
||||
async def create_chunks_for_document(self, document_id: int, chunks: list[str]):
|
||||
from app.db import Chunk
|
||||
|
||||
for chunk_text in chunks:
|
||||
self.session.add(Chunk(content=chunk_text, document_id=document_id))
|
||||
await self.session.flush()
|
||||
|
||||
|
||||
async def index_elasticsearch_documents(
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
update_last_indexed: bool = True,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index documents from Elasticsearch into SurfSense
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
connector_id: Elasticsearch connector ID
|
||||
search_space_id: Search space ID
|
||||
user_id: User ID
|
||||
start_date: Start date for indexing (not used for Elasticsearch, kept for compatibility)
|
||||
end_date: End date for indexing (not used for Elasticsearch, kept for compatibility)
|
||||
update_last_indexed: Whether to update the last indexed timestamp
|
||||
|
||||
Returns:
|
||||
Tuple of (number of documents processed, error message if any)
|
||||
"""
|
||||
es_connector = None
|
||||
try:
|
||||
# Get the connector configuration
|
||||
result = await session.execute(
|
||||
select(SearchSourceConnector).filter(
|
||||
SearchSourceConnector.id == connector_id
|
||||
)
|
||||
)
|
||||
connector = result.scalars().first()
|
||||
|
||||
if not connector:
|
||||
error_msg = f"Elasticsearch connector with ID {connector_id} not found"
|
||||
logger.error(error_msg)
|
||||
return 0, error_msg
|
||||
|
||||
# Get connector configuration
|
||||
config = connector.config
|
||||
|
||||
# Validate required fields - now only URL and INDEX are required
|
||||
# Authentication can be either API key OR username/password
|
||||
if "ELASTICSEARCH_URL" not in config:
|
||||
error_msg = "Missing required field in connector config: ELASTICSEARCH_URL"
|
||||
logger.error(error_msg)
|
||||
return 0, error_msg
|
||||
|
||||
if "ELASTICSEARCH_INDEX" not in config:
|
||||
error_msg = (
|
||||
"Missing required field in connector config: ELASTICSEARCH_INDEX"
|
||||
)
|
||||
logger.error(error_msg)
|
||||
return 0, error_msg
|
||||
|
||||
# Check authentication - must have either API key or username+password
|
||||
has_api_key = (
|
||||
"ELASTICSEARCH_API_KEY" in config and config["ELASTICSEARCH_API_KEY"]
|
||||
)
|
||||
has_basic_auth = (
|
||||
"ELASTICSEARCH_USERNAME" in config
|
||||
and config["ELASTICSEARCH_USERNAME"]
|
||||
and "ELASTICSEARCH_PASSWORD" in config
|
||||
and config["ELASTICSEARCH_PASSWORD"]
|
||||
)
|
||||
|
||||
if not has_api_key and not has_basic_auth:
|
||||
error_msg = "Missing authentication: provide either ELASTICSEARCH_API_KEY or ELASTICSEARCH_USERNAME + ELASTICSEARCH_PASSWORD"
|
||||
logger.error(error_msg)
|
||||
return 0, error_msg
|
||||
|
||||
# Initialize document service
|
||||
document_service = _DocumentService(session)
|
||||
chunking_service = _ChunkingService()
|
||||
|
||||
# Initialize Elasticsearch connector
|
||||
es_connector = ElasticsearchConnector(
|
||||
url=config["ELASTICSEARCH_URL"],
|
||||
api_key=config.get("ELASTICSEARCH_API_KEY"),
|
||||
username=config.get("ELASTICSEARCH_USERNAME"),
|
||||
password=config.get("ELASTICSEARCH_PASSWORD"),
|
||||
verify_certs=config.get("ELASTICSEARCH_VERIFY_CERTS", True),
|
||||
ca_certs=config.get("ELASTICSEARCH_CA_CERTS"),
|
||||
)
|
||||
|
||||
# Build query based on configuration
|
||||
query = _build_elasticsearch_query(config)
|
||||
|
||||
# Get the index name(s) - can be a string or list
|
||||
index_name = config["ELASTICSEARCH_INDEX"]
|
||||
|
||||
# Get max documents to index
|
||||
max_documents = config.get("ELASTICSEARCH_MAX_DOCUMENTS", 1000)
|
||||
|
||||
logger.info(
|
||||
f"Starting Elasticsearch indexing for index '{index_name}' with max {max_documents} documents"
|
||||
)
|
||||
|
||||
documents_processed = 0
|
||||
|
||||
try:
|
||||
# Use scroll search for large result sets
|
||||
async for hit in es_connector.scroll_search(
|
||||
index=index_name,
|
||||
query=query,
|
||||
size=min(max_documents, 100), # Scroll in batches
|
||||
fields=config.get("ELASTICSEARCH_FIELDS"),
|
||||
):
|
||||
if documents_processed >= max_documents:
|
||||
break
|
||||
|
||||
try:
|
||||
# Extract document data
|
||||
doc_id = hit["_id"]
|
||||
source = hit.get("_source", {})
|
||||
|
||||
# Build document title
|
||||
title_field = config.get("ELASTICSEARCH_TITLE_FIELD")
|
||||
if not title_field:
|
||||
for candidate in ("title", "name", "subject"):
|
||||
if candidate in source:
|
||||
title_field = candidate
|
||||
break
|
||||
title = (
|
||||
str(source.get(title_field, doc_id))
|
||||
if title_field is not None
|
||||
else str(doc_id)
|
||||
)
|
||||
|
||||
# Build document content
|
||||
content = _build_document_content(source, config)
|
||||
|
||||
if not content.strip():
|
||||
logger.warning(f"Skipping document {doc_id} - no content found")
|
||||
continue
|
||||
|
||||
# Create content hash
|
||||
content_hash = hashlib.sha256(content.encode()).hexdigest()
|
||||
|
||||
# Build metadata
|
||||
metadata = {
|
||||
"elasticsearch_id": doc_id,
|
||||
"elasticsearch_index": hit.get("_index", index_name),
|
||||
"elasticsearch_score": hit.get("_score"),
|
||||
"indexed_at": datetime.now().isoformat(),
|
||||
"source": "ELASTICSEARCH_CONNECTOR",
|
||||
}
|
||||
|
||||
# Add any additional metadata fields specified in config
|
||||
if "ELASTICSEARCH_METADATA_FIELDS" in config:
|
||||
for field in config["ELASTICSEARCH_METADATA_FIELDS"]:
|
||||
if field in source:
|
||||
metadata[f"es_{field}"] = source[field]
|
||||
|
||||
# Check if document already exists
|
||||
existing_doc = await document_service.get_document_by_hash(
|
||||
content_hash
|
||||
)
|
||||
|
||||
if existing_doc:
|
||||
logger.debug(f"Document {doc_id} already exists, skipping")
|
||||
continue
|
||||
|
||||
# Create document
|
||||
document = Document(
|
||||
title=title,
|
||||
content=content,
|
||||
content_hash=content_hash,
|
||||
document_type=DocumentType.ELASTICSEARCH_CONNECTOR,
|
||||
document_metadata=metadata,
|
||||
search_space_id=search_space_id,
|
||||
)
|
||||
|
||||
# Add document to session
|
||||
session.add(document)
|
||||
await session.flush() # Get the document ID
|
||||
|
||||
# Create chunks
|
||||
chunks = chunking_service.chunk_text(content)
|
||||
await document_service.create_chunks_for_document(
|
||||
document.id, chunks
|
||||
)
|
||||
|
||||
documents_processed += 1
|
||||
|
||||
if documents_processed % 10 == 0:
|
||||
logger.info(
|
||||
f"Processed {documents_processed} Elasticsearch documents"
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error processing Elasticsearch document {hit.get('_id', 'unknown')}: {e}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Final commit
|
||||
await session.commit()
|
||||
|
||||
logger.info(
|
||||
f"Successfully indexed {documents_processed} documents from Elasticsearch"
|
||||
)
|
||||
|
||||
# Update last indexed timestamp if requested
|
||||
if update_last_indexed and documents_processed > 0:
|
||||
connector.last_indexed_at = datetime.now()
|
||||
await session.commit()
|
||||
if update_last_indexed and documents_processed > 0:
|
||||
# store ISO-8601 UTC timestamp with 'Z' suffix, e.g. 2025-10-09T22:04:53.599658Z
|
||||
connector.last_indexed_at = (
|
||||
datetime.now(UTC).isoformat().replace("+00:00", "Z")
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
return documents_processed, None
|
||||
|
||||
finally:
|
||||
# Clean up Elasticsearch connection
|
||||
if es_connector:
|
||||
await es_connector.close()
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error indexing Elasticsearch documents: {e}"
|
||||
logger.error(error_msg, exc_info=True)
|
||||
await session.rollback()
|
||||
if es_connector:
|
||||
await es_connector.close()
|
||||
return 0, error_msg
|
||||
|
||||
|
||||
def _build_elasticsearch_query(config: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Build Elasticsearch query from connector configuration
|
||||
|
||||
Args:
|
||||
config: Connector configuration
|
||||
|
||||
Returns:
|
||||
Elasticsearch query DSL
|
||||
"""
|
||||
# Check if custom query is provided
|
||||
if config.get("ELASTICSEARCH_QUERY"):
|
||||
try:
|
||||
if isinstance(config["ELASTICSEARCH_QUERY"], str):
|
||||
return json.loads(config["ELASTICSEARCH_QUERY"])
|
||||
else:
|
||||
return config["ELASTICSEARCH_QUERY"]
|
||||
except (json.JSONDecodeError, TypeError) as e:
|
||||
logger.warning(f"Invalid custom query, using match_all: {e}")
|
||||
|
||||
# Default to match all documents
|
||||
return {"match_all": {}}
|
||||
|
||||
|
||||
def _build_document_content(source: dict[str, Any], config: dict[str, Any]) -> str:
|
||||
"""
|
||||
Build document content from Elasticsearch document source
|
||||
|
||||
Args:
|
||||
source: Elasticsearch document source
|
||||
config: Connector configuration
|
||||
|
||||
Returns:
|
||||
Formatted document content
|
||||
"""
|
||||
content_parts = []
|
||||
|
||||
# Get content fields from config
|
||||
content_fields = config.get("ELASTICSEARCH_CONTENT_FIELDS", [])
|
||||
|
||||
if content_fields:
|
||||
# Use specified content fields
|
||||
for field in content_fields:
|
||||
if field in source:
|
||||
field_value = source[field]
|
||||
if isinstance(field_value, str | int | float):
|
||||
content_parts.append(f"{field}: {field_value}")
|
||||
if isinstance(field_value, str | int | float):
|
||||
content_parts.append(f"{field}: {json.dumps(field_value)}")
|
||||
else:
|
||||
# Use all fields if no specific content fields specified
|
||||
for key, value in source.items():
|
||||
if isinstance(value, str | int | float):
|
||||
content_parts.append(f"{key}: {value}")
|
||||
elif isinstance(value, list | dict):
|
||||
content_parts.append(f"{key}: {json.dumps(value)}")
|
||||
|
||||
return "\n".join(content_parts)
|
||||
|
|
@ -43,6 +43,7 @@ dependencies = [
|
|||
"youtube-transcript-api>=1.0.3",
|
||||
"litellm>=1.77.5",
|
||||
"langchain-litellm>=0.2.3",
|
||||
"elasticsearch>=9.1.1",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
|
|
|
|||
5003
surfsense_backend/uv.lock
generated
5003
surfsense_backend/uv.lock
generated
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue