feat: added elasticsearch connector

This commit is contained in:
Anish Sarkar 2025-10-12 09:39:04 +05:30
parent 402039f02f
commit 55d752e3c8
27 changed files with 4331 additions and 2499 deletions

View file

@ -0,0 +1,56 @@
"""Add ElasticSearch connector enums
Revision ID: 26
Revises: 25
Create Date: 2025-10-12 12:00:00.000000
"""
from collections.abc import Sequence
from alembic import op
# revision identifiers
revision: str = "26"
down_revision: str | None = "25"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
# Add enum values
op.execute(
"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_type t
JOIN pg_enum e ON t.oid = e.enumtypid
WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'ELASTICSEARCH_CONNECTOR'
) THEN
ALTER TYPE searchsourceconnectortype ADD VALUE 'ELASTICSEARCH_CONNECTOR';
END IF;
END
$$;
"""
)
op.execute(
"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_type t
JOIN pg_enum e ON t.oid = e.enumtypid
WHERE t.typname = 'documenttype' AND e.enumlabel = 'ELASTICSEARCH_CONNECTOR'
) THEN
ALTER TYPE documenttype ADD VALUE 'ELASTICSEARCH_CONNECTOR';
END IF;
END
$$;
"""
)
def downgrade() -> None:
"""Remove 'ELASTICSEARCH_CONNECTOR' from enum types."""
pass

View file

@ -488,6 +488,25 @@ async def fetch_documents_by_ids(
)
url = metadata.get("url", "")
elif doc_type == "ELASTICSEARCH_CONNECTOR":
# Prefer explicit title in metadata/source, otherwise fallback to doc.title
es_title = (
metadata.get("title")
or metadata.get("es_title")
or doc.title
or f"Elasticsearch: {metadata.get('elasticsearch_index', '')}"
)
title = es_title
description = metadata.get("description") or (
doc.content[:100] + "..."
if len(doc.content) > 100
else doc.content
)
# If a link or index info is stored, surface it
url = metadata.get("url", "") or metadata.get(
"elasticsearch_index", ""
)
else: # FILE and other types
title = doc.title
description = (
@ -512,6 +531,7 @@ async def fetch_documents_by_ids(
"SLACK_CONNECTOR": "Slack (Selected)",
"NOTION_CONNECTOR": "Notion (Selected)",
"GITHUB_CONNECTOR": "GitHub (Selected)",
"ELASTICSEARCH_CONNECTOR": "Elasticsearch (Selected)",
"YOUTUBE_VIDEO": "YouTube Videos (Selected)",
"DISCORD_CONNECTOR": "Discord (Selected)",
"JIRA_CONNECTOR": "Jira Issues (Selected)",
@ -1266,6 +1286,33 @@ async def fetch_relevant_documents(
}
)
elif connector == "ELASTICSEARCH_CONNECTOR":
(
source_object,
elasticsearch_chunks,
) = await connector_service.search_elasticsearch(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
)
# Add to sources and raw documents
if source_object:
all_sources.append(source_object)
all_raw_documents.extend(elasticsearch_chunks)
# Stream found document count
if streaming_service and writer:
writer(
{
"yield_value": streaming_service.format_terminal_info_delta(
f"🔎 Found {len(elasticsearch_chunks)} Elasticsearch chunks related to your query"
)
}
)
except Exception as e:
logging.error("Error in search_airtable: %s", traceback.format_exc())
error_message = f"Error searching connector {connector}: {e!s}"

View file

@ -51,6 +51,7 @@ def get_connector_emoji(connector_name: str) -> str:
"GOOGLE_CALENDAR_CONNECTOR": "📅",
"AIRTABLE_CONNECTOR": "🗃️",
"LUMA_CONNECTOR": "",
"ELASTICSEARCH_CONNECTOR": "🔎",
}
return connector_emojis.get(connector_name, "🔎")
@ -74,6 +75,7 @@ def get_connector_friendly_name(connector_name: str) -> str:
"LINKUP_API": "Linkup Search",
"AIRTABLE_CONNECTOR": "Airtable",
"LUMA_CONNECTOR": "Luma",
"ELASTICSEARCH_CONNECTOR": "Elasticsearch",
}
return connector_friendly_names.get(connector_name, connector_name)

View file

@ -0,0 +1,254 @@
"""
Elasticsearch connector for SurfSense
"""
import logging
from typing import Any
from elasticsearch import AsyncElasticsearch
from elasticsearch.exceptions import (
AuthenticationException,
ConnectionError,
NotFoundError,
)
logger = logging.getLogger(__name__)
class ElasticsearchConnector:
"""
Connector for Elasticsearch instances
"""
def __init__(
self,
url: str,
api_key: str | None = None,
username: str | None = None,
password: str | None = None,
verify_certs: bool = True,
ca_certs: str | None = None,
):
"""
Initialize Elasticsearch connector
Args:
url: Full Elasticsearch URL (e.g., https://host:port or cloud endpoint)
api_key: API key for authentication (preferred method)
username: Username for basic authentication
password: Password for basic authentication
verify_certs: Whether to verify SSL certificates
ca_certs: Path to CA certificates file
"""
self.url = url
self.api_key = api_key
self.username = username
self.password = password
self.verify_certs = verify_certs
self.ca_certs = ca_certs
# Build connection configuration
self.es_config = self._build_config()
# Initialize Elasticsearch client
try:
self.client = AsyncElasticsearch(**self.es_config)
except Exception as e:
logger.error(f"Failed to initialize Elasticsearch client: {e}")
raise
def _build_config(self) -> dict[str, Any]:
"""Build Elasticsearch client configuration"""
config = {
"hosts": [self.url],
"verify_certs": self.verify_certs,
"request_timeout": 30,
"max_retries": 3,
"retry_on_timeout": True,
}
# Authentication - API key takes precedence
if self.api_key:
config["api_key"] = self.api_key
elif self.username and self.password:
config["basic_auth"] = (self.username, self.password)
# SSL configuration
if self.ca_certs:
config["ca_certs"] = self.ca_certs
return config
async def search(
self,
index: str | list[str],
query: dict[str, Any],
size: int = 100,
from_: int = 0,
fields: list[str] | None = None,
sort: list[dict[str, Any]] | None = None,
) -> dict[str, Any]:
"""
Search documents in Elasticsearch
Args:
index: Elasticsearch index name or list of indices
query: Elasticsearch query DSL
size: Number of results to return
from_: Starting offset for pagination
fields: List of fields to include in response
sort: Sort configuration
Returns:
Elasticsearch search response
"""
try:
search_body: dict[str, Any] = {
"query": query,
"size": size,
"from": from_,
}
if fields:
search_body["_source"] = fields
if sort:
search_body["sort"] = sort
response = await self.client.search(index=index, body=search_body)
logger.info(
f"Successfully searched index '{index}', found {response['hits']['total']['value']} results"
)
return response
except NotFoundError:
logger.error(f"Index '{index}' not found")
raise
except AuthenticationException:
logger.error("Authentication failed")
raise
except ConnectionError:
logger.error("Failed to connect to Elasticsearch")
raise
except Exception as e:
logger.error(f"Search failed: {e}")
raise
async def get_indices(self) -> list[str]:
"""
Get list of available indices
Returns:
List of index names
"""
try:
indices = await self.client.indices.get_alias(index="*")
return list(indices.keys())
except Exception as e:
logger.error(f"Failed to get indices: {e}")
raise
async def get_mapping(self, index: str) -> dict[str, Any]:
"""
Get mapping for an index
Args:
index: Index name
Returns:
Index mapping
"""
try:
mapping = await self.client.indices.get_mapping(index=index)
return mapping[index]["mappings"] if index in mapping else {}
except Exception as e:
logger.error(f"Failed to get mapping for index '{index}': {e}")
raise
async def scroll_search(
self,
index: str | list[str],
query: dict[str, Any],
size: int = 1000,
scroll_timeout: str = "5m",
fields: list[str] | None = None,
):
"""
Perform a scroll search for large result sets
Args:
index: Elasticsearch index name or list of indices
query: Elasticsearch query DSL
size: Number of results per scroll
scroll_timeout: Scroll timeout
fields: List of fields to include in response
Yields:
Document hits from Elasticsearch
"""
try:
search_body: dict[str, Any] = {
"query": query,
"size": size,
}
if fields:
search_body["_source"] = fields
# Initial search
response = await self.client.search(
index=index, body=search_body, scroll=scroll_timeout
)
scroll_id = response.get("_scroll_id")
hits = response["hits"]["hits"]
while hits:
for hit in hits:
yield hit
# Continue scrolling
if scroll_id:
response = await self.client.scroll(
scroll_id=scroll_id, scroll=scroll_timeout
)
scroll_id = response.get("_scroll_id")
hits = response["hits"]["hits"]
# Clear scroll
if scroll_id:
await self.client.clear_scroll(scroll_id=scroll_id)
except Exception as e:
logger.error(f"Scroll search failed: {e}")
raise
async def count_documents(
self, index: str | list[str], query: dict[str, Any] | None = None
) -> int:
"""
Count documents in an index
Args:
index: Index name or list of indices
query: Optional query to filter documents
Returns:
Number of documents
"""
try:
if query:
response = await self.client.count(index=index, body={"query": query})
else:
response = await self.client.count(index=index)
return response["count"]
except Exception as e:
logger.error(f"Failed to count documents in index '{index}': {e}")
raise
async def close(self):
"""Close the Elasticsearch client connection"""
if hasattr(self, "client"):
await self.client.close()

View file

@ -50,6 +50,7 @@ class DocumentType(str, Enum):
GOOGLE_GMAIL_CONNECTOR = "GOOGLE_GMAIL_CONNECTOR"
AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR"
LUMA_CONNECTOR = "LUMA_CONNECTOR"
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
class SearchSourceConnectorType(str, Enum):
@ -68,6 +69,7 @@ class SearchSourceConnectorType(str, Enum):
GOOGLE_GMAIL_CONNECTOR = "GOOGLE_GMAIL_CONNECTOR"
AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR"
LUMA_CONNECTOR = "LUMA_CONNECTOR"
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
class ChatType(str, Enum):

View file

@ -40,6 +40,7 @@ from app.tasks.connector_indexers import (
index_clickup_tasks,
index_confluence_pages,
index_discord_messages,
index_elasticsearch_documents,
index_github_repos,
index_google_calendar_events,
index_google_gmail_messages,
@ -363,6 +364,7 @@ async def index_connector_content(
- JIRA_CONNECTOR: Indexes issues and comments from Jira
- DISCORD_CONNECTOR: Indexes messages from all accessible Discord channels
- LUMA_CONNECTOR: Indexes events from Luma
- ELASTICSEARCH_CONNECTOR: Indexes documents from Elasticsearch
Args:
connector_id: ID of the connector to use
@ -589,6 +591,24 @@ async def index_connector_content(
)
response_message = "Luma indexing started in the background."
elif (
connector.connector_type
== SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR
):
# Run indexing in background
logger.info(
f"Triggering Elasticsearch indexing for connector {connector_id} into search space {search_space_id}"
)
background_tasks.add_task(
run_elasticsearch_indexing_with_new_session,
connector_id,
search_space_id,
str(user.id),
indexing_from,
indexing_to,
)
response_message = "Elasticsearch indexing started in the background."
else:
raise HTTPException(
status_code=400,
@ -1358,3 +1378,61 @@ async def run_luma_indexing(
)
except Exception as e:
logger.error(f"Error in background Luma indexing task: {e!s}")
async def run_elasticsearch_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Wrapper to run Elasticsearch indexing with its own database session."""
logger.info(
f"Background task started: Indexing Elasticsearch connector {connector_id} into space {search_space_id}"
)
async with async_session_maker() as session:
await run_elasticsearch_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
logger.info(
f"Background task finished: Indexing Elasticsearch connector {connector_id}"
)
async def run_elasticsearch_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Runs the Elasticsearch indexing task and updates the timestamp."""
try:
indexed_count, error_message = await index_elasticsearch_documents(
session,
connector_id,
search_space_id,
user_id,
start_date,
end_date,
update_last_indexed=False,
)
if error_message:
logger.error(
f"Elasticsearch indexing failed for connector {connector_id}: {error_message}"
)
else:
logger.info(
f"Elasticsearch indexing successful for connector {connector_id}. Indexed {indexed_count} documents."
)
# Update the last indexed timestamp only on success
await update_connector_last_indexed(session, connector_id)
await session.commit()
except Exception as e:
await session.rollback()
logger.error(
f"Critical error in run_elasticsearch_indexing for connector {connector_id}: {e}",
exc_info=True,
)

View file

@ -2028,3 +2028,117 @@ class ConnectorService:
}
return result_object, luma_chunks
async def search_elasticsearch(
self,
user_query: str,
user_id: str,
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
) -> tuple:
"""
Search for Elasticsearch documents and return both the source information and langchain documents
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
Returns:
tuple: (sources_info, langchain_documents)
"""
if search_mode == SearchMode.CHUNKS:
elasticsearch_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="ELASTICSEARCH_CONNECTOR",
)
elif search_mode == SearchMode.DOCUMENTS:
elasticsearch_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="ELASTICSEARCH_CONNECTOR",
)
# Transform document retriever results to match expected format
elasticsearch_chunks = self._transform_document_results(
elasticsearch_chunks
)
# Early return if no results
if not elasticsearch_chunks:
return {
"id": 34,
"name": "Elasticsearch",
"type": "ELASTICSEARCH_CONNECTOR",
"sources": [],
}, []
# Process each chunk and create sources directly without deduplication
sources_list = []
async with self.counter_lock:
for _i, chunk in enumerate(elasticsearch_chunks):
# Extract document metadata
document = chunk.get("document", {})
metadata = document.get("metadata", {})
# Extract Elasticsearch-specific metadata
es_id = metadata.get("elasticsearch_id", "")
es_index = metadata.get("elasticsearch_index", "")
es_score = metadata.get("elasticsearch_score", "")
# Create a more descriptive title for Elasticsearch documents
title = document.get("title", "Elasticsearch Document")
if es_index:
title = f"{title} (Index: {es_index})"
# Create a more descriptive description for Elasticsearch documents
description = chunk.get("content", "")[:150]
if len(description) == 150:
description += "..."
# Add Elasticsearch info to description
info_parts = []
if es_id:
info_parts.append(f"ID: {es_id}")
if es_score:
info_parts.append(f"Score: {es_score}")
if info_parts:
if description:
description = f"{description} | {' | '.join(info_parts)}"
else:
description = " | ".join(info_parts)
# For URL, we could construct a URL to view the document if we have the Elasticsearch UI URL
url = ""
# Could be extended to include Kibana or other UI URLs if configured
source = {
"id": chunk.get("chunk_id", self.source_id_counter),
"title": title,
"description": description,
"url": url,
"elasticsearch_id": es_id,
"elasticsearch_index": es_index,
"elasticsearch_score": es_score,
}
self.source_id_counter += 1
sources_list.append(source)
# Create result object
result_object = {
"id": 34, # Assign a unique ID for the Elasticsearch connector
"name": "Elasticsearch",
"type": "ELASTICSEARCH_CONNECTOR",
"sources": sources_list,
}
return result_object, elasticsearch_chunks

View file

@ -17,6 +17,7 @@ Available indexers:
- Google Gmail: Index messages from Google Gmail
- Google Calendar: Index events from Google Calendar
- Luma: Index events from Luma
- Elasticsearch: Index documents from Elasticsearch instances
"""
# Communication platforms
@ -27,6 +28,7 @@ from .confluence_indexer import index_confluence_pages
from .discord_indexer import index_discord_messages
# Development platforms
from .elasticsearch_indexer import index_elasticsearch_documents
from .github_indexer import index_github_repos
from .google_calendar_indexer import index_google_calendar_events
from .google_gmail_indexer import index_google_gmail_messages
@ -46,6 +48,7 @@ __all__ = [ # noqa: RUF022
"index_confluence_pages",
"index_discord_messages",
# Development platforms
"index_elasticsearch_documents",
"index_github_repos",
# Calendar and scheduling
"index_google_calendar_events",

View file

@ -0,0 +1,354 @@
"""
Elasticsearch indexer for SurfSense
"""
import hashlib
import json
import logging
from datetime import UTC, datetime
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.connectors.elasticsearch_connector import ElasticsearchConnector
from app.db import Document, DocumentType, SearchSourceConnector
logger = logging.getLogger(__name__)
class _ChunkingService:
def __init__(self, chunk_size: int = 1000, overlap: int = 200) -> None:
self.chunk_size = max(100, chunk_size)
self.overlap = max(0, min(overlap, self.chunk_size - 1))
def chunk_text(self, text: str) -> list[str]:
if not text:
return []
text = text.strip()
if len(text) <= self.chunk_size:
return [text]
chunks: list[str] = []
step = self.chunk_size - self.overlap
pos = 0
while pos < len(text):
end = pos + self.chunk_size
chunks.append(text[pos:end].strip())
pos += step
return chunks
class _DocumentService:
def __init__(self, session):
self.session = session
async def get_document_by_hash(self, content_hash: str):
from sqlalchemy.future import select
from app.db import Document
if not content_hash:
return None
result = await self.session.execute(
select(Document).where(Document.content_hash == content_hash)
)
return result.scalars().first()
async def create_chunks_for_document(self, document_id: int, chunks: list[str]):
from app.db import Chunk
for chunk_text in chunks:
self.session.add(Chunk(content=chunk_text, document_id=document_id))
await self.session.flush()
async def index_elasticsearch_documents(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index documents from Elasticsearch into SurfSense
Args:
session: Database session
connector_id: Elasticsearch connector ID
search_space_id: Search space ID
user_id: User ID
start_date: Start date for indexing (not used for Elasticsearch, kept for compatibility)
end_date: End date for indexing (not used for Elasticsearch, kept for compatibility)
update_last_indexed: Whether to update the last indexed timestamp
Returns:
Tuple of (number of documents processed, error message if any)
"""
es_connector = None
try:
# Get the connector configuration
result = await session.execute(
select(SearchSourceConnector).filter(
SearchSourceConnector.id == connector_id
)
)
connector = result.scalars().first()
if not connector:
error_msg = f"Elasticsearch connector with ID {connector_id} not found"
logger.error(error_msg)
return 0, error_msg
# Get connector configuration
config = connector.config
# Validate required fields - now only URL and INDEX are required
# Authentication can be either API key OR username/password
if "ELASTICSEARCH_URL" not in config:
error_msg = "Missing required field in connector config: ELASTICSEARCH_URL"
logger.error(error_msg)
return 0, error_msg
if "ELASTICSEARCH_INDEX" not in config:
error_msg = (
"Missing required field in connector config: ELASTICSEARCH_INDEX"
)
logger.error(error_msg)
return 0, error_msg
# Check authentication - must have either API key or username+password
has_api_key = (
"ELASTICSEARCH_API_KEY" in config and config["ELASTICSEARCH_API_KEY"]
)
has_basic_auth = (
"ELASTICSEARCH_USERNAME" in config
and config["ELASTICSEARCH_USERNAME"]
and "ELASTICSEARCH_PASSWORD" in config
and config["ELASTICSEARCH_PASSWORD"]
)
if not has_api_key and not has_basic_auth:
error_msg = "Missing authentication: provide either ELASTICSEARCH_API_KEY or ELASTICSEARCH_USERNAME + ELASTICSEARCH_PASSWORD"
logger.error(error_msg)
return 0, error_msg
# Initialize document service
document_service = _DocumentService(session)
chunking_service = _ChunkingService()
# Initialize Elasticsearch connector
es_connector = ElasticsearchConnector(
url=config["ELASTICSEARCH_URL"],
api_key=config.get("ELASTICSEARCH_API_KEY"),
username=config.get("ELASTICSEARCH_USERNAME"),
password=config.get("ELASTICSEARCH_PASSWORD"),
verify_certs=config.get("ELASTICSEARCH_VERIFY_CERTS", True),
ca_certs=config.get("ELASTICSEARCH_CA_CERTS"),
)
# Build query based on configuration
query = _build_elasticsearch_query(config)
# Get the index name(s) - can be a string or list
index_name = config["ELASTICSEARCH_INDEX"]
# Get max documents to index
max_documents = config.get("ELASTICSEARCH_MAX_DOCUMENTS", 1000)
logger.info(
f"Starting Elasticsearch indexing for index '{index_name}' with max {max_documents} documents"
)
documents_processed = 0
try:
# Use scroll search for large result sets
async for hit in es_connector.scroll_search(
index=index_name,
query=query,
size=min(max_documents, 100), # Scroll in batches
fields=config.get("ELASTICSEARCH_FIELDS"),
):
if documents_processed >= max_documents:
break
try:
# Extract document data
doc_id = hit["_id"]
source = hit.get("_source", {})
# Build document title
title_field = config.get("ELASTICSEARCH_TITLE_FIELD")
if not title_field:
for candidate in ("title", "name", "subject"):
if candidate in source:
title_field = candidate
break
title = (
str(source.get(title_field, doc_id))
if title_field is not None
else str(doc_id)
)
# Build document content
content = _build_document_content(source, config)
if not content.strip():
logger.warning(f"Skipping document {doc_id} - no content found")
continue
# Create content hash
content_hash = hashlib.sha256(content.encode()).hexdigest()
# Build metadata
metadata = {
"elasticsearch_id": doc_id,
"elasticsearch_index": hit.get("_index", index_name),
"elasticsearch_score": hit.get("_score"),
"indexed_at": datetime.now().isoformat(),
"source": "ELASTICSEARCH_CONNECTOR",
}
# Add any additional metadata fields specified in config
if "ELASTICSEARCH_METADATA_FIELDS" in config:
for field in config["ELASTICSEARCH_METADATA_FIELDS"]:
if field in source:
metadata[f"es_{field}"] = source[field]
# Check if document already exists
existing_doc = await document_service.get_document_by_hash(
content_hash
)
if existing_doc:
logger.debug(f"Document {doc_id} already exists, skipping")
continue
# Create document
document = Document(
title=title,
content=content,
content_hash=content_hash,
document_type=DocumentType.ELASTICSEARCH_CONNECTOR,
document_metadata=metadata,
search_space_id=search_space_id,
)
# Add document to session
session.add(document)
await session.flush() # Get the document ID
# Create chunks
chunks = chunking_service.chunk_text(content)
await document_service.create_chunks_for_document(
document.id, chunks
)
documents_processed += 1
if documents_processed % 10 == 0:
logger.info(
f"Processed {documents_processed} Elasticsearch documents"
)
await session.commit()
except Exception as e:
logger.error(
f"Error processing Elasticsearch document {hit.get('_id', 'unknown')}: {e}"
)
continue
# Final commit
await session.commit()
logger.info(
f"Successfully indexed {documents_processed} documents from Elasticsearch"
)
# Update last indexed timestamp if requested
if update_last_indexed and documents_processed > 0:
connector.last_indexed_at = datetime.now()
await session.commit()
if update_last_indexed and documents_processed > 0:
# store ISO-8601 UTC timestamp with 'Z' suffix, e.g. 2025-10-09T22:04:53.599658Z
connector.last_indexed_at = (
datetime.now(UTC).isoformat().replace("+00:00", "Z")
)
await session.commit()
return documents_processed, None
finally:
# Clean up Elasticsearch connection
if es_connector:
await es_connector.close()
except Exception as e:
error_msg = f"Error indexing Elasticsearch documents: {e}"
logger.error(error_msg, exc_info=True)
await session.rollback()
if es_connector:
await es_connector.close()
return 0, error_msg
def _build_elasticsearch_query(config: dict[str, Any]) -> dict[str, Any]:
"""
Build Elasticsearch query from connector configuration
Args:
config: Connector configuration
Returns:
Elasticsearch query DSL
"""
# Check if custom query is provided
if config.get("ELASTICSEARCH_QUERY"):
try:
if isinstance(config["ELASTICSEARCH_QUERY"], str):
return json.loads(config["ELASTICSEARCH_QUERY"])
else:
return config["ELASTICSEARCH_QUERY"]
except (json.JSONDecodeError, TypeError) as e:
logger.warning(f"Invalid custom query, using match_all: {e}")
# Default to match all documents
return {"match_all": {}}
def _build_document_content(source: dict[str, Any], config: dict[str, Any]) -> str:
"""
Build document content from Elasticsearch document source
Args:
source: Elasticsearch document source
config: Connector configuration
Returns:
Formatted document content
"""
content_parts = []
# Get content fields from config
content_fields = config.get("ELASTICSEARCH_CONTENT_FIELDS", [])
if content_fields:
# Use specified content fields
for field in content_fields:
if field in source:
field_value = source[field]
if isinstance(field_value, str | int | float):
content_parts.append(f"{field}: {field_value}")
if isinstance(field_value, str | int | float):
content_parts.append(f"{field}: {json.dumps(field_value)}")
else:
# Use all fields if no specific content fields specified
for key, value in source.items():
if isinstance(value, str | int | float):
content_parts.append(f"{key}: {value}")
elif isinstance(value, list | dict):
content_parts.append(f"{key}: {json.dumps(value)}")
return "\n".join(content_parts)

View file

@ -43,6 +43,7 @@ dependencies = [
"youtube-transcript-api>=1.0.3",
"litellm>=1.77.5",
"langchain-litellm>=0.2.3",
"elasticsearch>=9.1.1",
]
[dependency-groups]

5003
surfsense_backend/uv.lock generated

File diff suppressed because it is too large Load diff