feat: added elasticsearch connector

2026-07-04 22:02:16 +02:00 · 2025-10-12 09:39:04 +05:30 · 2025-10-12 09:39:04 +05:30 · 55d752e3c8
commit 55d752e3c8
parent 402039f02f
27 changed files with 4331 additions and 2499 deletions
--- a/surfsense_backend/alembic/versions/26_add_elasticsearch_connector_enums.py
+++ b/surfsense_backend/alembic/versions/26_add_elasticsearch_connector_enums.py
@ -0,0 +1,56 @@
+"""Add ElasticSearch connector enums
+
+Revision ID: 26
+Revises: 25
+Create Date: 2025-10-12 12:00:00.000000
+
+"""
+
+from collections.abc import Sequence
+
+from alembic import op
+
+# revision identifiers
+revision: str = "26"
+down_revision: str | None = "25"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    # Add enum values
+    op.execute(
+        """
+    DO $$
+    BEGIN
+        IF NOT EXISTS (
+            SELECT 1 FROM pg_type t
+            JOIN pg_enum e ON t.oid = e.enumtypid
+            WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'ELASTICSEARCH_CONNECTOR'
+        ) THEN
+            ALTER TYPE searchsourceconnectortype ADD VALUE 'ELASTICSEARCH_CONNECTOR';
+        END IF;
+    END
+    $$;
+    """
+    )
+    op.execute(
+        """
+    DO $$
+    BEGIN
+        IF NOT EXISTS (
+            SELECT 1 FROM pg_type t
+            JOIN pg_enum e ON t.oid = e.enumtypid
+            WHERE t.typname = 'documenttype' AND e.enumlabel = 'ELASTICSEARCH_CONNECTOR'
+        ) THEN
+            ALTER TYPE documenttype ADD VALUE 'ELASTICSEARCH_CONNECTOR';
+        END IF;
+    END
+    $$;
+    """
+    )
+
+
+def downgrade() -> None:
+    """Remove 'ELASTICSEARCH_CONNECTOR' from enum types."""
+    pass
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@ -488,6 +488,25 @@ async def fetch_documents_by_ids(
                    )
                    url = metadata.get("url", "")

+                elif doc_type == "ELASTICSEARCH_CONNECTOR":
+                    # Prefer explicit title in metadata/source, otherwise fallback to doc.title
+                    es_title = (
+                        metadata.get("title")
+                        or metadata.get("es_title")
+                        or doc.title
+                        or f"Elasticsearch: {metadata.get('elasticsearch_index', '')}"
+                    )
+                    title = es_title
+                    description = metadata.get("description") or (
+                        doc.content[:100] + "..."
+                        if len(doc.content) > 100
+                        else doc.content
+                    )
+                    # If a link or index info is stored, surface it
+                    url = metadata.get("url", "") or metadata.get(
+                        "elasticsearch_index", ""
+                    )
+
                else:  # FILE and other types
                    title = doc.title
                    description = (
@ -512,6 +531,7 @@ async def fetch_documents_by_ids(
                "SLACK_CONNECTOR": "Slack (Selected)",
                "NOTION_CONNECTOR": "Notion (Selected)",
                "GITHUB_CONNECTOR": "GitHub (Selected)",
+                "ELASTICSEARCH_CONNECTOR": "Elasticsearch (Selected)",
                "YOUTUBE_VIDEO": "YouTube Videos (Selected)",
                "DISCORD_CONNECTOR": "Discord (Selected)",
                "JIRA_CONNECTOR": "Jira Issues (Selected)",
@ -1266,6 +1286,33 @@ async def fetch_relevant_documents(
                            }
                        )

+                elif connector == "ELASTICSEARCH_CONNECTOR":
+                    (
+                        source_object,
+                        elasticsearch_chunks,
+                    ) = await connector_service.search_elasticsearch(
+                        user_query=reformulated_query,
+                        user_id=user_id,
+                        search_space_id=search_space_id,
+                        top_k=top_k,
+                        search_mode=search_mode,
+                    )
+
+                    # Add to sources and raw documents
+                    if source_object:
+                        all_sources.append(source_object)
+                    all_raw_documents.extend(elasticsearch_chunks)
+
+                    # Stream found document count
+                    if streaming_service and writer:
+                        writer(
+                            {
+                                "yield_value": streaming_service.format_terminal_info_delta(
+                                    f"🔎 Found {len(elasticsearch_chunks)} Elasticsearch chunks related to your query"
+                                )
+                            }
+                        )
+
            except Exception as e:
                logging.error("Error in search_airtable: %s", traceback.format_exc())
                error_message = f"Error searching connector {connector}: {e!s}"
--- a/surfsense_backend/app/agents/researcher/utils.py
+++ b/surfsense_backend/app/agents/researcher/utils.py
@ -51,6 +51,7 @@ def get_connector_emoji(connector_name: str) -> str:
        "GOOGLE_CALENDAR_CONNECTOR": "📅",
        "AIRTABLE_CONNECTOR": "🗃️",
        "LUMA_CONNECTOR": "✨",
+        "ELASTICSEARCH_CONNECTOR": "🔎",
    }
    return connector_emojis.get(connector_name, "🔎")

@ -74,6 +75,7 @@ def get_connector_friendly_name(connector_name: str) -> str:
        "LINKUP_API": "Linkup Search",
        "AIRTABLE_CONNECTOR": "Airtable",
        "LUMA_CONNECTOR": "Luma",
+        "ELASTICSEARCH_CONNECTOR": "Elasticsearch",
    }
    return connector_friendly_names.get(connector_name, connector_name)

--- a/surfsense_backend/app/connectors/elasticsearch_connector.py
+++ b/surfsense_backend/app/connectors/elasticsearch_connector.py
@ -0,0 +1,254 @@
+"""
+Elasticsearch connector for SurfSense
+"""
+
+import logging
+from typing import Any
+
+from elasticsearch import AsyncElasticsearch
+from elasticsearch.exceptions import (
+    AuthenticationException,
+    ConnectionError,
+    NotFoundError,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class ElasticsearchConnector:
+    """
+    Connector for Elasticsearch instances
+    """
+
+    def __init__(
+        self,
+        url: str,
+        api_key: str | None = None,
+        username: str | None = None,
+        password: str | None = None,
+        verify_certs: bool = True,
+        ca_certs: str | None = None,
+    ):
+        """
+        Initialize Elasticsearch connector
+
+        Args:
+            url: Full Elasticsearch URL (e.g., https://host:port or cloud endpoint)
+            api_key: API key for authentication (preferred method)
+            username: Username for basic authentication
+            password: Password for basic authentication
+            verify_certs: Whether to verify SSL certificates
+            ca_certs: Path to CA certificates file
+        """
+        self.url = url
+        self.api_key = api_key
+        self.username = username
+        self.password = password
+        self.verify_certs = verify_certs
+        self.ca_certs = ca_certs
+
+        # Build connection configuration
+        self.es_config = self._build_config()
+
+        # Initialize Elasticsearch client
+        try:
+            self.client = AsyncElasticsearch(**self.es_config)
+        except Exception as e:
+            logger.error(f"Failed to initialize Elasticsearch client: {e}")
+            raise
+
+    def _build_config(self) -> dict[str, Any]:
+        """Build Elasticsearch client configuration"""
+        config = {
+            "hosts": [self.url],
+            "verify_certs": self.verify_certs,
+            "request_timeout": 30,
+            "max_retries": 3,
+            "retry_on_timeout": True,
+        }
+
+        # Authentication - API key takes precedence
+        if self.api_key:
+            config["api_key"] = self.api_key
+        elif self.username and self.password:
+            config["basic_auth"] = (self.username, self.password)
+
+        # SSL configuration
+        if self.ca_certs:
+            config["ca_certs"] = self.ca_certs
+
+        return config
+
+    async def search(
+        self,
+        index: str | list[str],
+        query: dict[str, Any],
+        size: int = 100,
+        from_: int = 0,
+        fields: list[str] | None = None,
+        sort: list[dict[str, Any]] | None = None,
+    ) -> dict[str, Any]:
+        """
+        Search documents in Elasticsearch
+
+        Args:
+            index: Elasticsearch index name or list of indices
+            query: Elasticsearch query DSL
+            size: Number of results to return
+            from_: Starting offset for pagination
+            fields: List of fields to include in response
+            sort: Sort configuration
+
+        Returns:
+            Elasticsearch search response
+        """
+        try:
+            search_body: dict[str, Any] = {
+                "query": query,
+                "size": size,
+                "from": from_,
+            }
+
+            if fields:
+                search_body["_source"] = fields
+
+            if sort:
+                search_body["sort"] = sort
+
+            response = await self.client.search(index=index, body=search_body)
+
+            logger.info(
+                f"Successfully searched index '{index}', found {response['hits']['total']['value']} results"
+            )
+            return response
+
+        except NotFoundError:
+            logger.error(f"Index '{index}' not found")
+            raise
+        except AuthenticationException:
+            logger.error("Authentication failed")
+            raise
+        except ConnectionError:
+            logger.error("Failed to connect to Elasticsearch")
+            raise
+        except Exception as e:
+            logger.error(f"Search failed: {e}")
+            raise
+
+    async def get_indices(self) -> list[str]:
+        """
+        Get list of available indices
+
+        Returns:
+            List of index names
+        """
+        try:
+            indices = await self.client.indices.get_alias(index="*")
+            return list(indices.keys())
+        except Exception as e:
+            logger.error(f"Failed to get indices: {e}")
+            raise
+
+    async def get_mapping(self, index: str) -> dict[str, Any]:
+        """
+        Get mapping for an index
+
+        Args:
+            index: Index name
+
+        Returns:
+            Index mapping
+        """
+        try:
+            mapping = await self.client.indices.get_mapping(index=index)
+            return mapping[index]["mappings"] if index in mapping else {}
+        except Exception as e:
+            logger.error(f"Failed to get mapping for index '{index}': {e}")
+            raise
+
+    async def scroll_search(
+        self,
+        index: str | list[str],
+        query: dict[str, Any],
+        size: int = 1000,
+        scroll_timeout: str = "5m",
+        fields: list[str] | None = None,
+    ):
+        """
+        Perform a scroll search for large result sets
+
+        Args:
+            index: Elasticsearch index name or list of indices
+            query: Elasticsearch query DSL
+            size: Number of results per scroll
+            scroll_timeout: Scroll timeout
+            fields: List of fields to include in response
+
+        Yields:
+            Document hits from Elasticsearch
+        """
+        try:
+            search_body: dict[str, Any] = {
+                "query": query,
+                "size": size,
+            }
+
+            if fields:
+                search_body["_source"] = fields
+
+            # Initial search
+            response = await self.client.search(
+                index=index, body=search_body, scroll=scroll_timeout
+            )
+
+            scroll_id = response.get("_scroll_id")
+            hits = response["hits"]["hits"]
+
+            while hits:
+                for hit in hits:
+                    yield hit
+
+                # Continue scrolling
+                if scroll_id:
+                    response = await self.client.scroll(
+                        scroll_id=scroll_id, scroll=scroll_timeout
+                    )
+                    scroll_id = response.get("_scroll_id")
+                    hits = response["hits"]["hits"]
+
+            # Clear scroll
+            if scroll_id:
+                await self.client.clear_scroll(scroll_id=scroll_id)
+
+        except Exception as e:
+            logger.error(f"Scroll search failed: {e}")
+            raise
+
+    async def count_documents(
+        self, index: str | list[str], query: dict[str, Any] | None = None
+    ) -> int:
+        """
+        Count documents in an index
+
+        Args:
+            index: Index name or list of indices
+            query: Optional query to filter documents
+
+        Returns:
+            Number of documents
+        """
+        try:
+            if query:
+                response = await self.client.count(index=index, body={"query": query})
+            else:
+                response = await self.client.count(index=index)
+
+            return response["count"]
+        except Exception as e:
+            logger.error(f"Failed to count documents in index '{index}': {e}")
+            raise
+
+    async def close(self):
+        """Close the Elasticsearch client connection"""
+        if hasattr(self, "client"):
+            await self.client.close()
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@ -50,6 +50,7 @@ class DocumentType(str, Enum):
    GOOGLE_GMAIL_CONNECTOR = "GOOGLE_GMAIL_CONNECTOR"
    AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR"
    LUMA_CONNECTOR = "LUMA_CONNECTOR"
+    ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"


 class SearchSourceConnectorType(str, Enum):
@ -68,6 +69,7 @@ class SearchSourceConnectorType(str, Enum):
    GOOGLE_GMAIL_CONNECTOR = "GOOGLE_GMAIL_CONNECTOR"
    AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR"
    LUMA_CONNECTOR = "LUMA_CONNECTOR"
+    ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"


 class ChatType(str, Enum):
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@ -40,6 +40,7 @@ from app.tasks.connector_indexers import (
    index_clickup_tasks,
    index_confluence_pages,
    index_discord_messages,
+    index_elasticsearch_documents,
    index_github_repos,
    index_google_calendar_events,
    index_google_gmail_messages,
@ -363,6 +364,7 @@ async def index_connector_content(
    - JIRA_CONNECTOR: Indexes issues and comments from Jira
    - DISCORD_CONNECTOR: Indexes messages from all accessible Discord channels
    - LUMA_CONNECTOR: Indexes events from Luma
+    - ELASTICSEARCH_CONNECTOR: Indexes documents from Elasticsearch

    Args:
        connector_id: ID of the connector to use
@ -589,6 +591,24 @@ async def index_connector_content(
            )
            response_message = "Luma indexing started in the background."

+        elif (
+            connector.connector_type
+            == SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR
+        ):
+            # Run indexing in background
+            logger.info(
+                f"Triggering Elasticsearch indexing for connector {connector_id} into search space {search_space_id}"
+            )
+            background_tasks.add_task(
+                run_elasticsearch_indexing_with_new_session,
+                connector_id,
+                search_space_id,
+                str(user.id),
+                indexing_from,
+                indexing_to,
+            )
+            response_message = "Elasticsearch indexing started in the background."
+
        else:
            raise HTTPException(
                status_code=400,
@ -1358,3 +1378,61 @@ async def run_luma_indexing(
            )
    except Exception as e:
        logger.error(f"Error in background Luma indexing task: {e!s}")
+
+
+async def run_elasticsearch_indexing_with_new_session(
+    connector_id: int,
+    search_space_id: int,
+    user_id: str,
+    start_date: str,
+    end_date: str,
+):
+    """Wrapper to run Elasticsearch indexing with its own database session."""
+    logger.info(
+        f"Background task started: Indexing Elasticsearch connector {connector_id} into space {search_space_id}"
+    )
+    async with async_session_maker() as session:
+        await run_elasticsearch_indexing(
+            session, connector_id, search_space_id, user_id, start_date, end_date
+        )
+    logger.info(
+        f"Background task finished: Indexing Elasticsearch connector {connector_id}"
+    )
+
+
+async def run_elasticsearch_indexing(
+    session: AsyncSession,
+    connector_id: int,
+    search_space_id: int,
+    user_id: str,
+    start_date: str,
+    end_date: str,
+):
+    """Runs the Elasticsearch indexing task and updates the timestamp."""
+    try:
+        indexed_count, error_message = await index_elasticsearch_documents(
+            session,
+            connector_id,
+            search_space_id,
+            user_id,
+            start_date,
+            end_date,
+            update_last_indexed=False,
+        )
+        if error_message:
+            logger.error(
+                f"Elasticsearch indexing failed for connector {connector_id}: {error_message}"
+            )
+        else:
+            logger.info(
+                f"Elasticsearch indexing successful for connector {connector_id}. Indexed {indexed_count} documents."
+            )
+            # Update the last indexed timestamp only on success
+            await update_connector_last_indexed(session, connector_id)
+            await session.commit()
+    except Exception as e:
+        await session.rollback()
+        logger.error(
+            f"Critical error in run_elasticsearch_indexing for connector {connector_id}: {e}",
+            exc_info=True,
+        )
--- a/surfsense_backend/app/services/connector_service.py
+++ b/surfsense_backend/app/services/connector_service.py
@ -2028,3 +2028,117 @@ class ConnectorService:
        }

        return result_object, luma_chunks
+
+    async def search_elasticsearch(
+        self,
+        user_query: str,
+        user_id: str,
+        search_space_id: int,
+        top_k: int = 20,
+        search_mode: SearchMode = SearchMode.CHUNKS,
+    ) -> tuple:
+        """
+        Search for Elasticsearch documents and return both the source information and langchain documents
+
+        Args:
+            user_query: The user's query
+            user_id: The user's ID
+            search_space_id: The search space ID to search in
+            top_k: Maximum number of results to return
+            search_mode: Search mode (CHUNKS or DOCUMENTS)
+
+        Returns:
+            tuple: (sources_info, langchain_documents)
+        """
+        if search_mode == SearchMode.CHUNKS:
+            elasticsearch_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="ELASTICSEARCH_CONNECTOR",
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            elasticsearch_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="ELASTICSEARCH_CONNECTOR",
+            )
+            # Transform document retriever results to match expected format
+            elasticsearch_chunks = self._transform_document_results(
+                elasticsearch_chunks
+            )
+
+        # Early return if no results
+        if not elasticsearch_chunks:
+            return {
+                "id": 34,
+                "name": "Elasticsearch",
+                "type": "ELASTICSEARCH_CONNECTOR",
+                "sources": [],
+            }, []
+
+        # Process each chunk and create sources directly without deduplication
+        sources_list = []
+        async with self.counter_lock:
+            for _i, chunk in enumerate(elasticsearch_chunks):
+                # Extract document metadata
+                document = chunk.get("document", {})
+                metadata = document.get("metadata", {})
+
+                # Extract Elasticsearch-specific metadata
+                es_id = metadata.get("elasticsearch_id", "")
+                es_index = metadata.get("elasticsearch_index", "")
+                es_score = metadata.get("elasticsearch_score", "")
+
+                # Create a more descriptive title for Elasticsearch documents
+                title = document.get("title", "Elasticsearch Document")
+                if es_index:
+                    title = f"{title} (Index: {es_index})"
+
+                # Create a more descriptive description for Elasticsearch documents
+                description = chunk.get("content", "")[:150]
+                if len(description) == 150:
+                    description += "..."
+
+                # Add Elasticsearch info to description
+                info_parts = []
+                if es_id:
+                    info_parts.append(f"ID: {es_id}")
+                if es_score:
+                    info_parts.append(f"Score: {es_score}")
+
+                if info_parts:
+                    if description:
+                        description = f"{description} | {' | '.join(info_parts)}"
+                    else:
+                        description = " | ".join(info_parts)
+
+                # For URL, we could construct a URL to view the document if we have the Elasticsearch UI URL
+                url = ""
+                # Could be extended to include Kibana or other UI URLs if configured
+
+                source = {
+                    "id": chunk.get("chunk_id", self.source_id_counter),
+                    "title": title,
+                    "description": description,
+                    "url": url,
+                    "elasticsearch_id": es_id,
+                    "elasticsearch_index": es_index,
+                    "elasticsearch_score": es_score,
+                }
+
+                self.source_id_counter += 1
+                sources_list.append(source)
+
+        # Create result object
+        result_object = {
+            "id": 34,  # Assign a unique ID for the Elasticsearch connector
+            "name": "Elasticsearch",
+            "type": "ELASTICSEARCH_CONNECTOR",
+            "sources": sources_list,
+        }
+
+        return result_object, elasticsearch_chunks
--- a/surfsense_backend/app/tasks/connector_indexers/init.py
+++ b/surfsense_backend/app/tasks/connector_indexers/init.py
@ -17,6 +17,7 @@ Available indexers:
 - Google Gmail: Index messages from Google Gmail
 - Google Calendar: Index events from Google Calendar
 - Luma: Index events from Luma
+- Elasticsearch: Index documents from Elasticsearch instances
 """

 # Communication platforms
@ -27,6 +28,7 @@ from .confluence_indexer import index_confluence_pages
 from .discord_indexer import index_discord_messages

 # Development platforms
+from .elasticsearch_indexer import index_elasticsearch_documents
 from .github_indexer import index_github_repos
 from .google_calendar_indexer import index_google_calendar_events
 from .google_gmail_indexer import index_google_gmail_messages
@ -46,6 +48,7 @@ __all__ = [  # noqa: RUF022
    "index_confluence_pages",
    "index_discord_messages",
    # Development platforms
+    "index_elasticsearch_documents",
    "index_github_repos",
    # Calendar and scheduling
    "index_google_calendar_events",
--- a/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py
@ -0,0 +1,354 @@
+"""
+Elasticsearch indexer for SurfSense
+"""
+
+import hashlib
+import json
+import logging
+from datetime import UTC, datetime
+from typing import Any
+
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+
+from app.connectors.elasticsearch_connector import ElasticsearchConnector
+from app.db import Document, DocumentType, SearchSourceConnector
+
+logger = logging.getLogger(__name__)
+
+
+class _ChunkingService:
+    def __init__(self, chunk_size: int = 1000, overlap: int = 200) -> None:
+        self.chunk_size = max(100, chunk_size)
+        self.overlap = max(0, min(overlap, self.chunk_size - 1))
+
+    def chunk_text(self, text: str) -> list[str]:
+        if not text:
+            return []
+        text = text.strip()
+        if len(text) <= self.chunk_size:
+            return [text]
+        chunks: list[str] = []
+        step = self.chunk_size - self.overlap
+        pos = 0
+        while pos < len(text):
+            end = pos + self.chunk_size
+            chunks.append(text[pos:end].strip())
+            pos += step
+        return chunks
+
+
+class _DocumentService:
+    def __init__(self, session):
+        self.session = session
+
+    async def get_document_by_hash(self, content_hash: str):
+        from sqlalchemy.future import select
+
+        from app.db import Document
+
+        if not content_hash:
+            return None
+        result = await self.session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        return result.scalars().first()
+
+    async def create_chunks_for_document(self, document_id: int, chunks: list[str]):
+        from app.db import Chunk
+
+        for chunk_text in chunks:
+            self.session.add(Chunk(content=chunk_text, document_id=document_id))
+        await self.session.flush()
+
+
+async def index_elasticsearch_documents(
+    session: AsyncSession,
+    connector_id: int,
+    search_space_id: int,
+    user_id: str,
+    start_date: str,
+    end_date: str,
+    update_last_indexed: bool = True,
+) -> tuple[int, str | None]:
+    """
+    Index documents from Elasticsearch into SurfSense
+
+    Args:
+        session: Database session
+        connector_id: Elasticsearch connector ID
+        search_space_id: Search space ID
+        user_id: User ID
+        start_date: Start date for indexing (not used for Elasticsearch, kept for compatibility)
+        end_date: End date for indexing (not used for Elasticsearch, kept for compatibility)
+        update_last_indexed: Whether to update the last indexed timestamp
+
+    Returns:
+        Tuple of (number of documents processed, error message if any)
+    """
+    es_connector = None
+    try:
+        # Get the connector configuration
+        result = await session.execute(
+            select(SearchSourceConnector).filter(
+                SearchSourceConnector.id == connector_id
+            )
+        )
+        connector = result.scalars().first()
+
+        if not connector:
+            error_msg = f"Elasticsearch connector with ID {connector_id} not found"
+            logger.error(error_msg)
+            return 0, error_msg
+
+        # Get connector configuration
+        config = connector.config
+
+        # Validate required fields - now only URL and INDEX are required
+        # Authentication can be either API key OR username/password
+        if "ELASTICSEARCH_URL" not in config:
+            error_msg = "Missing required field in connector config: ELASTICSEARCH_URL"
+            logger.error(error_msg)
+            return 0, error_msg
+
+        if "ELASTICSEARCH_INDEX" not in config:
+            error_msg = (
+                "Missing required field in connector config: ELASTICSEARCH_INDEX"
+            )
+            logger.error(error_msg)
+            return 0, error_msg
+
+        # Check authentication - must have either API key or username+password
+        has_api_key = (
+            "ELASTICSEARCH_API_KEY" in config and config["ELASTICSEARCH_API_KEY"]
+        )
+        has_basic_auth = (
+            "ELASTICSEARCH_USERNAME" in config
+            and config["ELASTICSEARCH_USERNAME"]
+            and "ELASTICSEARCH_PASSWORD" in config
+            and config["ELASTICSEARCH_PASSWORD"]
+        )
+
+        if not has_api_key and not has_basic_auth:
+            error_msg = "Missing authentication: provide either ELASTICSEARCH_API_KEY or ELASTICSEARCH_USERNAME + ELASTICSEARCH_PASSWORD"
+            logger.error(error_msg)
+            return 0, error_msg
+
+        # Initialize document service
+        document_service = _DocumentService(session)
+        chunking_service = _ChunkingService()
+
+        # Initialize Elasticsearch connector
+        es_connector = ElasticsearchConnector(
+            url=config["ELASTICSEARCH_URL"],
+            api_key=config.get("ELASTICSEARCH_API_KEY"),
+            username=config.get("ELASTICSEARCH_USERNAME"),
+            password=config.get("ELASTICSEARCH_PASSWORD"),
+            verify_certs=config.get("ELASTICSEARCH_VERIFY_CERTS", True),
+            ca_certs=config.get("ELASTICSEARCH_CA_CERTS"),
+        )
+
+        # Build query based on configuration
+        query = _build_elasticsearch_query(config)
+
+        # Get the index name(s) - can be a string or list
+        index_name = config["ELASTICSEARCH_INDEX"]
+
+        # Get max documents to index
+        max_documents = config.get("ELASTICSEARCH_MAX_DOCUMENTS", 1000)
+
+        logger.info(
+            f"Starting Elasticsearch indexing for index '{index_name}' with max {max_documents} documents"
+        )
+
+        documents_processed = 0
+
+        try:
+            # Use scroll search for large result sets
+            async for hit in es_connector.scroll_search(
+                index=index_name,
+                query=query,
+                size=min(max_documents, 100),  # Scroll in batches
+                fields=config.get("ELASTICSEARCH_FIELDS"),
+            ):
+                if documents_processed >= max_documents:
+                    break
+
+                try:
+                    # Extract document data
+                    doc_id = hit["_id"]
+                    source = hit.get("_source", {})
+
+                    # Build document title
+                    title_field = config.get("ELASTICSEARCH_TITLE_FIELD")
+                    if not title_field:
+                        for candidate in ("title", "name", "subject"):
+                            if candidate in source:
+                                title_field = candidate
+                                break
+                    title = (
+                        str(source.get(title_field, doc_id))
+                        if title_field is not None
+                        else str(doc_id)
+                    )
+
+                    # Build document content
+                    content = _build_document_content(source, config)
+
+                    if not content.strip():
+                        logger.warning(f"Skipping document {doc_id} - no content found")
+                        continue
+
+                    # Create content hash
+                    content_hash = hashlib.sha256(content.encode()).hexdigest()
+
+                    # Build metadata
+                    metadata = {
+                        "elasticsearch_id": doc_id,
+                        "elasticsearch_index": hit.get("_index", index_name),
+                        "elasticsearch_score": hit.get("_score"),
+                        "indexed_at": datetime.now().isoformat(),
+                        "source": "ELASTICSEARCH_CONNECTOR",
+                    }
+
+                    # Add any additional metadata fields specified in config
+                    if "ELASTICSEARCH_METADATA_FIELDS" in config:
+                        for field in config["ELASTICSEARCH_METADATA_FIELDS"]:
+                            if field in source:
+                                metadata[f"es_{field}"] = source[field]
+
+                    # Check if document already exists
+                    existing_doc = await document_service.get_document_by_hash(
+                        content_hash
+                    )
+
+                    if existing_doc:
+                        logger.debug(f"Document {doc_id} already exists, skipping")
+                        continue
+
+                    # Create document
+                    document = Document(
+                        title=title,
+                        content=content,
+                        content_hash=content_hash,
+                        document_type=DocumentType.ELASTICSEARCH_CONNECTOR,
+                        document_metadata=metadata,
+                        search_space_id=search_space_id,
+                    )
+
+                    # Add document to session
+                    session.add(document)
+                    await session.flush()  # Get the document ID
+
+                    # Create chunks
+                    chunks = chunking_service.chunk_text(content)
+                    await document_service.create_chunks_for_document(
+                        document.id, chunks
+                    )
+
+                    documents_processed += 1
+
+                    if documents_processed % 10 == 0:
+                        logger.info(
+                            f"Processed {documents_processed} Elasticsearch documents"
+                        )
+                        await session.commit()
+
+                except Exception as e:
+                    logger.error(
+                        f"Error processing Elasticsearch document {hit.get('_id', 'unknown')}: {e}"
+                    )
+                    continue
+
+            # Final commit
+            await session.commit()
+
+            logger.info(
+                f"Successfully indexed {documents_processed} documents from Elasticsearch"
+            )
+
+            # Update last indexed timestamp if requested
+            if update_last_indexed and documents_processed > 0:
+                connector.last_indexed_at = datetime.now()
+                await session.commit()
+            if update_last_indexed and documents_processed > 0:
+                # store ISO-8601 UTC timestamp with 'Z' suffix, e.g. 2025-10-09T22:04:53.599658Z
+                connector.last_indexed_at = (
+                    datetime.now(UTC).isoformat().replace("+00:00", "Z")
+                )
+                await session.commit()
+
+            return documents_processed, None
+
+        finally:
+            # Clean up Elasticsearch connection
+            if es_connector:
+                await es_connector.close()
+
+    except Exception as e:
+        error_msg = f"Error indexing Elasticsearch documents: {e}"
+        logger.error(error_msg, exc_info=True)
+        await session.rollback()
+        if es_connector:
+            await es_connector.close()
+        return 0, error_msg
+
+
+def _build_elasticsearch_query(config: dict[str, Any]) -> dict[str, Any]:
+    """
+    Build Elasticsearch query from connector configuration
+
+    Args:
+        config: Connector configuration
+
+    Returns:
+        Elasticsearch query DSL
+    """
+    # Check if custom query is provided
+    if config.get("ELASTICSEARCH_QUERY"):
+        try:
+            if isinstance(config["ELASTICSEARCH_QUERY"], str):
+                return json.loads(config["ELASTICSEARCH_QUERY"])
+            else:
+                return config["ELASTICSEARCH_QUERY"]
+        except (json.JSONDecodeError, TypeError) as e:
+            logger.warning(f"Invalid custom query, using match_all: {e}")
+
+    # Default to match all documents
+    return {"match_all": {}}
+
+
+def _build_document_content(source: dict[str, Any], config: dict[str, Any]) -> str:
+    """
+    Build document content from Elasticsearch document source
+
+    Args:
+        source: Elasticsearch document source
+        config: Connector configuration
+
+    Returns:
+        Formatted document content
+    """
+    content_parts = []
+
+    # Get content fields from config
+    content_fields = config.get("ELASTICSEARCH_CONTENT_FIELDS", [])
+
+    if content_fields:
+        # Use specified content fields
+        for field in content_fields:
+            if field in source:
+                field_value = source[field]
+                if isinstance(field_value, str | int | float):
+                    content_parts.append(f"{field}: {field_value}")
+                if isinstance(field_value, str | int | float):
+                    content_parts.append(f"{field}: {json.dumps(field_value)}")
+    else:
+        # Use all fields if no specific content fields specified
+        for key, value in source.items():
+            if isinstance(value, str | int | float):
+                content_parts.append(f"{key}: {value}")
+            elif isinstance(value, list | dict):
+                content_parts.append(f"{key}: {json.dumps(value)}")
+
+    return "\n".join(content_parts)
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@ -43,6 +43,7 @@ dependencies = [
    "youtube-transcript-api>=1.0.3",
    "litellm>=1.77.5",
    "langchain-litellm>=0.2.3",
+    "elasticsearch>=9.1.1",
 ]

 [dependency-groups]
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock