Webcrawler connector draft

2026-05-03 21:02:40 +02:00 · 2025-11-21 20:45:59 -08:00 · 2025-11-21 20:45:59 -08:00 · 896e410e2a
commit 896e410e2a
parent 419f94e8ee
26 changed files with 1225 additions and 9 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,3 +1,11 @@
 {
-	"biome.configurationPath": "./surfsense_web/biome.json"
+	"biome.configurationPath": "./surfsense_web/biome.json",
 	"files.exclude": {
 		"**/.git": true,
 		"**/.svn": true,
 		"**/.hg": true,
 		"**/.DS_Store": true,
 		"**/Thumbs.db": true,
 		".mule": true
 	}
 }
--- a/surfsense_backend/alembic/versions/37_add_webcrawler_connector_enum.py
+++ b/surfsense_backend/alembic/versions/37_add_webcrawler_connector_enum.py
@ -0,0 +1,59 @@
 """Add Webcrawler connector enums
 Revision ID: 37
 Revises: 36
 Create Date: 2025-11-17 17:00:00.000000
 """
 from collections.abc import Sequence
 from alembic import op
 revision: str = "37"
 down_revision: str | None = "36"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 def upgrade() -> None:
    """Safely add 'WEBCRAWLER_CONNECTOR' to enum types if missing."""
    # Add to searchsourceconnectortype enum
    op.execute(
        """
    DO $$
    BEGIN
        IF NOT EXISTS (
            SELECT 1 FROM pg_type t
            JOIN pg_enum e ON t.oid = e.enumtypid
            WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'WEBCRAWLER_CONNECTOR'
        ) THEN
            ALTER TYPE searchsourceconnectortype ADD VALUE 'WEBCRAWLER_CONNECTOR';
        END IF;
    END
    $$;
    """
    )
    # Add to documenttype enum
    op.execute(
        """
    DO $$
    BEGIN
        IF NOT EXISTS (
            SELECT 1 FROM pg_type t
            JOIN pg_enum e ON t.oid = e.enumtypid
            WHERE t.typname = 'documenttype' AND e.enumlabel = 'WEBCRAWLER_CONNECTOR'
        ) THEN
            ALTER TYPE documenttype ADD VALUE 'WEBCRAWLER_CONNECTOR';
        END IF;
    END
    $$;
    """
    )
 def downgrade() -> None:
    """Remove 'WEBCRAWLER_CONNECTOR' from enum types."""
    pass
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@ -667,7 +667,7 @@ async def fetch_relevant_documents(
                            }
                        )
-                elif connector == "CRAWLED_URL":
+                elif connector == "WEBCRAWLER_CONNECTOR":
                    (
                        source_object,
                        crawled_urls_chunks,
@ -689,7 +689,7 @@ async def fetch_relevant_documents(
                        writer(
                            {
                                "yield_value": streaming_service.format_terminal_info_delta(
-                                    f"🌐 Found {len(crawled_urls_chunks)} Web Pages chunks related to your query"
+                                    f"🌐 Found {len(crawled_urls_chunks)} Web Page chunks related to your query"
                                )
                            }
                        )
--- a/surfsense_backend/app/agents/researcher/qna_agent/default_prompts.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/default_prompts.py
@ -17,7 +17,6 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel
 {chat_history_section}
 <knowledge_sources>
 - EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
 - CRAWLED_URL: "Webpages indexed by SurfSense web crawler" (personally selected websites)
 - FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
 - SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
 - NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
@ -35,6 +34,7 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel
 - TAVILY_API: "Tavily search API results" (personalized search results)
 - LINKUP_API: "Linkup search API results" (personalized search results)
 - LUMA_CONNECTOR: "Luma events"
 - WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense web crawler" (personally selected websites)
 </knowledge_sources>
 <instructions>
--- a/surfsense_backend/app/agents/researcher/utils.py
+++ b/surfsense_backend/app/agents/researcher/utils.py
@ -19,7 +19,6 @@ def get_connector_emoji(connector_name: str) -> str:
    connector_emojis = {
        "YOUTUBE_VIDEO": "📹",
        "EXTENSION": "🧩",
        "CRAWLED_URL": "🌐",
        "FILE": "📄",
        "SLACK_CONNECTOR": "💬",
        "NOTION_CONNECTOR": "📘",
@ -34,6 +33,7 @@ def get_connector_emoji(connector_name: str) -> str:
        "AIRTABLE_CONNECTOR": "🗃️",
        "LUMA_CONNECTOR": "✨",
        "ELASTICSEARCH_CONNECTOR": "⚡",
        "WEBCRAWLER_CONNECTOR": "🌐",
    }
    return connector_emojis.get(connector_name, "🔎")
@ -43,7 +43,6 @@ def get_connector_friendly_name(connector_name: str) -> str:
    connector_friendly_names = {
        "YOUTUBE_VIDEO": "YouTube",
        "EXTENSION": "Browser Extension",
        "CRAWLED_URL": "Web Pages",
        "FILE": "Files",
        "SLACK_CONNECTOR": "Slack",
        "NOTION_CONNECTOR": "Notion",
@ -59,6 +58,7 @@ def get_connector_friendly_name(connector_name: str) -> str:
        "AIRTABLE_CONNECTOR": "Airtable",
        "LUMA_CONNECTOR": "Luma",
        "ELASTICSEARCH_CONNECTOR": "Elasticsearch",
        "WEBCRAWLER_CONNECTOR": "Web Pages",
    }
    return connector_friendly_names.get(connector_name, connector_name)
--- a/surfsense_backend/app/connectors/webcrawler_connector.py
+++ b/surfsense_backend/app/connectors/webcrawler_connector.py
@ -0,0 +1,191 @@
 """
 WebCrawler Connector Module
 A module for crawling web pages and extracting content using Firecrawl or AsyncChromiumLoader.
 Provides a unified interface for web scraping.
 """
 from typing import Any
 import validators
 from firecrawl import AsyncFirecrawlApp
 from langchain_community.document_loaders import AsyncChromiumLoader
 class WebCrawlerConnector:
    """Class for crawling web pages and extracting content."""
    def __init__(self, firecrawl_api_key: str | None = None):
        """
        Initialize the WebCrawlerConnector class.
        Args:
            firecrawl_api_key: Firecrawl API key (optional, will use AsyncChromiumLoader if not provided)
        """
        self.firecrawl_api_key = firecrawl_api_key
        self.use_firecrawl = bool(firecrawl_api_key)
    def set_api_key(self, api_key: str) -> None:
        """
        Set the Firecrawl API key and enable Firecrawl usage.
        Args:
            api_key: Firecrawl API key
        """
        self.firecrawl_api_key = api_key
        self.use_firecrawl = True
    async def crawl_url(
        self, url: str, formats: list[str] | None = None
    ) -> tuple[dict[str, Any] | None, str | None]:
        """
        Crawl a single URL and extract its content.
        Args:
            url: URL to crawl
            formats: List of formats to extract (e.g., ["markdown", "html"]) - only for Firecrawl
        Returns:
            Tuple containing (crawl result dict, error message or None)
            Result dict contains:
                - content: Extracted content (markdown or HTML)
                - metadata: Page metadata (title, description, etc.)
                - source: Original URL
                - crawler_type: Type of crawler used
        """
        try:
            # Validate URL
            if not validators.url(url):
                return None, f"Invalid URL: {url}"
            if self.use_firecrawl:
                result = await self._crawl_with_firecrawl(url, formats)
            else:
                result = await self._crawl_with_chromium(url)
            return result, None
        except Exception as e:
            return None, f"Error crawling URL {url}: {e!s}"
    async def _crawl_with_firecrawl(
        self, url: str, formats: list[str] | None = None
    ) -> dict[str, Any]:
        """
        Crawl URL using Firecrawl.
        Args:
            url: URL to crawl
            formats: List of formats to extract
        Returns:
            Dict containing crawled content and metadata
        Raises:
            ValueError: If Firecrawl scraping fails
        """
        if not self.firecrawl_api_key:
            raise ValueError("Firecrawl API key not set. Call set_api_key() first.")
        firecrawl_app = AsyncFirecrawlApp(api_key=self.firecrawl_api_key)
        # Default to markdown format
        if formats is None:
            formats = ["markdown"]
        scrape_result = await firecrawl_app.scrape_url(url=url, formats=formats)
        if not scrape_result or not scrape_result.success:
            error_msg = (
                scrape_result.error
                if scrape_result and hasattr(scrape_result, "error")
                else "Unknown error"
            )
            raise ValueError(f"Firecrawl failed to scrape URL: {error_msg}")
        # Extract content based on format
        content = scrape_result.markdown or scrape_result.html or ""
        # Extract metadata
        metadata = scrape_result.metadata if scrape_result.metadata else {}
        return {
            "content": content,
            "metadata": {
                "source": url,
                "title": metadata.get("title", url),
                "description": metadata.get("description", ""),
                "language": metadata.get("language", ""),
                "sourceURL": metadata.get("sourceURL", url),
                **metadata,
            },
            "crawler_type": "firecrawl",
        }
    async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
        """
        Crawl URL using AsyncChromiumLoader.
        Args:
            url: URL to crawl
        Returns:
            Dict containing crawled content and metadata
        Raises:
            Exception: If crawling fails
        """
        crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
        documents = await crawl_loader.aload()
        if not documents:
            raise ValueError(f"Failed to load content from {url}")
        doc = documents[0]
        # Extract basic metadata from the document
        metadata = doc.metadata if doc.metadata else {}
        return {
            "content": doc.page_content,
            "metadata": {
                "source": url,
                "title": metadata.get("title", url),
                **metadata,
            },
            "crawler_type": "chromium",
        }
    def format_to_structured_document(self, crawl_result: dict[str, Any]) -> str:
        """
        Format crawl result as a structured document (similar to url_crawler.py format).
        Args:
            crawl_result: Result from crawl_url method
        Returns:
            Structured document string
        """
        metadata = crawl_result["metadata"]
        content = crawl_result["content"]
        document_parts = ["<DOCUMENT>", "<METADATA>"]
        # Add all metadata fields
        for key, value in metadata.items():
            document_parts.append(f"{key.upper()}: {value}")
        document_parts.extend(
            [
                "</METADATA>",
                "<CONTENT>",
                "FORMAT: markdown",
                "TEXT_START",
                content,
                "TEXT_END",
                "</CONTENT>",
                "</DOCUMENT>",
            ]
        )
        return "\n".join(document_parts)
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@ -73,6 +73,7 @@ class SearchSourceConnectorType(str, Enum):
    AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR"
    LUMA_CONNECTOR = "LUMA_CONNECTOR"
    ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
    WEBCRAWLER_CONNECTOR = "WEBCRAWLER_CONNECTOR"
 class ChatType(str, Enum):
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@ -49,6 +49,7 @@ from app.tasks.connector_indexers import (
    index_luma_events,
    index_notion_pages,
    index_slack_messages,
    index_webcrawler_urls,
 )
 from app.users import current_active_user
 from app.utils.check_ownership import check_ownership
@ -1523,3 +1524,63 @@ async def run_elasticsearch_indexing(
            f"Critical error in run_elasticsearch_indexing for connector {connector_id}: {e}",
            exc_info=True,
        )
 # Add new helper functions for webcrawler indexing
 async def run_webcrawler_indexing_with_new_session(
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str,
    end_date: str,
 ):
    """
    Create a new session and run the Webcrawler indexing task.
    This prevents session leaks by creating a dedicated session for the background task.
    """
    async with async_session_maker() as session:
        await run_webcrawler_indexing(
            session, connector_id, search_space_id, user_id, start_date, end_date
        )
 async def run_webcrawler_indexing(
    session: AsyncSession,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str,
    end_date: str,
 ):
    """
    Background task to run Webcrawler indexing.
    Args:
        session: Database session
        connector_id: ID of the webcrawler connector
        search_space_id: ID of the search space
        user_id: ID of the user
        start_date: Start date for indexing
        end_date: End date for indexing
    """
    try:
        documents_processed, error_or_warning = await index_webcrawler_urls(
            session=session,
            connector_id=connector_id,
            search_space_id=search_space_id,
            user_id=user_id,
            start_date=start_date,
            end_date=end_date,
            update_last_indexed=False,  # Don't update timestamp in the indexing function
        )
        # Only update last_indexed_at if indexing was successful (either new docs or updated docs)
        if documents_processed > 0:
            await update_connector_last_indexed(session, connector_id)
            logger.info(
                f"Webcrawler indexing completed successfully: {documents_processed} documents processed"
            )
        else:
            logger.error(
                f"Webcrawler indexing failed or no documents processed: {error_or_warning}"
            )
    except Exception as e:
        logger.error(f"Error in background Webcrawler indexing task: {e!s}")
--- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py
@ -600,3 +600,46 @@ async def _index_elasticsearch_documents(
        await run_elasticsearch_indexing(
            session, connector_id, search_space_id, user_id, start_date, end_date
        )
@celery_app.task(name="index_webcrawler_urls", bind=True)
 def index_webcrawler_urls_task(
    self,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str,
    end_date: str,
 ):
    """Celery task to index Webcrawler Urls."""
    import asyncio
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    try:
        loop.run_until_complete(
            _index_webcrawler_urls(
                connector_id, search_space_id, user_id, start_date, end_date
            )
        )
    finally:
        loop.close()
 async def _index_webcrawler_urls(
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str,
    end_date: str,
 ):
    """Index Webcrawler Urls with new session."""
    from app.routes.search_source_connectors_routes import (
        run_webcrawler_indexing,
    )
    async with get_celery_session_maker()() as session:
        await run_webcrawler_indexing(
            session, connector_id, search_space_id, user_id, start_date, end_date
        )
--- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
+++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py
@ -77,6 +77,7 @@ async def _check_and_trigger_schedules():
                index_luma_events_task,
                index_notion_pages_task,
                index_slack_messages_task,
                index_webcrawler_urls_task
            )
            # Map connector types to their tasks
@ -94,6 +95,7 @@ async def _check_and_trigger_schedules():
                SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
                SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
                SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
                SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_webcrawler_urls_task,
            }
            # Trigger indexing for each due connector
--- a/surfsense_backend/app/tasks/connector_indexers/init.py
+++ b/surfsense_backend/app/tasks/connector_indexers/init.py
@ -17,6 +17,7 @@ Available indexers:
 - Google Gmail: Index messages from Google Gmail
 - Google Calendar: Index events from Google Calendar
 - Luma: Index events from Luma
 - Webcrawler: Index crawled URLs
 - Elasticsearch: Index documents from Elasticsearch instances
 """
@ -41,6 +42,7 @@ from .luma_indexer import index_luma_events
 # Documentation and knowledge management
 from .notion_indexer import index_notion_pages
 from .slack_indexer import index_slack_messages
 from .webcrawler_indexer import index_webcrawler_urls
 __all__ = [  # noqa: RUF022
    "index_airtable_records",
@ -58,6 +60,7 @@ __all__ = [  # noqa: RUF022
    "index_linear_issues",
    # Documentation and knowledge management
    "index_notion_pages",
    "index_webcrawler_urls",
    # Communication platforms
    "index_slack_messages",
    "index_google_gmail_messages",
--- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
@ -0,0 +1,439 @@
 """
 Webcrawler connector indexer.
 """
 from datetime import datetime
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.config import config
 from app.connectors.webcrawler_connector import WebCrawlerConnector
 from app.db import Document, DocumentType, SearchSourceConnectorType
 from app.services.llm_service import get_user_long_context_llm
 from app.services.task_logging_service import TaskLoggingService
 from app.utils.document_converters import (
    create_document_chunks,
    generate_content_hash,
    generate_document_summary,
    generate_unique_identifier_hash,
 )
 from .base import (
    check_document_by_unique_identifier,
    get_connector_by_id,
    logger,
    update_connector_last_indexed,
 )
 async def index_webcrawler_urls(
    session: AsyncSession,
    connector_id: int,
    search_space_id: int,
    user_id: str,
    start_date: str | None = None,
    end_date: str | None = None,
    update_last_indexed: bool = True,
 ) -> tuple[int, str | None]:
    """
    Index webcrawler URLs.
    Args:
        session: Database session
        connector_id: ID of the webcrawler connector
        search_space_id: ID of the search space to store documents in
        user_id: User ID
        start_date: Start date for filtering (YYYY-MM-DD format) - optional
        end_date: End date for filtering (YYYY-MM-DD format) - optional
        update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
    Returns:
        Tuple containing (number of documents indexed, error message or None)
    """
    task_logger = TaskLoggingService(session, search_space_id)
    # Log task start
    log_entry = await task_logger.log_task_start(
        task_name="webcrawler_url_indexing",
        source="connector_indexing_task",
        message=f"Starting webcrawler URL indexing for connector {connector_id}",
        metadata={
            "connector_id": connector_id,
            "user_id": str(user_id),
            "start_date": start_date,
            "end_date": end_date,
        },
    )
    try:
        # Get the connector
        await task_logger.log_task_progress(
            log_entry,
            f"Retrieving webcrawler connector {connector_id} from database",
            {"stage": "connector_retrieval"},
        )
        # Get the connector from the database
        connector = await get_connector_by_id(
            session, connector_id, SearchSourceConnectorType.WEBCRAWLER_CONNECTOR
        )
        if not connector:
            await task_logger.log_task_failure(
                log_entry,
                f"Connector with ID {connector_id} not found or is not a webcrawler connector",
                "Connector not found",
                {"error_type": "ConnectorNotFound"},
            )
            return (
                0,
                f"Connector with ID {connector_id} not found or is not a webcrawler connector",
            )
        # Get the Firecrawl API key from the connector config (optional)
        api_key = connector.config.get("FIRECRAWL_API_KEY")
        # Get URLs from connector config
        initial_urls = connector.config.get("INITIAL_URLS", "")
        if isinstance(initial_urls, str):
            urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
        elif isinstance(initial_urls, list):
            urls = [url.strip() for url in initial_urls if url.strip()]
        else:
            urls = []
        logger.info(
            f"Starting webcrawler indexing for connector {connector_id} with {len(urls)} URLs"
        )
        # Initialize webcrawler client
        await task_logger.log_task_progress(
            log_entry,
            f"Initializing webcrawler client for connector {connector_id}",
            {
                "stage": "client_initialization",
                "use_firecrawl": bool(api_key),
            },
        )
        crawler = WebCrawlerConnector(firecrawl_api_key=api_key)
        # Validate URLs
        if not urls:
            await task_logger.log_task_failure(
                log_entry,
                "No URLs provided for indexing",
                "Empty URL list",
                {"error_type": "ValidationError"},
            )
            return 0, "No URLs provided for indexing"
        await task_logger.log_task_progress(
            log_entry,
            f"Starting to crawl {len(urls)} URLs",
            {
                "stage": "crawling",
                "total_urls": len(urls),
            },
        )
        documents_indexed = 0
        documents_updated = 0
        documents_skipped = 0
        failed_urls = []
        for idx, url in enumerate(urls, 1):
            try:
                logger.info(f"Processing URL {idx}/{len(urls)}: {url}")
                await task_logger.log_task_progress(
                    log_entry,
                    f"Crawling URL {idx}/{len(urls)}: {url}",
                    {
                        "stage": "crawling_url",
                        "url_index": idx,
                        "url": url,
                    },
                )
                # Crawl the URL
                crawl_result, error = await crawler.crawl_url(url)
                if error or not crawl_result:
                    logger.warning(f"Failed to crawl URL {url}: {error}")
                    failed_urls.append((url, error or "Unknown error"))
                    continue
                # Extract content and metadata
                content = crawl_result.get("content", "")
                metadata = crawl_result.get("metadata", {})
                crawler_type = crawl_result.get("crawler_type", "unknown")
                if not content.strip():
                    logger.warning(f"Skipping URL with no content: {url}")
                    failed_urls.append((url, "No content extracted"))
                    documents_skipped += 1
                    continue
                # Format content as structured document
                structured_document = crawler.format_to_structured_document(crawl_result)
                # Generate unique identifier hash for this URL
                unique_identifier_hash = generate_unique_identifier_hash(
                    DocumentType.CRAWLED_URL, url, search_space_id
                )
                # Generate content hash
                content_hash = generate_content_hash(structured_document, search_space_id)
                # Check if document with this unique identifier already exists
                existing_document = await check_document_by_unique_identifier(
                    session, unique_identifier_hash
                )
                # Extract useful metadata
                title = metadata.get("title", url)
                description = metadata.get("description", "")
                language = metadata.get("language", "")
                if existing_document:
                    # Document exists - check if content has changed
                    if existing_document.content_hash == content_hash:
                        logger.info(f"Document for URL {url} unchanged. Skipping.")
                        documents_skipped += 1
                        continue
                    else:
                        # Content has changed - update the existing document
                        logger.info(f"Content changed for URL {url}. Updating document.")
                        # Generate summary with metadata
                        user_llm = await get_user_long_context_llm(
                            session, user_id, search_space_id
                        )
                        if user_llm:
                            document_metadata = {
                                "url": url,
                                "title": title,
                                "description": description,
                                "language": language,
                                "document_type": "Crawled URL",
                                "crawler_type": crawler_type,
                            }
                            (
                                summary_content,
                                summary_embedding,
                            ) = await generate_document_summary(
                                structured_document, user_llm, document_metadata
                            )
                        else:
                            # Fallback to simple summary if no LLM configured
                            summary_content = f"Crawled URL: {title}\n\n"
                            summary_content += f"URL: {url}\n"
                            if description:
                                summary_content += f"Description: {description}\n"
                            if language:
                                summary_content += f"Language: {language}\n"
                            summary_content += f"Crawler: {crawler_type}\n\n"
                            # Add content preview
                            content_preview = content[:1000]
                            if len(content) > 1000:
                                content_preview += "..."
                            summary_content += f"Content Preview:\n{content_preview}\n"
                            summary_embedding = config.embedding_model_instance.embed(
                                summary_content
                            )
                        # Process chunks
                        chunks = await create_document_chunks(content)
                        # Update existing document
                        existing_document.title = title
                        existing_document.content = summary_content
                        existing_document.content_hash = content_hash
                        existing_document.embedding = summary_embedding
                        existing_document.document_metadata = {
                            **metadata,
                            "crawler_type": crawler_type,
                            "last_crawled_at": datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S"
                            ),
                        }
                        existing_document.chunks = chunks
                        documents_updated += 1
                        logger.info(f"Successfully updated URL {url}")
                        continue
                # Document doesn't exist - create new one
                # Generate summary with metadata
                user_llm = await get_user_long_context_llm(
                    session, user_id, search_space_id
                )
                if user_llm:
                    document_metadata = {
                        "url": url,
                        "title": title,
                        "description": description,
                        "language": language,
                        "document_type": "Crawled URL",
                        "crawler_type": crawler_type,
                    }
                    (
                        summary_content,
                        summary_embedding,
                    ) = await generate_document_summary(
                        structured_document, user_llm, document_metadata
                    )
                else:
                    # Fallback to simple summary if no LLM configured
                    summary_content = f"Crawled URL: {title}\n\n"
                    summary_content += f"URL: {url}\n"
                    if description:
                        summary_content += f"Description: {description}\n"
                    if language:
                        summary_content += f"Language: {language}\n"
                    summary_content += f"Crawler: {crawler_type}\n\n"
                    # Add content preview
                    content_preview = content[:1000]
                    if len(content) > 1000:
                        content_preview += "..."
                    summary_content += f"Content Preview:\n{content_preview}\n"
                    summary_embedding = config.embedding_model_instance.embed(
                        summary_content
                    )
                chunks = await create_document_chunks(content)
                document = Document(
                    search_space_id=search_space_id,
                    title=title,
                    document_type=DocumentType.CRAWLED_URL,
                    document_metadata={
                        **metadata,
                        "crawler_type": crawler_type,
                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    },
                    content=summary_content,
                    content_hash=content_hash,
                    unique_identifier_hash=unique_identifier_hash,
                    embedding=summary_embedding,
                    chunks=chunks,
                )
                session.add(document)
                documents_indexed += 1
                logger.info(f"Successfully indexed new URL {url}")
                # Batch commit every 10 documents
                if (documents_indexed + documents_updated) % 10 == 0:
                    logger.info(
                        f"Committing batch: {documents_indexed + documents_updated} URLs processed so far"
                    )
                    await session.commit()
            except Exception as e:
                logger.error(
                    f"Error processing URL {url}: {e!s}",
                    exc_info=True,
                )
                failed_urls.append((url, str(e)))
                continue
        total_processed = documents_indexed + documents_updated
        if total_processed > 0:
            await update_connector_last_indexed(session, connector, update_last_indexed)
        # Final commit for any remaining documents not yet committed in batches
        logger.info(
            f"Final commit: Total {documents_indexed} new, {documents_updated} updated URLs processed"
        )
        await session.commit()
        # Build result message
        result_message = None
        if failed_urls:
            failed_summary = "; ".join([f"{url}: {error}" for url, error in failed_urls[:5]])
            if len(failed_urls) > 5:
                failed_summary += f" (and {len(failed_urls) - 5} more)"
            result_message = f"Completed with {len(failed_urls)} failures: {failed_summary}"
        await task_logger.log_task_success(
            log_entry,
            f"Successfully completed webcrawler indexing for connector {connector_id}",
            {
                "urls_processed": total_processed,
                "documents_indexed": documents_indexed,
                "documents_updated": documents_updated,
                "documents_skipped": documents_skipped,
                "failed_urls_count": len(failed_urls),
            },
        )
        logger.info(
            f"Webcrawler indexing completed: {documents_indexed} new, "
            f"{documents_updated} updated, {documents_skipped} skipped, "
            f"{len(failed_urls)} failed"
        )
        return total_processed, result_message
    except SQLAlchemyError as db_error:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Database error during webcrawler indexing for connector {connector_id}",
            str(db_error),
            {"error_type": "SQLAlchemyError"},
        )
        logger.error(f"Database error: {db_error!s}", exc_info=True)
        return 0, f"Database error: {db_error!s}"
    except Exception as e:
        await session.rollback()
        await task_logger.log_task_failure(
            log_entry,
            f"Failed to index webcrawler URLs for connector {connector_id}",
            str(e),
            {"error_type": type(e).__name__},
        )
        logger.error(f"Failed to index webcrawler URLs: {e!s}", exc_info=True)
        return 0, f"Failed to index webcrawler URLs: {e!s}"
 async def get_crawled_url_documents(
    session: AsyncSession,
    search_space_id: int,
    connector_id: int | None = None,
 ) -> list[Document]:
    """
    Get all crawled URL documents for a search space.
    Args:
        session: Database session
        search_space_id: ID of the search space
        connector_id: Optional connector ID to filter by
    Returns:
        List of Document objects
    """
    from sqlalchemy import select
    query = select(Document).filter(
        Document.search_space_id == search_space_id,
        Document.document_type == DocumentType.CRAWLED_URL,
    )
    if connector_id:
        # Filter by connector if needed - you might need to add a connector_id field to Document
        # or filter by some other means depending on your schema
        pass
    result = await session.execute(query)
    documents = result.scalars().all()
    return list(documents)
--- a/surfsense_backend/app/utils/periodic_scheduler.py
+++ b/surfsense_backend/app/utils/periodic_scheduler.py
@ -31,6 +31,7 @@ CONNECTOR_TASK_MAP = {
    SearchSourceConnectorType.DISCORD_CONNECTOR: "index_discord_messages",
    SearchSourceConnectorType.LUMA_CONNECTOR: "index_luma_events",
    SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: "index_elasticsearch_documents",
    SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_webcrawler_urls",
 }
@ -79,6 +80,7 @@ def create_periodic_schedule(
            index_luma_events_task,
            index_notion_pages_task,
            index_slack_messages_task,
            index_webcrawler_urls_task,
        )
        # Map connector type to task
@ -96,6 +98,7 @@ def create_periodic_schedule(
            SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
            SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
            SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
            SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_webcrawler_urls_task,
        }
        # Trigger the first run immediately
--- a/surfsense_backend/app/utils/validators.py
+++ b/surfsense_backend/app/utils/validators.py
@ -468,6 +468,26 @@ def validate_connector_config(
        value = config.get(key)
        if not isinstance(value, list) or not value:
            raise ValueError(f"{field_name} must be a non-empty list of strings")
    def validate_firecrawl_api_key_format() -> None:
        api_key = config.get("FIRECRAWL_API_KEY", "")
        if api_key and api_key.strip():
            # Firecrawl API keys typically start with "fc-"
            if not api_key.strip().startswith("fc-"):
                raise ValueError(
                    "Firecrawl API key should start with 'fc-'. Please verify your API key."
                )
    def validate_initial_urls() -> None:
        initial_urls = config.get("INITIAL_URLS", "")
        if initial_urls and initial_urls.strip():
            urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
            for url in urls:
                if not validators.url(url):
                    raise ValueError(
                        f"Invalid URL format in INITIAL_URLS: {url}"
                    )
    # Lookup table for connector validation rules
    connector_rules = {
@ -550,6 +570,14 @@ def validate_connector_config(
        #     "validators": {}
        # },
        "LUMA_CONNECTOR": {"required": ["LUMA_API_KEY"], "validators": {}},
        "WEBCRAWLER_CONNECTOR": {
            "required": [],  # No required fields - API key is optional
            "optional": ["FIRECRAWL_API_KEY", "INITIAL_URLS"],
            "validators": {
                "FIRECRAWL_API_KEY": lambda: validate_firecrawl_api_key_format(),
                "INITIAL_URLS": lambda: validate_initial_urls(),
            },
        },
    }
    rules = connector_rules.get(connector_type_str)
--- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx
@ -282,6 +282,18 @@ export default function EditConnectorPage() {
 										placeholder="Your Elasticsearch API Key"
 									/>
 								)}
 								{/* == Webcrawler == */}
 								{connector.connector_type === "WEBCRAWLER_CONNECTOR" && (
 									<EditSimpleTokenForm
 										control={editForm.control}
 										fieldName="FIRECRAWL_API_KEY"
 										fieldLabel="Firecrawl API Key (Optional)"
 										fieldDescription="Add a Firecrawl API key for enhanced crawling capabilities. If not provided, will use AsyncChromiumLoader as fallback."
 										placeholder="fc-xxxxxxxxxxxxx"
 									/>
 								)}
 							</CardContent>
 							<CardFooter className="border-t pt-6">
 								<Button type="submit" disabled={isSaving} className="w-full sm:w-auto">
--- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx
@ -55,6 +55,7 @@ const getConnectorTypeDisplay = (type: string): string => {
 		AIRTABLE_CONNECTOR: "Airtable Connector",
 		LUMA_CONNECTOR: "Luma Connector",
 		ELASTICSEARCH_CONNECTOR: "Elasticsearch Connector",
 		WEBCRAWLER_CONNECTOR: "Web Crawler Connector",
 		// Add other connector types here as needed
 	};
 	return typeMap[type] || type;
@ -75,6 +76,7 @@ const getApiKeyFieldName = (connectorType: string): string => {
 		LINKUP_API: "LINKUP_API_KEY",
 		LUMA_CONNECTOR: "LUMA_API_KEY",
 		ELASTICSEARCH_CONNECTOR: "ELASTICSEARCH_API_KEY",
 		WEBCRAWLER_CONNECTOR: "FIRECRAWL_API_KEY",
 	};
 	return fieldMap[connectorType] || "";
 };
--- a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/webcrawler-connector/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/webcrawler-connector/page.tsx
@ -0,0 +1,334 @@
 "use client";
 import { zodResolver } from "@hookform/resolvers/zod";
 import { ArrowLeft, Check, Globe, Loader2 } from "lucide-react";
 import { motion } from "motion/react";
 import Link from "next/link";
 import { useParams, useRouter } from "next/navigation";
 import { useEffect, useState } from "react";
 import { useForm } from "react-hook-form";
 import { toast } from "sonner";
 import * as z from "zod";
 import { Button } from "@/components/ui/button";
 import {
 	Card,
 	CardContent,
 	CardDescription,
 	CardFooter,
 	CardHeader,
 	CardTitle,
 } from "@/components/ui/card";
 import {
 	Form,
 	FormControl,
 	FormDescription,
 	FormField,
 	FormItem,
 	FormLabel,
 	FormMessage,
 } from "@/components/ui/form";
 import { Input } from "@/components/ui/input";
 import { Textarea } from "@/components/ui/textarea";
 import { EnumConnectorName } from "@/contracts/enums/connector";
 import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
 import {
 	type SearchSourceConnector,
 	useSearchSourceConnectors,
 } from "@/hooks/use-search-source-connectors";
 // Define the form schema with Zod
 const webcrawlerConnectorFormSchema = z.object({
 	name: z.string().min(3, {
 		message: "Connector name must be at least 3 characters.",
 	}),
 	api_key: z.string().optional(),
 	initial_urls: z.string().optional(),
 });
 // Define the type for the form values
 type WebcrawlerConnectorFormValues = z.infer<typeof webcrawlerConnectorFormSchema>;
 export default function WebcrawlerConnectorPage() {
 	const router = useRouter();
 	const params = useParams();
 	const searchSpaceId = params.search_space_id as string;
 	const [isSubmitting, setIsSubmitting] = useState(false);
 	const [doesConnectorExist, setDoesConnectorExist] = useState(false);
 	const { fetchConnectors, createConnector } = useSearchSourceConnectors(
 		true,
 		parseInt(searchSpaceId)
 	);
 	// Initialize the form
 	const form = useForm<WebcrawlerConnectorFormValues>({
 		resolver: zodResolver(webcrawlerConnectorFormSchema),
 		defaultValues: {
 			name: "Web Crawler",
 			api_key: "",
 			initial_urls: "",
 		},
 	});
 	useEffect(() => {
 		fetchConnectors(parseInt(searchSpaceId))
 			.then((data) => {
 				if (data && Array.isArray(data)) {
 					const connector = data.find(
 						(c: SearchSourceConnector) => c.connector_type === EnumConnectorName.WEBCRAWLER_CONNECTOR
 					);
 					if (connector) {
 						setDoesConnectorExist(true);
 					}
 				}
 			})
 			.catch((error) => {
 				console.error("Error fetching connectors:", error);
 			});
 	}, [fetchConnectors, searchSpaceId]);
 	// Handle form submission
 	const onSubmit = async (values: WebcrawlerConnectorFormValues) => {
 		setIsSubmitting(true);
 		try {
 			const config: Record<string, string> = {};
 			// Only add API key to config if provided
 			if (values.api_key && values.api_key.trim()) {
 				config.FIRECRAWL_API_KEY = values.api_key;
 			}
 			// Parse initial URLs if provided
 			if (values.initial_urls && values.initial_urls.trim()) {
 				config.INITIAL_URLS = values.initial_urls;
 			}
 			await createConnector(
 				{
 					name: values.name,
 					connector_type: EnumConnectorName.WEBCRAWLER_CONNECTOR,
 					config: config,
 					is_indexable: true,
 					last_indexed_at: null,
 					periodic_indexing_enabled: false,
 					indexing_frequency_minutes: null,
 					next_scheduled_at: null,
 				},
 				parseInt(searchSpaceId)
 			);
 			toast.success("Webcrawler connector created successfully!");
 			// Navigate back to connectors page
 			router.push(`/dashboard/${searchSpaceId}/connectors`);
 		} catch (error) {
 			console.error("Error creating connector:", error);
 			toast.error(error instanceof Error ? error.message : "Failed to create connector");
 		} finally {
 			setIsSubmitting(false);
 		}
 	};
 	return (
 		<div className="container mx-auto py-8 max-w-2xl">
 			<motion.div
 				initial={{ opacity: 0, y: 20 }}
 				animate={{ opacity: 1, y: 0 }}
 				transition={{ duration: 0.5 }}
 			>
 				{/* Header */}
 				<div className="mb-8">
 					<Link
 						href={`/dashboard/${searchSpaceId}/connectors/add`}
 						className="inline-flex items-center text-sm text-muted-foreground hover:text-foreground mb-4"
 					>
 						<ArrowLeft className="mr-2 h-4 w-4" />
 						Back to connectors
 					</Link>
 					<div className="flex items-center gap-4">
 						<div className="flex h-12 w-12 items-center justify-center rounded-lg">
 							{getConnectorIcon(EnumConnectorName.WEBCRAWLER_CONNECTOR, "h-6 w-6")}
 						</div>
 						<div>
 							<h1 className="text-3xl font-bold tracking-tight">Connect Web Crawler</h1>
 							<p className="text-muted-foreground">Crawl and index web pages for search.</p>
 						</div>
 					</div>
 				</div>
 				{/* Connection Card */}
 				{!doesConnectorExist ? (
 					<Card>
 						<CardHeader>
 							<CardTitle>Set Up Web Crawler</CardTitle>
 							<CardDescription>
 								Configure your web crawler to index web pages. Optionally add a Firecrawl API key
 								for enhanced crawling capabilities.
 							</CardDescription>
 						</CardHeader>
 						<Form {...form}>
 							<form onSubmit={form.handleSubmit(onSubmit)}>
 								<CardContent className="space-y-4">
 									<FormField
 										control={form.control}
 										name="name"
 										render={({ field }) => (
 											<FormItem>
 												<FormLabel>Connector Name</FormLabel>
 												<FormControl>
 													<Input placeholder="My Web Crawler" {...field} />
 												</FormControl>
 												<FormDescription>
 													A friendly name to identify this connector.
 												</FormDescription>
 												<FormMessage />
 											</FormItem>
 										)}
 									/>
 									<FormField
 										control={form.control}
 										name="api_key"
 										render={({ field }) => (
 											<FormItem>
 												<FormLabel>Firecrawl API Key (Optional)</FormLabel>
 												<FormControl>
 													<Input 
 														type="password" 
 														placeholder="fc-xxxxxxxxxxxxx" 
 														{...field} 
 													/>
 												</FormControl>
 												<FormDescription>
 													Add a Firecrawl API key for enhanced crawling. If not provided, will use
 													AsyncChromiumLoader as fallback.
 												</FormDescription>
 												<FormMessage />
 											</FormItem>
 										)}
 									/>
 									<FormField
 										control={form.control}
 										name="initial_urls"
 										render={({ field }) => (
 											<FormItem>
 												<FormLabel>Initial URLs (Optional)</FormLabel>
 												<FormControl>
 													<Textarea 
 														placeholder="https://example.com&#10;https://docs.example.com&#10;https://blog.example.com"
 														className="min-h-[100px] font-mono text-sm"
 														{...field} 
 													/>
 												</FormControl>
 												<FormDescription>
 													Enter URLs to crawl (one per line). You can add more URLs later.
 												</FormDescription>
 												<FormMessage />
 											</FormItem>
 										)}
 									/>
 									<div className="space-y-2 pt-2">
 										<div className="flex items-center space-x-2 text-sm text-muted-foreground">
 											<Check className="h-4 w-4 text-green-500" />
 											<span>Crawl any public web page</span>
 										</div>
 										<div className="flex items-center space-x-2 text-sm text-muted-foreground">
 											<Check className="h-4 w-4 text-green-500" />
 											<span>Extract markdown content automatically</span>
 										</div>
 										<div className="flex items-center space-x-2 text-sm text-muted-foreground">
 											<Check className="h-4 w-4 text-green-500" />
 											<span>Detect content changes and update documents</span>
 										</div>
 										<div className="flex items-center space-x-2 text-sm text-muted-foreground">
 											<Check className="h-4 w-4 text-green-500" />
 											<span>Works with or without Firecrawl API key</span>
 										</div>
 									</div>
 								</CardContent>
 								<CardFooter className="flex justify-between">
 									<Button
 										type="button"
 										variant="outline"
 										onClick={() => router.push(`/dashboard/${searchSpaceId}/connectors/add`)}
 									>
 										Cancel
 									</Button>
 									<Button type="submit" disabled={isSubmitting}>
 										{isSubmitting ? (
 											<>
 												<Loader2 className="mr-2 h-4 w-4 animate-spin" />
 												Setting up...
 											</>
 										) : (
 											<>
 												<Globe className="mr-2 h-4 w-4" />
 												Create Crawler
 											</>
 										)}
 									</Button>
 								</CardFooter>
 							</form>
 						</Form>
 					</Card>
 				) : (
 					/* Success Card */
 					<Card>
 						<CardHeader>
 							<CardTitle>✅ Your web crawler is successfully set up!</CardTitle>
 							<CardDescription>
 								You can now add URLs to crawl from the connector management page.
 							</CardDescription>
 						</CardHeader>
 					</Card>
 				)}
 				{/* Help Section */}
 				{!doesConnectorExist && (
 					<Card className="mt-6">
 						<CardHeader>
 							<CardTitle className="text-lg">How It Works</CardTitle>
 						</CardHeader>
 						<CardContent className="space-y-4">
 							<div>
 								<h4 className="font-medium mb-2">1. Choose Your Crawler Method</h4>
 								<p className="text-sm text-muted-foreground">
 									<strong>With Firecrawl (Recommended):</strong> Get your API key from{" "}
 									<a 
 										href="https://firecrawl.dev" 
 										target="_blank" 
 										rel="noopener noreferrer"
 										className="text-primary hover:underline"
 									>
 										firecrawl.dev
 									</a>{" "}
 									for faster, more reliable crawling with better content extraction.
 								</p>
 								<p className="text-sm text-muted-foreground mt-2">
 									<strong>Without Firecrawl:</strong> The crawler will use AsyncChromiumLoader as a
 									free fallback option. This works well for most websites but may be slower.
 								</p>
 							</div>
 							<div>
 								<h4 className="font-medium mb-2">2. Add URLs to Crawl (Optional)</h4>
 								<p className="text-sm text-muted-foreground">
 									You can add initial URLs now or add them later from the connector management page.
 									Enter one URL per line.
 								</p>
 							</div>
 							<div>
 								<h4 className="font-medium mb-2">3. Manage Your Crawler</h4>
 								<p className="text-sm text-muted-foreground">
 									After setup, you can add more URLs, trigger manual crawls, or set up periodic
 									indexing to keep your content up-to-date.
 								</p>
 							</div>
 						</CardContent>
 					</Card>
 				)}
 			</motion.div>
 		</div>
 	);
 }
--- a/surfsense_web/components/dashboard-breadcrumb.tsx
+++ b/surfsense_web/components/dashboard-breadcrumb.tsx
@ -138,6 +138,7 @@ export function DashboardBreadcrumb() {
 								"linkup-api": "LinkUp API",
 								"luma-connector": "Luma",
 								"elasticsearch-connector": "Elasticsearch",
 								"webcrawler-connector": "WebCrawler",
 							};
 							const connectorLabel = connectorLabels[connectorType] || connectorType;
--- a/surfsense_web/components/editConnector/types.ts
+++ b/surfsense_web/components/editConnector/types.ts
@ -52,5 +52,6 @@ export const editConnectorSchema = z.object({
 	GOOGLE_CALENDAR_CALENDAR_IDS: z.string().optional(),
 	LUMA_API_KEY: z.string().optional(),
 	ELASTICSEARCH_API_KEY: z.string().optional(),
 	FIRECRAWL_API_KEY: z.string().optional(),
 });
 export type EditConnectorFormValues = z.infer<typeof editConnectorSchema>;
--- a/surfsense_web/components/homepage/integrations.tsx
+++ b/surfsense_web/components/homepage/integrations.tsx
@ -29,6 +29,7 @@ const INTEGRATIONS: Integration[] = [
 	// Documentation & Knowledge
 	{ name: "Confluence", icon: "https://cdn.simpleicons.org/confluence/172B4D" },
 	{ name: "Notion", icon: "https://cdn.simpleicons.org/notion/000000/ffffff" },
 	{ name: "Web Crawler", icon: "https://cdn.jsdelivr.net/npm/lucide-static@0.294.0/icons/globe.svg"},
 	// Cloud Storage
 	{ name: "Google Drive", icon: "https://cdn.simpleicons.org/googledrive/4285F4" },
--- a/surfsense_web/components/sources/connector-data.tsx
+++ b/surfsense_web/components/sources/connector-data.tsx
@ -138,6 +138,13 @@ export const connectorCategories: ConnectorCategory[] = [
 				icon: getConnectorIcon(EnumConnectorName.LUMA_CONNECTOR, "h-6 w-6"),
 				status: "available",
 			},
 			{
 				id: "webcrawler-connector",
 				title: "Web Crawler",
 				description: "webcrawler_desc",
 				icon: getConnectorIcon(EnumConnectorName.WEBCRAWLER_CONNECTOR, "h-6 w-6"),
 				status: "available",
 			},
 		],
 	},
 	{
--- a/surfsense_web/contracts/enums/connector.ts
+++ b/surfsense_web/contracts/enums/connector.ts
@ -17,4 +17,5 @@ export enum EnumConnectorName {
 	AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR",
 	LUMA_CONNECTOR = "LUMA_CONNECTOR",
 	ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR",
 	WEBCRAWLER_CONNECTOR = "WEBCRAWLER_CONNECTOR",
 }
--- a/surfsense_web/contracts/enums/connectorIcons.tsx
+++ b/surfsense_web/contracts/enums/connectorIcons.tsx
@ -59,11 +59,11 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas
 			return <IconSparkles {...iconProps} />;
 		case EnumConnectorName.ELASTICSEARCH_CONNECTOR:
 			return <IconBrandElastic {...iconProps} />;
 		case EnumConnectorName.WEBCRAWLER_CONNECTOR:
 			return <Globe {...iconProps} />;
 		// Additional cases for non-enum connector types
 		case "YOUTUBE_VIDEO":
 			return <IconBrandYoutube {...iconProps} />;
 		case "CRAWLED_URL":
 			return <Globe {...iconProps} />;
 		case "FILE":
 			return <File {...iconProps} />;
 		case "EXTENSION":
--- a/surfsense_web/hooks/use-connector-edit-page.ts
+++ b/surfsense_web/hooks/use-connector-edit-page.ts
@ -97,6 +97,7 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
 			JIRA_API_TOKEN: "",
 			LUMA_API_KEY: "",
 			ELASTICSEARCH_API_KEY: "",
 			FIRECRAWL_API_KEY: "",
 		},
 	});
@ -142,6 +143,7 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
 					JIRA_API_TOKEN: config.JIRA_API_TOKEN || "",
 					LUMA_API_KEY: config.LUMA_API_KEY || "",
 					ELASTICSEARCH_API_KEY: config.ELASTICSEARCH_API_KEY || "",
 					FIRECRAWL_API_KEY: config.FIRECRAWL_API_KEY || "",
 				});
 				if (currentConnector.connector_type === "GITHUB_CONNECTOR") {
 					const savedRepos = config.repo_full_names || [];
@ -469,6 +471,19 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
 						newConfig = { ELASTICSEARCH_API_KEY: formData.ELASTICSEARCH_API_KEY };
 					}
 					break;
 				case "WEBCRAWLER_CONNECTOR":
 					if (formData.FIRECRAWL_API_KEY !== originalConfig.FIRECRAWL_API_KEY) {
 						if (formData.FIRECRAWL_API_KEY && formData.FIRECRAWL_API_KEY.trim()) {
 							if (!formData.FIRECRAWL_API_KEY.startsWith("fc-")) {
 								toast.warning("Firecrawl API keys typically start with 'fc-'. Please verify your key.");
 							}
 							newConfig = { FIRECRAWL_API_KEY: formData.FIRECRAWL_API_KEY };
 						} else {
 							newConfig = {};
 							toast.info("Firecrawl API key removed. Web crawler will use AsyncChromiumLoader as fallback.");
 						}
 					}
 					break;
 			}
 			if (newConfig !== null) {
@ -562,6 +577,8 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
 							"ELASTICSEARCH_API_KEY",
 							newlySavedConfig.ELASTICSEARCH_API_KEY || ""
 						);
 					} else if (connector.connector_type == "WEBCRAWLER_CONNECTOR") {
 						editForm.setValue("FIRECRAWL_API_KEY",newlySavedConfig.FIRECRAWL_API_KEY || "");
 					}
 				}
 				if (connector.connector_type === "GITHUB_CONNECTOR") {
--- a/surfsense_web/lib/connectors/utils.ts
+++ b/surfsense_web/lib/connectors/utils.ts
@ -18,6 +18,7 @@ export const getConnectorTypeDisplay = (type: string): string => {
 		AIRTABLE_CONNECTOR: "Airtable",
 		LUMA_CONNECTOR: "Luma",
 		ELASTICSEARCH_CONNECTOR: "Elasticsearch",
 		WEBCRAWLER_CONNECTOR: "Web Crawler",
 	};
 	return typeMap[type] || type;
 };
--- a/surfsense_web/messages/en.json
+++ b/surfsense_web/messages/en.json
@ -331,7 +331,8 @@
 		"luma_desc": "Connect to Luma to search events",
 		"calendar_desc": "Connect to Google Calendar to search events, meetings and schedules.",
 		"gmail_desc": "Connect to your Gmail account to search through your emails.",
-		"zoom_desc": "Connect to Zoom to access meeting recordings and transcripts."
+		"zoom_desc": "Connect to Zoom to access meeting recordings and transcripts.",
 		"webcrawler_desc": "Scrape web pages using FireCrawl."
 	},
 	"upload_documents": {
 		"title": "Upload Documents",