Merge pull request #502 from MODSetter/dev

feat: reworked web crawlers and added it as connector
This commit is contained in:
Rohan Verma 2025-11-26 13:52:11 -08:00 committed by GitHub
commit bebc1c6bfc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
38 changed files with 1531 additions and 732 deletions

View file

@ -8,6 +8,8 @@ Create Date: 2025-11-13 23:20:12.912741
from collections.abc import Sequence from collections.abc import Sequence
from sqlalchemy import text
from alembic import op from alembic import op
# revision identifiers, used by Alembic. # revision identifiers, used by Alembic.
@ -17,6 +19,20 @@ branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None depends_on: str | Sequence[str] | None = None
def constraint_exists(connection, table_name: str, constraint_name: str) -> bool:
"""Check if a constraint exists on the given table."""
result = connection.execute(
text(
"""
SELECT 1 FROM information_schema.table_constraints
WHERE table_name = :table_name AND constraint_name = :constraint_name
"""
),
{"table_name": table_name, "constraint_name": constraint_name},
)
return result.fetchone() is not None
def upgrade() -> None: def upgrade() -> None:
""" """
Remove foreign key constraints on LLM preference columns to allow global configs (negative IDs). Remove foreign key constraints on LLM preference columns to allow global configs (negative IDs).
@ -24,50 +40,55 @@ def upgrade() -> None:
Global LLM configs use negative IDs and don't exist in the llm_configs table, Global LLM configs use negative IDs and don't exist in the llm_configs table,
so we need to remove the foreign key constraints that were preventing their use. so we need to remove the foreign key constraints that were preventing their use.
""" """
# Drop the foreign key constraints connection = op.get_bind()
op.drop_constraint(
# Drop the foreign key constraints if they exist
constraints_to_drop = [
"user_search_space_preferences_long_context_llm_id_fkey", "user_search_space_preferences_long_context_llm_id_fkey",
"user_search_space_preferences",
type_="foreignkey",
)
op.drop_constraint(
"user_search_space_preferences_fast_llm_id_fkey", "user_search_space_preferences_fast_llm_id_fkey",
"user_search_space_preferences",
type_="foreignkey",
)
op.drop_constraint(
"user_search_space_preferences_strategic_llm_id_fkey", "user_search_space_preferences_strategic_llm_id_fkey",
]
for constraint_name in constraints_to_drop:
if constraint_exists(
connection, "user_search_space_preferences", constraint_name
):
op.drop_constraint(
constraint_name,
"user_search_space_preferences", "user_search_space_preferences",
type_="foreignkey", type_="foreignkey",
) )
else:
print(f"Constraint '{constraint_name}' does not exist. Skipping.")
def downgrade() -> None: def downgrade() -> None:
""" """
Re-add foreign key constraints (will fail if any negative IDs exist in the table). Re-add foreign key constraints (will fail if any negative IDs exist in the table).
""" """
# Re-add the foreign key constraints connection = op.get_bind()
op.create_foreign_key(
# Re-add the foreign key constraints if they don't exist
constraints_to_create = [
(
"user_search_space_preferences_long_context_llm_id_fkey", "user_search_space_preferences_long_context_llm_id_fkey",
"user_search_space_preferences", "long_context_llm_id",
"llm_configs", ),
["long_context_llm_id"], ("user_search_space_preferences_fast_llm_id_fkey", "fast_llm_id"),
["id"], ("user_search_space_preferences_strategic_llm_id_fkey", "strategic_llm_id"),
ondelete="SET NULL", ]
)
for constraint_name, column_name in constraints_to_create:
if not constraint_exists(
connection, "user_search_space_preferences", constraint_name
):
op.create_foreign_key( op.create_foreign_key(
"user_search_space_preferences_fast_llm_id_fkey", constraint_name,
"user_search_space_preferences", "user_search_space_preferences",
"llm_configs", "llm_configs",
["fast_llm_id"], [column_name],
["id"],
ondelete="SET NULL",
)
op.create_foreign_key(
"user_search_space_preferences_strategic_llm_id_fkey",
"user_search_space_preferences",
"llm_configs",
["strategic_llm_id"],
["id"], ["id"],
ondelete="SET NULL", ondelete="SET NULL",
) )
else:
print(f"Constraint '{constraint_name}' already exists. Skipping.")

View file

@ -9,6 +9,7 @@ Create Date: 2025-11-19 00:00:00.000000
from collections.abc import Sequence from collections.abc import Sequence
import sqlalchemy as sa import sqlalchemy as sa
from sqlalchemy import text
from alembic import op from alembic import op
@ -19,24 +20,55 @@ branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None depends_on: str | Sequence[str] | None = None
def column_exists(connection, table_name: str, column_name: str) -> bool:
"""Check if a column exists on the given table."""
result = connection.execute(
text(
"""
SELECT 1 FROM information_schema.columns
WHERE table_name = :table_name AND column_name = :column_name
"""
),
{"table_name": table_name, "column_name": column_name},
)
return result.fetchone() is not None
def upgrade() -> None: def upgrade() -> None:
"""Add QnA configuration columns to searchspaces table.""" """Add QnA configuration columns to searchspaces table."""
connection = op.get_bind()
# Add citations_enabled boolean (default True) # Add citations_enabled boolean (default True)
if not column_exists(connection, "searchspaces", "citations_enabled"):
op.add_column( op.add_column(
"searchspaces", "searchspaces",
sa.Column( sa.Column(
"citations_enabled", sa.Boolean(), nullable=False, server_default="true" "citations_enabled", sa.Boolean(), nullable=False, server_default="true"
), ),
) )
else:
print("Column 'citations_enabled' already exists. Skipping.")
# Add custom instructions text field (nullable, defaults to empty) # Add custom instructions text field (nullable, defaults to empty)
if not column_exists(connection, "searchspaces", "qna_custom_instructions"):
op.add_column( op.add_column(
"searchspaces", "searchspaces",
sa.Column("qna_custom_instructions", sa.Text(), nullable=True), sa.Column("qna_custom_instructions", sa.Text(), nullable=True),
) )
else:
print("Column 'qna_custom_instructions' already exists. Skipping.")
def downgrade() -> None: def downgrade() -> None:
"""Remove QnA configuration columns from searchspaces table.""" """Remove QnA configuration columns from searchspaces table."""
connection = op.get_bind()
if column_exists(connection, "searchspaces", "qna_custom_instructions"):
op.drop_column("searchspaces", "qna_custom_instructions") op.drop_column("searchspaces", "qna_custom_instructions")
else:
print("Column 'qna_custom_instructions' does not exist. Skipping.")
if column_exists(connection, "searchspaces", "citations_enabled"):
op.drop_column("searchspaces", "citations_enabled") op.drop_column("searchspaces", "citations_enabled")
else:
print("Column 'citations_enabled' does not exist. Skipping.")

View file

@ -0,0 +1,59 @@
"""Add Webcrawler connector enums
Revision ID: 38
Revises: 37
Create Date: 2025-11-17 17:00:00.000000
"""
from collections.abc import Sequence
from alembic import op
revision: str = "38"
down_revision: str | None = "37"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
"""Safely add 'WEBCRAWLER_CONNECTOR' to enum types if missing."""
# Add to searchsourceconnectortype enum
op.execute(
"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_type t
JOIN pg_enum e ON t.oid = e.enumtypid
WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'WEBCRAWLER_CONNECTOR'
) THEN
ALTER TYPE searchsourceconnectortype ADD VALUE 'WEBCRAWLER_CONNECTOR';
END IF;
END
$$;
"""
)
# Add to documenttype enum
op.execute(
"""
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_type t
JOIN pg_enum e ON t.oid = e.enumtypid
WHERE t.typname = 'documenttype' AND e.enumlabel = 'CRAWLED_URL'
) THEN
ALTER TYPE documenttype ADD VALUE 'CRAWLED_URL';
END IF;
END
$$;
"""
)
def downgrade() -> None:
"""Remove 'WEBCRAWLER_CONNECTOR' from enum types."""
pass

View file

@ -667,7 +667,7 @@ async def fetch_relevant_documents(
} }
) )
elif connector == "CRAWLED_URL": elif connector == "WEBCRAWLER_CONNECTOR":
( (
source_object, source_object,
crawled_urls_chunks, crawled_urls_chunks,
@ -689,7 +689,7 @@ async def fetch_relevant_documents(
writer( writer(
{ {
"yield_value": streaming_service.format_terminal_info_delta( "yield_value": streaming_service.format_terminal_info_delta(
f"🌐 Found {len(crawled_urls_chunks)} Web Pages chunks related to your query" f"🌐 Found {len(crawled_urls_chunks)} Web Page chunks related to your query"
) )
} }
) )

View file

@ -17,7 +17,6 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel
{chat_history_section} {chat_history_section}
<knowledge_sources> <knowledge_sources>
- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history) - EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
- CRAWLED_URL: "Webpages indexed by SurfSense web crawler" (personally selected websites)
- FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files) - FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
- SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications) - SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
- NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management) - NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
@ -35,6 +34,7 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel
- TAVILY_API: "Tavily search API results" (personalized search results) - TAVILY_API: "Tavily search API results" (personalized search results)
- LINKUP_API: "Linkup search API results" (personalized search results) - LINKUP_API: "Linkup search API results" (personalized search results)
- LUMA_CONNECTOR: "Luma events" - LUMA_CONNECTOR: "Luma events"
- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites)
</knowledge_sources> </knowledge_sources>
<instructions> <instructions>

View file

@ -19,7 +19,6 @@ def get_connector_emoji(connector_name: str) -> str:
connector_emojis = { connector_emojis = {
"YOUTUBE_VIDEO": "📹", "YOUTUBE_VIDEO": "📹",
"EXTENSION": "🧩", "EXTENSION": "🧩",
"CRAWLED_URL": "🌐",
"FILE": "📄", "FILE": "📄",
"SLACK_CONNECTOR": "💬", "SLACK_CONNECTOR": "💬",
"NOTION_CONNECTOR": "📘", "NOTION_CONNECTOR": "📘",
@ -34,6 +33,7 @@ def get_connector_emoji(connector_name: str) -> str:
"AIRTABLE_CONNECTOR": "🗃️", "AIRTABLE_CONNECTOR": "🗃️",
"LUMA_CONNECTOR": "", "LUMA_CONNECTOR": "",
"ELASTICSEARCH_CONNECTOR": "", "ELASTICSEARCH_CONNECTOR": "",
"WEBCRAWLER_CONNECTOR": "🌐",
} }
return connector_emojis.get(connector_name, "🔎") return connector_emojis.get(connector_name, "🔎")
@ -43,7 +43,6 @@ def get_connector_friendly_name(connector_name: str) -> str:
connector_friendly_names = { connector_friendly_names = {
"YOUTUBE_VIDEO": "YouTube", "YOUTUBE_VIDEO": "YouTube",
"EXTENSION": "Browser Extension", "EXTENSION": "Browser Extension",
"CRAWLED_URL": "Web Pages",
"FILE": "Files", "FILE": "Files",
"SLACK_CONNECTOR": "Slack", "SLACK_CONNECTOR": "Slack",
"NOTION_CONNECTOR": "Notion", "NOTION_CONNECTOR": "Notion",
@ -59,6 +58,7 @@ def get_connector_friendly_name(connector_name: str) -> str:
"AIRTABLE_CONNECTOR": "Airtable", "AIRTABLE_CONNECTOR": "Airtable",
"LUMA_CONNECTOR": "Luma", "LUMA_CONNECTOR": "Luma",
"ELASTICSEARCH_CONNECTOR": "Elasticsearch", "ELASTICSEARCH_CONNECTOR": "Elasticsearch",
"WEBCRAWLER_CONNECTOR": "Web Pages",
} }
return connector_friendly_names.get(connector_name, connector_name) return connector_friendly_names.get(connector_name, connector_name)

View file

@ -208,9 +208,6 @@ class Config:
# LlamaCloud API Key # LlamaCloud API Key
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
# Firecrawl API Key
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None)
# Litellm TTS Configuration # Litellm TTS Configuration
TTS_SERVICE = os.getenv("TTS_SERVICE") TTS_SERVICE = os.getenv("TTS_SERVICE")
TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE") TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")

View file

@ -0,0 +1,191 @@
"""
WebCrawler Connector Module
A module for crawling web pages and extracting content using Firecrawl or AsyncChromiumLoader.
Provides a unified interface for web scraping.
"""
from typing import Any
import validators
from firecrawl import AsyncFirecrawlApp
from langchain_community.document_loaders import AsyncChromiumLoader
class WebCrawlerConnector:
"""Class for crawling web pages and extracting content."""
def __init__(self, firecrawl_api_key: str | None = None):
"""
Initialize the WebCrawlerConnector class.
Args:
firecrawl_api_key: Firecrawl API key (optional, will use AsyncChromiumLoader if not provided)
"""
self.firecrawl_api_key = firecrawl_api_key
self.use_firecrawl = bool(firecrawl_api_key)
def set_api_key(self, api_key: str) -> None:
"""
Set the Firecrawl API key and enable Firecrawl usage.
Args:
api_key: Firecrawl API key
"""
self.firecrawl_api_key = api_key
self.use_firecrawl = True
async def crawl_url(
self, url: str, formats: list[str] | None = None
) -> tuple[dict[str, Any] | None, str | None]:
"""
Crawl a single URL and extract its content.
Args:
url: URL to crawl
formats: List of formats to extract (e.g., ["markdown", "html"]) - only for Firecrawl
Returns:
Tuple containing (crawl result dict, error message or None)
Result dict contains:
- content: Extracted content (markdown or HTML)
- metadata: Page metadata (title, description, etc.)
- source: Original URL
- crawler_type: Type of crawler used
"""
try:
# Validate URL
if not validators.url(url):
return None, f"Invalid URL: {url}"
if self.use_firecrawl:
result = await self._crawl_with_firecrawl(url, formats)
else:
result = await self._crawl_with_chromium(url)
return result, None
except Exception as e:
return None, f"Error crawling URL {url}: {e!s}"
async def _crawl_with_firecrawl(
self, url: str, formats: list[str] | None = None
) -> dict[str, Any]:
"""
Crawl URL using Firecrawl.
Args:
url: URL to crawl
formats: List of formats to extract
Returns:
Dict containing crawled content and metadata
Raises:
ValueError: If Firecrawl scraping fails
"""
if not self.firecrawl_api_key:
raise ValueError("Firecrawl API key not set. Call set_api_key() first.")
firecrawl_app = AsyncFirecrawlApp(api_key=self.firecrawl_api_key)
# Default to markdown format
if formats is None:
formats = ["markdown"]
scrape_result = await firecrawl_app.scrape_url(url=url, formats=formats)
if not scrape_result or not scrape_result.success:
error_msg = (
scrape_result.error
if scrape_result and hasattr(scrape_result, "error")
else "Unknown error"
)
raise ValueError(f"Firecrawl failed to scrape URL: {error_msg}")
# Extract content based on format
content = scrape_result.markdown or scrape_result.html or ""
# Extract metadata
metadata = scrape_result.metadata if scrape_result.metadata else {}
return {
"content": content,
"metadata": {
"source": url,
"title": metadata.get("title", url),
"description": metadata.get("description", ""),
"language": metadata.get("language", ""),
"sourceURL": metadata.get("sourceURL", url),
**metadata,
},
"crawler_type": "firecrawl",
}
async def _crawl_with_chromium(self, url: str) -> dict[str, Any]:
"""
Crawl URL using AsyncChromiumLoader.
Args:
url: URL to crawl
Returns:
Dict containing crawled content and metadata
Raises:
Exception: If crawling fails
"""
crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
documents = await crawl_loader.aload()
if not documents:
raise ValueError(f"Failed to load content from {url}")
doc = documents[0]
# Extract basic metadata from the document
metadata = doc.metadata if doc.metadata else {}
return {
"content": doc.page_content,
"metadata": {
"source": url,
"title": metadata.get("title", url),
**metadata,
},
"crawler_type": "chromium",
}
def format_to_structured_document(self, crawl_result: dict[str, Any]) -> str:
"""
Format crawl result as a structured document.
Args:
crawl_result: Result from crawl_url method
Returns:
Structured document string
"""
metadata = crawl_result["metadata"]
content = crawl_result["content"]
document_parts = ["<DOCUMENT>", "<METADATA>"]
# Add all metadata fields
for key, value in metadata.items():
document_parts.append(f"{key.upper()}: {value}")
document_parts.extend(
[
"</METADATA>",
"<CONTENT>",
"FORMAT: markdown",
"TEXT_START",
content,
"TEXT_END",
"</CONTENT>",
"</DOCUMENT>",
]
)
return "\n".join(document_parts)

View file

@ -73,6 +73,7 @@ class SearchSourceConnectorType(str, Enum):
AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR" AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR"
LUMA_CONNECTOR = "LUMA_CONNECTOR" LUMA_CONNECTOR = "LUMA_CONNECTOR"
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR" ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR"
WEBCRAWLER_CONNECTOR = "WEBCRAWLER_CONNECTOR"
class ChatType(str, Enum): class ChatType(str, Enum):

View file

@ -65,13 +65,6 @@ async def create_documents(
process_extension_document_task.delay( process_extension_document_task.delay(
document_dict, request.search_space_id, str(user.id) document_dict, request.search_space_id, str(user.id)
) )
elif request.document_type == DocumentType.CRAWLED_URL:
from app.tasks.celery_tasks.document_tasks import process_crawled_url_task
for url in request.content:
process_crawled_url_task.delay(
url, request.search_space_id, str(user.id)
)
elif request.document_type == DocumentType.YOUTUBE_VIDEO: elif request.document_type == DocumentType.YOUTUBE_VIDEO:
from app.tasks.celery_tasks.document_tasks import process_youtube_video_task from app.tasks.celery_tasks.document_tasks import process_youtube_video_task

View file

@ -39,6 +39,7 @@ from app.tasks.connector_indexers import (
index_airtable_records, index_airtable_records,
index_clickup_tasks, index_clickup_tasks,
index_confluence_pages, index_confluence_pages,
index_crawled_urls,
index_discord_messages, index_discord_messages,
index_elasticsearch_documents, index_elasticsearch_documents,
index_github_repos, index_github_repos,
@ -482,6 +483,7 @@ async def index_connector_content(
- DISCORD_CONNECTOR: Indexes messages from all accessible Discord channels - DISCORD_CONNECTOR: Indexes messages from all accessible Discord channels
- LUMA_CONNECTOR: Indexes events from Luma - LUMA_CONNECTOR: Indexes events from Luma
- ELASTICSEARCH_CONNECTOR: Indexes documents from Elasticsearch - ELASTICSEARCH_CONNECTOR: Indexes documents from Elasticsearch
- WEBCRAWLER_CONNECTOR: Indexes web pages from crawled websites
Args: Args:
connector_id: ID of the connector to use connector_id: ID of the connector to use
@ -688,6 +690,17 @@ async def index_connector_content(
) )
response_message = "Elasticsearch indexing started in the background." response_message = "Elasticsearch indexing started in the background."
elif connector.connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
from app.tasks.celery_tasks.connector_tasks import index_crawled_urls_task
logger.info(
f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
)
index_crawled_urls_task.delay(
connector_id, search_space_id, str(user.id), indexing_from, indexing_to
)
response_message = "Web page indexing started in the background."
else: else:
raise HTTPException( raise HTTPException(
status_code=400, status_code=400,
@ -1523,3 +1536,64 @@ async def run_elasticsearch_indexing(
f"Critical error in run_elasticsearch_indexing for connector {connector_id}: {e}", f"Critical error in run_elasticsearch_indexing for connector {connector_id}: {e}",
exc_info=True, exc_info=True,
) )
# Add new helper functions for crawled web page indexing
async def run_web_page_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Create a new session and run the Web page indexing task.
This prevents session leaks by creating a dedicated session for the background task.
"""
async with async_session_maker() as session:
await run_web_page_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
async def run_web_page_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""
Background task to run Web page indexing.
Args:
session: Database session
connector_id: ID of the webcrawler connector
search_space_id: ID of the search space
user_id: ID of the user
start_date: Start date for indexing
end_date: End date for indexing
"""
try:
documents_processed, error_or_warning = await index_crawled_urls(
session=session,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
start_date=start_date,
end_date=end_date,
update_last_indexed=False, # Don't update timestamp in the indexing function
)
# Only update last_indexed_at if indexing was successful (either new docs or updated docs)
if documents_processed > 0:
await update_connector_last_indexed(session, connector_id)
logger.info(
f"Web page indexing completed successfully: {documents_processed} documents processed"
)
else:
logger.error(
f"Web page indexing failed or no documents processed: {error_or_warning}"
)
except Exception as e:
logger.error(f"Error in background Web page indexing task: {e!s}")

View file

@ -70,6 +70,13 @@ class ConnectorService:
""" """
Search for crawled URLs and return both the source information and langchain documents Search for crawled URLs and return both the source information and langchain documents
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
Returns: Returns:
tuple: (sources_info, langchain_documents) tuple: (sources_info, langchain_documents)
""" """
@ -109,15 +116,43 @@ class ConnectorService:
document = chunk.get("document", {}) document = chunk.get("document", {})
metadata = document.get("metadata", {}) metadata = document.get("metadata", {})
# Create a source entry # Extract webcrawler-specific metadata
url = metadata.get("source", metadata.get("url", ""))
title = document.get(
"title", metadata.get("title", "Untitled Document")
)
description = metadata.get("description", "")
language = metadata.get("language", "")
last_crawled_at = metadata.get("last_crawled_at", "")
# Build description with crawler info
content_preview = chunk.get("content", "")
if not description and content_preview:
# Use content preview if no description
description = content_preview[:200]
if len(content_preview) > 200:
description += "..."
# Add crawler metadata to description if available
info_parts = []
if language:
info_parts.append(f"Language: {language}")
if last_crawled_at:
info_parts.append(f"Last crawled: {last_crawled_at}")
if info_parts:
if description:
description += f" | {' | '.join(info_parts)}"
else:
description = " | ".join(info_parts)
source = { source = {
"id": chunk.get("chunk_id", self.source_id_counter), "id": chunk.get("chunk_id", self.source_id_counter),
"title": document.get("title", "Untitled Document"), "title": title,
"description": metadata.get( "description": description,
"og:description", "url": url,
metadata.get("ogDescription", chunk.get("content", "")), "language": language,
), "last_crawled_at": last_crawled_at,
"url": metadata.get("url", ""),
} }
self.source_id_counter += 1 self.source_id_counter += 1

View file

@ -600,3 +600,46 @@ async def _index_elasticsearch_documents(
await run_elasticsearch_indexing( await run_elasticsearch_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date session, connector_id, search_space_id, user_id, start_date, end_date
) )
@celery_app.task(name="index_crawled_urls", bind=True)
def index_crawled_urls_task(
self,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Celery task to index Web page Urls."""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
_index_crawled_urls(
connector_id, search_space_id, user_id, start_date, end_date
)
)
finally:
loop.close()
async def _index_crawled_urls(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index Web page Urls with new session."""
from app.routes.search_source_connectors_routes import (
run_web_page_indexing,
)
async with get_celery_session_maker()() as session:
await run_web_page_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)

View file

@ -9,7 +9,6 @@ from app.celery_app import celery_app
from app.config import config from app.config import config
from app.services.task_logging_service import TaskLoggingService from app.services.task_logging_service import TaskLoggingService
from app.tasks.document_processors import ( from app.tasks.document_processors import (
add_crawled_url_document,
add_extension_received_document, add_extension_received_document,
add_youtube_video_document, add_youtube_video_document,
) )
@ -120,71 +119,6 @@ async def _process_extension_document(
raise raise
@celery_app.task(name="process_crawled_url", bind=True)
def process_crawled_url_task(self, url: str, search_space_id: int, user_id: str):
"""
Celery task to process crawled URL.
Args:
url: URL to crawl and process
search_space_id: ID of the search space
user_id: ID of the user
"""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(_process_crawled_url(url, search_space_id, user_id))
finally:
loop.close()
async def _process_crawled_url(url: str, search_space_id: int, user_id: str):
"""Process crawled URL with new session."""
async with get_celery_session_maker()() as session:
task_logger = TaskLoggingService(session, search_space_id)
log_entry = await task_logger.log_task_start(
task_name="process_crawled_url",
source="document_processor",
message=f"Starting URL crawling and processing for: {url}",
metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
)
try:
result = await add_crawled_url_document(
session, url, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully crawled and processed URL: {url}",
{
"document_id": result.id,
"title": result.title,
"content_hash": result.content_hash,
},
)
else:
await task_logger.log_task_success(
log_entry,
f"URL document already exists (duplicate): {url}",
{"duplicate_detected": True},
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to crawl URL: {url}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Error processing crawled URL: {e!s}")
raise
@celery_app.task(name="process_youtube_video", bind=True) @celery_app.task(name="process_youtube_video", bind=True)
def process_youtube_video_task(self, url: str, search_space_id: int, user_id: str): def process_youtube_video_task(self, url: str, search_space_id: int, user_id: str):
""" """

View file

@ -67,6 +67,7 @@ async def _check_and_trigger_schedules():
index_airtable_records_task, index_airtable_records_task,
index_clickup_tasks_task, index_clickup_tasks_task,
index_confluence_pages_task, index_confluence_pages_task,
index_crawled_urls_task,
index_discord_messages_task, index_discord_messages_task,
index_elasticsearch_documents_task, index_elasticsearch_documents_task,
index_github_repos_task, index_github_repos_task,
@ -94,6 +95,7 @@ async def _check_and_trigger_schedules():
SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task, SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task, SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task, SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
} }
# Trigger indexing for each due connector # Trigger indexing for each due connector

View file

@ -17,6 +17,7 @@ Available indexers:
- Google Gmail: Index messages from Google Gmail - Google Gmail: Index messages from Google Gmail
- Google Calendar: Index events from Google Calendar - Google Calendar: Index events from Google Calendar
- Luma: Index events from Luma - Luma: Index events from Luma
- Webcrawler: Index crawled URLs
- Elasticsearch: Index documents from Elasticsearch instances - Elasticsearch: Index documents from Elasticsearch instances
""" """
@ -41,6 +42,7 @@ from .luma_indexer import index_luma_events
# Documentation and knowledge management # Documentation and knowledge management
from .notion_indexer import index_notion_pages from .notion_indexer import index_notion_pages
from .slack_indexer import index_slack_messages from .slack_indexer import index_slack_messages
from .webcrawler_indexer import index_crawled_urls
__all__ = [ # noqa: RUF022 __all__ = [ # noqa: RUF022
"index_airtable_records", "index_airtable_records",
@ -58,6 +60,7 @@ __all__ = [ # noqa: RUF022
"index_linear_issues", "index_linear_issues",
# Documentation and knowledge management # Documentation and knowledge management
"index_notion_pages", "index_notion_pages",
"index_crawled_urls",
# Communication platforms # Communication platforms
"index_slack_messages", "index_slack_messages",
"index_google_gmail_messages", "index_google_gmail_messages",

View file

@ -0,0 +1,450 @@
"""
Webcrawler connector indexer.
"""
from datetime import datetime
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.connectors.webcrawler_connector import WebCrawlerConnector
from app.db import Document, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
from .base import (
check_document_by_unique_identifier,
get_connector_by_id,
logger,
update_connector_last_indexed,
)
async def index_crawled_urls(
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str | None = None,
end_date: str | None = None,
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index web page URLs.
Args:
session: Database session
connector_id: ID of the webcrawler connector
search_space_id: ID of the search space to store documents in
user_id: User ID
start_date: Start date for filtering (YYYY-MM-DD format) - optional
end_date: End date for filtering (YYYY-MM-DD format) - optional
update_last_indexed: Whether to update the last_indexed_at timestamp (default: True)
Returns:
Tuple containing (number of documents indexed, error message or None)
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="crawled_url_indexing",
source="connector_indexing_task",
message=f"Starting web page URL indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
"start_date": start_date,
"end_date": end_date,
},
)
try:
# Get the connector
await task_logger.log_task_progress(
log_entry,
f"Retrieving webcrawler connector {connector_id} from database",
{"stage": "connector_retrieval"},
)
# Get the connector from the database
connector = await get_connector_by_id(
session, connector_id, SearchSourceConnectorType.WEBCRAWLER_CONNECTOR
)
if not connector:
await task_logger.log_task_failure(
log_entry,
f"Connector with ID {connector_id} not found or is not a webcrawler connector",
"Connector not found",
{"error_type": "ConnectorNotFound"},
)
return (
0,
f"Connector with ID {connector_id} not found or is not a webcrawler connector",
)
# Get the Firecrawl API key from the connector config (optional)
api_key = connector.config.get("FIRECRAWL_API_KEY")
# Get URLs from connector config
initial_urls = connector.config.get("INITIAL_URLS", "")
if isinstance(initial_urls, str):
urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
elif isinstance(initial_urls, list):
urls = [url.strip() for url in initial_urls if url.strip()]
else:
urls = []
logger.info(
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
)
# Initialize webcrawler client
await task_logger.log_task_progress(
log_entry,
f"Initializing webcrawler client for connector {connector_id}",
{
"stage": "client_initialization",
"use_firecrawl": bool(api_key),
},
)
crawler = WebCrawlerConnector(firecrawl_api_key=api_key)
# Validate URLs
if not urls:
await task_logger.log_task_failure(
log_entry,
"No URLs provided for indexing",
"Empty URL list",
{"error_type": "ValidationError"},
)
return 0, "No URLs provided for indexing"
await task_logger.log_task_progress(
log_entry,
f"Starting to crawl {len(urls)} URLs",
{
"stage": "crawling",
"total_urls": len(urls),
},
)
documents_indexed = 0
documents_updated = 0
documents_skipped = 0
failed_urls = []
for idx, url in enumerate(urls, 1):
try:
logger.info(f"Processing URL {idx}/{len(urls)}: {url}")
await task_logger.log_task_progress(
log_entry,
f"Crawling URL {idx}/{len(urls)}: {url}",
{
"stage": "crawling_url",
"url_index": idx,
"url": url,
},
)
# Crawl the URL
crawl_result, error = await crawler.crawl_url(url)
if error or not crawl_result:
logger.warning(f"Failed to crawl URL {url}: {error}")
failed_urls.append((url, error or "Unknown error"))
continue
# Extract content and metadata
content = crawl_result.get("content", "")
metadata = crawl_result.get("metadata", {})
crawler_type = crawl_result.get("crawler_type", "unknown")
if not content.strip():
logger.warning(f"Skipping URL with no content: {url}")
failed_urls.append((url, "No content extracted"))
documents_skipped += 1
continue
# Format content as structured document
structured_document = crawler.format_to_structured_document(
crawl_result
)
# Generate unique identifier hash for this URL
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.CRAWLED_URL, url, search_space_id
)
# Generate content hash
# TODO: To fix this by not including dynamic content like date, time, etc.
content_hash = generate_content_hash(
structured_document, search_space_id
)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
# Extract useful metadata
title = metadata.get("title", url)
description = metadata.get("description", "")
language = metadata.get("language", "")
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
logger.info(f"Document for URL {url} unchanged. Skipping.")
documents_skipped += 1
continue
else:
# Content has changed - update the existing document
logger.info(
f"Content changed for URL {url}. Updating document."
)
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"url": url,
"title": title,
"description": description,
"language": language,
"document_type": "Crawled URL",
"crawler_type": crawler_type,
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
structured_document, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Crawled URL: {title}\n\n"
summary_content += f"URL: {url}\n"
if description:
summary_content += f"Description: {description}\n"
if language:
summary_content += f"Language: {language}\n"
summary_content += f"Crawler: {crawler_type}\n\n"
# Add content preview
content_preview = content[:1000]
if len(content) > 1000:
content_preview += "..."
summary_content += f"Content Preview:\n{content_preview}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
# Process chunks
chunks = await create_document_chunks(content)
# Update existing document
existing_document.title = title
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = {
**metadata,
"crawler_type": crawler_type,
"last_crawled_at": datetime.now().strftime(
"%Y-%m-%d %H:%M:%S"
),
}
existing_document.chunks = chunks
documents_updated += 1
logger.info(f"Successfully updated URL {url}")
continue
# Document doesn't exist - create new one
# Generate summary with metadata
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
if user_llm:
document_metadata = {
"url": url,
"title": title,
"description": description,
"language": language,
"document_type": "Crawled URL",
"crawler_type": crawler_type,
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
structured_document, user_llm, document_metadata
)
else:
# Fallback to simple summary if no LLM configured
summary_content = f"Crawled URL: {title}\n\n"
summary_content += f"URL: {url}\n"
if description:
summary_content += f"Description: {description}\n"
if language:
summary_content += f"Language: {language}\n"
summary_content += f"Crawler: {crawler_type}\n\n"
# Add content preview
content_preview = content[:1000]
if len(content) > 1000:
content_preview += "..."
summary_content += f"Content Preview:\n{content_preview}\n"
summary_embedding = config.embedding_model_instance.embed(
summary_content
)
chunks = await create_document_chunks(content)
document = Document(
search_space_id=search_space_id,
title=title,
document_type=DocumentType.CRAWLED_URL,
document_metadata={
**metadata,
"crawler_type": crawler_type,
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
content=summary_content,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
embedding=summary_embedding,
chunks=chunks,
)
session.add(document)
documents_indexed += 1
logger.info(f"Successfully indexed new URL {url}")
# Batch commit every 10 documents
if (documents_indexed + documents_updated) % 10 == 0:
logger.info(
f"Committing batch: {documents_indexed + documents_updated} URLs processed so far"
)
await session.commit()
except Exception as e:
logger.error(
f"Error processing URL {url}: {e!s}",
exc_info=True,
)
failed_urls.append((url, str(e)))
continue
total_processed = documents_indexed + documents_updated
if total_processed > 0:
await update_connector_last_indexed(session, connector, update_last_indexed)
# Final commit for any remaining documents not yet committed in batches
logger.info(
f"Final commit: Total {documents_indexed} new, {documents_updated} updated URLs processed"
)
await session.commit()
# Build result message
result_message = None
if failed_urls:
failed_summary = "; ".join(
[f"{url}: {error}" for url, error in failed_urls[:5]]
)
if len(failed_urls) > 5:
failed_summary += f" (and {len(failed_urls) - 5} more)"
result_message = (
f"Completed with {len(failed_urls)} failures: {failed_summary}"
)
await task_logger.log_task_success(
log_entry,
f"Successfully completed crawled web page indexing for connector {connector_id}",
{
"urls_processed": total_processed,
"documents_indexed": documents_indexed,
"documents_updated": documents_updated,
"documents_skipped": documents_skipped,
"failed_urls_count": len(failed_urls),
},
)
logger.info(
f"Web page indexing completed: {documents_indexed} new, "
f"{documents_updated} updated, {documents_skipped} skipped, "
f"{len(failed_urls)} failed"
)
return total_processed, result_message
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during web page indexing for connector {connector_id}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
logger.error(f"Database error: {db_error!s}", exc_info=True)
return 0, f"Database error: {db_error!s}"
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to index web page URLs for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index web page URLs: {e!s}", exc_info=True)
return 0, f"Failed to index web page URLs: {e!s}"
async def get_crawled_url_documents(
session: AsyncSession,
search_space_id: int,
connector_id: int | None = None,
) -> list[Document]:
"""
Get all crawled URL documents for a search space.
Args:
session: Database session
search_space_id: ID of the search space
connector_id: Optional connector ID to filter by
Returns:
List of Document objects
"""
from sqlalchemy import select
query = select(Document).filter(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.CRAWLED_URL,
)
if connector_id:
# Filter by connector if needed - you might need to add a connector_id field to Document
# or filter by some other means depending on your schema
pass
result = await session.execute(query)
documents = result.scalars().all()
return list(documents)

View file

@ -6,7 +6,6 @@ and sources. Each processor is responsible for handling a specific type of docum
processing task in the background. processing task in the background.
Available processors: Available processors:
- URL crawler: Process web pages from URLs
- Extension processor: Handle documents from browser extension - Extension processor: Handle documents from browser extension
- Markdown processor: Process markdown files - Markdown processor: Process markdown files
- File processors: Handle files using different ETL services (Unstructured, LlamaCloud, Docling) - File processors: Handle files using different ETL services (Unstructured, LlamaCloud, Docling)
@ -26,14 +25,11 @@ from .file_processors import (
# Markdown processor # Markdown processor
from .markdown_processor import add_received_markdown_file_document from .markdown_processor import add_received_markdown_file_document
from .url_crawler import add_crawled_url_document
# YouTube processor # YouTube processor
from .youtube_processor import add_youtube_video_document from .youtube_processor import add_youtube_video_document
__all__ = [ __all__ = [
# URL processing
"add_crawled_url_document",
# Extension processing # Extension processing
"add_extension_received_document", "add_extension_received_document",
"add_received_file_document_using_docling", "add_received_file_document_using_docling",

View file

@ -1,330 +0,0 @@
"""
URL crawler document processor.
"""
import logging
import validators
from firecrawl import AsyncFirecrawlApp
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document as LangchainDocument
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.db import Document, DocumentType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
from .base import (
check_document_by_unique_identifier,
md,
)
async def add_crawled_url_document(
session: AsyncSession, url: str, search_space_id: int, user_id: str
) -> Document | None:
"""
Process and store a document from a crawled URL.
Args:
session: Database session
url: URL to crawl
search_space_id: ID of the search space
user_id: ID of the user
Returns:
Document object if successful, None if failed
"""
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="crawl_url_document",
source="background_task",
message=f"Starting URL crawling process for: {url}",
metadata={"url": url, "user_id": str(user_id)},
)
try:
# URL validation step
await task_logger.log_task_progress(
log_entry, f"Validating URL: {url}", {"stage": "validation"}
)
if not validators.url(url):
raise ValueError(f"Url {url} is not a valid URL address")
# Set up crawler
await task_logger.log_task_progress(
log_entry,
f"Setting up crawler for URL: {url}",
{
"stage": "crawler_setup",
"firecrawl_available": bool(config.FIRECRAWL_API_KEY),
},
)
use_firecrawl = bool(config.FIRECRAWL_API_KEY)
if use_firecrawl:
# Use Firecrawl SDK directly
firecrawl_app = AsyncFirecrawlApp(api_key=config.FIRECRAWL_API_KEY)
else:
crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
# Perform crawling
await task_logger.log_task_progress(
log_entry,
f"Crawling URL content: {url}",
{
"stage": "crawling",
"crawler_type": "AsyncFirecrawlApp"
if use_firecrawl
else "AsyncChromiumLoader",
},
)
if use_firecrawl:
# Use async Firecrawl SDK with v1 API - properly awaited
scrape_result = await firecrawl_app.scrape_url(
url=url, formats=["markdown"]
)
# scrape_result is a Pydantic ScrapeResponse object
# Access attributes directly
if scrape_result and scrape_result.success:
# Extract markdown content
markdown_content = scrape_result.markdown or ""
# Extract metadata - this is a DICT
metadata = scrape_result.metadata if scrape_result.metadata else {}
# Convert to LangChain Document format
url_crawled = [
LangchainDocument(
page_content=markdown_content,
metadata={
"source": url,
"title": metadata.get("title", url),
"description": metadata.get("description", ""),
"language": metadata.get("language", ""),
"sourceURL": metadata.get("sourceURL", url),
**metadata, # Include all other metadata fields
},
)
]
content_in_markdown = url_crawled[0].page_content
else:
error_msg = (
scrape_result.error
if scrape_result and hasattr(scrape_result, "error")
else "Unknown error"
)
raise ValueError(f"Firecrawl failed to scrape URL: {error_msg}")
else:
# Use AsyncChromiumLoader as fallback
url_crawled = await crawl_loader.aload()
content_in_markdown = md.transform_documents(url_crawled)[0].page_content
# Format document
await task_logger.log_task_progress(
log_entry,
f"Processing crawled content from: {url}",
{"stage": "content_processing", "content_length": len(content_in_markdown)},
)
# Format document metadata in a more maintainable way
metadata_sections = [
(
"METADATA",
[
f"{key.upper()}: {value}"
for key, value in url_crawled[0].metadata.items()
],
),
(
"CONTENT",
["FORMAT: markdown", "TEXT_START", content_in_markdown, "TEXT_END"],
),
]
# Build the document string more efficiently
document_parts = []
document_parts.append("<DOCUMENT>")
for section_title, section_content in metadata_sections:
document_parts.append(f"<{section_title}>")
document_parts.extend(section_content)
document_parts.append(f"</{section_title}>")
document_parts.append("</DOCUMENT>")
combined_document_string = "\n".join(document_parts)
# Generate unique identifier hash for this URL
unique_identifier_hash = generate_unique_identifier_hash(
DocumentType.CRAWLED_URL, url, search_space_id
)
# Generate content hash
content_hash = generate_content_hash(combined_document_string, search_space_id)
# Check if document with this unique identifier already exists
await task_logger.log_task_progress(
log_entry,
f"Checking for existing URL: {url}",
{"stage": "duplicate_check", "url": url},
)
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash
)
if existing_document:
# Document exists - check if content has changed
if existing_document.content_hash == content_hash:
await task_logger.log_task_success(
log_entry,
f"URL document unchanged: {url}",
{
"duplicate_detected": True,
"existing_document_id": existing_document.id,
},
)
logging.info(f"Document for URL {url} unchanged. Skipping.")
return existing_document
else:
# Content has changed - update the existing document
logging.info(f"Content changed for URL {url}. Updating document.")
await task_logger.log_task_progress(
log_entry,
f"Updating URL document: {url}",
{"stage": "document_update", "url": url},
)
# Get LLM for summary generation (needed for both create and update)
await task_logger.log_task_progress(
log_entry,
f"Preparing for summary generation: {url}",
{"stage": "llm_setup"},
)
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if not user_llm:
raise RuntimeError(
f"No long context LLM configured for user {user_id} in search space {search_space_id}"
)
# Generate summary
await task_logger.log_task_progress(
log_entry,
f"Generating summary for URL content: {url}",
{"stage": "summary_generation"},
)
# Generate summary with metadata
document_metadata = {
"url": url,
"title": url_crawled[0].metadata.get("title", url),
"document_type": "Crawled URL Document",
"crawler_type": "FirecrawlApp" if use_firecrawl else "AsyncChromiumLoader",
}
summary_content, summary_embedding = await generate_document_summary(
combined_document_string, user_llm, document_metadata
)
# Process chunks
await task_logger.log_task_progress(
log_entry,
f"Processing content chunks for URL: {url}",
{"stage": "chunk_processing"},
)
chunks = await create_document_chunks(content_in_markdown)
# Update or create document
if existing_document:
# Update existing document
await task_logger.log_task_progress(
log_entry,
f"Updating document in database for URL: {url}",
{"stage": "document_update", "chunks_count": len(chunks)},
)
existing_document.title = url_crawled[0].metadata.get(
"title", url_crawled[0].metadata.get("source", url)
)
existing_document.content = summary_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.document_metadata = url_crawled[0].metadata
existing_document.chunks = chunks
document = existing_document
else:
# Create new document
await task_logger.log_task_progress(
log_entry,
f"Creating document in database for URL: {url}",
{"stage": "document_creation", "chunks_count": len(chunks)},
)
document = Document(
search_space_id=search_space_id,
title=url_crawled[0].metadata.get(
"title", url_crawled[0].metadata.get("source", url)
),
document_type=DocumentType.CRAWLED_URL,
document_metadata=url_crawled[0].metadata,
content=summary_content,
embedding=summary_embedding,
chunks=chunks,
content_hash=content_hash,
unique_identifier_hash=unique_identifier_hash,
)
session.add(document)
await session.commit()
await session.refresh(document)
# Log success
await task_logger.log_task_success(
log_entry,
f"Successfully crawled and processed URL: {url}",
{
"document_id": document.id,
"title": document.title,
"content_hash": content_hash,
"chunks_count": len(chunks),
"summary_length": len(summary_content),
},
)
return document
except SQLAlchemyError as db_error:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error while processing URL: {url}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
raise db_error
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to crawl URL: {url}",
str(e),
{"error_type": type(e).__name__},
)
raise RuntimeError(f"Failed to crawl URL: {e!s}") from e

View file

@ -31,6 +31,7 @@ CONNECTOR_TASK_MAP = {
SearchSourceConnectorType.DISCORD_CONNECTOR: "index_discord_messages", SearchSourceConnectorType.DISCORD_CONNECTOR: "index_discord_messages",
SearchSourceConnectorType.LUMA_CONNECTOR: "index_luma_events", SearchSourceConnectorType.LUMA_CONNECTOR: "index_luma_events",
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: "index_elasticsearch_documents", SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: "index_elasticsearch_documents",
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_crawled_urls",
} }
@ -69,6 +70,7 @@ def create_periodic_schedule(
index_airtable_records_task, index_airtable_records_task,
index_clickup_tasks_task, index_clickup_tasks_task,
index_confluence_pages_task, index_confluence_pages_task,
index_crawled_urls_task,
index_discord_messages_task, index_discord_messages_task,
index_elasticsearch_documents_task, index_elasticsearch_documents_task,
index_github_repos_task, index_github_repos_task,
@ -96,6 +98,7 @@ def create_periodic_schedule(
SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task, SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task, SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task, SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
} }
# Trigger the first run immediately # Trigger the first run immediately

View file

@ -469,6 +469,22 @@ def validate_connector_config(
if not isinstance(value, list) or not value: if not isinstance(value, list) or not value:
raise ValueError(f"{field_name} must be a non-empty list of strings") raise ValueError(f"{field_name} must be a non-empty list of strings")
def validate_firecrawl_api_key_format() -> None:
"""Validate Firecrawl API key format if provided."""
api_key = config.get("FIRECRAWL_API_KEY", "")
if api_key and api_key.strip() and not api_key.strip().startswith("fc-"):
raise ValueError(
"Firecrawl API key should start with 'fc-'. Please verify your API key."
)
def validate_initial_urls() -> None:
initial_urls = config.get("INITIAL_URLS", "")
if initial_urls and initial_urls.strip():
urls = [url.strip() for url in initial_urls.split("\n") if url.strip()]
for url in urls:
if not validators.url(url):
raise ValueError(f"Invalid URL format in INITIAL_URLS: {url}")
# Lookup table for connector validation rules # Lookup table for connector validation rules
connector_rules = { connector_rules = {
"SERPER_API": {"required": ["SERPER_API_KEY"], "validators": {}}, "SERPER_API": {"required": ["SERPER_API_KEY"], "validators": {}},
@ -550,6 +566,14 @@ def validate_connector_config(
# "validators": {} # "validators": {}
# }, # },
"LUMA_CONNECTOR": {"required": ["LUMA_API_KEY"], "validators": {}}, "LUMA_CONNECTOR": {"required": ["LUMA_API_KEY"], "validators": {}},
"WEBCRAWLER_CONNECTOR": {
"required": [], # No required fields - API key is optional
"optional": ["FIRECRAWL_API_KEY", "INITIAL_URLS"],
"validators": {
"FIRECRAWL_API_KEY": lambda: validate_firecrawl_api_key_format(),
"INITIAL_URLS": lambda: validate_initial_urls(),
},
},
} }
rules = connector_rules.get(connector_type_str) rules = connector_rules.get(connector_type_str)

View file

@ -18,7 +18,16 @@ import {
CardHeader, CardHeader,
CardTitle, CardTitle,
} from "@/components/ui/card"; } from "@/components/ui/card";
import { Form } from "@/components/ui/form"; import {
Form,
FormControl,
FormDescription,
FormField,
FormItem,
FormLabel,
FormMessage,
} from "@/components/ui/form";
import { Textarea } from "@/components/ui/textarea";
import { getConnectorIcon } from "@/contracts/enums/connectorIcons"; import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
import { useConnectorEditPage } from "@/hooks/use-connector-edit-page"; import { useConnectorEditPage } from "@/hooks/use-connector-edit-page";
// Import Utils, Types, Hook, and Components // Import Utils, Types, Hook, and Components
@ -282,6 +291,40 @@ export default function EditConnectorPage() {
placeholder="Your Elasticsearch API Key" placeholder="Your Elasticsearch API Key"
/> />
)} )}
{/* == Webcrawler == */}
{connector.connector_type === "WEBCRAWLER_CONNECTOR" && (
<div className="space-y-4">
<EditSimpleTokenForm
control={editForm.control}
fieldName="FIRECRAWL_API_KEY"
fieldLabel="Firecrawl API Key (Optional)"
fieldDescription="Add a Firecrawl API key for enhanced crawling capabilities. If not provided, will use AsyncChromiumLoader as fallback."
placeholder="fc-xxxxxxxxxxxxx"
/>
<FormField
control={editForm.control}
name="INITIAL_URLS"
render={({ field }) => (
<FormItem>
<FormLabel>URLs to Crawl</FormLabel>
<FormControl>
<Textarea
placeholder="https://example.com&#10;https://docs.example.com&#10;https://blog.example.com"
className="min-h-[150px] font-mono text-sm"
{...field}
/>
</FormControl>
<FormDescription>
Enter URLs to crawl (one per line). These URLs will be indexed when you
trigger indexing.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
</div>
)}
</CardContent> </CardContent>
<CardFooter className="border-t pt-6"> <CardFooter className="border-t pt-6">
<Button type="submit" disabled={isSaving} className="w-full sm:w-auto"> <Button type="submit" disabled={isSaving} className="w-full sm:w-auto">

View file

@ -55,6 +55,7 @@ const getConnectorTypeDisplay = (type: string): string => {
AIRTABLE_CONNECTOR: "Airtable Connector", AIRTABLE_CONNECTOR: "Airtable Connector",
LUMA_CONNECTOR: "Luma Connector", LUMA_CONNECTOR: "Luma Connector",
ELASTICSEARCH_CONNECTOR: "Elasticsearch Connector", ELASTICSEARCH_CONNECTOR: "Elasticsearch Connector",
WEBCRAWLER_CONNECTOR: "Web Page Connector",
// Add other connector types here as needed // Add other connector types here as needed
}; };
return typeMap[type] || type; return typeMap[type] || type;
@ -75,6 +76,7 @@ const getApiKeyFieldName = (connectorType: string): string => {
LINKUP_API: "LINKUP_API_KEY", LINKUP_API: "LINKUP_API_KEY",
LUMA_CONNECTOR: "LUMA_API_KEY", LUMA_CONNECTOR: "LUMA_API_KEY",
ELASTICSEARCH_CONNECTOR: "ELASTICSEARCH_API_KEY", ELASTICSEARCH_CONNECTOR: "ELASTICSEARCH_API_KEY",
WEBCRAWLER_CONNECTOR: "FIRECRAWL_API_KEY",
}; };
return fieldMap[connectorType] || ""; return fieldMap[connectorType] || "";
}; };

View file

@ -0,0 +1,331 @@
"use client";
import { zodResolver } from "@hookform/resolvers/zod";
import { ArrowLeft, Check, Globe, Loader2 } from "lucide-react";
import { motion } from "motion/react";
import Link from "next/link";
import { useParams, useRouter } from "next/navigation";
import { useEffect, useState } from "react";
import { useForm } from "react-hook-form";
import { toast } from "sonner";
import * as z from "zod";
import { Button } from "@/components/ui/button";
import {
Card,
CardContent,
CardDescription,
CardFooter,
CardHeader,
CardTitle,
} from "@/components/ui/card";
import {
Form,
FormControl,
FormDescription,
FormField,
FormItem,
FormLabel,
FormMessage,
} from "@/components/ui/form";
import { Input } from "@/components/ui/input";
import { Textarea } from "@/components/ui/textarea";
import { EnumConnectorName } from "@/contracts/enums/connector";
import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
import {
type SearchSourceConnector,
useSearchSourceConnectors,
} from "@/hooks/use-search-source-connectors";
// Define the form schema with Zod
const webcrawlerConnectorFormSchema = z.object({
name: z.string().min(3, {
message: "Connector name must be at least 3 characters.",
}),
api_key: z.string().optional(),
initial_urls: z.string().optional(),
});
// Define the type for the form values
type WebcrawlerConnectorFormValues = z.infer<typeof webcrawlerConnectorFormSchema>;
export default function WebcrawlerConnectorPage() {
const router = useRouter();
const params = useParams();
const searchSpaceId = params.search_space_id as string;
const [isSubmitting, setIsSubmitting] = useState(false);
const [doesConnectorExist, setDoesConnectorExist] = useState(false);
const { fetchConnectors, createConnector } = useSearchSourceConnectors(
true,
parseInt(searchSpaceId)
);
// Initialize the form
const form = useForm<WebcrawlerConnectorFormValues>({
resolver: zodResolver(webcrawlerConnectorFormSchema),
defaultValues: {
name: "Web Pages",
api_key: "",
initial_urls: "",
},
});
useEffect(() => {
fetchConnectors(parseInt(searchSpaceId))
.then((data) => {
if (data && Array.isArray(data)) {
const connector = data.find(
(c: SearchSourceConnector) =>
c.connector_type === EnumConnectorName.WEBCRAWLER_CONNECTOR
);
if (connector) {
setDoesConnectorExist(true);
}
}
})
.catch((error) => {
console.error("Error fetching connectors:", error);
});
}, [fetchConnectors, searchSpaceId]);
// Handle form submission
const onSubmit = async (values: WebcrawlerConnectorFormValues) => {
setIsSubmitting(true);
try {
const config: Record<string, string> = {};
// Only add API key to config if provided
if (values.api_key && values.api_key.trim()) {
config.FIRECRAWL_API_KEY = values.api_key;
}
// Parse initial URLs if provided
if (values.initial_urls && values.initial_urls.trim()) {
config.INITIAL_URLS = values.initial_urls;
}
await createConnector(
{
name: values.name,
connector_type: EnumConnectorName.WEBCRAWLER_CONNECTOR,
config: config,
is_indexable: true,
last_indexed_at: null,
periodic_indexing_enabled: false,
indexing_frequency_minutes: null,
next_scheduled_at: null,
},
parseInt(searchSpaceId)
);
toast.success("Webcrawler connector created successfully!");
// Navigate back to connectors page
router.push(`/dashboard/${searchSpaceId}/connectors`);
} catch (error) {
console.error("Error creating connector:", error);
toast.error(error instanceof Error ? error.message : "Failed to create connector");
} finally {
setIsSubmitting(false);
}
};
return (
<div className="container mx-auto py-8 max-w-2xl">
<motion.div
initial={{ opacity: 0, y: 20 }}
animate={{ opacity: 1, y: 0 }}
transition={{ duration: 0.5 }}
>
{/* Header */}
<div className="mb-8">
<Link
href={`/dashboard/${searchSpaceId}/connectors/add`}
className="inline-flex items-center text-sm text-muted-foreground hover:text-foreground mb-4"
>
<ArrowLeft className="mr-2 h-4 w-4" />
Back to connectors
</Link>
<div className="flex items-center gap-4">
<div className="flex h-12 w-12 items-center justify-center rounded-lg">
{getConnectorIcon(EnumConnectorName.WEBCRAWLER_CONNECTOR, "h-6 w-6")}
</div>
<div>
<h1 className="text-3xl font-bold tracking-tight">Connect Web Pages</h1>
<p className="text-muted-foreground">Crawl and index web pages for search.</p>
</div>
</div>
</div>
{/* Connection Card */}
{!doesConnectorExist ? (
<Card>
<CardHeader>
<CardTitle>Set Up Web Page crawler</CardTitle>
<CardDescription>
Configure your web page crawler to index web pages. Optionally add a Firecrawl API
key for enhanced crawling capabilities.
</CardDescription>
</CardHeader>
<Form {...form}>
<form onSubmit={form.handleSubmit(onSubmit)}>
<CardContent className="space-y-4">
<FormField
control={form.control}
name="name"
render={({ field }) => (
<FormItem>
<FormLabel>Connector Name</FormLabel>
<FormControl>
<Input placeholder="My Web Crawler" {...field} />
</FormControl>
<FormDescription>
A friendly name to identify this connector.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
<FormField
control={form.control}
name="api_key"
render={({ field }) => (
<FormItem>
<FormLabel>Firecrawl API Key (Optional)</FormLabel>
<FormControl>
<Input type="password" placeholder="fc-xxxxxxxxxxxxx" {...field} />
</FormControl>
<FormDescription>
Add a Firecrawl API key for enhanced crawling. If not provided, will use
AsyncChromiumLoader as fallback.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
<FormField
control={form.control}
name="initial_urls"
render={({ field }) => (
<FormItem>
<FormLabel>Initial URLs (Optional)</FormLabel>
<FormControl>
<Textarea
placeholder="https://example.com&#10;https://docs.example.com&#10;https://blog.example.com"
className="min-h-[100px] font-mono text-sm"
{...field}
/>
</FormControl>
<FormDescription>
Enter URLs to crawl (one per line). You can add more URLs later.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
<div className="space-y-2 pt-2">
<div className="flex items-center space-x-2 text-sm text-muted-foreground">
<Check className="h-4 w-4 text-green-500" />
<span>Crawl any public web page</span>
</div>
<div className="flex items-center space-x-2 text-sm text-muted-foreground">
<Check className="h-4 w-4 text-green-500" />
<span>Extract markdown content automatically</span>
</div>
<div className="flex items-center space-x-2 text-sm text-muted-foreground">
<Check className="h-4 w-4 text-green-500" />
<span>Detect content changes and update documents</span>
</div>
<div className="flex items-center space-x-2 text-sm text-muted-foreground">
<Check className="h-4 w-4 text-green-500" />
<span>Works with or without Firecrawl API key</span>
</div>
</div>
</CardContent>
<CardFooter className="flex justify-between">
<Button
type="button"
variant="outline"
onClick={() => router.push(`/dashboard/${searchSpaceId}/connectors/add`)}
>
Cancel
</Button>
<Button type="submit" disabled={isSubmitting}>
{isSubmitting ? (
<>
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
Setting up...
</>
) : (
<>
<Globe className="mr-2 h-4 w-4" />
Create Crawler
</>
)}
</Button>
</CardFooter>
</form>
</Form>
</Card>
) : (
/* Success Card */
<Card>
<CardHeader>
<CardTitle> Your web page crawler is successfully set up!</CardTitle>
<CardDescription>
You can now add URLs to crawl from the connector management page.
</CardDescription>
</CardHeader>
</Card>
)}
{/* Help Section */}
{!doesConnectorExist && (
<Card className="mt-6">
<CardHeader>
<CardTitle className="text-lg">How It Works</CardTitle>
</CardHeader>
<CardContent className="space-y-4">
<div>
<h4 className="font-medium mb-2">1. Choose Your Crawler Method</h4>
<p className="text-sm text-muted-foreground">
<strong>With Firecrawl (Recommended):</strong> Get your API key from{" "}
<a
href="https://firecrawl.dev"
target="_blank"
rel="noopener noreferrer"
className="text-primary hover:underline"
>
firecrawl.dev
</a>{" "}
for faster, more reliable crawling with better content extraction.
</p>
<p className="text-sm text-muted-foreground mt-2">
<strong>Without Firecrawl:</strong> The crawler will use AsyncChromiumLoader as a
free fallback option. This works well for most websites but may be slower.
</p>
</div>
<div>
<h4 className="font-medium mb-2">2. Add URLs to Crawl (Optional)</h4>
<p className="text-sm text-muted-foreground">
You can add initial URLs now or add them later from the connector management page.
Enter one URL per line.
</p>
</div>
<div>
<h4 className="font-medium mb-2">3. Manage Your Crawler</h4>
<p className="text-sm text-muted-foreground">
After setup, you can add more URLs, trigger manual crawls, or set up periodic
indexing to keep your content up-to-date.
</p>
</div>
</CardContent>
</Card>
)}
</motion.div>
</div>
);
}

View file

@ -1,201 +0,0 @@
"use client";
import { type Tag, TagInput } from "emblor";
import { Globe, Loader2 } from "lucide-react";
import { useParams, useRouter } from "next/navigation";
import { useTranslations } from "next-intl";
import { useState } from "react";
import { toast } from "sonner";
import { Button } from "@/components/ui/button";
import {
Card,
CardContent,
CardDescription,
CardFooter,
CardHeader,
CardTitle,
} from "@/components/ui/card";
import { Label } from "@/components/ui/label";
// URL validation regex
const urlRegex = /^(https?:\/\/)?([\da-z.-]+)\.([a-z.]{2,6})([/\w .-]*)*\/?$/;
export default function WebpageCrawler() {
const t = useTranslations("add_webpage");
const params = useParams();
const router = useRouter();
const search_space_id = params.search_space_id as string;
const [urlTags, setUrlTags] = useState<Tag[]>([]);
const [activeTagIndex, setActiveTagIndex] = useState<number | null>(null);
const [isSubmitting, setIsSubmitting] = useState(false);
const [error, setError] = useState<string | null>(null);
// Function to validate a URL
const isValidUrl = (url: string): boolean => {
return urlRegex.test(url);
};
// Function to handle URL submission
const handleSubmit = async () => {
// Validate that we have at least one URL
if (urlTags.length === 0) {
setError(t("error_no_url"));
return;
}
// Validate all URLs
const invalidUrls = urlTags.filter((tag) => !isValidUrl(tag.text));
if (invalidUrls.length > 0) {
setError(t("error_invalid_urls", { urls: invalidUrls.map((tag) => tag.text).join(", ") }));
return;
}
setError(null);
setIsSubmitting(true);
try {
toast(t("crawling_toast"), {
description: t("crawling_toast_desc"),
});
// Extract URLs from tags
const urls = urlTags.map((tag) => tag.text);
// Make API call to backend
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/documents`,
{
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
body: JSON.stringify({
document_type: "CRAWLED_URL",
content: urls,
search_space_id: parseInt(search_space_id),
}),
}
);
if (!response.ok) {
throw new Error("Failed to crawl URLs");
}
await response.json();
toast(t("success_toast"), {
description: t("success_toast_desc"),
});
// Redirect to documents page
router.push(`/dashboard/${search_space_id}/documents`);
} catch (error: any) {
setError(error.message || t("error_generic"));
toast(t("error_toast"), {
description: `${t("error_toast_desc")}: ${error.message}`,
});
} finally {
setIsSubmitting(false);
}
};
// Function to add a new URL tag
const handleAddTag = (text: string) => {
// Basic URL validation
if (!isValidUrl(text)) {
toast(t("invalid_url_toast"), {
description: t("invalid_url_toast_desc"),
});
return;
}
// Check for duplicates
if (urlTags.some((tag) => tag.text === text)) {
toast(t("duplicate_url_toast"), {
description: t("duplicate_url_toast_desc"),
});
return;
}
// Add the new tag
const newTag: Tag = {
id: Date.now().toString(),
text: text,
};
setUrlTags([...urlTags, newTag]);
};
return (
<div className="container mx-auto py-8">
<Card className="max-w-2xl mx-auto">
<CardHeader>
<CardTitle className="flex items-center gap-2">
<Globe className="h-5 w-5" />
{t("title")}
</CardTitle>
<CardDescription>{t("subtitle")}</CardDescription>
</CardHeader>
<CardContent>
<div className="space-y-4">
<div className="space-y-2">
<Label htmlFor="url-input">{t("label")}</Label>
<TagInput
id="url-input"
tags={urlTags}
setTags={setUrlTags}
placeholder={t("placeholder")}
onAddTag={handleAddTag}
styleClasses={{
inlineTagsContainer:
"border-input rounded-lg bg-background shadow-sm shadow-black/5 transition-shadow focus-within:border-ring focus-within:outline-none focus-within:ring-[3px] focus-within:ring-ring/20 p-1 gap-1",
input: "w-full min-w-[80px] focus-visible:outline-none shadow-none px-2 h-7",
tag: {
body: "h-7 relative bg-background border border-input hover:bg-background rounded-md font-medium text-xs ps-2 pe-7 flex",
closeButton:
"absolute -inset-y-px -end-px p-0 rounded-e-lg flex size-7 transition-colors outline-0 focus-visible:outline focus-visible:outline-2 focus-visible:outline-ring/70 text-muted-foreground/80 hover:text-foreground",
},
}}
activeTagIndex={activeTagIndex}
setActiveTagIndex={setActiveTagIndex}
/>
<p className="text-xs text-muted-foreground mt-1">{t("hint")}</p>
</div>
{error && <div className="text-sm text-red-500 mt-2">{error}</div>}
<div className="bg-muted/50 rounded-lg p-4 text-sm">
<h4 className="font-medium mb-2">{t("tips_title")}</h4>
<ul className="list-disc pl-5 space-y-1 text-muted-foreground">
<li>{t("tip_1")}</li>
<li>{t("tip_2")}</li>
<li>{t("tip_3")}</li>
<li>{t("tip_4")}</li>
</ul>
</div>
</div>
</CardContent>
<CardFooter className="flex justify-between">
<Button
variant="outline"
onClick={() => router.push(`/dashboard/${search_space_id}/documents`)}
>
{t("cancel")}
</Button>
<Button onClick={handleSubmit} disabled={isSubmitting || urlTags.length === 0}>
{isSubmitting ? (
<>
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
{t("submitting")}
</>
) : (
t("submit")
)}
</Button>
</CardFooter>
</Card>
</div>
);
}

View file

@ -1,9 +1,9 @@
"use client"; "use client";
import { IconBrandYoutube } from "@tabler/icons-react"; import { IconBrandYoutube } from "@tabler/icons-react";
import { Cable, Database, Upload } from "lucide-react"; import { Cable, Database, Globe, Upload } from "lucide-react";
import { motion } from "motion/react"; import { motion } from "motion/react";
import { useParams, useSearchParams } from "next/navigation"; import { useParams, useRouter, useSearchParams } from "next/navigation";
import { useEffect, useState } from "react"; import { useEffect, useState } from "react";
import { ConnectorsTab } from "@/components/sources/ConnectorsTab"; import { ConnectorsTab } from "@/components/sources/ConnectorsTab";
import { DocumentUploadTab } from "@/components/sources/DocumentUploadTab"; import { DocumentUploadTab } from "@/components/sources/DocumentUploadTab";
@ -12,6 +12,7 @@ import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
export default function AddSourcesPage() { export default function AddSourcesPage() {
const params = useParams(); const params = useParams();
const router = useRouter();
const searchParams = useSearchParams(); const searchParams = useSearchParams();
const search_space_id = params.search_space_id as string; const search_space_id = params.search_space_id as string;
const [activeTab, setActiveTab] = useState("documents"); const [activeTab, setActiveTab] = useState("documents");
@ -24,6 +25,14 @@ export default function AddSourcesPage() {
} }
}, [searchParams]); }, [searchParams]);
const handleTabChange = (value: string) => {
if (value === "webpages") {
router.push(`/dashboard/${search_space_id}/connectors/add/webcrawler-connector`);
} else {
setActiveTab(value);
}
};
return ( return (
<div className="container mx-auto py-8 px-4"> <div className="container mx-auto py-8 px-4">
<motion.div <motion.div
@ -42,19 +51,26 @@ export default function AddSourcesPage() {
</div> </div>
{/* Tabs */} {/* Tabs */}
<Tabs value={activeTab} onValueChange={setActiveTab} className="w-full"> <Tabs value={activeTab} onValueChange={handleTabChange} className="w-full">
<TabsList className="grid w-full max-w-2xl mx-auto grid-cols-3 h-12"> <TabsList className="grid w-full max-w-3xl mx-auto grid-cols-4 h-12">
<TabsTrigger value="documents" className="flex items-center gap-2"> <TabsTrigger value="documents" className="flex items-center gap-2">
<Upload className="h-4 w-4" /> <Upload className="h-4 w-4" />
Documents <span className="hidden sm:inline">Documents</span>
<span className="sm:hidden">Docs</span>
</TabsTrigger> </TabsTrigger>
<TabsTrigger value="youtube" className="flex items-center gap-2"> <TabsTrigger value="youtube" className="flex items-center gap-2">
<IconBrandYoutube className="h-4 w-4" /> <IconBrandYoutube className="h-4 w-4" />
YouTube YouTube
</TabsTrigger> </TabsTrigger>
<TabsTrigger value="webpages" className="flex items-center gap-2">
<Globe className="h-4 w-4" />
<span className="hidden sm:inline">Web Pages</span>
<span className="sm:hidden">Web</span>
</TabsTrigger>
<TabsTrigger value="connectors" className="flex items-center gap-2"> <TabsTrigger value="connectors" className="flex items-center gap-2">
<Cable className="h-4 w-4" /> <Cable className="h-4 w-4" />
Connectors <span className="hidden sm:inline">Connectors</span>
<span className="sm:hidden">More</span>
</TabsTrigger> </TabsTrigger>
</TabsList> </TabsList>

View file

@ -138,6 +138,7 @@ export function DashboardBreadcrumb() {
"linkup-api": "LinkUp API", "linkup-api": "LinkUp API",
"luma-connector": "Luma", "luma-connector": "Luma",
"elasticsearch-connector": "Elasticsearch", "elasticsearch-connector": "Elasticsearch",
"webcrawler-connector": "Web Pages",
}; };
const connectorLabel = connectorLabels[connectorType] || connectorType; const connectorLabel = connectorLabels[connectorType] || connectorType;

View file

@ -52,5 +52,7 @@ export const editConnectorSchema = z.object({
GOOGLE_CALENDAR_CALENDAR_IDS: z.string().optional(), GOOGLE_CALENDAR_CALENDAR_IDS: z.string().optional(),
LUMA_API_KEY: z.string().optional(), LUMA_API_KEY: z.string().optional(),
ELASTICSEARCH_API_KEY: z.string().optional(), ELASTICSEARCH_API_KEY: z.string().optional(),
FIRECRAWL_API_KEY: z.string().optional(),
INITIAL_URLS: z.string().optional(),
}); });
export type EditConnectorFormValues = z.infer<typeof editConnectorSchema>; export type EditConnectorFormValues = z.infer<typeof editConnectorSchema>;

View file

@ -29,6 +29,7 @@ const INTEGRATIONS: Integration[] = [
// Documentation & Knowledge // Documentation & Knowledge
{ name: "Confluence", icon: "https://cdn.simpleicons.org/confluence/172B4D" }, { name: "Confluence", icon: "https://cdn.simpleicons.org/confluence/172B4D" },
{ name: "Notion", icon: "https://cdn.simpleicons.org/notion/000000/ffffff" }, { name: "Notion", icon: "https://cdn.simpleicons.org/notion/000000/ffffff" },
{ name: "Web Pages", icon: "https://cdn.jsdelivr.net/npm/lucide-static@0.294.0/icons/globe.svg" },
// Cloud Storage // Cloud Storage
{ name: "Google Drive", icon: "https://cdn.simpleicons.org/googledrive/4285F4" }, { name: "Google Drive", icon: "https://cdn.simpleicons.org/googledrive/4285F4" },

View file

@ -19,11 +19,14 @@ interface ConnectorsTabProps {
export function ConnectorsTab({ searchSpaceId }: ConnectorsTabProps) { export function ConnectorsTab({ searchSpaceId }: ConnectorsTabProps) {
const t = useTranslations("add_connector"); const t = useTranslations("add_connector");
const [expandedCategories, setExpandedCategories] = useState<string[]>([ const [expandedCategories, setExpandedCategories] = useState<string[]>([
"search-engines", "web-search",
"knowledge-bases", "messaging",
"project-management", "project-management",
"team-chats", "documentation",
"communication", "development",
"databases",
"productivity",
"web-crawling",
]); ]);
const toggleCategory = (categoryId: string) => { const toggleCategory = (categoryId: string) => {

View file

@ -5,8 +5,21 @@ import type { ConnectorCategory } from "./types";
export const connectorCategories: ConnectorCategory[] = [ export const connectorCategories: ConnectorCategory[] = [
{ {
id: "search-engines", id: "web-crawling",
title: "search_engines", title: "web_crawling",
connectors: [
{
id: "webcrawler-connector",
title: "Web Pages",
description: "webcrawler_desc",
icon: getConnectorIcon(EnumConnectorName.WEBCRAWLER_CONNECTOR, "h-6 w-6"),
status: "available",
},
],
},
{
id: "web-search",
title: "web_search",
connectors: [ connectors: [
{ {
id: "tavily-api", id: "tavily-api",
@ -29,13 +42,6 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.LINKUP_API, "h-6 w-6"), icon: getConnectorIcon(EnumConnectorName.LINKUP_API, "h-6 w-6"),
status: "available", status: "available",
}, },
{
id: "elasticsearch-connector",
title: "Elasticsearch",
description: "elasticsearch_desc",
icon: getConnectorIcon(EnumConnectorName.ELASTICSEARCH_CONNECTOR, "h-6 w-6"),
status: "available",
},
{ {
id: "baidu-search-api", id: "baidu-search-api",
title: "Baidu Search", title: "Baidu Search",
@ -46,8 +52,8 @@ export const connectorCategories: ConnectorCategory[] = [
], ],
}, },
{ {
id: "team-chats", id: "messaging",
title: "team_chats", title: "messaging",
connectors: [ connectors: [
{ {
id: "slack-connector", id: "slack-connector",
@ -56,13 +62,6 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.SLACK_CONNECTOR, "h-6 w-6"), icon: getConnectorIcon(EnumConnectorName.SLACK_CONNECTOR, "h-6 w-6"),
status: "available", status: "available",
}, },
{
id: "ms-teams",
title: "Microsoft Teams",
description: "teams_desc",
icon: <IconBrandWindows className="h-6 w-6" />,
status: "coming-soon",
},
{ {
id: "discord-connector", id: "discord-connector",
title: "Discord", title: "Discord",
@ -70,6 +69,13 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.DISCORD_CONNECTOR, "h-6 w-6"), icon: getConnectorIcon(EnumConnectorName.DISCORD_CONNECTOR, "h-6 w-6"),
status: "available", status: "available",
}, },
{
id: "ms-teams",
title: "Microsoft Teams",
description: "teams_desc",
icon: <IconBrandWindows className="h-6 w-6" />,
status: "coming-soon",
},
], ],
}, },
{ {
@ -100,8 +106,8 @@ export const connectorCategories: ConnectorCategory[] = [
], ],
}, },
{ {
id: "knowledge-bases", id: "documentation",
title: "knowledge_bases", title: "documentation",
connectors: [ connectors: [
{ {
id: "notion-connector", id: "notion-connector",
@ -110,6 +116,19 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.NOTION_CONNECTOR, "h-6 w-6"), icon: getConnectorIcon(EnumConnectorName.NOTION_CONNECTOR, "h-6 w-6"),
status: "available", status: "available",
}, },
{
id: "confluence-connector",
title: "Confluence",
description: "confluence_desc",
icon: getConnectorIcon(EnumConnectorName.CONFLUENCE_CONNECTOR, "h-6 w-6"),
status: "available",
},
],
},
{
id: "development",
title: "development",
connectors: [
{ {
id: "github-connector", id: "github-connector",
title: "GitHub", title: "GitHub",
@ -117,11 +136,17 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.GITHUB_CONNECTOR, "h-6 w-6"), icon: getConnectorIcon(EnumConnectorName.GITHUB_CONNECTOR, "h-6 w-6"),
status: "available", status: "available",
}, },
],
},
{ {
id: "confluence-connector", id: "databases",
title: "Confluence", title: "databases",
description: "confluence_desc", connectors: [
icon: getConnectorIcon(EnumConnectorName.CONFLUENCE_CONNECTOR, "h-6 w-6"), {
id: "elasticsearch-connector",
title: "Elasticsearch",
description: "elasticsearch_desc",
icon: getConnectorIcon(EnumConnectorName.ELASTICSEARCH_CONNECTOR, "h-6 w-6"),
status: "available", status: "available",
}, },
{ {
@ -131,18 +156,11 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.AIRTABLE_CONNECTOR, "h-6 w-6"), icon: getConnectorIcon(EnumConnectorName.AIRTABLE_CONNECTOR, "h-6 w-6"),
status: "available", status: "available",
}, },
{
id: "luma-connector",
title: "Luma",
description: "luma_desc",
icon: getConnectorIcon(EnumConnectorName.LUMA_CONNECTOR, "h-6 w-6"),
status: "available",
},
], ],
}, },
{ {
id: "communication", id: "productivity",
title: "communication", title: "productivity",
connectors: [ connectors: [
{ {
id: "google-calendar-connector", id: "google-calendar-connector",
@ -158,6 +176,13 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.GOOGLE_GMAIL_CONNECTOR, "h-6 w-6"), icon: getConnectorIcon(EnumConnectorName.GOOGLE_GMAIL_CONNECTOR, "h-6 w-6"),
status: "available", status: "available",
}, },
{
id: "luma-connector",
title: "Luma",
description: "luma_desc",
icon: getConnectorIcon(EnumConnectorName.LUMA_CONNECTOR, "h-6 w-6"),
status: "available",
},
{ {
id: "zoom", id: "zoom",
title: "Zoom", title: "Zoom",

View file

@ -17,4 +17,5 @@ export enum EnumConnectorName {
AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR", AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR",
LUMA_CONNECTOR = "LUMA_CONNECTOR", LUMA_CONNECTOR = "LUMA_CONNECTOR",
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR", ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR",
WEBCRAWLER_CONNECTOR = "WEBCRAWLER_CONNECTOR",
} }

View file

@ -59,11 +59,13 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas
return <IconSparkles {...iconProps} />; return <IconSparkles {...iconProps} />;
case EnumConnectorName.ELASTICSEARCH_CONNECTOR: case EnumConnectorName.ELASTICSEARCH_CONNECTOR:
return <IconBrandElastic {...iconProps} />; return <IconBrandElastic {...iconProps} />;
case EnumConnectorName.WEBCRAWLER_CONNECTOR:
return <Globe {...iconProps} />;
// Additional cases for non-enum connector types // Additional cases for non-enum connector types
case "YOUTUBE_VIDEO":
return <IconBrandYoutube {...iconProps} />;
case "CRAWLED_URL": case "CRAWLED_URL":
return <Globe {...iconProps} />; return <Globe {...iconProps} />;
case "YOUTUBE_VIDEO":
return <IconBrandYoutube {...iconProps} />;
case "FILE": case "FILE":
return <File {...iconProps} />; return <File {...iconProps} />;
case "EXTENSION": case "EXTENSION":

View file

@ -97,6 +97,8 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
JIRA_API_TOKEN: "", JIRA_API_TOKEN: "",
LUMA_API_KEY: "", LUMA_API_KEY: "",
ELASTICSEARCH_API_KEY: "", ELASTICSEARCH_API_KEY: "",
FIRECRAWL_API_KEY: "",
INITIAL_URLS: "",
}, },
}); });
@ -142,6 +144,8 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
JIRA_API_TOKEN: config.JIRA_API_TOKEN || "", JIRA_API_TOKEN: config.JIRA_API_TOKEN || "",
LUMA_API_KEY: config.LUMA_API_KEY || "", LUMA_API_KEY: config.LUMA_API_KEY || "",
ELASTICSEARCH_API_KEY: config.ELASTICSEARCH_API_KEY || "", ELASTICSEARCH_API_KEY: config.ELASTICSEARCH_API_KEY || "",
FIRECRAWL_API_KEY: config.FIRECRAWL_API_KEY || "",
INITIAL_URLS: config.INITIAL_URLS || "",
}); });
if (currentConnector.connector_type === "GITHUB_CONNECTOR") { if (currentConnector.connector_type === "GITHUB_CONNECTOR") {
const savedRepos = config.repo_full_names || []; const savedRepos = config.repo_full_names || [];
@ -469,6 +473,35 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
newConfig = { ELASTICSEARCH_API_KEY: formData.ELASTICSEARCH_API_KEY }; newConfig = { ELASTICSEARCH_API_KEY: formData.ELASTICSEARCH_API_KEY };
} }
break; break;
case "WEBCRAWLER_CONNECTOR":
if (
formData.FIRECRAWL_API_KEY !== originalConfig.FIRECRAWL_API_KEY ||
formData.INITIAL_URLS !== originalConfig.INITIAL_URLS
) {
newConfig = {};
if (formData.FIRECRAWL_API_KEY && formData.FIRECRAWL_API_KEY.trim()) {
if (!formData.FIRECRAWL_API_KEY.startsWith("fc-")) {
toast.warning(
"Firecrawl API keys typically start with 'fc-'. Please verify your key."
);
}
newConfig.FIRECRAWL_API_KEY = formData.FIRECRAWL_API_KEY.trim();
} else if (originalConfig.FIRECRAWL_API_KEY) {
toast.info(
"Firecrawl API key removed. Web crawler will use AsyncChromiumLoader as fallback."
);
}
if (formData.INITIAL_URLS !== undefined) {
if (formData.INITIAL_URLS && formData.INITIAL_URLS.trim()) {
newConfig.INITIAL_URLS = formData.INITIAL_URLS.trim();
} else if (originalConfig.INITIAL_URLS) {
toast.info("URLs removed from crawler configuration.");
}
}
}
break;
} }
if (newConfig !== null) { if (newConfig !== null) {
@ -562,6 +595,9 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
"ELASTICSEARCH_API_KEY", "ELASTICSEARCH_API_KEY",
newlySavedConfig.ELASTICSEARCH_API_KEY || "" newlySavedConfig.ELASTICSEARCH_API_KEY || ""
); );
} else if (connector.connector_type === "WEBCRAWLER_CONNECTOR") {
editForm.setValue("FIRECRAWL_API_KEY", newlySavedConfig.FIRECRAWL_API_KEY || "");
editForm.setValue("INITIAL_URLS", newlySavedConfig.INITIAL_URLS || "");
} }
} }
if (connector.connector_type === "GITHUB_CONNECTOR") { if (connector.connector_type === "GITHUB_CONNECTOR") {

View file

@ -18,6 +18,7 @@ export const getConnectorTypeDisplay = (type: string): string => {
AIRTABLE_CONNECTOR: "Airtable", AIRTABLE_CONNECTOR: "Airtable",
LUMA_CONNECTOR: "Luma", LUMA_CONNECTOR: "Luma",
ELASTICSEARCH_CONNECTOR: "Elasticsearch", ELASTICSEARCH_CONNECTOR: "Elasticsearch",
WEBCRAWLER_CONNECTOR: "Web Pages",
}; };
return typeMap[type] || type; return typeMap[type] || type;
}; };

View file

@ -304,11 +304,14 @@
"add_connector": { "add_connector": {
"title": "Connect Your Tools", "title": "Connect Your Tools",
"subtitle": "Integrate with your favorite services to enhance your research capabilities.", "subtitle": "Integrate with your favorite services to enhance your research capabilities.",
"search_engines": "Search Engines", "web_search": "Web Search",
"team_chats": "Team Chats", "messaging": "Messaging",
"project_management": "Project Management", "project_management": "Project Management",
"knowledge_bases": "Knowledge Bases", "documentation": "Documentation",
"communication": "Communication", "development": "Development",
"databases": "Databases",
"productivity": "Productivity",
"web_crawling": "Web Crawling",
"connect": "Connect", "connect": "Connect",
"coming_soon": "Coming Soon", "coming_soon": "Coming Soon",
"connected": "Connected", "connected": "Connected",
@ -328,10 +331,11 @@
"github_desc": "Connect a GitHub PAT to index code and docs from accessible repositories.", "github_desc": "Connect a GitHub PAT to index code and docs from accessible repositories.",
"confluence_desc": "Connect to Confluence to search pages, comments and documentation.", "confluence_desc": "Connect to Confluence to search pages, comments and documentation.",
"airtable_desc": "Connect to Airtable to search records, tables and database content.", "airtable_desc": "Connect to Airtable to search records, tables and database content.",
"luma_desc": "Connect to Luma to search events", "luma_desc": "Connect to Luma to search events, meetups and gatherings.",
"calendar_desc": "Connect to Google Calendar to search events, meetings and schedules.", "calendar_desc": "Connect to Google Calendar to search events, meetings and schedules.",
"gmail_desc": "Connect to your Gmail account to search through your emails.", "gmail_desc": "Connect to your Gmail account to search through your emails.",
"zoom_desc": "Connect to Zoom to access meeting recordings and transcripts." "zoom_desc": "Connect to Zoom to access meeting recordings and transcripts.",
"webcrawler_desc": "Crawl and index content from any public web pages."
}, },
"upload_documents": { "upload_documents": {
"title": "Upload Documents", "title": "Upload Documents",

View file

@ -304,11 +304,14 @@
"add_connector": { "add_connector": {
"title": "连接您的工具", "title": "连接您的工具",
"subtitle": "集成您喜欢的服务以增强研究能力。", "subtitle": "集成您喜欢的服务以增强研究能力。",
"search_engines": "搜索引擎", "web_search": "网络搜索",
"team_chats": "团队聊天", "messaging": "即时通讯",
"project_management": "项目管理", "project_management": "项目管理",
"knowledge_bases": "知识库", "documentation": "文档协作",
"communication": "通讯", "development": "开发工具",
"databases": "数据库",
"productivity": "效率工具",
"web_crawling": "网页爬取",
"connect": "连接", "connect": "连接",
"coming_soon": "即将推出", "coming_soon": "即将推出",
"connected": "已连接", "connected": "已连接",
@ -328,10 +331,11 @@
"github_desc": "连接 GitHub PAT 以索引可访问存储库的代码和文档。", "github_desc": "连接 GitHub PAT 以索引可访问存储库的代码和文档。",
"confluence_desc": "连接到 Confluence 以搜索页面、评论和文档。", "confluence_desc": "连接到 Confluence 以搜索页面、评论和文档。",
"airtable_desc": "连接到 Airtable 以搜索记录、表格和数据库内容。", "airtable_desc": "连接到 Airtable 以搜索记录、表格和数据库内容。",
"luma_desc": "连接到 Luma 以搜索活动", "luma_desc": "连接到 Luma 以搜索活动、聚会和集会。",
"calendar_desc": "连接到 Google 日历以搜索活动、会议和日程。", "calendar_desc": "连接到 Google 日历以搜索活动、会议和日程。",
"gmail_desc": "连接到您的 Gmail 账户以搜索您的电子邮件。", "gmail_desc": "连接到您的 Gmail 账户以搜索您的电子邮件。",
"zoom_desc": "连接到 Zoom 以访问会议录制和转录。" "zoom_desc": "连接到 Zoom 以访问会议录制和转录。",
"webcrawler_desc": "爬取和索引任何公开网页的内容。"
}, },
"upload_documents": { "upload_documents": {
"title": "上传文档", "title": "上传文档",