diff --git a/surfsense_backend/alembic/versions/37_add_webcrawler_connector_enum.py b/surfsense_backend/alembic/versions/38_add_webcrawler_connector_enum.py similarity index 93% rename from surfsense_backend/alembic/versions/37_add_webcrawler_connector_enum.py rename to surfsense_backend/alembic/versions/38_add_webcrawler_connector_enum.py index 3a76b762c..1b33c31b3 100644 --- a/surfsense_backend/alembic/versions/37_add_webcrawler_connector_enum.py +++ b/surfsense_backend/alembic/versions/38_add_webcrawler_connector_enum.py @@ -44,9 +44,9 @@ def upgrade() -> None: IF NOT EXISTS ( SELECT 1 FROM pg_type t JOIN pg_enum e ON t.oid = e.enumtypid - WHERE t.typname = 'documenttype' AND e.enumlabel = 'WEBCRAWLER_CONNECTOR' + WHERE t.typname = 'documenttype' AND e.enumlabel = 'CRAWLED_URL' ) THEN - ALTER TYPE documenttype ADD VALUE 'WEBCRAWLER_CONNECTOR'; + ALTER TYPE documenttype ADD VALUE 'CRAWLED_URL'; END IF; END $$; diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py index 8269f08de..efbd89c2b 100644 --- a/surfsense_backend/app/agents/researcher/nodes.py +++ b/surfsense_backend/app/agents/researcher/nodes.py @@ -671,7 +671,7 @@ async def fetch_relevant_documents( ( source_object, crawled_urls_chunks, - ) = await connector_service.search_crawled_urls( + ) = await connector_service.search_webcrawler( user_query=reformulated_query, user_id=user_id, search_space_id=search_space_id, diff --git a/surfsense_backend/app/agents/researcher/qna_agent/default_prompts.py b/surfsense_backend/app/agents/researcher/qna_agent/default_prompts.py index 65c609d6f..7b5d251fe 100644 --- a/surfsense_backend/app/agents/researcher/qna_agent/default_prompts.py +++ b/surfsense_backend/app/agents/researcher/qna_agent/default_prompts.py @@ -34,7 +34,7 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel - TAVILY_API: "Tavily search API results" (personalized search results) - LINKUP_API: "Linkup search API results" (personalized search results) - LUMA_CONNECTOR: "Luma events" -- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense web crawler" (personally selected websites) +- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites) diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 7d06643e1..efa51d2db 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -208,9 +208,6 @@ class Config: # LlamaCloud API Key LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") - # Firecrawl API Key - FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None) - # Litellm TTS Configuration TTS_SERVICE = os.getenv("TTS_SERVICE") TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE") diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 2c8ae232f..6bf7a97d5 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -49,7 +49,7 @@ from app.tasks.connector_indexers import ( index_luma_events, index_notion_pages, index_slack_messages, - index_webcrawler_urls, + index_crawled_urls, ) from app.users import current_active_user from app.utils.check_ownership import check_ownership @@ -691,12 +691,12 @@ async def index_connector_content( response_message = "Elasticsearch indexing started in the background." elif connector.connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: - from app.tasks.celery_tasks.connector_tasks import index_webcrawler_urls_task + from app.tasks.celery_tasks.connector_tasks import index_crawled_urls_task logger.info( f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}" ) - index_webcrawler_urls_task.delay( + index_crawled_urls_task.delay( connector_id, search_space_id, str(user.id), indexing_from, indexing_to ) response_message = "Web page indexing started in the background." @@ -1537,8 +1537,8 @@ async def run_elasticsearch_indexing( exc_info=True, ) -# Add new helper functions for webcrawler indexing -async def run_webcrawler_indexing_with_new_session( +# Add new helper functions for crawled web page indexing +async def run_web_page_indexing_with_new_session( connector_id: int, search_space_id: int, user_id: str, @@ -1546,16 +1546,16 @@ async def run_webcrawler_indexing_with_new_session( end_date: str, ): """ - Create a new session and run the Webcrawler indexing task. + Create a new session and run the Web page indexing task. This prevents session leaks by creating a dedicated session for the background task. """ async with async_session_maker() as session: - await run_webcrawler_indexing( + await run_web_page_indexing( session, connector_id, search_space_id, user_id, start_date, end_date ) -async def run_webcrawler_indexing( +async def run_web_page_indexing( session: AsyncSession, connector_id: int, search_space_id: int, @@ -1564,7 +1564,7 @@ async def run_webcrawler_indexing( end_date: str, ): """ - Background task to run Webcrawler indexing. + Background task to run Web page indexing. Args: session: Database session connector_id: ID of the webcrawler connector @@ -1574,7 +1574,7 @@ async def run_webcrawler_indexing( end_date: End date for indexing """ try: - documents_processed, error_or_warning = await index_webcrawler_urls( + documents_processed, error_or_warning = await index_crawled_urls( session=session, connector_id=connector_id, search_space_id=search_space_id, @@ -1588,11 +1588,11 @@ async def run_webcrawler_indexing( if documents_processed > 0: await update_connector_last_indexed(session, connector_id) logger.info( - f"Webcrawler indexing completed successfully: {documents_processed} documents processed" + f"Web page indexing completed successfully: {documents_processed} documents processed" ) else: logger.error( - f"Webcrawler indexing failed or no documents processed: {error_or_warning}" + f"Web page indexing failed or no documents processed: {error_or_warning}" ) except Exception as e: - logger.error(f"Error in background Webcrawler indexing task: {e!s}") \ No newline at end of file + logger.error(f"Error in background Web page indexing task: {e!s}") \ No newline at end of file diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index 0fa174274..add938e88 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -2573,4 +2573,4 @@ class ConnectorService: "sources": sources_list, } - return result_object, elasticsearch_chunks + return result_object, elasticsearch_chunks \ No newline at end of file diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index 29a82d77a..b735741fe 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -602,8 +602,8 @@ async def _index_elasticsearch_documents( ) -@celery_app.task(name="index_webcrawler_urls", bind=True) -def index_webcrawler_urls_task( +@celery_app.task(name="index_crawled_urls", bind=True) +def index_crawled_urls_task( self, connector_id: int, search_space_id: int, @@ -611,7 +611,7 @@ def index_webcrawler_urls_task( start_date: str, end_date: str, ): - """Celery task to index Webcrawler Urls.""" + """Celery task to index Web page Urls.""" import asyncio loop = asyncio.new_event_loop() @@ -619,7 +619,7 @@ def index_webcrawler_urls_task( try: loop.run_until_complete( - _index_webcrawler_urls( + _index_crawled_urls( connector_id, search_space_id, user_id, start_date, end_date ) ) @@ -627,19 +627,19 @@ def index_webcrawler_urls_task( loop.close() -async def _index_webcrawler_urls( +async def _index_crawled_urls( connector_id: int, search_space_id: int, user_id: str, start_date: str, end_date: str, ): - """Index Webcrawler Urls with new session.""" + """Index Web page Urls with new session.""" from app.routes.search_source_connectors_routes import ( - run_webcrawler_indexing, + run_web_page_indexing, ) async with get_celery_session_maker()() as session: - await run_webcrawler_indexing( + await run_web_page_indexing( session, connector_id, search_space_id, user_id, start_date, end_date ) diff --git a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py index a0704fcfa..05a747230 100644 --- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py +++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py @@ -77,7 +77,7 @@ async def _check_and_trigger_schedules(): index_luma_events_task, index_notion_pages_task, index_slack_messages_task, - index_webcrawler_urls_task + index_crawled_urls_task ) # Map connector types to their tasks @@ -95,7 +95,7 @@ async def _check_and_trigger_schedules(): SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task, SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task, SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task, - SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_webcrawler_urls_task, + SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task, } # Trigger indexing for each due connector diff --git a/surfsense_backend/app/tasks/connector_indexers/__init__.py b/surfsense_backend/app/tasks/connector_indexers/__init__.py index 2ec7ce497..f62739679 100644 --- a/surfsense_backend/app/tasks/connector_indexers/__init__.py +++ b/surfsense_backend/app/tasks/connector_indexers/__init__.py @@ -42,7 +42,7 @@ from .luma_indexer import index_luma_events # Documentation and knowledge management from .notion_indexer import index_notion_pages from .slack_indexer import index_slack_messages -from .webcrawler_indexer import index_webcrawler_urls +from .webcrawler_indexer import index_crawled_urls __all__ = [ # noqa: RUF022 "index_airtable_records", @@ -60,7 +60,7 @@ __all__ = [ # noqa: RUF022 "index_linear_issues", # Documentation and knowledge management "index_notion_pages", - "index_webcrawler_urls", + "index_crawled_urls", # Communication platforms "index_slack_messages", "index_google_gmail_messages", diff --git a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py index d68c989b4..6d16ae40b 100644 --- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py @@ -27,7 +27,7 @@ from .base import ( ) -async def index_webcrawler_urls( +async def index_crawled_urls( session: AsyncSession, connector_id: int, search_space_id: int, @@ -37,7 +37,7 @@ async def index_webcrawler_urls( update_last_indexed: bool = True, ) -> tuple[int, str | None]: """ - Index webcrawler URLs. + Index web page URLs. Args: session: Database session @@ -55,9 +55,9 @@ async def index_webcrawler_urls( # Log task start log_entry = await task_logger.log_task_start( - task_name="webcrawler_url_indexing", + task_name="crawled_url_indexing", source="connector_indexing_task", - message=f"Starting webcrawler URL indexing for connector {connector_id}", + message=f"Starting web page URL indexing for connector {connector_id}", metadata={ "connector_id": connector_id, "user_id": str(user_id), @@ -104,7 +104,7 @@ async def index_webcrawler_urls( urls = [] logger.info( - f"Starting webcrawler indexing for connector {connector_id} with {len(urls)} URLs" + f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs" ) # Initialize webcrawler client @@ -367,7 +367,7 @@ async def index_webcrawler_urls( await task_logger.log_task_success( log_entry, - f"Successfully completed webcrawler indexing for connector {connector_id}", + f"Successfully completed crawled web page indexing for connector {connector_id}", { "urls_processed": total_processed, "documents_indexed": documents_indexed, @@ -378,7 +378,7 @@ async def index_webcrawler_urls( ) logger.info( - f"Webcrawler indexing completed: {documents_indexed} new, " + f"Web page indexing completed: {documents_indexed} new, " f"{documents_updated} updated, {documents_skipped} skipped, " f"{len(failed_urls)} failed" ) @@ -388,7 +388,7 @@ async def index_webcrawler_urls( await session.rollback() await task_logger.log_task_failure( log_entry, - f"Database error during webcrawler indexing for connector {connector_id}", + f"Database error during web page indexing for connector {connector_id}", str(db_error), {"error_type": "SQLAlchemyError"}, ) @@ -398,12 +398,12 @@ async def index_webcrawler_urls( await session.rollback() await task_logger.log_task_failure( log_entry, - f"Failed to index webcrawler URLs for connector {connector_id}", + f"Failed to index web page URLs for connector {connector_id}", str(e), {"error_type": type(e).__name__}, ) - logger.error(f"Failed to index webcrawler URLs: {e!s}", exc_info=True) - return 0, f"Failed to index webcrawler URLs: {e!s}" + logger.error(f"Failed to index web page URLs: {e!s}", exc_info=True) + return 0, f"Failed to index web page URLs: {e!s}" async def get_crawled_url_documents( diff --git a/surfsense_backend/app/utils/periodic_scheduler.py b/surfsense_backend/app/utils/periodic_scheduler.py index e303a3be5..7ee8acf0a 100644 --- a/surfsense_backend/app/utils/periodic_scheduler.py +++ b/surfsense_backend/app/utils/periodic_scheduler.py @@ -31,7 +31,7 @@ CONNECTOR_TASK_MAP = { SearchSourceConnectorType.DISCORD_CONNECTOR: "index_discord_messages", SearchSourceConnectorType.LUMA_CONNECTOR: "index_luma_events", SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: "index_elasticsearch_documents", - SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_webcrawler_urls", + SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_crawled_urls", } @@ -80,7 +80,7 @@ def create_periodic_schedule( index_luma_events_task, index_notion_pages_task, index_slack_messages_task, - index_webcrawler_urls_task, + index_crawled_urls_task, ) # Map connector type to task @@ -98,7 +98,7 @@ def create_periodic_schedule( SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task, SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task, SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task, - SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_webcrawler_urls_task, + SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task, } # Trigger the first run immediately diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx index b558a5858..5756278c4 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx @@ -18,7 +18,16 @@ import { CardHeader, CardTitle, } from "@/components/ui/card"; -import { Form } from "@/components/ui/form"; +import { + Form, + FormControl, + FormDescription, + FormField, + FormItem, + FormLabel, + FormMessage, +} from "@/components/ui/form"; +import { Textarea } from "@/components/ui/textarea"; import { getConnectorIcon } from "@/contracts/enums/connectorIcons"; import { useConnectorEditPage } from "@/hooks/use-connector-edit-page"; // Import Utils, Types, Hook, and Components @@ -285,13 +294,35 @@ export default function EditConnectorPage() { {/* == Webcrawler == */} {connector.connector_type === "WEBCRAWLER_CONNECTOR" && ( - +
+ + ( + + URLs to Crawl + +