diff --git a/surfsense_backend/alembic/versions/36_remove_fk_constraints_for_global_llm_configs.py b/surfsense_backend/alembic/versions/36_remove_fk_constraints_for_global_llm_configs.py index a750d6455..1faebf2ed 100644 --- a/surfsense_backend/alembic/versions/36_remove_fk_constraints_for_global_llm_configs.py +++ b/surfsense_backend/alembic/versions/36_remove_fk_constraints_for_global_llm_configs.py @@ -8,9 +8,10 @@ Create Date: 2025-11-13 23:20:12.912741 from collections.abc import Sequence -from alembic import op from sqlalchemy import text +from alembic import op + # revision identifiers, used by Alembic. revision: str = "36" down_revision: str | None = "35" @@ -49,7 +50,9 @@ def upgrade() -> None: ] for constraint_name in constraints_to_drop: - if constraint_exists(connection, "user_search_space_preferences", constraint_name): + if constraint_exists( + connection, "user_search_space_preferences", constraint_name + ): op.drop_constraint( constraint_name, "user_search_space_preferences", @@ -67,13 +70,18 @@ def downgrade() -> None: # Re-add the foreign key constraints if they don't exist constraints_to_create = [ - ("user_search_space_preferences_long_context_llm_id_fkey", "long_context_llm_id"), + ( + "user_search_space_preferences_long_context_llm_id_fkey", + "long_context_llm_id", + ), ("user_search_space_preferences_fast_llm_id_fkey", "fast_llm_id"), ("user_search_space_preferences_strategic_llm_id_fkey", "strategic_llm_id"), ] for constraint_name, column_name in constraints_to_create: - if not constraint_exists(connection, "user_search_space_preferences", constraint_name): + if not constraint_exists( + connection, "user_search_space_preferences", constraint_name + ): op.create_foreign_key( constraint_name, "user_search_space_preferences", diff --git a/surfsense_backend/app/connectors/webcrawler_connector.py b/surfsense_backend/app/connectors/webcrawler_connector.py index 871b4d4b3..9bb72f1ce 100644 --- a/surfsense_backend/app/connectors/webcrawler_connector.py +++ b/surfsense_backend/app/connectors/webcrawler_connector.py @@ -105,7 +105,7 @@ class WebCrawlerConnector: # Extract content based on format content = scrape_result.markdown or scrape_result.html or "" - + # Extract metadata metadata = scrape_result.metadata if scrape_result.metadata else {} diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 6bf7a97d5..bf397a352 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -39,6 +39,7 @@ from app.tasks.connector_indexers import ( index_airtable_records, index_clickup_tasks, index_confluence_pages, + index_crawled_urls, index_discord_messages, index_elasticsearch_documents, index_github_repos, @@ -49,7 +50,6 @@ from app.tasks.connector_indexers import ( index_luma_events, index_notion_pages, index_slack_messages, - index_crawled_urls, ) from app.users import current_active_user from app.utils.check_ownership import check_ownership @@ -1537,6 +1537,7 @@ async def run_elasticsearch_indexing( exc_info=True, ) + # Add new helper functions for crawled web page indexing async def run_web_page_indexing_with_new_session( connector_id: int, @@ -1595,4 +1596,4 @@ async def run_web_page_indexing( f"Web page indexing failed or no documents processed: {error_or_warning}" ) except Exception as e: - logger.error(f"Error in background Web page indexing task: {e!s}") \ No newline at end of file + logger.error(f"Error in background Web page indexing task: {e!s}") diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index add938e88..3445d69f7 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -118,7 +118,9 @@ class ConnectorService: # Extract webcrawler-specific metadata url = metadata.get("source", metadata.get("url", "")) - title = document.get("title", metadata.get("title", "Untitled Document")) + title = document.get( + "title", metadata.get("title", "Untitled Document") + ) description = metadata.get("description", "") language = metadata.get("language", "") last_crawled_at = metadata.get("last_crawled_at", "") @@ -2573,4 +2575,4 @@ class ConnectorService: "sources": sources_list, } - return result_object, elasticsearch_chunks \ No newline at end of file + return result_object, elasticsearch_chunks diff --git a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py index 05a747230..dbc326406 100644 --- a/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py +++ b/surfsense_backend/app/tasks/celery_tasks/schedule_checker_task.py @@ -67,6 +67,7 @@ async def _check_and_trigger_schedules(): index_airtable_records_task, index_clickup_tasks_task, index_confluence_pages_task, + index_crawled_urls_task, index_discord_messages_task, index_elasticsearch_documents_task, index_github_repos_task, @@ -77,7 +78,6 @@ async def _check_and_trigger_schedules(): index_luma_events_task, index_notion_pages_task, index_slack_messages_task, - index_crawled_urls_task ) # Map connector types to their tasks diff --git a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py index 6d16ae40b..f74e37503 100644 --- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py @@ -93,7 +93,7 @@ async def index_crawled_urls( # Get the Firecrawl API key from the connector config (optional) api_key = connector.config.get("FIRECRAWL_API_KEY") - + # Get URLs from connector config initial_urls = connector.config.get("INITIAL_URLS", "") if isinstance(initial_urls, str): @@ -177,7 +177,9 @@ async def index_crawled_urls( continue # Format content as structured document - structured_document = crawler.format_to_structured_document(crawl_result) + structured_document = crawler.format_to_structured_document( + crawl_result + ) # Generate unique identifier hash for this URL unique_identifier_hash = generate_unique_identifier_hash( @@ -185,7 +187,9 @@ async def index_crawled_urls( ) # Generate content hash - content_hash = generate_content_hash(structured_document, search_space_id) + content_hash = generate_content_hash( + structured_document, search_space_id + ) # Check if document with this unique identifier already exists existing_document = await check_document_by_unique_identifier( @@ -205,7 +209,9 @@ async def index_crawled_urls( continue else: # Content has changed - update the existing document - logger.info(f"Content changed for URL {url}. Updating document.") + logger.info( + f"Content changed for URL {url}. Updating document." + ) # Generate summary with metadata user_llm = await get_user_long_context_llm( @@ -236,7 +242,7 @@ async def index_crawled_urls( if language: summary_content += f"Language: {language}\n" summary_content += f"Crawler: {crawler_type}\n\n" - + # Add content preview content_preview = content[:1000] if len(content) > 1000: @@ -298,7 +304,7 @@ async def index_crawled_urls( if language: summary_content += f"Language: {language}\n" summary_content += f"Crawler: {crawler_type}\n\n" - + # Add content preview content_preview = content[:1000] if len(content) > 1000: @@ -347,7 +353,7 @@ async def index_crawled_urls( continue total_processed = documents_indexed + documents_updated - + if total_processed > 0: await update_connector_last_indexed(session, connector, update_last_indexed) @@ -360,10 +366,14 @@ async def index_crawled_urls( # Build result message result_message = None if failed_urls: - failed_summary = "; ".join([f"{url}: {error}" for url, error in failed_urls[:5]]) + failed_summary = "; ".join( + [f"{url}: {error}" for url, error in failed_urls[:5]] + ) if len(failed_urls) > 5: failed_summary += f" (and {len(failed_urls) - 5} more)" - result_message = f"Completed with {len(failed_urls)} failures: {failed_summary}" + result_message = ( + f"Completed with {len(failed_urls)} failures: {failed_summary}" + ) await task_logger.log_task_success( log_entry, @@ -436,4 +446,4 @@ async def get_crawled_url_documents( result = await session.execute(query) documents = result.scalars().all() - return list(documents) \ No newline at end of file + return list(documents) diff --git a/surfsense_backend/app/utils/periodic_scheduler.py b/surfsense_backend/app/utils/periodic_scheduler.py index 7ee8acf0a..e33661d65 100644 --- a/surfsense_backend/app/utils/periodic_scheduler.py +++ b/surfsense_backend/app/utils/periodic_scheduler.py @@ -70,6 +70,7 @@ def create_periodic_schedule( index_airtable_records_task, index_clickup_tasks_task, index_confluence_pages_task, + index_crawled_urls_task, index_discord_messages_task, index_elasticsearch_documents_task, index_github_repos_task, @@ -80,7 +81,6 @@ def create_periodic_schedule( index_luma_events_task, index_notion_pages_task, index_slack_messages_task, - index_crawled_urls_task, ) # Map connector type to task diff --git a/surfsense_backend/app/utils/validators.py b/surfsense_backend/app/utils/validators.py index d0bdd7928..6b69fb3e1 100644 --- a/surfsense_backend/app/utils/validators.py +++ b/surfsense_backend/app/utils/validators.py @@ -468,7 +468,7 @@ def validate_connector_config( value = config.get(key) if not isinstance(value, list) or not value: raise ValueError(f"{field_name} must be a non-empty list of strings") - + def validate_firecrawl_api_key_format() -> None: """Validate Firecrawl API key format if provided.""" api_key = config.get("FIRECRAWL_API_KEY", "") @@ -477,16 +477,13 @@ def validate_connector_config( "Firecrawl API key should start with 'fc-'. Please verify your API key." ) - def validate_initial_urls() -> None: initial_urls = config.get("INITIAL_URLS", "") if initial_urls and initial_urls.strip(): urls = [url.strip() for url in initial_urls.split("\n") if url.strip()] for url in urls: if not validators.url(url): - raise ValueError( - f"Invalid URL format in INITIAL_URLS: {url}" - ) + raise ValueError(f"Invalid URL format in INITIAL_URLS: {url}") # Lookup table for connector validation rules connector_rules = {