diff --git a/surfsense_backend/app/connectors/webcrawler_connector.py b/surfsense_backend/app/connectors/webcrawler_connector.py index 9434e046f..871b4d4b3 100644 --- a/surfsense_backend/app/connectors/webcrawler_connector.py +++ b/surfsense_backend/app/connectors/webcrawler_connector.py @@ -158,7 +158,7 @@ class WebCrawlerConnector: def format_to_structured_document(self, crawl_result: dict[str, Any]) -> str: """ - Format crawl result as a structured document (similar to url_crawler.py format). + Format crawl result as a structured document. Args: crawl_result: Result from crawl_url method diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 344a2503d..ae9df0cf4 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -65,13 +65,6 @@ async def create_documents( process_extension_document_task.delay( document_dict, request.search_space_id, str(user.id) ) - elif request.document_type == DocumentType.CRAWLED_URL: - from app.tasks.celery_tasks.document_tasks import process_crawled_url_task - - for url in request.content: - process_crawled_url_task.delay( - url, request.search_space_id, str(user.id) - ) elif request.document_type == DocumentType.YOUTUBE_VIDEO: from app.tasks.celery_tasks.document_tasks import process_youtube_video_task diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index 28f70d285..0fa174274 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -70,6 +70,13 @@ class ConnectorService: """ Search for crawled URLs and return both the source information and langchain documents + Args: + user_query: The user's query + user_id: The user's ID + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + search_mode: Search mode (CHUNKS or DOCUMENTS) + Returns: tuple: (sources_info, langchain_documents) """ @@ -109,15 +116,41 @@ class ConnectorService: document = chunk.get("document", {}) metadata = document.get("metadata", {}) - # Create a source entry + # Extract webcrawler-specific metadata + url = metadata.get("source", metadata.get("url", "")) + title = document.get("title", metadata.get("title", "Untitled Document")) + description = metadata.get("description", "") + language = metadata.get("language", "") + last_crawled_at = metadata.get("last_crawled_at", "") + + # Build description with crawler info + content_preview = chunk.get("content", "") + if not description and content_preview: + # Use content preview if no description + description = content_preview[:200] + if len(content_preview) > 200: + description += "..." + + # Add crawler metadata to description if available + info_parts = [] + if language: + info_parts.append(f"Language: {language}") + if last_crawled_at: + info_parts.append(f"Last crawled: {last_crawled_at}") + + if info_parts: + if description: + description += f" | {' | '.join(info_parts)}" + else: + description = " | ".join(info_parts) + source = { "id": chunk.get("chunk_id", self.source_id_counter), - "title": document.get("title", "Untitled Document"), - "description": metadata.get( - "og:description", - metadata.get("ogDescription", chunk.get("content", "")), - ), - "url": metadata.get("url", ""), + "title": title, + "description": description, + "url": url, + "language": language, + "last_crawled_at": last_crawled_at, } self.source_id_counter += 1