diff --git a/surfsense_backend/app/tasks/stream_connector_search_results.py b/surfsense_backend/app/tasks/stream_connector_search_results.py index 4fc918116..99b039489 100644 --- a/surfsense_backend/app/tasks/stream_connector_search_results.py +++ b/surfsense_backend/app/tasks/stream_connector_search_results.py @@ -59,6 +59,33 @@ async def stream_connector_search_results( # Process each selected connector for connector in selected_connectors: + # Extension Docs + if connector == "EXTENSION": + # Send terminal message about starting search + yield streaming_service.add_terminal_message("Starting to search for extension...") + + # Search for crawled URLs using reformulated query + result_object, extension_chunks = await connector_service.search_extension( + user_query=reformulated_query, + user_id=user_id, + search_space_id=search_space_id, + top_k=TOP_K + ) + + # Send terminal message about search results + yield streaming_service.add_terminal_message( + f"Found {len(result_object['sources'])} relevant extension documents", + "success" + ) + + # Update sources + all_sources.append(result_object) + yield streaming_service.update_sources(all_sources) + + # Add documents to collection + all_raw_documents.extend(extension_chunks) + + # Crawled URLs if connector == "CRAWLED_URL": # Send terminal message about starting search diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py index b26427b01..60dd1cf6d 100644 --- a/surfsense_backend/app/utils/connector_service.py +++ b/surfsense_backend/app/utils/connector_service.py @@ -382,4 +382,101 @@ class ConnectorService: "sources": sources_list, } - return result_object, notion_chunks \ No newline at end of file + return result_object, notion_chunks + + async def search_extension(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple: + """ + Search for extension data and return both the source information and langchain documents + + Args: + user_query: The user's query + user_id: The user's ID + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + + Returns: + tuple: (sources_info, langchain_documents) + """ + extension_chunks = await self.retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="EXTENSION" + ) + + # Map extension_chunks to the required format + mapped_sources = {} + for i, chunk in enumerate(extension_chunks): + # Fix for UI + extension_chunks[i]['document']['id'] = self.source_id_counter + + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) + + # Extract extension-specific metadata + webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page') + webpage_url = metadata.get('VisitedWebPageURL', '') + visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '') + visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '') + browsing_session_id = metadata.get('BrowsingSessionId', '') + + # Create a more descriptive title for extension data + title = webpage_title + if visit_date: + # Format the date for display (simplified) + try: + # Just extract the date part for display + formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date + title += f" (visited: {formatted_date})" + except: + # Fallback if date parsing fails + title += f" (visited: {visit_date})" + + # Create a more descriptive description for extension data + description = chunk.get('content', '')[:100] + if len(description) == 100: + description += "..." + + # Add visit duration if available + if visit_duration: + try: + duration_seconds = int(visit_duration) / 1000 + if duration_seconds < 60: + duration_text = f"{duration_seconds:.1f} seconds" + else: + duration_text = f"{duration_seconds/60:.1f} minutes" + + if description: + description += f" | Duration: {duration_text}" + except: + # Fallback if duration parsing fails + pass + + source = { + "id": self.source_id_counter, + "title": title, + "description": description, + "url": webpage_url + } + + self.source_id_counter += 1 + + # Use URL and timestamp as a unique identifier for tracking unique sources + source_key = f"{webpage_url}_{visit_date}" + if source_key and source_key not in mapped_sources: + mapped_sources[source_key] = source + + # Convert to list of sources + sources_list = list(mapped_sources.values()) + + # Create result object + result_object = { + "id": 6, + "name": "Extension", + "type": "EXTENSION", + "sources": sources_list, + } + + return result_object, extension_chunks \ No newline at end of file diff --git a/surfsense_frontend b/surfsense_frontend index 269cef484..1de756133 160000 --- a/surfsense_frontend +++ b/surfsense_frontend @@ -1 +1 @@ -Subproject commit 269cef48438adfba31f5405898a8ef1b1231020a +Subproject commit 1de75613320f6d077ca04c6ec7a7441e07536613