Fixing search logic

This commit is contained in:
samkul-swe 2025-11-22 13:33:16 -08:00
parent 1480f85431
commit 6d19e0fad8
3 changed files with 41 additions and 15 deletions

View file

@ -158,7 +158,7 @@ class WebCrawlerConnector:
def format_to_structured_document(self, crawl_result: dict[str, Any]) -> str: def format_to_structured_document(self, crawl_result: dict[str, Any]) -> str:
""" """
Format crawl result as a structured document (similar to url_crawler.py format). Format crawl result as a structured document.
Args: Args:
crawl_result: Result from crawl_url method crawl_result: Result from crawl_url method

View file

@ -65,13 +65,6 @@ async def create_documents(
process_extension_document_task.delay( process_extension_document_task.delay(
document_dict, request.search_space_id, str(user.id) document_dict, request.search_space_id, str(user.id)
) )
elif request.document_type == DocumentType.CRAWLED_URL:
from app.tasks.celery_tasks.document_tasks import process_crawled_url_task
for url in request.content:
process_crawled_url_task.delay(
url, request.search_space_id, str(user.id)
)
elif request.document_type == DocumentType.YOUTUBE_VIDEO: elif request.document_type == DocumentType.YOUTUBE_VIDEO:
from app.tasks.celery_tasks.document_tasks import process_youtube_video_task from app.tasks.celery_tasks.document_tasks import process_youtube_video_task

View file

@ -70,6 +70,13 @@ class ConnectorService:
""" """
Search for crawled URLs and return both the source information and langchain documents Search for crawled URLs and return both the source information and langchain documents
Args:
user_query: The user's query
user_id: The user's ID
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
Returns: Returns:
tuple: (sources_info, langchain_documents) tuple: (sources_info, langchain_documents)
""" """
@ -109,15 +116,41 @@ class ConnectorService:
document = chunk.get("document", {}) document = chunk.get("document", {})
metadata = document.get("metadata", {}) metadata = document.get("metadata", {})
# Create a source entry # Extract webcrawler-specific metadata
url = metadata.get("source", metadata.get("url", ""))
title = document.get("title", metadata.get("title", "Untitled Document"))
description = metadata.get("description", "")
language = metadata.get("language", "")
last_crawled_at = metadata.get("last_crawled_at", "")
# Build description with crawler info
content_preview = chunk.get("content", "")
if not description and content_preview:
# Use content preview if no description
description = content_preview[:200]
if len(content_preview) > 200:
description += "..."
# Add crawler metadata to description if available
info_parts = []
if language:
info_parts.append(f"Language: {language}")
if last_crawled_at:
info_parts.append(f"Last crawled: {last_crawled_at}")
if info_parts:
if description:
description += f" | {' | '.join(info_parts)}"
else:
description = " | ".join(info_parts)
source = { source = {
"id": chunk.get("chunk_id", self.source_id_counter), "id": chunk.get("chunk_id", self.source_id_counter),
"title": document.get("title", "Untitled Document"), "title": title,
"description": metadata.get( "description": description,
"og:description", "url": url,
metadata.get("ogDescription", chunk.get("content", "")), "language": language,
), "last_crawled_at": last_crawled_at,
"url": metadata.get("url", ""),
} }
self.source_id_counter += 1 self.source_id_counter += 1