mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-27 17:56:25 +02:00
Fixing search logic
This commit is contained in:
parent
1480f85431
commit
6d19e0fad8
3 changed files with 41 additions and 15 deletions
|
|
@ -158,7 +158,7 @@ class WebCrawlerConnector:
|
||||||
|
|
||||||
def format_to_structured_document(self, crawl_result: dict[str, Any]) -> str:
|
def format_to_structured_document(self, crawl_result: dict[str, Any]) -> str:
|
||||||
"""
|
"""
|
||||||
Format crawl result as a structured document (similar to url_crawler.py format).
|
Format crawl result as a structured document.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
crawl_result: Result from crawl_url method
|
crawl_result: Result from crawl_url method
|
||||||
|
|
|
||||||
|
|
@ -65,13 +65,6 @@ async def create_documents(
|
||||||
process_extension_document_task.delay(
|
process_extension_document_task.delay(
|
||||||
document_dict, request.search_space_id, str(user.id)
|
document_dict, request.search_space_id, str(user.id)
|
||||||
)
|
)
|
||||||
elif request.document_type == DocumentType.CRAWLED_URL:
|
|
||||||
from app.tasks.celery_tasks.document_tasks import process_crawled_url_task
|
|
||||||
|
|
||||||
for url in request.content:
|
|
||||||
process_crawled_url_task.delay(
|
|
||||||
url, request.search_space_id, str(user.id)
|
|
||||||
)
|
|
||||||
elif request.document_type == DocumentType.YOUTUBE_VIDEO:
|
elif request.document_type == DocumentType.YOUTUBE_VIDEO:
|
||||||
from app.tasks.celery_tasks.document_tasks import process_youtube_video_task
|
from app.tasks.celery_tasks.document_tasks import process_youtube_video_task
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -70,6 +70,13 @@ class ConnectorService:
|
||||||
"""
|
"""
|
||||||
Search for crawled URLs and return both the source information and langchain documents
|
Search for crawled URLs and return both the source information and langchain documents
|
||||||
|
|
||||||
|
Args:
|
||||||
|
user_query: The user's query
|
||||||
|
user_id: The user's ID
|
||||||
|
search_space_id: The search space ID to search in
|
||||||
|
top_k: Maximum number of results to return
|
||||||
|
search_mode: Search mode (CHUNKS or DOCUMENTS)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tuple: (sources_info, langchain_documents)
|
tuple: (sources_info, langchain_documents)
|
||||||
"""
|
"""
|
||||||
|
|
@ -109,15 +116,41 @@ class ConnectorService:
|
||||||
document = chunk.get("document", {})
|
document = chunk.get("document", {})
|
||||||
metadata = document.get("metadata", {})
|
metadata = document.get("metadata", {})
|
||||||
|
|
||||||
# Create a source entry
|
# Extract webcrawler-specific metadata
|
||||||
|
url = metadata.get("source", metadata.get("url", ""))
|
||||||
|
title = document.get("title", metadata.get("title", "Untitled Document"))
|
||||||
|
description = metadata.get("description", "")
|
||||||
|
language = metadata.get("language", "")
|
||||||
|
last_crawled_at = metadata.get("last_crawled_at", "")
|
||||||
|
|
||||||
|
# Build description with crawler info
|
||||||
|
content_preview = chunk.get("content", "")
|
||||||
|
if not description and content_preview:
|
||||||
|
# Use content preview if no description
|
||||||
|
description = content_preview[:200]
|
||||||
|
if len(content_preview) > 200:
|
||||||
|
description += "..."
|
||||||
|
|
||||||
|
# Add crawler metadata to description if available
|
||||||
|
info_parts = []
|
||||||
|
if language:
|
||||||
|
info_parts.append(f"Language: {language}")
|
||||||
|
if last_crawled_at:
|
||||||
|
info_parts.append(f"Last crawled: {last_crawled_at}")
|
||||||
|
|
||||||
|
if info_parts:
|
||||||
|
if description:
|
||||||
|
description += f" | {' | '.join(info_parts)}"
|
||||||
|
else:
|
||||||
|
description = " | ".join(info_parts)
|
||||||
|
|
||||||
source = {
|
source = {
|
||||||
"id": chunk.get("chunk_id", self.source_id_counter),
|
"id": chunk.get("chunk_id", self.source_id_counter),
|
||||||
"title": document.get("title", "Untitled Document"),
|
"title": title,
|
||||||
"description": metadata.get(
|
"description": description,
|
||||||
"og:description",
|
"url": url,
|
||||||
metadata.get("ogDescription", chunk.get("content", "")),
|
"language": language,
|
||||||
),
|
"last_crawled_at": last_crawled_at,
|
||||||
"url": metadata.get("url", ""),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
self.source_id_counter += 1
|
self.source_id_counter += 1
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue