use trafilatura to extrack page content from the chromium result

This commit is contained in:
CREDO23 2025-12-19 08:36:25 +02:00
parent c6cc7c2a6a
commit 64cd65bc1f
4 changed files with 202 additions and 8 deletions

View file

@ -197,6 +197,10 @@ async def index_crawled_urls(
structured_document_for_hash, search_space_id
)
logger.info(
f"structured_document_for_hash {structured_document_for_hash} ========="
)
# Check if document with this unique identifier already exists
existing_document = await check_document_by_unique_identifier(
session, unique_identifier_hash