use trafilatura to extrack page content from the chromium result

2026-05-31 19:45:15 +02:00 · 2025-12-19 08:36:25 +02:00 · 2025-12-19 08:36:25 +02:00 · 64cd65bc1f
commit 64cd65bc1f
parent c6cc7c2a6a
4 changed files with 202 additions and 8 deletions
--- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py
@ -197,6 +197,10 @@ async def index_crawled_urls(
                    structured_document_for_hash, search_space_id
                )

+                logger.info(
+                    f"structured_document_for_hash {structured_document_for_hash} ========="
+                )
+
                # Check if document with this unique identifier already exists
                existing_document = await check_document_by_unique_identifier(
                    session, unique_identifier_hash