From 4cfeffb38aaedaf84d729b52d6db001ab170c5e7 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Wed, 17 Dec 2025 18:42:37 +0200 Subject: [PATCH] refactor: update the webcrawler connector formater --- .../app/connectors/webcrawler_connector.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/surfsense_backend/app/connectors/webcrawler_connector.py b/surfsense_backend/app/connectors/webcrawler_connector.py index edd7f8800..4bb4ec3e3 100644 --- a/surfsense_backend/app/connectors/webcrawler_connector.py +++ b/surfsense_backend/app/connectors/webcrawler_connector.py @@ -153,12 +153,18 @@ class WebCrawlerConnector: "crawler_type": "chromium", } - def format_to_structured_document(self, crawl_result: dict[str, Any]) -> str: + def format_to_structured_document( + self, crawl_result: dict[str, Any], exclude_metadata: bool = False + ) -> str: """ Format crawl result as a structured document. Args: crawl_result: Result from crawl_url method + exclude_metadata: If True, excludes ALL metadata fields from the document. + This is useful for content hash generation to ensure the hash + only changes when actual content changes, not when metadata + (which often contains dynamic fields like timestamps, IDs, etc.) changes. Returns: Structured document string @@ -166,15 +172,17 @@ class WebCrawlerConnector: metadata = crawl_result["metadata"] content = crawl_result["content"] - document_parts = ["", ""] + document_parts = [""] - # Add all metadata fields - for key, value in metadata.items(): - document_parts.append(f"{key.upper()}: {value}") + # Include metadata section only if not excluded + if not exclude_metadata: + document_parts.append("") + for key, value in metadata.items(): + document_parts.append(f"{key.upper()}: {value}") + document_parts.append("") document_parts.extend( [ - "", "", "FORMAT: markdown", "TEXT_START",