From 7d1bd1fab4db02342d90454fb738fd381f6b7a2c Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Tue, 17 Feb 2026 20:30:12 +0200 Subject: [PATCH] Implement KB sync after Notion page updates with block ID verification - Add NotionKBSyncService for immediate KB updates after page changes - Implement block ID verification to ensure content freshness - Refactor duplicate block processing logic to shared utils - Add user-friendly status messages - Include debug logging for troubleshooting --- .../new_chat/tools/notion/update_page.py | 7 +- .../app/connectors/notion_history.py | 14 +++- .../app/services/notion/kb_sync_service.py | 81 +++++++++++++++++-- .../connector_indexers/notion_indexer.py | 48 +---------- surfsense_backend/app/utils/notion_utils.py | 58 +++++++++++++ 5 files changed, 150 insertions(+), 58 deletions(-) create mode 100644 surfsense_backend/app/utils/notion_utils.py diff --git a/surfsense_backend/app/agents/new_chat/tools/notion/update_page.py b/surfsense_backend/app/agents/new_chat/tools/notion/update_page.py index bfec96ac4..088041add 100644 --- a/surfsense_backend/app/agents/new_chat/tools/notion/update_page.py +++ b/surfsense_backend/app/agents/new_chat/tools/notion/update_page.py @@ -230,22 +230,23 @@ def create_update_notion_page_tool( appended_content=final_content, user_id=user_id, search_space_id=search_space_id, + appended_block_ids=result.get("appended_block_ids"), ) if kb_result["status"] == "success": result["message"] = ( - f"{result['message']}. Knowledge base updated - your search results now reflect the latest content." + f"{result['message']}. Your knowledge base has also been updated." ) logger.info( f"Knowledge base successfully updated for page {final_page_id}" ) elif kb_result["status"] == "not_indexed": result["message"] = ( - f"{result['message']}. (Note: This page hasn't been indexed yet, so it won't appear in search until the next scheduled indexing.)" + f"{result['message']}. This page will be added to your knowledge base in the next scheduled sync." ) else: result["message"] = ( - f"{result['message']}. However, knowledge base update failed: {kb_result['message']}. The page will sync on the next scheduled indexing." + f"{result['message']}. Your knowledge base will be updated in the next scheduled sync." ) logger.warning( f"KB update failed for page {final_page_id}: {kb_result['message']}" diff --git a/surfsense_backend/app/connectors/notion_history.py b/surfsense_backend/app/connectors/notion_history.py index 311f97ddd..2539c952f 100644 --- a/surfsense_backend/app/connectors/notion_history.py +++ b/surfsense_backend/app/connectors/notion_history.py @@ -1041,7 +1041,7 @@ class NotionHistoryConnector: try: notion = await self._get_client() - # Append content if provided + appended_block_ids = [] if content: # Convert new content to blocks try: @@ -1065,14 +1065,23 @@ class NotionHistoryConnector: try: for i in range(0, len(children), 100): batch = children[i : i + 100] - await self._api_call_with_retry( + response = await self._api_call_with_retry( notion.blocks.children.append, block_id=page_id, children=batch, ) + batch_block_ids = [ + block["id"] for block in response.get("results", []) + ] + appended_block_ids.extend(batch_block_ids) logger.info( f"Successfully appended {len(children)} new blocks to page {page_id}" ) + logger.debug( + f"Appended block IDs: {appended_block_ids[:5]}..." + if len(appended_block_ids) > 5 + else f"Appended block IDs: {appended_block_ids}" + ) except Exception as e: logger.error(f"Failed to append content blocks: {e}") return { @@ -1092,6 +1101,7 @@ class NotionHistoryConnector: "page_id": page_id, "url": page_url, "title": page_title, + "appended_block_ids": appended_block_ids, "message": f"Updated Notion page '{page_title}' (content appended)", } diff --git a/surfsense_backend/app/services/notion/kb_sync_service.py b/surfsense_backend/app/services/notion/kb_sync_service.py index 6941824a1..de06135b9 100644 --- a/surfsense_backend/app/services/notion/kb_sync_service.py +++ b/surfsense_backend/app/services/notion/kb_sync_service.py @@ -26,6 +26,7 @@ class NotionKBSyncService: appended_content: str, user_id: str, search_space_id: int, + appended_block_ids: list[str] | None = None, ) -> dict: from app.tasks.connector_indexers.base import ( get_current_timestamp, @@ -33,13 +34,70 @@ class NotionKBSyncService: ) try: + logger.debug(f"Starting KB sync for document {document_id}") document = await self.db_session.get(Document, document_id) if not document: + logger.warning(f"Document {document_id} not found in KB") return {"status": "not_indexed"} - new_content = document.content + "\n\n" + appended_content + page_id = document.document_metadata.get("page_id") + if not page_id: + logger.error(f"Document {document_id} missing page_id in metadata") + return {"status": "error", "message": "Missing page_id in metadata"} + logger.debug( + f"Document found: id={document_id}, page_id={page_id}, connector_id={document.connector_id}" + ) + + from app.connectors.notion_history import NotionHistoryConnector + + notion_connector = NotionHistoryConnector( + session=self.db_session, connector_id=document.connector_id + ) + + logger.debug(f"Fetching page content from Notion for page {page_id}") + blocks, _ = await notion_connector.get_page_content(page_id, page_title=None) + + from app.utils.notion_utils import extract_all_block_ids, process_blocks + + fetched_content = process_blocks(blocks) + logger.debug(f"Fetched content length: {len(fetched_content)} chars") + + content_verified = False + if appended_block_ids: + fetched_block_ids = set(extract_all_block_ids(blocks)) + found_blocks = [ + bid for bid in appended_block_ids if bid in fetched_block_ids + ] + + logger.debug( + f"Block verification: {len(found_blocks)}/{len(appended_block_ids)} blocks found" + ) + logger.debug( + f"Appended IDs (first 3): {appended_block_ids[:3]}, Fetched IDs count: {len(fetched_block_ids)}" + ) + + if len(found_blocks) >= 1: + logger.info( + f"Content verified fresh: found {len(found_blocks)} appended blocks" + ) + full_content = fetched_content + content_verified = True + else: + logger.warning( + "No appended blocks found in fetched content - appending manually" + ) + full_content = fetched_content + "\n\n" + appended_content + content_verified = False + else: + logger.warning("No block IDs provided - using fetched content as-is") + full_content = fetched_content + content_verified = False + + logger.debug(f"Final content length: {len(full_content)} chars, verified={content_verified}") + + logger.debug("Generating summary and embeddings") user_llm = await get_user_long_context_llm( self.db_session, user_id, search_space_id ) @@ -52,22 +110,28 @@ class NotionKBSyncService: "connector_type": "Notion", } summary_content, summary_embedding = await generate_document_summary( - new_content, user_llm, document_metadata_for_summary + full_content, user_llm, document_metadata_for_summary ) + logger.debug(f"Generated summary length: {len(summary_content)} chars") else: - summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{new_content[:500]}..." + logger.warning("No LLM configured - using fallback summary") + summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{full_content[:500]}..." summary_embedding = config.embedding_model_instance.embed( summary_content ) + logger.debug(f"Deleting old chunks for document {document_id}") await self.db_session.execute( delete(Chunk).where(Chunk.document_id == document.id) ) - chunks = await create_document_chunks(new_content) + logger.debug("Creating new chunks") + chunks = await create_document_chunks(full_content) + logger.debug(f"Created {len(chunks)} chunks") + logger.debug("Updating document fields") document.content = summary_content - document.content_hash = generate_content_hash(new_content) + document.content_hash = generate_content_hash(full_content, search_space_id) document.embedding = summary_embedding document.document_metadata = { **document.document_metadata, @@ -76,9 +140,14 @@ class NotionKBSyncService: safe_set_chunks(document, chunks) document.updated_at = get_current_timestamp() + logger.debug("Committing changes to database") await self.db_session.commit() - logger.info(f"Successfully synced KB for document {document_id}") + logger.info( + f"Successfully synced KB for document {document_id}: " + f"summary={len(summary_content)} chars, chunks={len(chunks)}, " + f"content_verified={content_verified}" + ) return {"status": "success"} except Exception as e: diff --git a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py index 1a67ee7fc..d2b1c9137 100644 --- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py @@ -24,6 +24,7 @@ from app.utils.document_converters import ( generate_document_summary, generate_unique_identifier_hash, ) +from app.utils.notion_utils import process_blocks from .base import ( build_document_metadata_string, @@ -280,53 +281,6 @@ async def index_notion_pages( pages_to_process = [] # List of dicts with document and page data new_documents_created = False - # Helper function to convert page content to markdown - def process_blocks(blocks, level=0): - result = "" - for block in blocks: - block_type = block.get("type") - block_content = block.get("content", "") - children = block.get("children", []) - - # Add indentation based on level - indent = " " * level - - # Format based on block type - if block_type in ["paragraph", "text"]: - result += f"{indent}{block_content}\n\n" - elif block_type in ["heading_1", "header"]: - result += f"{indent}# {block_content}\n\n" - elif block_type == "heading_2": - result += f"{indent}## {block_content}\n\n" - elif block_type == "heading_3": - result += f"{indent}### {block_content}\n\n" - elif block_type == "bulleted_list_item": - result += f"{indent}* {block_content}\n" - elif block_type == "numbered_list_item": - result += f"{indent}1. {block_content}\n" - elif block_type == "to_do": - result += f"{indent}- [ ] {block_content}\n" - elif block_type == "toggle": - result += f"{indent}> {block_content}\n" - elif block_type == "code": - result += f"{indent}```\n{block_content}\n```\n\n" - elif block_type == "quote": - result += f"{indent}> {block_content}\n\n" - elif block_type == "callout": - result += f"{indent}> **Note:** {block_content}\n\n" - elif block_type == "image": - result += f"{indent}![Image]({block_content})\n\n" - else: - # Default for other block types - if block_content: - result += f"{indent}{block_content}\n\n" - - # Process children recursively - if children: - result += process_blocks(children, level + 1) - - return result - for page in pages: try: page_id = page.get("page_id") diff --git a/surfsense_backend/app/utils/notion_utils.py b/surfsense_backend/app/utils/notion_utils.py new file mode 100644 index 000000000..8f833fab6 --- /dev/null +++ b/surfsense_backend/app/utils/notion_utils.py @@ -0,0 +1,58 @@ +"""Utility functions for processing Notion blocks and content.""" + + +def extract_all_block_ids(blocks_list): + ids = [] + for block in blocks_list: + if isinstance(block, dict) and "id" in block: + ids.append(block["id"]) + if isinstance(block, dict) and block.get("children"): + ids.extend(extract_all_block_ids(block["children"])) + return ids + + +def process_blocks(blocks, level=0): + result = "" + for block in blocks: + block_type = block.get("type") + block_content = block.get("content", "") + children = block.get("children", []) + + # Add indentation based on level + indent = " " * level + + # Format based on block type + if block_type in ["paragraph", "text"]: + result += f"{indent}{block_content}\n\n" + elif block_type in ["heading_1", "header"]: + result += f"{indent}# {block_content}\n\n" + elif block_type == "heading_2": + result += f"{indent}## {block_content}\n\n" + elif block_type == "heading_3": + result += f"{indent}### {block_content}\n\n" + elif block_type == "bulleted_list_item": + result += f"{indent}* {block_content}\n" + elif block_type == "numbered_list_item": + result += f"{indent}1. {block_content}\n" + elif block_type == "to_do": + result += f"{indent}- [ ] {block_content}\n" + elif block_type == "toggle": + result += f"{indent}> {block_content}\n" + elif block_type == "code": + result += f"{indent}```\n{block_content}\n```\n\n" + elif block_type == "quote": + result += f"{indent}> {block_content}\n\n" + elif block_type == "callout": + result += f"{indent}> **Note:** {block_content}\n\n" + elif block_type == "image": + result += f"{indent}![Image]({block_content})\n\n" + else: + # Default for other block types + if block_content: + result += f"{indent}{block_content}\n\n" + + # Process children recursively + if children: + result += process_blocks(children, level + 1) + + return result