diff --git a/surfsense_backend/app/connectors/notion_history.py b/surfsense_backend/app/connectors/notion_history.py index e38218a6e..a79168fdf 100644 --- a/surfsense_backend/app/connectors/notion_history.py +++ b/surfsense_backend/app/connectors/notion_history.py @@ -1,6 +1,7 @@ import logging from notion_client import AsyncClient +from notion_client.errors import APIResponseError from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select @@ -12,6 +13,17 @@ from app.utils.oauth_security import TokenEncryption logger = logging.getLogger(__name__) +# Known unsupported block types that Notion API doesn't expose +# These will be skipped gracefully instead of failing the entire sync +UNSUPPORTED_BLOCK_TYPE_ERRORS = [ + "transcription is not supported", + "ai_block is not supported", + "is not supported via the API", +] + +# Known unsupported block types to check before API calls +UNSUPPORTED_BLOCK_TYPES = ["transcription", "ai_block"] + class NotionHistoryConnector: def __init__( @@ -32,6 +44,8 @@ class NotionHistoryConnector: self._connector_id = connector_id self._credentials = credentials self._notion_client: AsyncClient | None = None + # Track pages with skipped unsupported content (for user notifications) + self._pages_with_skipped_content: list[str] = [] async def _get_valid_token(self) -> str: """ @@ -163,6 +177,34 @@ class NotionHistoryConnector: await self._notion_client.aclose() self._notion_client = None + def get_pages_with_skipped_content(self) -> list[str]: + """ + Get list of page titles that had unsupported content skipped. + + Returns: + List of page titles with skipped content + """ + return self._pages_with_skipped_content + + def get_skipped_content_count(self) -> int: + """ + Get count of pages that had unsupported content skipped. + + Returns: + Number of pages with skipped content + """ + return len(self._pages_with_skipped_content) + + def _record_skipped_content(self, page_title: str): + """ + Record that a page had unsupported content skipped. + + Args: + page_title: Title of the page with skipped content + """ + if page_title not in self._pages_with_skipped_content: + self._pages_with_skipped_content.append(page_title) + async def __aenter__(self): """Async context manager entry.""" return self @@ -229,14 +271,21 @@ class NotionHistoryConnector: for page in pages: page_id = page["id"] + page_title = self.get_page_title(page) - # Get detailed page information - page_content = await self.get_page_content(page_id) + # Get detailed page information (pass title for skip tracking) + page_content, had_skipped_content = await self.get_page_content( + page_id, page_title + ) + + # Record if this page had skipped content + if had_skipped_content: + self._record_skipped_content(page_title) all_page_data.append( { "page_id": page_id, - "title": self.get_page_title(page), + "title": page_title, "content": page_content, } ) @@ -265,46 +314,85 @@ class NotionHistoryConnector: # If no title found, return the page ID as fallback return f"Untitled page ({page['id']})" - async def get_page_content(self, page_id): + async def get_page_content( + self, page_id: str, page_title: str | None = None + ) -> tuple[list, bool]: """ Fetches the content (blocks) of a specific page. Args: page_id (str): The ID of the page to fetch + page_title (str, optional): Title of the page (for logging) Returns: - list: List of processed blocks from the page + tuple: (List of processed blocks, bool indicating if content was skipped) """ notion = await self._get_client() blocks = [] has_more = True cursor = None + skipped_blocks_count = 0 + had_skipped_content = False # Paginate through all blocks while has_more: - if cursor: - response = await notion.blocks.children.list( - block_id=page_id, start_cursor=cursor - ) - else: - response = await notion.blocks.children.list(block_id=page_id) + try: + if cursor: + response = await notion.blocks.children.list( + block_id=page_id, start_cursor=cursor + ) + else: + response = await notion.blocks.children.list(block_id=page_id) - blocks.extend(response["results"]) - has_more = response["has_more"] + blocks.extend(response["results"]) + has_more = response["has_more"] - if has_more: - cursor = response["next_cursor"] + if has_more: + cursor = response["next_cursor"] + + except APIResponseError as e: + error_message = str(e) + # Check if this is an unsupported block type error + if any( + err in error_message for err in UNSUPPORTED_BLOCK_TYPE_ERRORS + ): + logger.warning( + f"Skipping page blocks due to unsupported block type in page {page_id}: {error_message}" + ) + skipped_blocks_count += 1 + had_skipped_content = True + # If we haven't fetched any blocks yet, return empty + # If we have some blocks, continue with what we have + has_more = False + continue + elif "Could not find block" in error_message: + logger.warning( + f"Block not found in page {page_id}, continuing with available blocks: {error_message}" + ) + has_more = False + continue + # Re-raise other API errors + raise + + if skipped_blocks_count > 0: + logger.info( + f"Page {page_id}: Skipped {skipped_blocks_count} unsupported block sections, " + f"successfully processed {len(blocks)} blocks" + ) # Process nested blocks recursively processed_blocks = [] for block in blocks: - processed_block = await self.process_block(block) - processed_blocks.append(processed_block) + processed_block, block_had_skips = await self.process_block(block) + if processed_block: # Only add if block was processed successfully + processed_blocks.append(processed_block) + if block_had_skips: + had_skipped_content = True - return processed_blocks + return processed_blocks, had_skipped_content - async def process_block(self, block): + async def process_block(self, block) -> tuple[dict | None, bool]: """ Processes a block and recursively fetches any child blocks. @@ -312,12 +400,28 @@ class NotionHistoryConnector: block (dict): The block to process Returns: - dict: Processed block with content and children + tuple: (Processed block dict or None, bool indicating if content was skipped) """ notion = await self._get_client() block_id = block["id"] block_type = block["type"] + had_skipped_content = False + + # Check if this is a known unsupported block type before processing + if block_type in UNSUPPORTED_BLOCK_TYPES: + logger.debug( + f"Skipping unsupported block type: {block_type} (block_id: {block_id})" + ) + return ( + { + "id": block_id, + "type": block_type, + "content": f"[{block_type} block - not supported by Notion API]", + "children": [], + }, + True, # Content was skipped + ) # Extract block content based on its type content = self.extract_block_content(block) @@ -327,17 +431,48 @@ class NotionHistoryConnector: child_blocks = [] if has_children: - # Fetch and process child blocks - children_response = await notion.blocks.children.list(block_id=block_id) - for child_block in children_response["results"]: - child_blocks.append(await self.process_block(child_block)) + try: + # Fetch and process child blocks + children_response = await notion.blocks.children.list( + block_id=block_id + ) + for child_block in children_response["results"]: + processed_child, child_had_skips = await self.process_block( + child_block + ) + if processed_child: + child_blocks.append(processed_child) + if child_had_skips: + had_skipped_content = True + except APIResponseError as e: + error_message = str(e) + # Check if this is an unsupported block type error + if any( + err in error_message for err in UNSUPPORTED_BLOCK_TYPE_ERRORS + ): + logger.warning( + f"Skipping children of block {block_id} due to unsupported block type: {error_message}" + ) + had_skipped_content = True + # Continue without children instead of failing + elif "Could not find block" in error_message: + logger.warning( + f"Block {block_id} children not accessible, skipping: {error_message}" + ) + # Continue without children + else: + # Re-raise other API errors + raise - return { - "id": block_id, - "type": block_type, - "content": content, - "children": child_blocks, - } + return ( + { + "id": block_id, + "type": block_type, + "content": content, + "children": child_blocks, + }, + had_skipped_content, + ) def extract_block_content(self, block): """ diff --git a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py index 2d36351fa..eee668198 100644 --- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py @@ -156,6 +156,13 @@ async def index_notion_pages( start_date=start_date_iso, end_date=end_date_iso ) logger.info(f"Found {len(pages)} Notion pages") + + # Get count of pages that had unsupported content skipped + pages_with_skipped_content = notion_client.get_skipped_content_count() + if pages_with_skipped_content > 0: + logger.info( + f"{pages_with_skipped_content} pages had Notion AI content skipped (not available via API)" + ) except Exception as e: await task_logger.log_task_failure( log_entry, @@ -437,13 +444,23 @@ async def index_notion_pages( logger.info(f"Final commit: Total {documents_indexed} documents processed") await session.commit() - # Prepare result message + # Get final count of pages with skipped Notion AI content + pages_with_skipped_ai_content = notion_client.get_skipped_content_count() + + # Prepare result message with user-friendly notification about skipped content result_message = None if skipped_pages: result_message = f"Processed {total_processed} pages. Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}" else: result_message = f"Processed {total_processed} pages." + # Add user-friendly message about skipped Notion AI content + if pages_with_skipped_ai_content > 0: + result_message += ( + f" Audio transcriptions and AI summaries from Notion aren't accessible " + f"via their API — all other content was saved." + ) + # Log success await task_logger.log_task_success( log_entry, @@ -453,6 +470,7 @@ async def index_notion_pages( "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, "skipped_pages_count": len(skipped_pages), + "pages_with_skipped_ai_content": pages_with_skipped_ai_content, "result_message": result_message, }, ) @@ -464,10 +482,18 @@ async def index_notion_pages( # Clean up the async client await notion_client.close() + # Return user-friendly message about skipped AI content (if any) + # This will be shown in the notification to inform users + user_notification_message = None + if pages_with_skipped_ai_content > 0: + user_notification_message = ( + "Some Notion AI content couldn't be synced (Notion API limitation)" + ) + return ( total_processed, - None, - ) # Return None on success (result_message is for logging only) + user_notification_message, + ) # Return message about skipped AI content if any except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx index 35815b0b7..72069441a 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx @@ -218,7 +218,7 @@ export const IndexingConfigurationView: FC = ({ {isStartingIndexing ? ( <> - Starting... + Starting ) : ( "Start Indexing" diff --git a/surfsense_web/content/docs/connectors/notion.mdx b/surfsense_web/content/docs/connectors/notion.mdx index 0612c4f4f..6fcda8dae 100644 --- a/surfsense_web/content/docs/connectors/notion.mdx +++ b/surfsense_web/content/docs/connectors/notion.mdx @@ -66,6 +66,29 @@ Click **Save** to apply the capabilities. --- +## Limitations & Unsupported Content + +Notion's API has limitations on certain block types that cannot be retrieved. SurfSense will automatically skip these unsupported blocks and continue syncing all other content. + +### Unsupported Block Types + +The following Notion features are **not accessible via the Notion API** and will be skipped during sync: + +- **Transcription blocks** - Audio/video transcriptions from Notion AI +- **AI blocks** - AI-generated content blocks + +### Learn More + +The Notion API only supports specific block types for retrieval. The official list of **supported block types** is documented in Notion's Block reference: + +- **[Block Object Reference](https://developers.notion.com/reference/block)** - Official documentation listing all supported block types. Any block type not listed here (such as `transcription` and `ai_block`) is not accessible via the Notion API. + +For additional information: +- [Working with Page Content](https://developers.notion.com/docs/working-with-page-content) - Guide on how the Notion API handles page content +- [Notion API Reference](https://developers.notion.com/reference) - Complete API documentation + +--- + ## Running SurfSense with Notion Connector Add the Notion environment variables to your Docker run command: