mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-06 20:15:17 +02:00
feat(backend): Implement handling of unsupported Notion block types and track skipped content, add documentation for it
This commit is contained in:
parent
5eca07f24f
commit
41ebe162b0
4 changed files with 218 additions and 34 deletions
|
|
@ -1,6 +1,7 @@
|
|||
import logging
|
||||
|
||||
from notion_client import AsyncClient
|
||||
from notion_client.errors import APIResponseError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
|
||||
|
|
@ -12,6 +13,17 @@ from app.utils.oauth_security import TokenEncryption
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Known unsupported block types that Notion API doesn't expose
|
||||
# These will be skipped gracefully instead of failing the entire sync
|
||||
UNSUPPORTED_BLOCK_TYPE_ERRORS = [
|
||||
"transcription is not supported",
|
||||
"ai_block is not supported",
|
||||
"is not supported via the API",
|
||||
]
|
||||
|
||||
# Known unsupported block types to check before API calls
|
||||
UNSUPPORTED_BLOCK_TYPES = ["transcription", "ai_block"]
|
||||
|
||||
|
||||
class NotionHistoryConnector:
|
||||
def __init__(
|
||||
|
|
@ -32,6 +44,8 @@ class NotionHistoryConnector:
|
|||
self._connector_id = connector_id
|
||||
self._credentials = credentials
|
||||
self._notion_client: AsyncClient | None = None
|
||||
# Track pages with skipped unsupported content (for user notifications)
|
||||
self._pages_with_skipped_content: list[str] = []
|
||||
|
||||
async def _get_valid_token(self) -> str:
|
||||
"""
|
||||
|
|
@ -163,6 +177,34 @@ class NotionHistoryConnector:
|
|||
await self._notion_client.aclose()
|
||||
self._notion_client = None
|
||||
|
||||
def get_pages_with_skipped_content(self) -> list[str]:
|
||||
"""
|
||||
Get list of page titles that had unsupported content skipped.
|
||||
|
||||
Returns:
|
||||
List of page titles with skipped content
|
||||
"""
|
||||
return self._pages_with_skipped_content
|
||||
|
||||
def get_skipped_content_count(self) -> int:
|
||||
"""
|
||||
Get count of pages that had unsupported content skipped.
|
||||
|
||||
Returns:
|
||||
Number of pages with skipped content
|
||||
"""
|
||||
return len(self._pages_with_skipped_content)
|
||||
|
||||
def _record_skipped_content(self, page_title: str):
|
||||
"""
|
||||
Record that a page had unsupported content skipped.
|
||||
|
||||
Args:
|
||||
page_title: Title of the page with skipped content
|
||||
"""
|
||||
if page_title not in self._pages_with_skipped_content:
|
||||
self._pages_with_skipped_content.append(page_title)
|
||||
|
||||
async def __aenter__(self):
|
||||
"""Async context manager entry."""
|
||||
return self
|
||||
|
|
@ -229,14 +271,21 @@ class NotionHistoryConnector:
|
|||
|
||||
for page in pages:
|
||||
page_id = page["id"]
|
||||
page_title = self.get_page_title(page)
|
||||
|
||||
# Get detailed page information
|
||||
page_content = await self.get_page_content(page_id)
|
||||
# Get detailed page information (pass title for skip tracking)
|
||||
page_content, had_skipped_content = await self.get_page_content(
|
||||
page_id, page_title
|
||||
)
|
||||
|
||||
# Record if this page had skipped content
|
||||
if had_skipped_content:
|
||||
self._record_skipped_content(page_title)
|
||||
|
||||
all_page_data.append(
|
||||
{
|
||||
"page_id": page_id,
|
||||
"title": self.get_page_title(page),
|
||||
"title": page_title,
|
||||
"content": page_content,
|
||||
}
|
||||
)
|
||||
|
|
@ -265,46 +314,85 @@ class NotionHistoryConnector:
|
|||
# If no title found, return the page ID as fallback
|
||||
return f"Untitled page ({page['id']})"
|
||||
|
||||
async def get_page_content(self, page_id):
|
||||
async def get_page_content(
|
||||
self, page_id: str, page_title: str | None = None
|
||||
) -> tuple[list, bool]:
|
||||
"""
|
||||
Fetches the content (blocks) of a specific page.
|
||||
|
||||
Args:
|
||||
page_id (str): The ID of the page to fetch
|
||||
page_title (str, optional): Title of the page (for logging)
|
||||
|
||||
Returns:
|
||||
list: List of processed blocks from the page
|
||||
tuple: (List of processed blocks, bool indicating if content was skipped)
|
||||
"""
|
||||
notion = await self._get_client()
|
||||
|
||||
blocks = []
|
||||
has_more = True
|
||||
cursor = None
|
||||
skipped_blocks_count = 0
|
||||
had_skipped_content = False
|
||||
|
||||
# Paginate through all blocks
|
||||
while has_more:
|
||||
if cursor:
|
||||
response = await notion.blocks.children.list(
|
||||
block_id=page_id, start_cursor=cursor
|
||||
)
|
||||
else:
|
||||
response = await notion.blocks.children.list(block_id=page_id)
|
||||
try:
|
||||
if cursor:
|
||||
response = await notion.blocks.children.list(
|
||||
block_id=page_id, start_cursor=cursor
|
||||
)
|
||||
else:
|
||||
response = await notion.blocks.children.list(block_id=page_id)
|
||||
|
||||
blocks.extend(response["results"])
|
||||
has_more = response["has_more"]
|
||||
blocks.extend(response["results"])
|
||||
has_more = response["has_more"]
|
||||
|
||||
if has_more:
|
||||
cursor = response["next_cursor"]
|
||||
if has_more:
|
||||
cursor = response["next_cursor"]
|
||||
|
||||
except APIResponseError as e:
|
||||
error_message = str(e)
|
||||
# Check if this is an unsupported block type error
|
||||
if any(
|
||||
err in error_message for err in UNSUPPORTED_BLOCK_TYPE_ERRORS
|
||||
):
|
||||
logger.warning(
|
||||
f"Skipping page blocks due to unsupported block type in page {page_id}: {error_message}"
|
||||
)
|
||||
skipped_blocks_count += 1
|
||||
had_skipped_content = True
|
||||
# If we haven't fetched any blocks yet, return empty
|
||||
# If we have some blocks, continue with what we have
|
||||
has_more = False
|
||||
continue
|
||||
elif "Could not find block" in error_message:
|
||||
logger.warning(
|
||||
f"Block not found in page {page_id}, continuing with available blocks: {error_message}"
|
||||
)
|
||||
has_more = False
|
||||
continue
|
||||
# Re-raise other API errors
|
||||
raise
|
||||
|
||||
if skipped_blocks_count > 0:
|
||||
logger.info(
|
||||
f"Page {page_id}: Skipped {skipped_blocks_count} unsupported block sections, "
|
||||
f"successfully processed {len(blocks)} blocks"
|
||||
)
|
||||
|
||||
# Process nested blocks recursively
|
||||
processed_blocks = []
|
||||
for block in blocks:
|
||||
processed_block = await self.process_block(block)
|
||||
processed_blocks.append(processed_block)
|
||||
processed_block, block_had_skips = await self.process_block(block)
|
||||
if processed_block: # Only add if block was processed successfully
|
||||
processed_blocks.append(processed_block)
|
||||
if block_had_skips:
|
||||
had_skipped_content = True
|
||||
|
||||
return processed_blocks
|
||||
return processed_blocks, had_skipped_content
|
||||
|
||||
async def process_block(self, block):
|
||||
async def process_block(self, block) -> tuple[dict | None, bool]:
|
||||
"""
|
||||
Processes a block and recursively fetches any child blocks.
|
||||
|
||||
|
|
@ -312,12 +400,28 @@ class NotionHistoryConnector:
|
|||
block (dict): The block to process
|
||||
|
||||
Returns:
|
||||
dict: Processed block with content and children
|
||||
tuple: (Processed block dict or None, bool indicating if content was skipped)
|
||||
"""
|
||||
notion = await self._get_client()
|
||||
|
||||
block_id = block["id"]
|
||||
block_type = block["type"]
|
||||
had_skipped_content = False
|
||||
|
||||
# Check if this is a known unsupported block type before processing
|
||||
if block_type in UNSUPPORTED_BLOCK_TYPES:
|
||||
logger.debug(
|
||||
f"Skipping unsupported block type: {block_type} (block_id: {block_id})"
|
||||
)
|
||||
return (
|
||||
{
|
||||
"id": block_id,
|
||||
"type": block_type,
|
||||
"content": f"[{block_type} block - not supported by Notion API]",
|
||||
"children": [],
|
||||
},
|
||||
True, # Content was skipped
|
||||
)
|
||||
|
||||
# Extract block content based on its type
|
||||
content = self.extract_block_content(block)
|
||||
|
|
@ -327,17 +431,48 @@ class NotionHistoryConnector:
|
|||
child_blocks = []
|
||||
|
||||
if has_children:
|
||||
# Fetch and process child blocks
|
||||
children_response = await notion.blocks.children.list(block_id=block_id)
|
||||
for child_block in children_response["results"]:
|
||||
child_blocks.append(await self.process_block(child_block))
|
||||
try:
|
||||
# Fetch and process child blocks
|
||||
children_response = await notion.blocks.children.list(
|
||||
block_id=block_id
|
||||
)
|
||||
for child_block in children_response["results"]:
|
||||
processed_child, child_had_skips = await self.process_block(
|
||||
child_block
|
||||
)
|
||||
if processed_child:
|
||||
child_blocks.append(processed_child)
|
||||
if child_had_skips:
|
||||
had_skipped_content = True
|
||||
except APIResponseError as e:
|
||||
error_message = str(e)
|
||||
# Check if this is an unsupported block type error
|
||||
if any(
|
||||
err in error_message for err in UNSUPPORTED_BLOCK_TYPE_ERRORS
|
||||
):
|
||||
logger.warning(
|
||||
f"Skipping children of block {block_id} due to unsupported block type: {error_message}"
|
||||
)
|
||||
had_skipped_content = True
|
||||
# Continue without children instead of failing
|
||||
elif "Could not find block" in error_message:
|
||||
logger.warning(
|
||||
f"Block {block_id} children not accessible, skipping: {error_message}"
|
||||
)
|
||||
# Continue without children
|
||||
else:
|
||||
# Re-raise other API errors
|
||||
raise
|
||||
|
||||
return {
|
||||
"id": block_id,
|
||||
"type": block_type,
|
||||
"content": content,
|
||||
"children": child_blocks,
|
||||
}
|
||||
return (
|
||||
{
|
||||
"id": block_id,
|
||||
"type": block_type,
|
||||
"content": content,
|
||||
"children": child_blocks,
|
||||
},
|
||||
had_skipped_content,
|
||||
)
|
||||
|
||||
def extract_block_content(self, block):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -156,6 +156,13 @@ async def index_notion_pages(
|
|||
start_date=start_date_iso, end_date=end_date_iso
|
||||
)
|
||||
logger.info(f"Found {len(pages)} Notion pages")
|
||||
|
||||
# Get count of pages that had unsupported content skipped
|
||||
pages_with_skipped_content = notion_client.get_skipped_content_count()
|
||||
if pages_with_skipped_content > 0:
|
||||
logger.info(
|
||||
f"{pages_with_skipped_content} pages had Notion AI content skipped (not available via API)"
|
||||
)
|
||||
except Exception as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
|
|
@ -437,13 +444,23 @@ async def index_notion_pages(
|
|||
logger.info(f"Final commit: Total {documents_indexed} documents processed")
|
||||
await session.commit()
|
||||
|
||||
# Prepare result message
|
||||
# Get final count of pages with skipped Notion AI content
|
||||
pages_with_skipped_ai_content = notion_client.get_skipped_content_count()
|
||||
|
||||
# Prepare result message with user-friendly notification about skipped content
|
||||
result_message = None
|
||||
if skipped_pages:
|
||||
result_message = f"Processed {total_processed} pages. Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}"
|
||||
else:
|
||||
result_message = f"Processed {total_processed} pages."
|
||||
|
||||
# Add user-friendly message about skipped Notion AI content
|
||||
if pages_with_skipped_ai_content > 0:
|
||||
result_message += (
|
||||
f" Audio transcriptions and AI summaries from Notion aren't accessible "
|
||||
f"via their API — all other content was saved."
|
||||
)
|
||||
|
||||
# Log success
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
|
|
@ -453,6 +470,7 @@ async def index_notion_pages(
|
|||
"documents_indexed": documents_indexed,
|
||||
"documents_skipped": documents_skipped,
|
||||
"skipped_pages_count": len(skipped_pages),
|
||||
"pages_with_skipped_ai_content": pages_with_skipped_ai_content,
|
||||
"result_message": result_message,
|
||||
},
|
||||
)
|
||||
|
|
@ -464,10 +482,18 @@ async def index_notion_pages(
|
|||
# Clean up the async client
|
||||
await notion_client.close()
|
||||
|
||||
# Return user-friendly message about skipped AI content (if any)
|
||||
# This will be shown in the notification to inform users
|
||||
user_notification_message = None
|
||||
if pages_with_skipped_ai_content > 0:
|
||||
user_notification_message = (
|
||||
"Some Notion AI content couldn't be synced (Notion API limitation)"
|
||||
)
|
||||
|
||||
return (
|
||||
total_processed,
|
||||
None,
|
||||
) # Return None on success (result_message is for logging only)
|
||||
user_notification_message,
|
||||
) # Return message about skipped AI content if any
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
|
|
|
|||
|
|
@ -218,7 +218,7 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
|
|||
{isStartingIndexing ? (
|
||||
<>
|
||||
<Spinner size="sm" className="mr-2" />
|
||||
Starting...
|
||||
Starting
|
||||
</>
|
||||
) : (
|
||||
"Start Indexing"
|
||||
|
|
|
|||
|
|
@ -66,6 +66,29 @@ Click **Save** to apply the capabilities.
|
|||
|
||||
---
|
||||
|
||||
## Limitations & Unsupported Content
|
||||
|
||||
Notion's API has limitations on certain block types that cannot be retrieved. SurfSense will automatically skip these unsupported blocks and continue syncing all other content.
|
||||
|
||||
### Unsupported Block Types
|
||||
|
||||
The following Notion features are **not accessible via the Notion API** and will be skipped during sync:
|
||||
|
||||
- **Transcription blocks** - Audio/video transcriptions from Notion AI
|
||||
- **AI blocks** - AI-generated content blocks
|
||||
|
||||
### Learn More
|
||||
|
||||
The Notion API only supports specific block types for retrieval. The official list of **supported block types** is documented in Notion's Block reference:
|
||||
|
||||
- **[Block Object Reference](https://developers.notion.com/reference/block)** - Official documentation listing all supported block types. Any block type not listed here (such as `transcription` and `ai_block`) is not accessible via the Notion API.
|
||||
|
||||
For additional information:
|
||||
- [Working with Page Content](https://developers.notion.com/docs/working-with-page-content) - Guide on how the Notion API handles page content
|
||||
- [Notion API Reference](https://developers.notion.com/reference) - Complete API documentation
|
||||
|
||||
---
|
||||
|
||||
## Running SurfSense with Notion Connector
|
||||
|
||||
Add the Notion environment variables to your Docker run command:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue