mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 01:06:23 +02:00
1179 lines
44 KiB
Python
1179 lines
44 KiB
Python
import asyncio
|
|
import contextlib
|
|
import logging
|
|
import re
|
|
from collections.abc import Awaitable, Callable
|
|
from typing import Any, TypeVar
|
|
|
|
from notion_client import AsyncClient
|
|
from notion_client.errors import APIResponseError
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
from sqlalchemy.future import select
|
|
|
|
from app.config import config
|
|
from app.db import SearchSourceConnector
|
|
from app.schemas.notion_auth_credentials import NotionAuthCredentialsBase
|
|
from app.utils.oauth_security import TokenEncryption
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class NotionAPIError(Exception):
|
|
"""Raised when the Notion API returns a non-200 response.
|
|
|
|
The message is always user-presentable; callers should surface it directly
|
|
without any additional prefix or wrapping.
|
|
"""
|
|
|
|
|
|
# Type variable for generic return type
|
|
T = TypeVar("T")
|
|
|
|
# ============================================================================
|
|
# Retry Configuration (per Notion API docs)
|
|
# https://developers.notion.com/reference/request-limits
|
|
# https://developers.notion.com/reference/status-codes
|
|
# ============================================================================
|
|
MAX_RETRIES = 5
|
|
BASE_RETRY_DELAY = 1.0 # seconds
|
|
MAX_RETRY_DELAY = 60.0 # seconds (Notion's max request timeout)
|
|
MAX_RATE_LIMIT_WAIT_SECONDS = float(
|
|
getattr(config, "NOTION_MAX_RETRY_AFTER_SECONDS", 30.0)
|
|
)
|
|
MAX_TOTAL_RETRY_WAIT_SECONDS = float(
|
|
getattr(config, "NOTION_MAX_TOTAL_RETRY_WAIT_SECONDS", 120.0)
|
|
)
|
|
|
|
# Type alias for retry callback function
|
|
# Signature: async callback(retry_reason, attempt, max_attempts, wait_seconds) -> None
|
|
# retry_reason: 'rate_limit', 'server_error', 'timeout'
|
|
# This callback can be used to update notifications during retries
|
|
RetryCallbackType = Callable[[str, int, int, float], Awaitable[None]]
|
|
|
|
# HTTP status codes that should trigger a retry
|
|
# 429: rate_limited - Use Retry-After header
|
|
# 500: internal_server_error - Unexpected error
|
|
# 502: bad_gateway - Failed upstream connection
|
|
# 503: service_unavailable - Notion unavailable or timeout
|
|
# 504: gateway_timeout - Notion timed out
|
|
RETRYABLE_STATUS_CODES = frozenset({429, 500, 502, 503, 504})
|
|
|
|
# Known unsupported block types that Notion API doesn't expose
|
|
# These will be skipped gracefully instead of failing the entire sync
|
|
UNSUPPORTED_BLOCK_TYPE_ERRORS = [
|
|
"transcription is not supported",
|
|
"ai_block is not supported",
|
|
"is not supported via the API",
|
|
]
|
|
|
|
# Known unsupported block types to check before API calls
|
|
UNSUPPORTED_BLOCK_TYPES = ["transcription", "ai_block"]
|
|
|
|
|
|
class NotionHistoryConnector:
|
|
def __init__(
|
|
self,
|
|
session: AsyncSession,
|
|
connector_id: int,
|
|
credentials: NotionAuthCredentialsBase | None = None,
|
|
):
|
|
"""
|
|
Initialize the NotionHistoryConnector with auto-refresh capability.
|
|
|
|
Args:
|
|
session: Database session for updating connector
|
|
connector_id: Connector ID for direct updates
|
|
credentials: Notion OAuth credentials (optional, will be loaded from DB if not provided)
|
|
"""
|
|
self._session = session
|
|
self._connector_id = connector_id
|
|
self._credentials = credentials
|
|
self._notion_client: AsyncClient | None = None
|
|
# Track pages with skipped unsupported content (for user notifications)
|
|
self._pages_with_skipped_content: list[str] = []
|
|
# Optional callback to notify about retry progress (for user notifications)
|
|
self._on_retry_callback: RetryCallbackType | None = None
|
|
# Track if using legacy integration token (for upgrade notification)
|
|
self._using_legacy_token: bool = False
|
|
|
|
def set_retry_callback(self, callback: RetryCallbackType | None) -> None:
|
|
"""
|
|
Set a callback function to be called when API calls are retried.
|
|
|
|
This allows the indexer to receive notifications about rate limits
|
|
and other transient errors, which can be used to update user-facing
|
|
notifications.
|
|
|
|
Args:
|
|
callback: Async function with signature:
|
|
callback(retry_reason, attempt, max_attempts, wait_seconds) -> None
|
|
retry_reason: 'rate_limit', 'server_error', or 'timeout'
|
|
Set to None to disable callbacks.
|
|
"""
|
|
self._on_retry_callback = callback
|
|
|
|
async def _get_valid_token(self) -> str:
|
|
"""
|
|
Get valid Notion access token, refreshing if needed.
|
|
|
|
Returns:
|
|
Valid access token
|
|
|
|
Raises:
|
|
ValueError: If credentials are missing or invalid
|
|
Exception: If token refresh fails
|
|
"""
|
|
# Load credentials from DB if not provided
|
|
if self._credentials is None:
|
|
result = await self._session.execute(
|
|
select(SearchSourceConnector).filter(
|
|
SearchSourceConnector.id == self._connector_id
|
|
)
|
|
)
|
|
connector = result.scalars().first()
|
|
|
|
if not connector:
|
|
raise ValueError(f"Connector {self._connector_id} not found")
|
|
|
|
config_data = connector.config.copy()
|
|
|
|
# Check for legacy integration token format first
|
|
# (for connectors created before OAuth was implemented)
|
|
legacy_token = config_data.get("NOTION_INTEGRATION_TOKEN")
|
|
raw_access_token = config_data.get("access_token")
|
|
|
|
# Validate that we have some form of token
|
|
if not raw_access_token and not legacy_token:
|
|
raise ValueError(
|
|
"Notion integration not properly connected. "
|
|
"Please remove and re-add the Notion connector."
|
|
)
|
|
|
|
# Decrypt credentials if they are encrypted
|
|
token_encrypted = config_data.get("_token_encrypted", False)
|
|
if token_encrypted and config.SECRET_KEY:
|
|
try:
|
|
token_encryption = TokenEncryption(config.SECRET_KEY)
|
|
|
|
# Decrypt sensitive fields
|
|
if config_data.get("access_token"):
|
|
config_data["access_token"] = token_encryption.decrypt_token(
|
|
config_data["access_token"]
|
|
)
|
|
if config_data.get("refresh_token"):
|
|
config_data["refresh_token"] = token_encryption.decrypt_token(
|
|
config_data["refresh_token"]
|
|
)
|
|
|
|
logger.info(
|
|
f"Decrypted Notion credentials for connector {self._connector_id}"
|
|
)
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Failed to decrypt Notion credentials for connector {self._connector_id}: {e!s}"
|
|
)
|
|
raise ValueError(
|
|
"Notion credentials could not be decrypted. "
|
|
"Please remove and re-add the Notion connector."
|
|
) from e
|
|
|
|
# Handle legacy format: convert NOTION_INTEGRATION_TOKEN to access_token
|
|
if not config_data.get("access_token") and legacy_token:
|
|
config_data["access_token"] = legacy_token
|
|
self._using_legacy_token = True
|
|
logger.info(
|
|
f"Using legacy NOTION_INTEGRATION_TOKEN for connector {self._connector_id}"
|
|
)
|
|
|
|
# Final validation: ensure we have a valid access_token after all processing
|
|
final_token = config_data.get("access_token")
|
|
if not final_token or (
|
|
isinstance(final_token, str) and not final_token.strip()
|
|
):
|
|
raise ValueError(
|
|
"Notion access token is invalid or empty. "
|
|
"Please remove and re-add the Notion connector."
|
|
)
|
|
|
|
try:
|
|
self._credentials = NotionAuthCredentialsBase.from_dict(config_data)
|
|
except KeyError as e:
|
|
raise ValueError(
|
|
f"Notion credentials are incomplete (missing {e}). "
|
|
"Please reconnect your Notion account."
|
|
) from e
|
|
except Exception as e:
|
|
raise ValueError(
|
|
f"Notion credentials format error: {e!s}. "
|
|
"Please reconnect your Notion account."
|
|
) from e
|
|
|
|
# Check if token is expired and refreshable
|
|
if self._credentials.is_expired and self._credentials.is_refreshable:
|
|
try:
|
|
logger.info(
|
|
f"Notion token expired for connector {self._connector_id}, refreshing..."
|
|
)
|
|
|
|
# Get connector for refresh
|
|
result = await self._session.execute(
|
|
select(SearchSourceConnector).filter(
|
|
SearchSourceConnector.id == self._connector_id
|
|
)
|
|
)
|
|
connector = result.scalars().first()
|
|
|
|
if not connector:
|
|
raise RuntimeError(
|
|
f"Connector {self._connector_id} not found; cannot refresh token."
|
|
)
|
|
|
|
# Refresh token
|
|
from app.routes.notion_add_connector_route import refresh_notion_token
|
|
|
|
connector = await refresh_notion_token(self._session, connector)
|
|
|
|
# Reload credentials after refresh
|
|
config_data = connector.config.copy()
|
|
token_encrypted = config_data.get("_token_encrypted", False)
|
|
if token_encrypted and config.SECRET_KEY:
|
|
token_encryption = TokenEncryption(config.SECRET_KEY)
|
|
if config_data.get("access_token"):
|
|
config_data["access_token"] = token_encryption.decrypt_token(
|
|
config_data["access_token"]
|
|
)
|
|
if config_data.get("refresh_token"):
|
|
config_data["refresh_token"] = token_encryption.decrypt_token(
|
|
config_data["refresh_token"]
|
|
)
|
|
|
|
self._credentials = NotionAuthCredentialsBase.from_dict(config_data)
|
|
|
|
# Invalidate cached client so it's recreated with new token
|
|
self._notion_client = None
|
|
|
|
logger.info(
|
|
f"Successfully refreshed Notion token for connector {self._connector_id}"
|
|
)
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Failed to refresh Notion token for connector {self._connector_id}: {e!s}"
|
|
)
|
|
raise NotionAPIError(
|
|
"Failed to refresh your Notion connection. "
|
|
"Please try again or reconnect your Notion account."
|
|
) from e
|
|
|
|
return self._credentials.access_token
|
|
|
|
async def _get_client(self) -> AsyncClient:
|
|
"""
|
|
Get or create Notion AsyncClient with valid token.
|
|
|
|
Returns:
|
|
Notion AsyncClient instance
|
|
"""
|
|
if self._notion_client is None:
|
|
token = await self._get_valid_token()
|
|
self._notion_client = AsyncClient(auth=token)
|
|
return self._notion_client
|
|
|
|
async def _api_call_with_retry(
|
|
self,
|
|
api_func: Callable[..., Awaitable[T]],
|
|
*args: Any,
|
|
on_retry: RetryCallbackType | None = None,
|
|
**kwargs: Any,
|
|
) -> T:
|
|
"""
|
|
Execute Notion API call with retry logic and exponential backoff.
|
|
|
|
Handles retryable errors per Notion API documentation:
|
|
- 429 rate_limited: Uses Retry-After header value
|
|
- 500 internal_server_error: Retries with exponential backoff
|
|
- 502 bad_gateway: Retries with exponential backoff
|
|
- 503 service_unavailable: Retries with exponential backoff
|
|
- 504 gateway_timeout: Retries with exponential backoff
|
|
|
|
Args:
|
|
api_func: The async Notion API function to call
|
|
*args: Positional arguments to pass to the API function
|
|
on_retry: Optional callback to notify about retry progress.
|
|
Signature: async callback(retry_reason, attempt, max_attempts, wait_seconds)
|
|
retry_reason is one of: 'rate_limit', 'server_error', 'timeout'
|
|
**kwargs: Keyword arguments to pass to the API function
|
|
|
|
Returns:
|
|
The result from the API call
|
|
|
|
Raises:
|
|
APIResponseError: If all retries are exhausted or error is not retryable
|
|
"""
|
|
last_exception: APIResponseError | None = None
|
|
retry_delay = BASE_RETRY_DELAY
|
|
total_wait_time = 0.0
|
|
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
return await api_func(*args, **kwargs)
|
|
|
|
except APIResponseError as e:
|
|
last_exception = e
|
|
|
|
# Check if this error is retryable
|
|
if e.status not in RETRYABLE_STATUS_CODES:
|
|
# Not retryable (e.g., 400, 401, 403, 404) - raise immediately
|
|
raise
|
|
|
|
# Check if we've exhausted retries
|
|
if attempt == MAX_RETRIES - 1:
|
|
logger.error(
|
|
f"Notion API call failed after {MAX_RETRIES} retries. "
|
|
f"Last error: {e.status} {e.code}"
|
|
)
|
|
raise
|
|
|
|
# Determine retry reason and wait time based on status code
|
|
if e.status == 429:
|
|
# Rate limited - use Retry-After header if available
|
|
retry_reason = "rate_limit"
|
|
retry_after = e.headers.get("Retry-After") if e.headers else None
|
|
if retry_after:
|
|
try:
|
|
wait_time = float(retry_after)
|
|
except (ValueError, TypeError):
|
|
wait_time = retry_delay
|
|
else:
|
|
wait_time = retry_delay
|
|
|
|
# Avoid very long worker sleeps from external Retry-After values.
|
|
if wait_time > MAX_RATE_LIMIT_WAIT_SECONDS:
|
|
logger.warning(
|
|
f"Notion Retry-After ({wait_time}s) exceeds cap "
|
|
f"({MAX_RATE_LIMIT_WAIT_SECONDS}s). Clamping wait time."
|
|
)
|
|
wait_time = MAX_RATE_LIMIT_WAIT_SECONDS
|
|
|
|
logger.warning(
|
|
f"Notion API rate limited (429). "
|
|
f"Waiting {wait_time}s. Attempt {attempt + 1}/{MAX_RETRIES}"
|
|
)
|
|
elif e.status == 504:
|
|
# Gateway timeout
|
|
retry_reason = "timeout"
|
|
wait_time = min(retry_delay, MAX_RETRY_DELAY)
|
|
logger.warning(
|
|
f"Notion API timeout ({e.status}). "
|
|
f"Retrying in {wait_time}s. Attempt {attempt + 1}/{MAX_RETRIES}"
|
|
)
|
|
else:
|
|
# Server error (500/502/503) - use exponential backoff
|
|
retry_reason = "server_error"
|
|
wait_time = min(retry_delay, MAX_RETRY_DELAY)
|
|
logger.warning(
|
|
f"Notion API error {e.status} ({e.code}). "
|
|
f"Retrying in {wait_time}s. Attempt {attempt + 1}/{MAX_RETRIES}"
|
|
)
|
|
|
|
# Notify about retry via callback (for user notifications)
|
|
# Call before sleeping so user sees the message while we wait
|
|
if total_wait_time + wait_time > MAX_TOTAL_RETRY_WAIT_SECONDS:
|
|
logger.error(
|
|
"Notion API retry budget exceeded "
|
|
f"({total_wait_time + wait_time:.1f}s > "
|
|
f"{MAX_TOTAL_RETRY_WAIT_SECONDS:.1f}s). Failing fast."
|
|
)
|
|
raise
|
|
|
|
if on_retry:
|
|
try:
|
|
await on_retry(
|
|
retry_reason,
|
|
attempt + 1, # 1-based for display
|
|
MAX_RETRIES,
|
|
wait_time,
|
|
)
|
|
except Exception as callback_error:
|
|
# Don't let callback errors break the retry logic
|
|
logger.warning(f"Retry callback failed: {callback_error}")
|
|
|
|
# Wait before retrying
|
|
await asyncio.sleep(wait_time)
|
|
total_wait_time += wait_time
|
|
|
|
# Exponential backoff for next attempt
|
|
retry_delay = min(retry_delay * 2, MAX_RETRY_DELAY)
|
|
|
|
# This should not be reached, but just in case
|
|
if last_exception:
|
|
raise last_exception
|
|
raise RuntimeError("Unexpected state in retry logic")
|
|
|
|
async def close(self):
|
|
"""Close the async client connection."""
|
|
if self._notion_client:
|
|
await self._notion_client.aclose()
|
|
self._notion_client = None
|
|
|
|
def get_pages_with_skipped_content(self) -> list[str]:
|
|
"""
|
|
Get list of page titles that had unsupported content skipped.
|
|
|
|
Returns:
|
|
List of page titles with skipped content
|
|
"""
|
|
return self._pages_with_skipped_content
|
|
|
|
def get_skipped_content_count(self) -> int:
|
|
"""
|
|
Get count of pages that had unsupported content skipped.
|
|
|
|
Returns:
|
|
Number of pages with skipped content
|
|
"""
|
|
return len(self._pages_with_skipped_content)
|
|
|
|
def is_using_legacy_token(self) -> bool:
|
|
"""
|
|
Check if connector is using legacy integration token format.
|
|
|
|
Returns:
|
|
True if using legacy NOTION_INTEGRATION_TOKEN, False if using OAuth
|
|
"""
|
|
return self._using_legacy_token
|
|
|
|
def _record_skipped_content(self, page_title: str):
|
|
"""
|
|
Record that a page had unsupported content skipped.
|
|
|
|
Args:
|
|
page_title: Title of the page with skipped content
|
|
"""
|
|
if page_title not in self._pages_with_skipped_content:
|
|
self._pages_with_skipped_content.append(page_title)
|
|
|
|
@staticmethod
|
|
def _api_error_message(error: APIResponseError) -> str:
|
|
"""Extract a stable, human-readable message from Notion API errors."""
|
|
body = getattr(error, "body", None)
|
|
if isinstance(body, dict):
|
|
return str(body.get("message", str(error)))
|
|
if body:
|
|
return str(body)
|
|
return str(error)
|
|
|
|
async def __aenter__(self):
|
|
"""Async context manager entry."""
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
"""Async context manager exit."""
|
|
await self.close()
|
|
|
|
async def get_all_pages(self, start_date=None, end_date=None):
|
|
"""
|
|
Fetches all pages shared with your integration and their content.
|
|
|
|
Args:
|
|
start_date (str, optional): ISO 8601 date string (e.g., "2023-01-01T00:00:00Z")
|
|
end_date (str, optional): ISO 8601 date string (e.g., "2023-12-31T23:59:59Z")
|
|
|
|
Returns:
|
|
list: List of dictionaries containing page data
|
|
"""
|
|
notion = await self._get_client()
|
|
|
|
# Build the filter for the search
|
|
# Note: Notion API requires specific filter structure
|
|
search_params: dict[str, Any] = {}
|
|
|
|
# Filter for pages only (not databases)
|
|
search_params["filter"] = {"value": "page", "property": "object"}
|
|
|
|
# Add date filters if provided
|
|
if start_date or end_date:
|
|
date_filter = {}
|
|
|
|
if start_date:
|
|
date_filter["on_or_after"] = start_date
|
|
|
|
if end_date:
|
|
date_filter["on_or_before"] = end_date
|
|
|
|
# Add the date filter to the search params
|
|
if date_filter:
|
|
search_params["sort"] = {
|
|
"direction": "descending",
|
|
"timestamp": "last_edited_time",
|
|
}
|
|
|
|
# Paginate through all pages the integration has access to
|
|
pages = []
|
|
has_more = True
|
|
cursor = None
|
|
|
|
while has_more:
|
|
try:
|
|
if cursor:
|
|
search_params["start_cursor"] = cursor
|
|
|
|
# Use retry wrapper for search API call
|
|
search_results = await self._api_call_with_retry(
|
|
notion.search, on_retry=self._on_retry_callback, **search_params
|
|
)
|
|
|
|
pages.extend(search_results["results"])
|
|
has_more = search_results.get("has_more", False)
|
|
|
|
if has_more:
|
|
cursor = search_results.get("next_cursor")
|
|
|
|
except APIResponseError as e:
|
|
error_message = str(e)
|
|
# Handle invalid cursor - stop pagination gracefully
|
|
if "start_cursor provided is invalid" in error_message:
|
|
logger.warning(
|
|
f"Invalid pagination cursor encountered. "
|
|
f"Continuing with {len(pages)} pages already fetched."
|
|
)
|
|
has_more = False
|
|
continue
|
|
# Re-raise other errors
|
|
raise
|
|
|
|
all_page_data = []
|
|
|
|
for page in pages:
|
|
page_id = page["id"]
|
|
page_title = self.get_page_title(page)
|
|
|
|
# Get detailed page information (pass title for skip tracking)
|
|
page_content, had_skipped_content = await self.get_page_content(
|
|
page_id, page_title
|
|
)
|
|
|
|
# Record if this page had skipped content
|
|
if had_skipped_content:
|
|
self._record_skipped_content(page_title)
|
|
|
|
all_page_data.append(
|
|
{
|
|
"page_id": page_id,
|
|
"title": page_title,
|
|
"content": page_content,
|
|
}
|
|
)
|
|
|
|
return all_page_data
|
|
|
|
def get_page_title(self, page):
|
|
"""
|
|
Extracts the title from a page object.
|
|
|
|
Args:
|
|
page (dict): Notion page object
|
|
|
|
Returns:
|
|
str: Page title or a fallback string
|
|
"""
|
|
# Title can be in different properties depending on the page type
|
|
if "properties" in page:
|
|
# Try to find a title property
|
|
for _prop_name, prop_data in page["properties"].items():
|
|
if prop_data["type"] == "title" and len(prop_data["title"]) > 0:
|
|
return " ".join(
|
|
[text_obj["plain_text"] for text_obj in prop_data["title"]]
|
|
)
|
|
|
|
# If no title found, return the page ID as fallback
|
|
return f"Untitled page ({page['id']})"
|
|
|
|
async def get_page_content(
|
|
self, page_id: str, page_title: str | None = None
|
|
) -> tuple[list, bool]:
|
|
"""
|
|
Fetches the content (blocks) of a specific page.
|
|
|
|
Args:
|
|
page_id (str): The ID of the page to fetch
|
|
page_title (str, optional): Title of the page (for logging)
|
|
|
|
Returns:
|
|
tuple: (List of processed blocks, bool indicating if content was skipped)
|
|
"""
|
|
notion = await self._get_client()
|
|
|
|
blocks = []
|
|
has_more = True
|
|
cursor = None
|
|
skipped_blocks_count = 0
|
|
had_skipped_content = False
|
|
|
|
# Paginate through all blocks
|
|
while has_more:
|
|
try:
|
|
# Use retry wrapper for blocks.children.list API call
|
|
if cursor:
|
|
response = await self._api_call_with_retry(
|
|
notion.blocks.children.list,
|
|
on_retry=self._on_retry_callback,
|
|
block_id=page_id,
|
|
start_cursor=cursor,
|
|
)
|
|
else:
|
|
response = await self._api_call_with_retry(
|
|
notion.blocks.children.list,
|
|
on_retry=self._on_retry_callback,
|
|
block_id=page_id,
|
|
)
|
|
|
|
blocks.extend(response["results"])
|
|
has_more = response["has_more"]
|
|
|
|
if has_more:
|
|
cursor = response["next_cursor"]
|
|
|
|
except APIResponseError as e:
|
|
error_message = str(e)
|
|
# Check if this is an unsupported block type error
|
|
if any(err in error_message for err in UNSUPPORTED_BLOCK_TYPE_ERRORS):
|
|
logger.warning(
|
|
f"Skipping page blocks due to unsupported block type in page {page_id}: {error_message}"
|
|
)
|
|
skipped_blocks_count += 1
|
|
had_skipped_content = True
|
|
# If we haven't fetched any blocks yet, return empty
|
|
# If we have some blocks, continue with what we have
|
|
has_more = False
|
|
continue
|
|
elif "Could not find block" in error_message:
|
|
logger.warning(
|
|
f"Block not found in page {page_id}, continuing with available blocks: {error_message}"
|
|
)
|
|
has_more = False
|
|
continue
|
|
# Re-raise other API errors (after retry exhaustion)
|
|
raise
|
|
|
|
if skipped_blocks_count > 0:
|
|
logger.info(
|
|
f"Page {page_id}: Skipped {skipped_blocks_count} unsupported block sections, "
|
|
f"successfully processed {len(blocks)} blocks"
|
|
)
|
|
|
|
# Process nested blocks recursively
|
|
processed_blocks = []
|
|
for block in blocks:
|
|
processed_block, block_had_skips = await self.process_block(block)
|
|
if processed_block: # Only add if block was processed successfully
|
|
processed_blocks.append(processed_block)
|
|
if block_had_skips:
|
|
had_skipped_content = True
|
|
|
|
return processed_blocks, had_skipped_content
|
|
|
|
async def process_block(self, block) -> tuple[dict | None, bool]:
|
|
"""
|
|
Processes a block and recursively fetches any child blocks.
|
|
|
|
Args:
|
|
block (dict): The block to process
|
|
|
|
Returns:
|
|
tuple: (Processed block dict or None, bool indicating if content was skipped)
|
|
"""
|
|
notion = await self._get_client()
|
|
|
|
block_id = block["id"]
|
|
block_type = block["type"]
|
|
had_skipped_content = False
|
|
|
|
# Check if this is a known unsupported block type before processing
|
|
if block_type in UNSUPPORTED_BLOCK_TYPES:
|
|
logger.debug(
|
|
f"Skipping unsupported block type: {block_type} (block_id: {block_id})"
|
|
)
|
|
return (
|
|
{
|
|
"id": block_id,
|
|
"type": block_type,
|
|
"content": f"[{block_type} block - not supported by Notion API]",
|
|
"children": [],
|
|
},
|
|
True, # Content was skipped
|
|
)
|
|
|
|
# Extract block content based on its type
|
|
content = self.extract_block_content(block)
|
|
|
|
# Check if block has children
|
|
has_children = block.get("has_children", False)
|
|
child_blocks = []
|
|
|
|
if has_children:
|
|
try:
|
|
# Use retry wrapper for blocks.children.list API call
|
|
children_response = await self._api_call_with_retry(
|
|
notion.blocks.children.list,
|
|
on_retry=self._on_retry_callback,
|
|
block_id=block_id,
|
|
)
|
|
for child_block in children_response["results"]:
|
|
processed_child, child_had_skips = await self.process_block(
|
|
child_block
|
|
)
|
|
if processed_child:
|
|
child_blocks.append(processed_child)
|
|
if child_had_skips:
|
|
had_skipped_content = True
|
|
except APIResponseError as e:
|
|
error_message = str(e)
|
|
# Check if this is an unsupported block type error
|
|
if any(err in error_message for err in UNSUPPORTED_BLOCK_TYPE_ERRORS):
|
|
logger.warning(
|
|
f"Skipping children of block {block_id} due to unsupported block type: {error_message}"
|
|
)
|
|
had_skipped_content = True
|
|
# Continue without children instead of failing
|
|
elif "Could not find block" in error_message:
|
|
logger.warning(
|
|
f"Block {block_id} children not accessible, skipping: {error_message}"
|
|
)
|
|
# Continue without children
|
|
else:
|
|
# Re-raise other API errors (after retry exhaustion)
|
|
raise
|
|
|
|
return (
|
|
{
|
|
"id": block_id,
|
|
"type": block_type,
|
|
"content": content,
|
|
"children": child_blocks,
|
|
},
|
|
had_skipped_content,
|
|
)
|
|
|
|
def extract_block_content(self, block):
|
|
"""
|
|
Extracts the content from a block based on its type.
|
|
|
|
Args:
|
|
block (dict): The block to extract content from
|
|
|
|
Returns:
|
|
str: Extracted content as a string
|
|
"""
|
|
block_type = block["type"]
|
|
|
|
# Different block types have different structures
|
|
if block_type in block and "rich_text" in block[block_type]:
|
|
return "".join(
|
|
[text_obj["plain_text"] for text_obj in block[block_type]["rich_text"]]
|
|
)
|
|
elif block_type == "image":
|
|
# Instead of returning the raw URL which may contain sensitive AWS credentials,
|
|
# return a placeholder or reference to the image
|
|
if "file" in block["image"]:
|
|
# For Notion-hosted images (which use AWS S3 pre-signed URLs)
|
|
return "[Notion Image]"
|
|
elif "external" in block["image"]:
|
|
# For external images, we can return a sanitized reference
|
|
url = block["image"]["external"]["url"]
|
|
# Only return the domain part of external URLs to avoid potential sensitive parameters
|
|
try:
|
|
from urllib.parse import urlparse
|
|
|
|
parsed_url = urlparse(url)
|
|
return f"[External Image from {parsed_url.netloc}]"
|
|
except Exception:
|
|
return "[External Image]"
|
|
elif block_type == "code":
|
|
language = block["code"]["language"]
|
|
code_text = "".join(
|
|
[text_obj["plain_text"] for text_obj in block["code"]["rich_text"]]
|
|
)
|
|
return f"```{language}\n{code_text}\n```"
|
|
elif block_type == "equation":
|
|
return block["equation"]["expression"]
|
|
# Add more block types as needed
|
|
|
|
# Return empty string for unsupported block types
|
|
return ""
|
|
|
|
# =========================================================================
|
|
# WRITE OPERATIONS (create, update, delete pages)
|
|
# =========================================================================
|
|
|
|
async def _get_first_accessible_parent(self) -> str | None:
|
|
"""
|
|
Get the first accessible page ID that can be used as a parent.
|
|
|
|
Returns:
|
|
Page ID string, or None if no accessible pages found
|
|
"""
|
|
try:
|
|
notion = await self._get_client()
|
|
|
|
# Search for pages, get most recently edited first
|
|
response = await self._api_call_with_retry(
|
|
notion.search,
|
|
filter={"property": "object", "value": "page"},
|
|
sort={"direction": "descending", "timestamp": "last_edited_time"},
|
|
page_size=1, # We only need the first one
|
|
)
|
|
|
|
results = response.get("results", [])
|
|
if results:
|
|
return results[0]["id"]
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error finding accessible parent page: {e}")
|
|
return None
|
|
|
|
def _markdown_to_blocks(self, markdown: str) -> list[dict[str, Any]]:
|
|
"""
|
|
Convert markdown content to Notion blocks.
|
|
|
|
This is a simple converter that handles basic markdown.
|
|
For more complex markdown, consider using a proper markdown parser.
|
|
|
|
Args:
|
|
markdown: Markdown content
|
|
|
|
Returns:
|
|
List of Notion block objects
|
|
"""
|
|
blocks = []
|
|
lines = markdown.split("\n")
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
|
|
if not line:
|
|
continue
|
|
|
|
# Heading 1
|
|
if line.startswith("# "):
|
|
blocks.append(
|
|
{
|
|
"object": "block",
|
|
"type": "heading_1",
|
|
"heading_1": {
|
|
"rich_text": [
|
|
{"type": "text", "text": {"content": line[2:]}}
|
|
]
|
|
},
|
|
}
|
|
)
|
|
# Heading 2
|
|
elif line.startswith("## "):
|
|
blocks.append(
|
|
{
|
|
"object": "block",
|
|
"type": "heading_2",
|
|
"heading_2": {
|
|
"rich_text": [
|
|
{"type": "text", "text": {"content": line[3:]}}
|
|
]
|
|
},
|
|
}
|
|
)
|
|
# Heading 3
|
|
elif line.startswith("### "):
|
|
blocks.append(
|
|
{
|
|
"object": "block",
|
|
"type": "heading_3",
|
|
"heading_3": {
|
|
"rich_text": [
|
|
{"type": "text", "text": {"content": line[4:]}}
|
|
]
|
|
},
|
|
}
|
|
)
|
|
# Bullet list
|
|
elif line.startswith("- ") or line.startswith("* "):
|
|
blocks.append(
|
|
{
|
|
"object": "block",
|
|
"type": "bulleted_list_item",
|
|
"bulleted_list_item": {
|
|
"rich_text": [
|
|
{"type": "text", "text": {"content": line[2:]}}
|
|
]
|
|
},
|
|
}
|
|
)
|
|
# Numbered list
|
|
elif match := re.match(r"^(\d+)\.\s+(.*)$", line):
|
|
content = match.group(2) # Extract text after "number. "
|
|
blocks.append(
|
|
{
|
|
"object": "block",
|
|
"type": "numbered_list_item",
|
|
"numbered_list_item": {
|
|
"rich_text": [
|
|
{"type": "text", "text": {"content": content}}
|
|
]
|
|
},
|
|
}
|
|
)
|
|
# Regular paragraph
|
|
else:
|
|
blocks.append(
|
|
{
|
|
"object": "block",
|
|
"type": "paragraph",
|
|
"paragraph": {
|
|
"rich_text": [{"type": "text", "text": {"content": line}}]
|
|
},
|
|
}
|
|
)
|
|
|
|
return blocks
|
|
|
|
async def create_page(
|
|
self, title: str, content: str, parent_page_id: str | None = None
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Create a new Notion page.
|
|
|
|
Args:
|
|
title: Page title
|
|
content: Page content (markdown format)
|
|
parent_page_id: Optional parent page ID (creates as subpage if provided)
|
|
|
|
Returns:
|
|
Dictionary with page details:
|
|
- page_id: Created page ID
|
|
- url: Page URL
|
|
- title: Page title
|
|
- status: "success" or "error"
|
|
- message: Success/error message
|
|
|
|
Raises:
|
|
APIResponseError: If Notion API returns an error
|
|
"""
|
|
try:
|
|
logger.info(
|
|
f"Creating Notion page: title='{title}', parent_page_id={parent_page_id}"
|
|
)
|
|
|
|
# Get Notion client
|
|
notion = await self._get_client()
|
|
|
|
# Convert markdown content to Notion blocks
|
|
children = self._markdown_to_blocks(content)
|
|
|
|
# Prepare parent - find first available page if not provided
|
|
if not parent_page_id:
|
|
logger.info(
|
|
"No parent_page_id provided, searching for first accessible page..."
|
|
)
|
|
parent_page_id = await self._get_first_accessible_parent()
|
|
if not parent_page_id:
|
|
logger.warning("No accessible parent pages found")
|
|
return {
|
|
"status": "error",
|
|
"message": "Could not find any accessible Notion pages to use as parent. "
|
|
"Please make sure your Notion integration has access to at least one page.",
|
|
}
|
|
logger.info(f"Using parent_page_id: {parent_page_id}")
|
|
|
|
parent = {"type": "page_id", "page_id": parent_page_id}
|
|
|
|
# Create the page with standard title property
|
|
properties = {
|
|
"title": {"title": [{"type": "text", "text": {"content": title}}]}
|
|
}
|
|
|
|
response = await self._api_call_with_retry(
|
|
notion.pages.create,
|
|
parent=parent,
|
|
properties=properties,
|
|
children=children[:100], # Notion API limit: 100 blocks per request
|
|
)
|
|
|
|
page_id = response["id"]
|
|
page_url = response["url"]
|
|
|
|
# If content has more than 100 blocks, append them
|
|
if len(children) > 100:
|
|
for i in range(100, len(children), 100):
|
|
batch = children[i : i + 100]
|
|
await self._api_call_with_retry(
|
|
notion.blocks.children.append, block_id=page_id, children=batch
|
|
)
|
|
|
|
return {
|
|
"status": "success",
|
|
"page_id": page_id,
|
|
"url": page_url,
|
|
"title": title,
|
|
"message": f"Created Notion page '{title}'",
|
|
}
|
|
|
|
except APIResponseError as e:
|
|
logger.error(f"Notion API error creating page: {e}")
|
|
error_msg = self._api_error_message(e)
|
|
return {
|
|
"status": "error",
|
|
"message": f"Failed to create Notion page: {error_msg}",
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error creating Notion page: {e}")
|
|
return {
|
|
"status": "error",
|
|
"message": f"Failed to create Notion page: {e!s}",
|
|
}
|
|
|
|
async def update_page(
|
|
self, page_id: str, content: str | None = None
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Update an existing Notion page by appending new content.
|
|
|
|
Note: Content is appended to the page, not replaced.
|
|
|
|
Args:
|
|
page_id: Page ID to update
|
|
content: New markdown content to append to the page (optional)
|
|
|
|
Returns:
|
|
Dictionary with update result
|
|
|
|
Raises:
|
|
APIResponseError: If Notion API returns an error
|
|
"""
|
|
try:
|
|
notion = await self._get_client()
|
|
|
|
appended_block_ids = []
|
|
if content:
|
|
# Convert new content to blocks
|
|
try:
|
|
children = self._markdown_to_blocks(content)
|
|
if not children:
|
|
logger.warning(
|
|
"No blocks generated from content, skipping append"
|
|
)
|
|
return {
|
|
"status": "error",
|
|
"message": "Content conversion failed: no valid blocks generated",
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Failed to convert markdown to blocks: {e}")
|
|
return {
|
|
"status": "error",
|
|
"message": f"Failed to parse content: {e!s}",
|
|
}
|
|
|
|
# Append new content blocks
|
|
try:
|
|
for i in range(0, len(children), 100):
|
|
batch = children[i : i + 100]
|
|
response = await self._api_call_with_retry(
|
|
notion.blocks.children.append,
|
|
block_id=page_id,
|
|
children=batch,
|
|
)
|
|
batch_block_ids = [
|
|
block["id"] for block in response.get("results", [])
|
|
]
|
|
appended_block_ids.extend(batch_block_ids)
|
|
logger.info(
|
|
f"Successfully appended {len(children)} new blocks to page {page_id}"
|
|
)
|
|
logger.debug(
|
|
f"Appended block IDs: {appended_block_ids[:5]}..."
|
|
if len(appended_block_ids) > 5
|
|
else f"Appended block IDs: {appended_block_ids}"
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Failed to append content blocks: {e}")
|
|
return {
|
|
"status": "error",
|
|
"message": f"Failed to append content: {e!s}",
|
|
}
|
|
|
|
# Get updated page info
|
|
response = await self._api_call_with_retry(
|
|
notion.pages.retrieve, page_id=page_id
|
|
)
|
|
page_url = response["url"]
|
|
page_title = response["properties"]["title"]["title"][0]["text"]["content"]
|
|
|
|
return {
|
|
"status": "success",
|
|
"page_id": page_id,
|
|
"url": page_url,
|
|
"title": page_title,
|
|
"appended_block_ids": appended_block_ids,
|
|
"message": f"Updated Notion page '{page_title}' (content appended)",
|
|
}
|
|
|
|
except APIResponseError as e:
|
|
logger.error(f"Notion API error updating page: {e}")
|
|
error_msg = self._api_error_message(e)
|
|
return {
|
|
"status": "error",
|
|
"message": f"Failed to update Notion page: {error_msg}",
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error updating Notion page: {e}")
|
|
return {
|
|
"status": "error",
|
|
"message": f"Failed to update Notion page: {e!s}",
|
|
}
|
|
|
|
async def delete_page(self, page_id: str) -> dict[str, Any]:
|
|
"""
|
|
Delete (archive) a Notion page.
|
|
|
|
Note: Notion doesn't truly delete pages, it archives them.
|
|
|
|
Args:
|
|
page_id: Page ID to delete
|
|
|
|
Returns:
|
|
Dictionary with deletion result
|
|
|
|
Raises:
|
|
APIResponseError: If Notion API returns an error
|
|
"""
|
|
try:
|
|
notion = await self._get_client()
|
|
|
|
# Archive the page (Notion's way of "deleting")
|
|
response = await self._api_call_with_retry(
|
|
notion.pages.update, page_id=page_id, archived=True
|
|
)
|
|
|
|
page_title = "Unknown"
|
|
with contextlib.suppress(KeyError, IndexError):
|
|
page_title = response["properties"]["title"]["title"][0]["text"][
|
|
"content"
|
|
]
|
|
|
|
return {
|
|
"status": "success",
|
|
"page_id": page_id,
|
|
"message": f"Deleted Notion page '{page_title}'",
|
|
}
|
|
|
|
except APIResponseError as e:
|
|
logger.error(f"Notion API error deleting page: {e}")
|
|
error_msg = self._api_error_message(e)
|
|
return {
|
|
"status": "error",
|
|
"message": f"Failed to delete Notion page: {error_msg}",
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error deleting Notion page: {e}")
|
|
return {
|
|
"status": "error",
|
|
"message": f"Failed to delete Notion page: {e!s}",
|
|
}
|