diff --git a/surfsense_backend/app/connectors/dropbox/client.py b/surfsense_backend/app/connectors/dropbox/client.py index 27bffcb91..530059143 100644 --- a/surfsense_backend/app/connectors/dropbox/client.py +++ b/surfsense_backend/app/connectors/dropbox/client.py @@ -264,6 +264,24 @@ class DropboxClient: f.write(chunk) return None + async def export_file( + self, + path: str, + export_format: str | None = None, + ) -> tuple[bytes | None, str | None]: + """Export a non-downloadable file (e.g. .paper) via /2/files/export. + + Uses the recommended new API for Paper-as-files. + Returns (content_bytes, error_message). + """ + api_arg: dict[str, str] = {"path": path} + if export_format: + api_arg["export_format"] = export_format + resp = await self._content_request("/2/files/export", api_arg) + if resp.status_code != 200: + return None, f"Export failed: {resp.status_code} - {resp.text}" + return resp.content, None + async def upload_file( self, path: str, diff --git a/surfsense_backend/app/connectors/dropbox/content_extractor.py b/surfsense_backend/app/connectors/dropbox/content_extractor.py index 9e83f3474..226a643c7 100644 --- a/surfsense_backend/app/connectors/dropbox/content_extractor.py +++ b/surfsense_backend/app/connectors/dropbox/content_extractor.py @@ -12,11 +12,36 @@ from pathlib import Path from typing import Any from .client import DropboxClient -from .file_types import get_extension_from_name, should_skip_file +from .file_types import get_extension_from_name, is_paper_file, should_skip_file logger = logging.getLogger(__name__) +async def _export_paper_content( + client: DropboxClient, + file: dict[str, Any], + metadata: dict[str, Any], +) -> tuple[str | None, dict[str, Any], str | None]: + """Export a Dropbox Paper doc as markdown via ``/2/files/export``.""" + file_path_lower = file.get("path_lower", "") + file_name = file.get("name", "Unknown") + + logger.info(f"Exporting Paper doc as markdown: {file_name}") + + content_bytes, error = await client.export_file( + file_path_lower, export_format="markdown" + ) + if error: + return None, metadata, error + if not content_bytes: + return None, metadata, "Export returned empty content" + + markdown = content_bytes.decode("utf-8", errors="replace") + metadata["exported_as"] = "markdown" + metadata["original_type"] = "paper" + return markdown, metadata, None + + async def download_and_extract_content( client: DropboxClient, file: dict[str, Any], @@ -50,6 +75,9 @@ async def download_and_extract_content( if "content_hash" in file: metadata["content_hash"] = file["content_hash"] + if is_paper_file(file): + return await _export_paper_content(client, file, metadata) + temp_file_path = None try: extension = get_extension_from_name(file_name) or ".bin" diff --git a/surfsense_backend/app/connectors/dropbox/file_types.py b/surfsense_backend/app/connectors/dropbox/file_types.py index a3ddc2afb..e6d772a1c 100644 --- a/surfsense_backend/app/connectors/dropbox/file_types.py +++ b/surfsense_backend/app/connectors/dropbox/file_types.py @@ -1,10 +1,8 @@ """File type handlers for Dropbox.""" -SKIP_EXTENSIONS = frozenset( - { - ".paper", # Dropbox Paper docs are not downloadable via /files/download - } -) +PAPER_EXTENSION = ".paper" + +SKIP_EXTENSIONS: frozenset[str] = frozenset() MIME_TO_EXTENSION: dict[str, str] = { "application/pdf": ".pdf", @@ -37,10 +35,22 @@ def is_folder(item: dict) -> bool: return item.get(".tag") == "folder" +def is_paper_file(item: dict) -> bool: + """Detect Dropbox Paper docs (exported via /2/files/export, not /2/files/download).""" + name = item.get("name", "") + ext = get_extension_from_name(name).lower() + return ext == PAPER_EXTENSION + + def should_skip_file(item: dict) -> bool: - """Skip folders and non-downloadable files.""" + """Skip folders and truly non-indexable files. + + Paper docs are non-downloadable but exportable, so they are NOT skipped. + """ if is_folder(item): return True + if is_paper_file(item): + return False if not item.get("is_downloadable", True): return True name = item.get("name", "")