feat: add support for exporting Dropbox Paper .paper documents as markdown, enhancing content extraction capabilities

This commit is contained in:
Anish Sarkar 2026-03-31 18:47:11 +05:30
parent 9d7b172ae7
commit 8d591ce03c
3 changed files with 63 additions and 7 deletions

View file

@ -264,6 +264,24 @@ class DropboxClient:
f.write(chunk)
return None
async def export_file(
self,
path: str,
export_format: str | None = None,
) -> tuple[bytes | None, str | None]:
"""Export a non-downloadable file (e.g. .paper) via /2/files/export.
Uses the recommended new API for Paper-as-files.
Returns (content_bytes, error_message).
"""
api_arg: dict[str, str] = {"path": path}
if export_format:
api_arg["export_format"] = export_format
resp = await self._content_request("/2/files/export", api_arg)
if resp.status_code != 200:
return None, f"Export failed: {resp.status_code} - {resp.text}"
return resp.content, None
async def upload_file(
self,
path: str,

View file

@ -12,11 +12,36 @@ from pathlib import Path
from typing import Any
from .client import DropboxClient
from .file_types import get_extension_from_name, should_skip_file
from .file_types import get_extension_from_name, is_paper_file, should_skip_file
logger = logging.getLogger(__name__)
async def _export_paper_content(
client: DropboxClient,
file: dict[str, Any],
metadata: dict[str, Any],
) -> tuple[str | None, dict[str, Any], str | None]:
"""Export a Dropbox Paper doc as markdown via ``/2/files/export``."""
file_path_lower = file.get("path_lower", "")
file_name = file.get("name", "Unknown")
logger.info(f"Exporting Paper doc as markdown: {file_name}")
content_bytes, error = await client.export_file(
file_path_lower, export_format="markdown"
)
if error:
return None, metadata, error
if not content_bytes:
return None, metadata, "Export returned empty content"
markdown = content_bytes.decode("utf-8", errors="replace")
metadata["exported_as"] = "markdown"
metadata["original_type"] = "paper"
return markdown, metadata, None
async def download_and_extract_content(
client: DropboxClient,
file: dict[str, Any],
@ -50,6 +75,9 @@ async def download_and_extract_content(
if "content_hash" in file:
metadata["content_hash"] = file["content_hash"]
if is_paper_file(file):
return await _export_paper_content(client, file, metadata)
temp_file_path = None
try:
extension = get_extension_from_name(file_name) or ".bin"

View file

@ -1,10 +1,8 @@
"""File type handlers for Dropbox."""
SKIP_EXTENSIONS = frozenset(
{
".paper", # Dropbox Paper docs are not downloadable via /files/download
}
)
PAPER_EXTENSION = ".paper"
SKIP_EXTENSIONS: frozenset[str] = frozenset()
MIME_TO_EXTENSION: dict[str, str] = {
"application/pdf": ".pdf",
@ -37,10 +35,22 @@ def is_folder(item: dict) -> bool:
return item.get(".tag") == "folder"
def is_paper_file(item: dict) -> bool:
"""Detect Dropbox Paper docs (exported via /2/files/export, not /2/files/download)."""
name = item.get("name", "")
ext = get_extension_from_name(name).lower()
return ext == PAPER_EXTENSION
def should_skip_file(item: dict) -> bool:
"""Skip folders and non-downloadable files."""
"""Skip folders and truly non-indexable files.
Paper docs are non-downloadable but exportable, so they are NOT skipped.
"""
if is_folder(item):
return True
if is_paper_file(item):
return False
if not item.get("is_downloadable", True):
return True
name = item.get("name", "")