mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-02 19:55:18 +02:00
feat: add support for exporting Dropbox Paper .paper documents as markdown, enhancing content extraction capabilities
This commit is contained in:
parent
9d7b172ae7
commit
8d591ce03c
3 changed files with 63 additions and 7 deletions
|
|
@ -264,6 +264,24 @@ class DropboxClient:
|
|||
f.write(chunk)
|
||||
return None
|
||||
|
||||
async def export_file(
|
||||
self,
|
||||
path: str,
|
||||
export_format: str | None = None,
|
||||
) -> tuple[bytes | None, str | None]:
|
||||
"""Export a non-downloadable file (e.g. .paper) via /2/files/export.
|
||||
|
||||
Uses the recommended new API for Paper-as-files.
|
||||
Returns (content_bytes, error_message).
|
||||
"""
|
||||
api_arg: dict[str, str] = {"path": path}
|
||||
if export_format:
|
||||
api_arg["export_format"] = export_format
|
||||
resp = await self._content_request("/2/files/export", api_arg)
|
||||
if resp.status_code != 200:
|
||||
return None, f"Export failed: {resp.status_code} - {resp.text}"
|
||||
return resp.content, None
|
||||
|
||||
async def upload_file(
|
||||
self,
|
||||
path: str,
|
||||
|
|
|
|||
|
|
@ -12,11 +12,36 @@ from pathlib import Path
|
|||
from typing import Any
|
||||
|
||||
from .client import DropboxClient
|
||||
from .file_types import get_extension_from_name, should_skip_file
|
||||
from .file_types import get_extension_from_name, is_paper_file, should_skip_file
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def _export_paper_content(
|
||||
client: DropboxClient,
|
||||
file: dict[str, Any],
|
||||
metadata: dict[str, Any],
|
||||
) -> tuple[str | None, dict[str, Any], str | None]:
|
||||
"""Export a Dropbox Paper doc as markdown via ``/2/files/export``."""
|
||||
file_path_lower = file.get("path_lower", "")
|
||||
file_name = file.get("name", "Unknown")
|
||||
|
||||
logger.info(f"Exporting Paper doc as markdown: {file_name}")
|
||||
|
||||
content_bytes, error = await client.export_file(
|
||||
file_path_lower, export_format="markdown"
|
||||
)
|
||||
if error:
|
||||
return None, metadata, error
|
||||
if not content_bytes:
|
||||
return None, metadata, "Export returned empty content"
|
||||
|
||||
markdown = content_bytes.decode("utf-8", errors="replace")
|
||||
metadata["exported_as"] = "markdown"
|
||||
metadata["original_type"] = "paper"
|
||||
return markdown, metadata, None
|
||||
|
||||
|
||||
async def download_and_extract_content(
|
||||
client: DropboxClient,
|
||||
file: dict[str, Any],
|
||||
|
|
@ -50,6 +75,9 @@ async def download_and_extract_content(
|
|||
if "content_hash" in file:
|
||||
metadata["content_hash"] = file["content_hash"]
|
||||
|
||||
if is_paper_file(file):
|
||||
return await _export_paper_content(client, file, metadata)
|
||||
|
||||
temp_file_path = None
|
||||
try:
|
||||
extension = get_extension_from_name(file_name) or ".bin"
|
||||
|
|
|
|||
|
|
@ -1,10 +1,8 @@
|
|||
"""File type handlers for Dropbox."""
|
||||
|
||||
SKIP_EXTENSIONS = frozenset(
|
||||
{
|
||||
".paper", # Dropbox Paper docs are not downloadable via /files/download
|
||||
}
|
||||
)
|
||||
PAPER_EXTENSION = ".paper"
|
||||
|
||||
SKIP_EXTENSIONS: frozenset[str] = frozenset()
|
||||
|
||||
MIME_TO_EXTENSION: dict[str, str] = {
|
||||
"application/pdf": ".pdf",
|
||||
|
|
@ -37,10 +35,22 @@ def is_folder(item: dict) -> bool:
|
|||
return item.get(".tag") == "folder"
|
||||
|
||||
|
||||
def is_paper_file(item: dict) -> bool:
|
||||
"""Detect Dropbox Paper docs (exported via /2/files/export, not /2/files/download)."""
|
||||
name = item.get("name", "")
|
||||
ext = get_extension_from_name(name).lower()
|
||||
return ext == PAPER_EXTENSION
|
||||
|
||||
|
||||
def should_skip_file(item: dict) -> bool:
|
||||
"""Skip folders and non-downloadable files."""
|
||||
"""Skip folders and truly non-indexable files.
|
||||
|
||||
Paper docs are non-downloadable but exportable, so they are NOT skipped.
|
||||
"""
|
||||
if is_folder(item):
|
||||
return True
|
||||
if is_paper_file(item):
|
||||
return False
|
||||
if not item.get("is_downloadable", True):
|
||||
return True
|
||||
name = item.get("name", "")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue