mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-02 19:55:18 +02:00
feat: add support for exporting Dropbox Paper .paper documents as markdown, enhancing content extraction capabilities
This commit is contained in:
parent
9d7b172ae7
commit
8d591ce03c
3 changed files with 63 additions and 7 deletions
|
|
@ -264,6 +264,24 @@ class DropboxClient:
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
async def export_file(
|
||||||
|
self,
|
||||||
|
path: str,
|
||||||
|
export_format: str | None = None,
|
||||||
|
) -> tuple[bytes | None, str | None]:
|
||||||
|
"""Export a non-downloadable file (e.g. .paper) via /2/files/export.
|
||||||
|
|
||||||
|
Uses the recommended new API for Paper-as-files.
|
||||||
|
Returns (content_bytes, error_message).
|
||||||
|
"""
|
||||||
|
api_arg: dict[str, str] = {"path": path}
|
||||||
|
if export_format:
|
||||||
|
api_arg["export_format"] = export_format
|
||||||
|
resp = await self._content_request("/2/files/export", api_arg)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return None, f"Export failed: {resp.status_code} - {resp.text}"
|
||||||
|
return resp.content, None
|
||||||
|
|
||||||
async def upload_file(
|
async def upload_file(
|
||||||
self,
|
self,
|
||||||
path: str,
|
path: str,
|
||||||
|
|
|
||||||
|
|
@ -12,11 +12,36 @@ from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from .client import DropboxClient
|
from .client import DropboxClient
|
||||||
from .file_types import get_extension_from_name, should_skip_file
|
from .file_types import get_extension_from_name, is_paper_file, should_skip_file
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def _export_paper_content(
|
||||||
|
client: DropboxClient,
|
||||||
|
file: dict[str, Any],
|
||||||
|
metadata: dict[str, Any],
|
||||||
|
) -> tuple[str | None, dict[str, Any], str | None]:
|
||||||
|
"""Export a Dropbox Paper doc as markdown via ``/2/files/export``."""
|
||||||
|
file_path_lower = file.get("path_lower", "")
|
||||||
|
file_name = file.get("name", "Unknown")
|
||||||
|
|
||||||
|
logger.info(f"Exporting Paper doc as markdown: {file_name}")
|
||||||
|
|
||||||
|
content_bytes, error = await client.export_file(
|
||||||
|
file_path_lower, export_format="markdown"
|
||||||
|
)
|
||||||
|
if error:
|
||||||
|
return None, metadata, error
|
||||||
|
if not content_bytes:
|
||||||
|
return None, metadata, "Export returned empty content"
|
||||||
|
|
||||||
|
markdown = content_bytes.decode("utf-8", errors="replace")
|
||||||
|
metadata["exported_as"] = "markdown"
|
||||||
|
metadata["original_type"] = "paper"
|
||||||
|
return markdown, metadata, None
|
||||||
|
|
||||||
|
|
||||||
async def download_and_extract_content(
|
async def download_and_extract_content(
|
||||||
client: DropboxClient,
|
client: DropboxClient,
|
||||||
file: dict[str, Any],
|
file: dict[str, Any],
|
||||||
|
|
@ -50,6 +75,9 @@ async def download_and_extract_content(
|
||||||
if "content_hash" in file:
|
if "content_hash" in file:
|
||||||
metadata["content_hash"] = file["content_hash"]
|
metadata["content_hash"] = file["content_hash"]
|
||||||
|
|
||||||
|
if is_paper_file(file):
|
||||||
|
return await _export_paper_content(client, file, metadata)
|
||||||
|
|
||||||
temp_file_path = None
|
temp_file_path = None
|
||||||
try:
|
try:
|
||||||
extension = get_extension_from_name(file_name) or ".bin"
|
extension = get_extension_from_name(file_name) or ".bin"
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,8 @@
|
||||||
"""File type handlers for Dropbox."""
|
"""File type handlers for Dropbox."""
|
||||||
|
|
||||||
SKIP_EXTENSIONS = frozenset(
|
PAPER_EXTENSION = ".paper"
|
||||||
{
|
|
||||||
".paper", # Dropbox Paper docs are not downloadable via /files/download
|
SKIP_EXTENSIONS: frozenset[str] = frozenset()
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
MIME_TO_EXTENSION: dict[str, str] = {
|
MIME_TO_EXTENSION: dict[str, str] = {
|
||||||
"application/pdf": ".pdf",
|
"application/pdf": ".pdf",
|
||||||
|
|
@ -37,10 +35,22 @@ def is_folder(item: dict) -> bool:
|
||||||
return item.get(".tag") == "folder"
|
return item.get(".tag") == "folder"
|
||||||
|
|
||||||
|
|
||||||
|
def is_paper_file(item: dict) -> bool:
|
||||||
|
"""Detect Dropbox Paper docs (exported via /2/files/export, not /2/files/download)."""
|
||||||
|
name = item.get("name", "")
|
||||||
|
ext = get_extension_from_name(name).lower()
|
||||||
|
return ext == PAPER_EXTENSION
|
||||||
|
|
||||||
|
|
||||||
def should_skip_file(item: dict) -> bool:
|
def should_skip_file(item: dict) -> bool:
|
||||||
"""Skip folders and non-downloadable files."""
|
"""Skip folders and truly non-indexable files.
|
||||||
|
|
||||||
|
Paper docs are non-downloadable but exportable, so they are NOT skipped.
|
||||||
|
"""
|
||||||
if is_folder(item):
|
if is_folder(item):
|
||||||
return True
|
return True
|
||||||
|
if is_paper_file(item):
|
||||||
|
return False
|
||||||
if not item.get("is_downloadable", True):
|
if not item.get("is_downloadable", True):
|
||||||
return True
|
return True
|
||||||
name = item.get("name", "")
|
name = item.get("name", "")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue