mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
480 lines
16 KiB
Python
480 lines
16 KiB
Python
"""
|
|
Editor routes for document editing with markdown (Plate.js frontend).
|
|
Includes multi-format export (PDF, DOCX, HTML, LaTeX, EPUB, ODT, plain text).
|
|
"""
|
|
|
|
import asyncio
|
|
import io
|
|
import logging
|
|
import os
|
|
import tempfile
|
|
from datetime import UTC, datetime
|
|
from typing import Any
|
|
|
|
import pypandoc
|
|
import typst
|
|
from fastapi import APIRouter, Depends, HTTPException, Query
|
|
from fastapi.responses import StreamingResponse
|
|
from sqlalchemy import func, select
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.db import Chunk, Document, DocumentType, Permission, User, get_async_session
|
|
from app.routes.reports_routes import (
|
|
_FILE_EXTENSIONS,
|
|
_MEDIA_TYPES,
|
|
ExportFormat,
|
|
_normalize_latex_delimiters,
|
|
_strip_wrapping_code_fences,
|
|
)
|
|
from app.templates.export_helpers import (
|
|
get_html_css_path,
|
|
get_reference_docx_path,
|
|
get_typst_template_path,
|
|
)
|
|
from app.users import current_active_user
|
|
from app.utils.rbac import check_permission
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
@router.get("/search-spaces/{search_space_id}/documents/{document_id}/editor-content")
|
|
async def get_editor_content(
|
|
search_space_id: int,
|
|
document_id: int,
|
|
max_length: int | None = Query(
|
|
None, description="Truncate source_markdown to this many characters"
|
|
),
|
|
session: AsyncSession = Depends(get_async_session),
|
|
user: User = Depends(current_active_user),
|
|
):
|
|
"""
|
|
Get document content for editing.
|
|
|
|
Returns source_markdown for the Plate.js editor.
|
|
Falls back to blocknote_document → markdown conversion, then chunk reconstruction.
|
|
|
|
Requires DOCUMENTS_READ permission.
|
|
"""
|
|
# Check RBAC permission
|
|
await check_permission(
|
|
session,
|
|
user,
|
|
search_space_id,
|
|
Permission.DOCUMENTS_READ.value,
|
|
"You don't have permission to read documents in this search space",
|
|
)
|
|
|
|
result = await session.execute(
|
|
select(Document).filter(
|
|
Document.id == document_id,
|
|
Document.search_space_id == search_space_id,
|
|
)
|
|
)
|
|
document = result.scalars().first()
|
|
|
|
if not document:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
count_result = await session.execute(
|
|
select(func.count()).select_from(Chunk).filter(Chunk.document_id == document_id)
|
|
)
|
|
chunk_count = count_result.scalar() or 0
|
|
|
|
def _build_response(md: str) -> dict:
|
|
size_bytes = len(md.encode("utf-8"))
|
|
truncated = False
|
|
output_md = md
|
|
if max_length is not None and size_bytes > max_length:
|
|
output_md = md[:max_length]
|
|
truncated = True
|
|
return {
|
|
"document_id": document.id,
|
|
"title": document.title,
|
|
"document_type": document.document_type.value,
|
|
"source_markdown": output_md,
|
|
"content_size_bytes": size_bytes,
|
|
"chunk_count": chunk_count,
|
|
"truncated": truncated,
|
|
"updated_at": document.updated_at.isoformat()
|
|
if document.updated_at
|
|
else None,
|
|
}
|
|
|
|
if document.source_markdown is not None:
|
|
return _build_response(document.source_markdown)
|
|
|
|
if document.blocknote_document:
|
|
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
|
|
|
markdown = blocknote_to_markdown(document.blocknote_document)
|
|
if markdown:
|
|
document.source_markdown = markdown
|
|
await session.commit()
|
|
return _build_response(markdown)
|
|
|
|
if document.document_type == DocumentType.NOTE:
|
|
empty_markdown = ""
|
|
document.source_markdown = empty_markdown
|
|
await session.commit()
|
|
return _build_response(empty_markdown)
|
|
|
|
chunk_contents_result = await session.execute(
|
|
select(Chunk.content)
|
|
.filter(Chunk.document_id == document_id)
|
|
.order_by(Chunk.id)
|
|
)
|
|
chunk_contents = chunk_contents_result.scalars().all()
|
|
|
|
if not chunk_contents:
|
|
doc_status = document.status or {}
|
|
state = (
|
|
doc_status.get("state", "ready")
|
|
if isinstance(doc_status, dict)
|
|
else "ready"
|
|
)
|
|
if state in ("pending", "processing"):
|
|
raise HTTPException(
|
|
status_code=409,
|
|
detail="This document is still being processed. Please wait a moment and try again.",
|
|
)
|
|
if state == "failed":
|
|
reason = (
|
|
doc_status.get("reason", "Unknown error")
|
|
if isinstance(doc_status, dict)
|
|
else "Unknown error"
|
|
)
|
|
raise HTTPException(
|
|
status_code=422,
|
|
detail=f"Processing failed: {reason}. You can delete this document and re-upload it.",
|
|
)
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="This document has no content. It may not have been processed correctly. Try deleting and re-uploading it.",
|
|
)
|
|
|
|
markdown_content = "\n\n".join(chunk_contents)
|
|
|
|
if not markdown_content.strip():
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="This document appears to be empty. Try re-uploading or editing it to add content.",
|
|
)
|
|
|
|
document.source_markdown = markdown_content
|
|
await session.commit()
|
|
|
|
return _build_response(markdown_content)
|
|
|
|
|
|
@router.get(
|
|
"/search-spaces/{search_space_id}/documents/{document_id}/download-markdown"
|
|
)
|
|
async def download_document_markdown(
|
|
search_space_id: int,
|
|
document_id: int,
|
|
session: AsyncSession = Depends(get_async_session),
|
|
user: User = Depends(current_active_user),
|
|
):
|
|
"""
|
|
Download the full document content as a .md file.
|
|
Reconstructs markdown from source_markdown or chunks.
|
|
"""
|
|
await check_permission(
|
|
session,
|
|
user,
|
|
search_space_id,
|
|
Permission.DOCUMENTS_READ.value,
|
|
"You don't have permission to read documents in this search space",
|
|
)
|
|
|
|
result = await session.execute(
|
|
select(Document).filter(
|
|
Document.id == document_id,
|
|
Document.search_space_id == search_space_id,
|
|
)
|
|
)
|
|
document = result.scalars().first()
|
|
|
|
if not document:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
markdown: str | None = document.source_markdown
|
|
if markdown is None and document.blocknote_document:
|
|
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
|
|
|
markdown = blocknote_to_markdown(document.blocknote_document)
|
|
if markdown is None:
|
|
chunk_contents_result = await session.execute(
|
|
select(Chunk.content)
|
|
.filter(Chunk.document_id == document_id)
|
|
.order_by(Chunk.id)
|
|
)
|
|
chunk_contents = chunk_contents_result.scalars().all()
|
|
if chunk_contents:
|
|
markdown = "\n\n".join(chunk_contents)
|
|
|
|
if not markdown or not markdown.strip():
|
|
raise HTTPException(
|
|
status_code=400, detail="Document has no content to download"
|
|
)
|
|
|
|
safe_title = (
|
|
"".join(
|
|
c if c.isalnum() or c in " -_" else "_"
|
|
for c in (document.title or "document")
|
|
).strip()[:80]
|
|
or "document"
|
|
)
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(markdown.encode("utf-8")),
|
|
media_type="text/markdown; charset=utf-8",
|
|
headers={"Content-Disposition": f'attachment; filename="{safe_title}.md"'},
|
|
)
|
|
|
|
|
|
@router.post("/search-spaces/{search_space_id}/documents/{document_id}/save")
|
|
async def save_document(
|
|
search_space_id: int,
|
|
document_id: int,
|
|
data: dict[str, Any],
|
|
session: AsyncSession = Depends(get_async_session),
|
|
user: User = Depends(current_active_user),
|
|
):
|
|
"""
|
|
Save document markdown and trigger reindexing.
|
|
Called when user clicks 'Save & Exit'.
|
|
|
|
Accepts { "source_markdown": "...", "title": "..." (optional) }.
|
|
|
|
Requires DOCUMENTS_UPDATE permission.
|
|
"""
|
|
from app.tasks.celery_tasks.document_reindex_tasks import reindex_document_task
|
|
|
|
# Check RBAC permission
|
|
await check_permission(
|
|
session,
|
|
user,
|
|
search_space_id,
|
|
Permission.DOCUMENTS_UPDATE.value,
|
|
"You don't have permission to update documents in this search space",
|
|
)
|
|
|
|
result = await session.execute(
|
|
select(Document).filter(
|
|
Document.id == document_id,
|
|
Document.search_space_id == search_space_id,
|
|
)
|
|
)
|
|
document = result.scalars().first()
|
|
|
|
if not document:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
source_markdown = data.get("source_markdown")
|
|
if source_markdown is None:
|
|
raise HTTPException(status_code=400, detail="source_markdown is required")
|
|
|
|
if not isinstance(source_markdown, str):
|
|
raise HTTPException(status_code=400, detail="source_markdown must be a string")
|
|
|
|
# For NOTE type, extract title from first heading line if present
|
|
if document.document_type == DocumentType.NOTE:
|
|
# If the frontend sends a title, use it; otherwise extract from markdown
|
|
new_title = data.get("title")
|
|
if not new_title:
|
|
# Extract title from the first line of markdown (# Heading)
|
|
for line in source_markdown.split("\n"):
|
|
stripped = line.strip()
|
|
if stripped.startswith("# "):
|
|
new_title = stripped[2:].strip()
|
|
break
|
|
elif stripped:
|
|
# First non-empty non-heading line
|
|
new_title = stripped[:100]
|
|
break
|
|
|
|
if new_title:
|
|
document.title = new_title.strip()
|
|
else:
|
|
document.title = "Untitled"
|
|
|
|
# Save source_markdown
|
|
document.source_markdown = source_markdown
|
|
document.updated_at = datetime.now(UTC)
|
|
document.content_needs_reindexing = True
|
|
|
|
await session.commit()
|
|
|
|
# Queue reindex task
|
|
reindex_document_task.delay(document_id, str(user.id))
|
|
|
|
return {
|
|
"status": "saved",
|
|
"document_id": document_id,
|
|
"message": "Document saved and will be reindexed in the background",
|
|
"updated_at": document.updated_at.isoformat(),
|
|
}
|
|
|
|
|
|
@router.get("/search-spaces/{search_space_id}/documents/{document_id}/export")
|
|
async def export_document(
|
|
search_space_id: int,
|
|
document_id: int,
|
|
format: ExportFormat = Query(
|
|
ExportFormat.PDF,
|
|
description="Export format: pdf, docx, html, latex, epub, odt, or plain",
|
|
),
|
|
session: AsyncSession = Depends(get_async_session),
|
|
user: User = Depends(current_active_user),
|
|
):
|
|
"""Export a document in the requested format (reuses the report export pipeline)."""
|
|
await check_permission(
|
|
session,
|
|
user,
|
|
search_space_id,
|
|
Permission.DOCUMENTS_READ.value,
|
|
"You don't have permission to read documents in this search space",
|
|
)
|
|
|
|
result = await session.execute(
|
|
select(Document).filter(
|
|
Document.id == document_id,
|
|
Document.search_space_id == search_space_id,
|
|
)
|
|
)
|
|
document = result.scalars().first()
|
|
if not document:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
markdown_content: str | None = document.source_markdown
|
|
if markdown_content is None and document.blocknote_document:
|
|
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
|
|
|
markdown_content = blocknote_to_markdown(document.blocknote_document)
|
|
if markdown_content is None:
|
|
chunk_contents_result = await session.execute(
|
|
select(Chunk.content)
|
|
.filter(Chunk.document_id == document_id)
|
|
.order_by(Chunk.id)
|
|
)
|
|
chunk_contents = chunk_contents_result.scalars().all()
|
|
if chunk_contents:
|
|
markdown_content = "\n\n".join(chunk_contents)
|
|
|
|
if not markdown_content or not markdown_content.strip():
|
|
raise HTTPException(status_code=400, detail="Document has no content to export")
|
|
|
|
markdown_content = _strip_wrapping_code_fences(markdown_content)
|
|
markdown_content = _normalize_latex_delimiters(markdown_content)
|
|
|
|
doc_title = document.title or "Document"
|
|
formatted_date = (
|
|
document.created_at.strftime("%B %d, %Y") if document.created_at else ""
|
|
)
|
|
input_fmt = "gfm+tex_math_dollars"
|
|
meta_args = ["-M", f"title:{doc_title}", "-M", f"date:{formatted_date}"]
|
|
|
|
def _convert_and_read() -> bytes:
|
|
if format == ExportFormat.PDF:
|
|
typst_template = str(get_typst_template_path())
|
|
typst_markup: str = pypandoc.convert_text(
|
|
markdown_content,
|
|
"typst",
|
|
format=input_fmt,
|
|
extra_args=[
|
|
"--standalone",
|
|
f"--template={typst_template}",
|
|
"-V",
|
|
"mainfont:Libertinus Serif",
|
|
"-V",
|
|
"codefont:DejaVu Sans Mono",
|
|
*meta_args,
|
|
],
|
|
)
|
|
return typst.compile(typst_markup.encode("utf-8"))
|
|
|
|
if format == ExportFormat.DOCX:
|
|
return _pandoc_to_tempfile(
|
|
format.value,
|
|
[
|
|
"--standalone",
|
|
f"--reference-doc={get_reference_docx_path()}",
|
|
*meta_args,
|
|
],
|
|
)
|
|
|
|
if format == ExportFormat.HTML:
|
|
html_str: str = pypandoc.convert_text(
|
|
markdown_content,
|
|
"html5",
|
|
format=input_fmt,
|
|
extra_args=[
|
|
"--standalone",
|
|
"--embed-resources",
|
|
f"--css={get_html_css_path()}",
|
|
"--syntax-highlighting=pygments",
|
|
*meta_args,
|
|
],
|
|
)
|
|
return html_str.encode("utf-8")
|
|
|
|
if format == ExportFormat.EPUB:
|
|
return _pandoc_to_tempfile("epub3", ["--standalone", *meta_args])
|
|
|
|
if format == ExportFormat.ODT:
|
|
return _pandoc_to_tempfile("odt", ["--standalone", *meta_args])
|
|
|
|
if format == ExportFormat.LATEX:
|
|
tex_str: str = pypandoc.convert_text(
|
|
markdown_content,
|
|
"latex",
|
|
format=input_fmt,
|
|
extra_args=["--standalone", *meta_args],
|
|
)
|
|
return tex_str.encode("utf-8")
|
|
|
|
plain_str: str = pypandoc.convert_text(
|
|
markdown_content,
|
|
"plain",
|
|
format=input_fmt,
|
|
extra_args=["--wrap=auto", "--columns=80"],
|
|
)
|
|
return plain_str.encode("utf-8")
|
|
|
|
def _pandoc_to_tempfile(output_format: str, extra_args: list[str]) -> bytes:
|
|
fd, tmp_path = tempfile.mkstemp(suffix=f".{output_format}")
|
|
os.close(fd)
|
|
try:
|
|
pypandoc.convert_text(
|
|
markdown_content,
|
|
output_format,
|
|
format=input_fmt,
|
|
extra_args=extra_args,
|
|
outputfile=tmp_path,
|
|
)
|
|
with open(tmp_path, "rb") as f:
|
|
return f.read()
|
|
finally:
|
|
os.unlink(tmp_path)
|
|
|
|
try:
|
|
loop = asyncio.get_running_loop()
|
|
output = await loop.run_in_executor(None, _convert_and_read)
|
|
except Exception as e:
|
|
logger.exception("Document export failed")
|
|
raise HTTPException(status_code=500, detail=f"Export failed: {e!s}") from e
|
|
|
|
safe_title = (
|
|
"".join(c if c.isalnum() or c in " -_" else "_" for c in doc_title).strip()[:80]
|
|
or "document"
|
|
)
|
|
ext = _FILE_EXTENSIONS[format]
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(output),
|
|
media_type=_MEDIA_TYPES[format],
|
|
headers={"Content-Disposition": f'attachment; filename="{safe_title}.{ext}"'},
|
|
)
|