mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 08:46:22 +02:00
291 lines
10 KiB
Python
291 lines
10 KiB
Python
"""Pure-Python converter: BlockNote JSON → Markdown.
|
|
|
|
No external dependencies (no Node.js, no npm packages, no HTTP calls).
|
|
Handles all standard BlockNote block types. Produces output equivalent to
|
|
BlockNote's own ``blocksToMarkdownLossy()``.
|
|
|
|
Usage:
|
|
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
|
|
|
markdown = blocknote_to_markdown(blocknote_json)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Inline content → markdown text
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _render_inline_content(content: list[dict[str, Any]] | None) -> str:
|
|
"""Convert BlockNote inline content array to a markdown string."""
|
|
if not content:
|
|
return ""
|
|
|
|
parts: list[str] = []
|
|
for item in content:
|
|
if not isinstance(item, dict):
|
|
continue
|
|
|
|
item_type = item.get("type", "text")
|
|
|
|
if item_type == "text":
|
|
text = item.get("text", "")
|
|
styles: dict[str, Any] = item.get("styles", {})
|
|
|
|
# Apply inline styles (order: code first so nested marks don't break it)
|
|
if styles.get("code"):
|
|
text = f"`{text}`"
|
|
else:
|
|
if styles.get("bold"):
|
|
text = f"**{text}**"
|
|
if styles.get("italic"):
|
|
text = f"*{text}*"
|
|
if styles.get("strikethrough"):
|
|
text = f"~~{text}~~"
|
|
# underline has no markdown equivalent — keep as plain text (lossy)
|
|
|
|
parts.append(text)
|
|
|
|
elif item_type == "link":
|
|
href = item.get("href", "")
|
|
link_content = item.get("content", [])
|
|
link_text = _render_inline_content(link_content) if link_content else href
|
|
parts.append(f"[{link_text}]({href})")
|
|
|
|
else:
|
|
# Unknown inline type — extract text if possible
|
|
text = item.get("text", "")
|
|
if text:
|
|
parts.append(text)
|
|
|
|
return "".join(parts)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Block → markdown lines
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _render_block(
|
|
block: dict[str, Any], indent: int = 0, numbered_list_counter: int = 0
|
|
) -> tuple[list[str], int]:
|
|
"""Convert a single BlockNote block (and its children) to markdown lines.
|
|
|
|
Args:
|
|
block: A BlockNote block dict.
|
|
indent: Current indentation level (for nested children).
|
|
numbered_list_counter: Current counter for consecutive numbered list items.
|
|
|
|
Returns:
|
|
A tuple of (list of markdown lines without trailing newlines,
|
|
updated numbered_list_counter).
|
|
"""
|
|
block_type = block.get("type", "paragraph")
|
|
props: dict[str, Any] = block.get("props", {})
|
|
content = block.get("content")
|
|
children: list[dict[str, Any]] = block.get("children", [])
|
|
prefix = " " * indent # 2-space indent per nesting level
|
|
|
|
lines: list[str] = []
|
|
|
|
# --- Block type handlers ---
|
|
|
|
if block_type == "paragraph":
|
|
text = _render_inline_content(content) if content else ""
|
|
lines.append(f"{prefix}{text}")
|
|
|
|
elif block_type == "heading":
|
|
level = props.get("level", 1)
|
|
hashes = "#" * min(max(level, 1), 6)
|
|
text = _render_inline_content(content) if content else ""
|
|
lines.append(f"{prefix}{hashes} {text}")
|
|
|
|
elif block_type == "bulletListItem":
|
|
text = _render_inline_content(content) if content else ""
|
|
lines.append(f"{prefix}- {text}")
|
|
|
|
elif block_type == "numberedListItem":
|
|
# Use props.start if present, otherwise increment counter
|
|
start = props.get("start")
|
|
if start is not None:
|
|
numbered_list_counter = int(start)
|
|
else:
|
|
numbered_list_counter += 1
|
|
text = _render_inline_content(content) if content else ""
|
|
lines.append(f"{prefix}{numbered_list_counter}. {text}")
|
|
|
|
elif block_type == "checkListItem":
|
|
checked = props.get("checked", False)
|
|
marker = "[x]" if checked else "[ ]"
|
|
text = _render_inline_content(content) if content else ""
|
|
lines.append(f"{prefix}- {marker} {text}")
|
|
|
|
elif block_type == "codeBlock":
|
|
language = props.get("language", "")
|
|
# Code blocks store content as a single text item
|
|
code_text = _render_inline_content(content) if content else ""
|
|
lines.append(f"{prefix}```{language}")
|
|
for code_line in code_text.split("\n"):
|
|
lines.append(f"{prefix}{code_line}")
|
|
lines.append(f"{prefix}```")
|
|
|
|
elif block_type == "table":
|
|
# Table content is a nested structure: content.rows[].cells[][]
|
|
table_content = block.get("content", {})
|
|
rows: list[dict[str, Any]] = []
|
|
|
|
if isinstance(table_content, dict):
|
|
rows = table_content.get("rows", [])
|
|
elif isinstance(table_content, list):
|
|
# Some versions store rows directly as a list
|
|
rows = table_content
|
|
|
|
if rows:
|
|
for row_idx, row in enumerate(rows):
|
|
cells = row.get("cells", []) if isinstance(row, dict) else row
|
|
cell_texts: list[str] = []
|
|
for cell in cells:
|
|
if isinstance(cell, list):
|
|
# Cell is a list of inline content
|
|
cell_texts.append(_render_inline_content(cell))
|
|
elif isinstance(cell, dict):
|
|
# Cell is a tableCell object with its own content
|
|
cell_content = cell.get("content")
|
|
if isinstance(cell_content, list):
|
|
cell_texts.append(_render_inline_content(cell_content))
|
|
else:
|
|
cell_texts.append("")
|
|
elif isinstance(cell, str):
|
|
cell_texts.append(cell)
|
|
else:
|
|
cell_texts.append(str(cell))
|
|
lines.append(f"{prefix}| {' | '.join(cell_texts)} |")
|
|
# Add header separator after first row
|
|
if row_idx == 0:
|
|
lines.append(f"{prefix}| {' | '.join('---' for _ in cell_texts)} |")
|
|
|
|
elif block_type == "image":
|
|
url = props.get("url", "")
|
|
caption = props.get("caption", "") or props.get("name", "")
|
|
if url:
|
|
lines.append(f"{prefix}")
|
|
|
|
elif block_type == "video":
|
|
url = props.get("url", "")
|
|
caption = props.get("caption", "") or "video"
|
|
if url:
|
|
lines.append(f"{prefix}[{caption}]({url})")
|
|
|
|
elif block_type == "audio":
|
|
url = props.get("url", "")
|
|
caption = props.get("caption", "") or "audio"
|
|
if url:
|
|
lines.append(f"{prefix}[{caption}]({url})")
|
|
|
|
elif block_type == "file":
|
|
url = props.get("url", "")
|
|
name = props.get("name", "") or props.get("caption", "") or "file"
|
|
if url:
|
|
lines.append(f"{prefix}[{name}]({url})")
|
|
|
|
else:
|
|
# Unknown block type — extract text content if possible, skip otherwise
|
|
if content:
|
|
text = _render_inline_content(content) if isinstance(content, list) else ""
|
|
if text:
|
|
lines.append(f"{prefix}{text}")
|
|
# If no content at all, silently skip (lossy)
|
|
|
|
# --- Render nested children (indented) ---
|
|
if children:
|
|
for child in children:
|
|
child_lines, numbered_list_counter = _render_block(
|
|
child, indent=indent + 1, numbered_list_counter=numbered_list_counter
|
|
)
|
|
lines.extend(child_lines)
|
|
|
|
return lines, numbered_list_counter
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public API
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def blocknote_to_markdown(
|
|
blocks: list[dict[str, Any]] | dict[str, Any] | None,
|
|
) -> str | None:
|
|
"""Convert a BlockNote document (list of blocks) to a markdown string.
|
|
|
|
Args:
|
|
blocks: BlockNote JSON — either a list of block dicts, or a single
|
|
block dict, or None.
|
|
|
|
Returns:
|
|
Markdown string, or None if input is empty / unconvertible.
|
|
|
|
Examples:
|
|
>>> blocknote_to_markdown([
|
|
... {"type": "heading", "props": {"level": 2},
|
|
... "content": [{"type": "text", "text": "Hello", "styles": {}}],
|
|
... "children": []},
|
|
... {"type": "paragraph",
|
|
... "content": [{"type": "text", "text": "World", "styles": {"bold": True}}],
|
|
... "children": []},
|
|
... ])
|
|
'## Hello\\n\\nWorld'
|
|
"""
|
|
if not blocks:
|
|
return None
|
|
|
|
# Normalise: accept a single block as well as a list
|
|
if isinstance(blocks, dict):
|
|
blocks = [blocks]
|
|
|
|
if not isinstance(blocks, list):
|
|
logger.warning(
|
|
f"blocknote_to_markdown received unexpected type: {type(blocks)}"
|
|
)
|
|
return None
|
|
|
|
all_lines: list[str] = []
|
|
prev_type: str | None = None
|
|
numbered_list_counter: int = 0
|
|
|
|
for block in blocks:
|
|
if not isinstance(block, dict):
|
|
continue
|
|
|
|
block_type = block.get("type", "paragraph")
|
|
|
|
# Reset numbered list counter when we leave a numbered list run
|
|
if block_type != "numberedListItem" and prev_type == "numberedListItem":
|
|
numbered_list_counter = 0
|
|
|
|
block_lines, numbered_list_counter = _render_block(
|
|
block, numbered_list_counter=numbered_list_counter
|
|
)
|
|
|
|
# Add a blank line between blocks (standard markdown spacing)
|
|
# Exception: consecutive list items of the same type don't get extra blank lines
|
|
if all_lines and block_lines:
|
|
same_list = block_type == prev_type and block_type in (
|
|
"bulletListItem",
|
|
"numberedListItem",
|
|
"checkListItem",
|
|
)
|
|
if not same_list:
|
|
all_lines.append("")
|
|
|
|
all_lines.extend(block_lines)
|
|
prev_type = block_type
|
|
|
|
result = "\n".join(all_lines).strip()
|
|
return result if result else None
|