SurfSense/surfsense_backend/app/gateway/telegram/formatting.py

55 lines
1.5 KiB
Python

"""Telegram formatting helpers."""
from __future__ import annotations
import re
MARKDOWN_V2_RESERVED = r"_*[]()~`>#+-=|{}.!"
MAX_TELEGRAM_MESSAGE_UNITS = 4096
_RESERVED_RE = re.compile(r"([_\*\[\]\(\)~`>#+\-=|{}\.!])")
def escape_markdown_v2(text: str) -> str:
"""Escape all Telegram MarkdownV2 reserved characters."""
return _RESERVED_RE.sub(r"\\\1", text)
def _utf16_len(text: str) -> int:
return len(text.encode("utf-16-le")) // 2
def _split_at_boundary(text: str, max_units: int) -> tuple[str, str]:
if _utf16_len(text) <= max_units:
return text, ""
# Build a hard upper bound by code point, then walk back to natural
# boundaries. Telegram's limit is UTF-16 code units, so verify candidates.
end = min(len(text), max_units)
while end > 0 and _utf16_len(text[:end]) > max_units:
end -= 1
candidate = text[:end]
boundary = max(candidate.rfind("\n\n"), candidate.rfind(". "), candidate.rfind("\n"))
if boundary > max(200, end // 2):
end = boundary + (2 if candidate[boundary : boundary + 2] in {"\n\n", ". "} else 1)
return text[:end], text[end:]
def chunk_message(
text: str,
*,
max_units: int = MAX_TELEGRAM_MESSAGE_UNITS,
) -> list[str]:
"""Split a Telegram message at paragraph/sentence boundaries."""
if not text:
return [""]
chunks: list[str] = []
remaining = text
while remaining:
chunk, remaining = _split_at_boundary(remaining, max_units)
chunks.append(chunk)
return chunks