mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-02 19:55:18 +02:00
55 lines
1.5 KiB
Python
55 lines
1.5 KiB
Python
"""Telegram formatting helpers."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
MARKDOWN_V2_RESERVED = r"_*[]()~`>#+-=|{}.!"
|
|
MAX_TELEGRAM_MESSAGE_UNITS = 4096
|
|
|
|
_RESERVED_RE = re.compile(r"([_\*\[\]\(\)~`>#+\-=|{}\.!])")
|
|
|
|
|
|
def escape_markdown_v2(text: str) -> str:
|
|
"""Escape all Telegram MarkdownV2 reserved characters."""
|
|
return _RESERVED_RE.sub(r"\\\1", text)
|
|
|
|
|
|
def _utf16_len(text: str) -> int:
|
|
return len(text.encode("utf-16-le")) // 2
|
|
|
|
|
|
def _split_at_boundary(text: str, max_units: int) -> tuple[str, str]:
|
|
if _utf16_len(text) <= max_units:
|
|
return text, ""
|
|
|
|
# Build a hard upper bound by code point, then walk back to natural
|
|
# boundaries. Telegram's limit is UTF-16 code units, so verify candidates.
|
|
end = min(len(text), max_units)
|
|
while end > 0 and _utf16_len(text[:end]) > max_units:
|
|
end -= 1
|
|
|
|
candidate = text[:end]
|
|
boundary = max(candidate.rfind("\n\n"), candidate.rfind(". "), candidate.rfind("\n"))
|
|
if boundary > max(200, end // 2):
|
|
end = boundary + (2 if candidate[boundary : boundary + 2] in {"\n\n", ". "} else 1)
|
|
|
|
return text[:end], text[end:]
|
|
|
|
|
|
def chunk_message(
|
|
text: str,
|
|
*,
|
|
max_units: int = MAX_TELEGRAM_MESSAGE_UNITS,
|
|
) -> list[str]:
|
|
"""Split a Telegram message at paragraph/sentence boundaries."""
|
|
if not text:
|
|
return [""]
|
|
|
|
chunks: list[str] = []
|
|
remaining = text
|
|
while remaining:
|
|
chunk, remaining = _split_at_boundary(remaining, max_units)
|
|
chunks.append(chunk)
|
|
return chunks
|
|
|