mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-04 20:05:16 +02:00
feat(gateway): add Telegram adapter and formatting
This commit is contained in:
parent
c9b7d7b572
commit
59e6475348
5 changed files with 298 additions and 0 deletions
55
surfsense_backend/app/gateway/telegram/formatting.py
Normal file
55
surfsense_backend/app/gateway/telegram/formatting.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
"""Telegram formatting helpers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
MARKDOWN_V2_RESERVED = r"_*[]()~`>#+-=|{}.!"
|
||||
MAX_TELEGRAM_MESSAGE_UNITS = 4096
|
||||
|
||||
_RESERVED_RE = re.compile(r"([_\*\[\]\(\)~`>#+\-=|{}\.!])")
|
||||
|
||||
|
||||
def escape_markdown_v2(text: str) -> str:
|
||||
"""Escape all Telegram MarkdownV2 reserved characters."""
|
||||
return _RESERVED_RE.sub(r"\\\1", text)
|
||||
|
||||
|
||||
def _utf16_len(text: str) -> int:
|
||||
return len(text.encode("utf-16-le")) // 2
|
||||
|
||||
|
||||
def _split_at_boundary(text: str, max_units: int) -> tuple[str, str]:
|
||||
if _utf16_len(text) <= max_units:
|
||||
return text, ""
|
||||
|
||||
# Build a hard upper bound by code point, then walk back to natural
|
||||
# boundaries. Telegram's limit is UTF-16 code units, so verify candidates.
|
||||
end = min(len(text), max_units)
|
||||
while end > 0 and _utf16_len(text[:end]) > max_units:
|
||||
end -= 1
|
||||
|
||||
candidate = text[:end]
|
||||
boundary = max(candidate.rfind("\n\n"), candidate.rfind(". "), candidate.rfind("\n"))
|
||||
if boundary > max(200, end // 2):
|
||||
end = boundary + (2 if candidate[boundary : boundary + 2] in {"\n\n", ". "} else 1)
|
||||
|
||||
return text[:end], text[end:]
|
||||
|
||||
|
||||
def chunk_message(
|
||||
text: str,
|
||||
*,
|
||||
max_units: int = MAX_TELEGRAM_MESSAGE_UNITS,
|
||||
) -> list[str]:
|
||||
"""Split a Telegram message at paragraph/sentence boundaries."""
|
||||
if not text:
|
||||
return [""]
|
||||
|
||||
chunks: list[str] = []
|
||||
remaining = text
|
||||
while remaining:
|
||||
chunk, remaining = _split_at_boundary(remaining, max_units)
|
||||
chunks.append(chunk)
|
||||
return chunks
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue