SurfSense/surfsense_backend/app/services/memory/document.py

"""Memory-specific markdown document model and canonical renderer.

This intentionally parses only SurfSense memory's small markdown contract:
``##`` sections with dated bullet items. Unknown lines are preserved so user
edits are not lost, while legacy marker bullets are normalized on render.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from datetime import date

DEFAULT_LEGACY_SECTION = "Memory"
LEGACY_MARKERS = frozenset({"fact", "pref", "instr"})


@dataclass(frozen=True)
class MemoryBullet:
    entry_date: date
    text: str


@dataclass(frozen=True)
class MemoryRawLine:
    text: str


MemoryLine = MemoryBullet | MemoryRawLine


@dataclass(frozen=True)
class MemorySection:
    heading: str
    lines: list[MemoryLine] = field(default_factory=list)
    explicit_heading: bool = True


@dataclass(frozen=True)
class MemoryDocument:
    sections: list[MemorySection] = field(default_factory=list)

    @property
    def has_explicit_heading(self) -> bool:
        return any(section.explicit_heading for section in self.sections)


def is_section_heading(line: str) -> bool:
    return line.startswith("## ") and bool(line[3:].strip())


def heading_text(line: str) -> str:
    return line[3:].strip()


def normalize_heading(heading: str) -> str:
    chars: list[str] = []
    previous_was_space = True
    for char in heading.strip().lower():
        if char.isalnum():
            chars.append(char)
            previous_was_space = False
        elif not previous_was_space:
            chars.append(" ")
            previous_was_space = True
    return "".join(chars).strip()


def parse_bullet_line(line: str) -> MemoryBullet | None:
    stripped = line.strip()
    if not stripped.startswith("- "):
        return None

    body = stripped[2:]
    parsed = _parse_canonical_bullet(body)
    if parsed is not None:
        return parsed
    return _parse_legacy_bullet(body)


def _parse_canonical_bullet(body: str) -> MemoryBullet | None:
    if len(body) < 13 or body[10:12] != ": ":
        return None
    try:
        entry_date = date.fromisoformat(body[:10])
    except ValueError:
        return None
    text = body[12:].strip()
    if not text:
        return None
    return MemoryBullet(entry_date=entry_date, text=text)


def _parse_legacy_bullet(body: str) -> MemoryBullet | None:
    if len(body) < 20 or not body.startswith("("):
        return None
    if len(body) < 14 or body[11:14] != ") [":
        return None
    try:
        entry_date = date.fromisoformat(body[1:11])
    except ValueError:
        return None

    marker_end = body.find("] ", 14)
    if marker_end == -1:
        return None
    marker = body[14:marker_end]
    if marker not in LEGACY_MARKERS:
        return None

    text = body[marker_end + 2 :].strip()
    if not text:
        return None
    return MemoryBullet(entry_date=entry_date, text=text)


def parse_memory_document(content: str | None) -> MemoryDocument:
    if not content:
        return MemoryDocument()

    sections: list[MemorySection] = []
    current_heading: str | None = None
    current_explicit = True
    current_lines: list[MemoryLine] = []

    def flush_current() -> None:
        nonlocal current_heading, current_explicit, current_lines
        if current_heading is None:
            return
        sections.append(
            MemorySection(
                heading=current_heading,
                lines=current_lines,
                explicit_heading=current_explicit,
            )
        )
        current_heading = None
        current_explicit = True
        current_lines = []

    for raw_line in content.strip().splitlines():
        line = raw_line.rstrip()
        if is_section_heading(line):
            flush_current()
            current_heading = heading_text(line)
            current_explicit = True
            current_lines = []
            continue

        bullet = parse_bullet_line(line)
        if current_heading is None:
            if bullet is None:
                continue
            current_heading = DEFAULT_LEGACY_SECTION
            current_explicit = False
            current_lines = [bullet]
            continue

        current_lines.append(bullet if bullet is not None else MemoryRawLine(text=line))

    flush_current()
    return MemoryDocument(sections=sections)


def render_memory_document(document: MemoryDocument) -> str:
    rendered_sections: list[str] = []
    for section in document.sections:
        section_lines = [f"## {section.heading}"]
        for line in section.lines:
            if isinstance(line, MemoryBullet):
                section_lines.append(f"- {line.entry_date.isoformat()}: {line.text}")
            else:
                section_lines.append(line.text)
        rendered_sections.append("\n".join(section_lines).strip())
    return "\n\n".join(section for section in rendered_sections if section).strip()


def extract_headings(memory: str | None) -> set[str]:
    document = parse_memory_document(memory)
    return {
        normalize_heading(section.heading)
        for section in document.sections
        if section.explicit_heading
    }


def has_explicit_heading(content: str) -> bool:
    return parse_memory_document(content).has_explicit_heading


def nonstandard_bullets(content: str) -> list[str]:
    warnings: list[str] = []
    for line in content.splitlines():
        stripped = line.strip()
        if not stripped.startswith("- "):
            continue
        if parse_bullet_line(stripped) is not None:
            continue
        short = stripped[:80] + ("..." if len(stripped) > 80 else "")
        warnings.append(f"Non-standard memory bullet: {short}")
    return warnings