feat: add memory document model and parsing functionality for markdown handling

2026-07-20 23:21:06 +02:00 · 2026-05-20 13:20:05 +05:30 · 2026-05-20 13:20:05 +05:30 · a0ff86e0e8
commit a0ff86e0e8
parent fe07de3f9c
5 changed files with 241 additions and 37 deletions
--- a/surfsense_backend/app/services/memory/document.py
+++ b/surfsense_backend/app/services/memory/document.py
@ -0,0 +1,200 @@
+"""Memory-specific markdown document model and canonical renderer.
+
+This intentionally parses only SurfSense memory's small markdown contract:
+``##`` sections with dated bullet items. Unknown lines are preserved so user
+edits are not lost, while legacy marker bullets are normalized on render.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from datetime import date
+
+DEFAULT_LEGACY_SECTION = "Memory"
+LEGACY_MARKERS = frozenset({"fact", "pref", "instr"})
+
+
+@dataclass(frozen=True)
+class MemoryBullet:
+    entry_date: date
+    text: str
+
+
+@dataclass(frozen=True)
+class MemoryRawLine:
+    text: str
+
+
+MemoryLine = MemoryBullet | MemoryRawLine
+
+
+@dataclass(frozen=True)
+class MemorySection:
+    heading: str
+    lines: list[MemoryLine] = field(default_factory=list)
+    explicit_heading: bool = True
+
+
+@dataclass(frozen=True)
+class MemoryDocument:
+    sections: list[MemorySection] = field(default_factory=list)
+
+    @property
+    def has_explicit_heading(self) -> bool:
+        return any(section.explicit_heading for section in self.sections)
+
+
+def is_section_heading(line: str) -> bool:
+    return line.startswith("## ") and bool(line[3:].strip())
+
+
+def heading_text(line: str) -> str:
+    return line[3:].strip()
+
+
+def normalize_heading(heading: str) -> str:
+    chars: list[str] = []
+    previous_was_space = True
+    for char in heading.strip().lower():
+        if char.isalnum():
+            chars.append(char)
+            previous_was_space = False
+        elif not previous_was_space:
+            chars.append(" ")
+            previous_was_space = True
+    return "".join(chars).strip()
+
+
+def parse_bullet_line(line: str) -> MemoryBullet | None:
+    stripped = line.strip()
+    if not stripped.startswith("- "):
+        return None
+
+    body = stripped[2:]
+    parsed = _parse_canonical_bullet(body)
+    if parsed is not None:
+        return parsed
+    return _parse_legacy_bullet(body)
+
+
+def _parse_canonical_bullet(body: str) -> MemoryBullet | None:
+    if len(body) < 13 or body[10:12] != ": ":
+        return None
+    try:
+        entry_date = date.fromisoformat(body[:10])
+    except ValueError:
+        return None
+    text = body[12:].strip()
+    if not text:
+        return None
+    return MemoryBullet(entry_date=entry_date, text=text)
+
+
+def _parse_legacy_bullet(body: str) -> MemoryBullet | None:
+    if len(body) < 20 or not body.startswith("("):
+        return None
+    if len(body) < 14 or body[11:14] != ") [":
+        return None
+    try:
+        entry_date = date.fromisoformat(body[1:11])
+    except ValueError:
+        return None
+
+    marker_end = body.find("] ", 14)
+    if marker_end == -1:
+        return None
+    marker = body[14:marker_end]
+    if marker not in LEGACY_MARKERS:
+        return None
+
+    text = body[marker_end + 2 :].strip()
+    if not text:
+        return None
+    return MemoryBullet(entry_date=entry_date, text=text)
+
+
+def parse_memory_document(content: str | None) -> MemoryDocument:
+    if not content:
+        return MemoryDocument()
+
+    sections: list[MemorySection] = []
+    current_heading: str | None = None
+    current_explicit = True
+    current_lines: list[MemoryLine] = []
+
+    def flush_current() -> None:
+        nonlocal current_heading, current_explicit, current_lines
+        if current_heading is None:
+            return
+        sections.append(
+            MemorySection(
+                heading=current_heading,
+                lines=current_lines,
+                explicit_heading=current_explicit,
+            )
+        )
+        current_heading = None
+        current_explicit = True
+        current_lines = []
+
+    for raw_line in content.strip().splitlines():
+        line = raw_line.rstrip()
+        if is_section_heading(line):
+            flush_current()
+            current_heading = heading_text(line)
+            current_explicit = True
+            current_lines = []
+            continue
+
+        bullet = parse_bullet_line(line)
+        if current_heading is None:
+            if bullet is None:
+                continue
+            current_heading = DEFAULT_LEGACY_SECTION
+            current_explicit = False
+            current_lines = [bullet]
+            continue
+
+        current_lines.append(bullet if bullet is not None else MemoryRawLine(text=line))
+
+    flush_current()
+    return MemoryDocument(sections=sections)
+
+
+def render_memory_document(document: MemoryDocument) -> str:
+    rendered_sections: list[str] = []
+    for section in document.sections:
+        section_lines = [f"## {section.heading}"]
+        for line in section.lines:
+            if isinstance(line, MemoryBullet):
+                section_lines.append(f"- {line.entry_date.isoformat()}: {line.text}")
+            else:
+                section_lines.append(line.text)
+        rendered_sections.append("\n".join(section_lines).strip())
+    return "\n\n".join(section for section in rendered_sections if section).strip()
+
+
+def extract_headings(memory: str | None) -> set[str]:
+    document = parse_memory_document(memory)
+    return {
+        normalize_heading(section.heading)
+        for section in document.sections
+        if section.explicit_heading
+    }
+
+
+def has_explicit_heading(content: str) -> bool:
+    return parse_memory_document(content).has_explicit_heading
+
+
+def nonstandard_bullets(content: str) -> list[str]:
+    warnings: list[str] = []
+    for line in content.splitlines():
+        stripped = line.strip()
+        if not stripped.startswith("- "):
+            continue
+        if parse_bullet_line(stripped) is not None:
+            continue
+        short = stripped[:80] + ("..." if len(stripped) > 80 else "")
+        warnings.append(f"Non-standard memory bullet: {short}")
+    return warnings