mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-27 19:25:15 +02:00
200 lines
5.6 KiB
Python
200 lines
5.6 KiB
Python
"""Memory-specific markdown document model and canonical renderer.
|
|
|
|
This intentionally parses only SurfSense memory's small markdown contract:
|
|
``##`` sections with dated bullet items. Unknown lines are preserved so user
|
|
edits are not lost, while legacy marker bullets are normalized on render.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from datetime import date
|
|
|
|
DEFAULT_LEGACY_SECTION = "Memory"
|
|
LEGACY_MARKERS = frozenset({"fact", "pref", "instr"})
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MemoryBullet:
|
|
entry_date: date
|
|
text: str
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MemoryRawLine:
|
|
text: str
|
|
|
|
|
|
MemoryLine = MemoryBullet | MemoryRawLine
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MemorySection:
|
|
heading: str
|
|
lines: list[MemoryLine] = field(default_factory=list)
|
|
explicit_heading: bool = True
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MemoryDocument:
|
|
sections: list[MemorySection] = field(default_factory=list)
|
|
|
|
@property
|
|
def has_explicit_heading(self) -> bool:
|
|
return any(section.explicit_heading for section in self.sections)
|
|
|
|
|
|
def is_section_heading(line: str) -> bool:
|
|
return line.startswith("## ") and bool(line[3:].strip())
|
|
|
|
|
|
def heading_text(line: str) -> str:
|
|
return line[3:].strip()
|
|
|
|
|
|
def normalize_heading(heading: str) -> str:
|
|
chars: list[str] = []
|
|
previous_was_space = True
|
|
for char in heading.strip().lower():
|
|
if char.isalnum():
|
|
chars.append(char)
|
|
previous_was_space = False
|
|
elif not previous_was_space:
|
|
chars.append(" ")
|
|
previous_was_space = True
|
|
return "".join(chars).strip()
|
|
|
|
|
|
def parse_bullet_line(line: str) -> MemoryBullet | None:
|
|
stripped = line.strip()
|
|
if not stripped.startswith("- "):
|
|
return None
|
|
|
|
body = stripped[2:]
|
|
parsed = _parse_canonical_bullet(body)
|
|
if parsed is not None:
|
|
return parsed
|
|
return _parse_legacy_bullet(body)
|
|
|
|
|
|
def _parse_canonical_bullet(body: str) -> MemoryBullet | None:
|
|
if len(body) < 13 or body[10:12] != ": ":
|
|
return None
|
|
try:
|
|
entry_date = date.fromisoformat(body[:10])
|
|
except ValueError:
|
|
return None
|
|
text = body[12:].strip()
|
|
if not text:
|
|
return None
|
|
return MemoryBullet(entry_date=entry_date, text=text)
|
|
|
|
|
|
def _parse_legacy_bullet(body: str) -> MemoryBullet | None:
|
|
if len(body) < 20 or not body.startswith("("):
|
|
return None
|
|
if len(body) < 14 or body[11:14] != ") [":
|
|
return None
|
|
try:
|
|
entry_date = date.fromisoformat(body[1:11])
|
|
except ValueError:
|
|
return None
|
|
|
|
marker_end = body.find("] ", 14)
|
|
if marker_end == -1:
|
|
return None
|
|
marker = body[14:marker_end]
|
|
if marker not in LEGACY_MARKERS:
|
|
return None
|
|
|
|
text = body[marker_end + 2 :].strip()
|
|
if not text:
|
|
return None
|
|
return MemoryBullet(entry_date=entry_date, text=text)
|
|
|
|
|
|
def parse_memory_document(content: str | None) -> MemoryDocument:
|
|
if not content:
|
|
return MemoryDocument()
|
|
|
|
sections: list[MemorySection] = []
|
|
current_heading: str | None = None
|
|
current_explicit = True
|
|
current_lines: list[MemoryLine] = []
|
|
|
|
def flush_current() -> None:
|
|
nonlocal current_heading, current_explicit, current_lines
|
|
if current_heading is None:
|
|
return
|
|
sections.append(
|
|
MemorySection(
|
|
heading=current_heading,
|
|
lines=current_lines,
|
|
explicit_heading=current_explicit,
|
|
)
|
|
)
|
|
current_heading = None
|
|
current_explicit = True
|
|
current_lines = []
|
|
|
|
for raw_line in content.strip().splitlines():
|
|
line = raw_line.rstrip()
|
|
if is_section_heading(line):
|
|
flush_current()
|
|
current_heading = heading_text(line)
|
|
current_explicit = True
|
|
current_lines = []
|
|
continue
|
|
|
|
bullet = parse_bullet_line(line)
|
|
if current_heading is None:
|
|
if bullet is None:
|
|
continue
|
|
current_heading = DEFAULT_LEGACY_SECTION
|
|
current_explicit = False
|
|
current_lines = [bullet]
|
|
continue
|
|
|
|
current_lines.append(bullet if bullet is not None else MemoryRawLine(text=line))
|
|
|
|
flush_current()
|
|
return MemoryDocument(sections=sections)
|
|
|
|
|
|
def render_memory_document(document: MemoryDocument) -> str:
|
|
rendered_sections: list[str] = []
|
|
for section in document.sections:
|
|
section_lines = [f"## {section.heading}"]
|
|
for line in section.lines:
|
|
if isinstance(line, MemoryBullet):
|
|
section_lines.append(f"- {line.entry_date.isoformat()}: {line.text}")
|
|
else:
|
|
section_lines.append(line.text)
|
|
rendered_sections.append("\n".join(section_lines).strip())
|
|
return "\n\n".join(section for section in rendered_sections if section).strip()
|
|
|
|
|
|
def extract_headings(memory: str | None) -> set[str]:
|
|
document = parse_memory_document(memory)
|
|
return {
|
|
normalize_heading(section.heading)
|
|
for section in document.sections
|
|
if section.explicit_heading
|
|
}
|
|
|
|
|
|
def has_explicit_heading(content: str) -> bool:
|
|
return parse_memory_document(content).has_explicit_heading
|
|
|
|
|
|
def nonstandard_bullets(content: str) -> list[str]:
|
|
warnings: list[str] = []
|
|
for line in content.splitlines():
|
|
stripped = line.strip()
|
|
if not stripped.startswith("- "):
|
|
continue
|
|
if parse_bullet_line(stripped) is not None:
|
|
continue
|
|
short = stripped[:80] + ("..." if len(stripped) > 80 else "")
|
|
warnings.append(f"Non-standard memory bullet: {short}")
|
|
return warnings
|