mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-27 19:25:15 +02:00
feat: add memory document model and parsing functionality for markdown handling
This commit is contained in:
parent
fe07de3f9c
commit
a0ff86e0e8
5 changed files with 241 additions and 37 deletions
200
surfsense_backend/app/services/memory/document.py
Normal file
200
surfsense_backend/app/services/memory/document.py
Normal file
|
|
@ -0,0 +1,200 @@
|
|||
"""Memory-specific markdown document model and canonical renderer.
|
||||
|
||||
This intentionally parses only SurfSense memory's small markdown contract:
|
||||
``##`` sections with dated bullet items. Unknown lines are preserved so user
|
||||
edits are not lost, while legacy marker bullets are normalized on render.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date
|
||||
|
||||
DEFAULT_LEGACY_SECTION = "Memory"
|
||||
LEGACY_MARKERS = frozenset({"fact", "pref", "instr"})
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MemoryBullet:
|
||||
entry_date: date
|
||||
text: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MemoryRawLine:
|
||||
text: str
|
||||
|
||||
|
||||
MemoryLine = MemoryBullet | MemoryRawLine
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MemorySection:
|
||||
heading: str
|
||||
lines: list[MemoryLine] = field(default_factory=list)
|
||||
explicit_heading: bool = True
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MemoryDocument:
|
||||
sections: list[MemorySection] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def has_explicit_heading(self) -> bool:
|
||||
return any(section.explicit_heading for section in self.sections)
|
||||
|
||||
|
||||
def is_section_heading(line: str) -> bool:
|
||||
return line.startswith("## ") and bool(line[3:].strip())
|
||||
|
||||
|
||||
def heading_text(line: str) -> str:
|
||||
return line[3:].strip()
|
||||
|
||||
|
||||
def normalize_heading(heading: str) -> str:
|
||||
chars: list[str] = []
|
||||
previous_was_space = True
|
||||
for char in heading.strip().lower():
|
||||
if char.isalnum():
|
||||
chars.append(char)
|
||||
previous_was_space = False
|
||||
elif not previous_was_space:
|
||||
chars.append(" ")
|
||||
previous_was_space = True
|
||||
return "".join(chars).strip()
|
||||
|
||||
|
||||
def parse_bullet_line(line: str) -> MemoryBullet | None:
|
||||
stripped = line.strip()
|
||||
if not stripped.startswith("- "):
|
||||
return None
|
||||
|
||||
body = stripped[2:]
|
||||
parsed = _parse_canonical_bullet(body)
|
||||
if parsed is not None:
|
||||
return parsed
|
||||
return _parse_legacy_bullet(body)
|
||||
|
||||
|
||||
def _parse_canonical_bullet(body: str) -> MemoryBullet | None:
|
||||
if len(body) < 13 or body[10:12] != ": ":
|
||||
return None
|
||||
try:
|
||||
entry_date = date.fromisoformat(body[:10])
|
||||
except ValueError:
|
||||
return None
|
||||
text = body[12:].strip()
|
||||
if not text:
|
||||
return None
|
||||
return MemoryBullet(entry_date=entry_date, text=text)
|
||||
|
||||
|
||||
def _parse_legacy_bullet(body: str) -> MemoryBullet | None:
|
||||
if len(body) < 20 or not body.startswith("("):
|
||||
return None
|
||||
if len(body) < 14 or body[11:14] != ") [":
|
||||
return None
|
||||
try:
|
||||
entry_date = date.fromisoformat(body[1:11])
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
marker_end = body.find("] ", 14)
|
||||
if marker_end == -1:
|
||||
return None
|
||||
marker = body[14:marker_end]
|
||||
if marker not in LEGACY_MARKERS:
|
||||
return None
|
||||
|
||||
text = body[marker_end + 2 :].strip()
|
||||
if not text:
|
||||
return None
|
||||
return MemoryBullet(entry_date=entry_date, text=text)
|
||||
|
||||
|
||||
def parse_memory_document(content: str | None) -> MemoryDocument:
|
||||
if not content:
|
||||
return MemoryDocument()
|
||||
|
||||
sections: list[MemorySection] = []
|
||||
current_heading: str | None = None
|
||||
current_explicit = True
|
||||
current_lines: list[MemoryLine] = []
|
||||
|
||||
def flush_current() -> None:
|
||||
nonlocal current_heading, current_explicit, current_lines
|
||||
if current_heading is None:
|
||||
return
|
||||
sections.append(
|
||||
MemorySection(
|
||||
heading=current_heading,
|
||||
lines=current_lines,
|
||||
explicit_heading=current_explicit,
|
||||
)
|
||||
)
|
||||
current_heading = None
|
||||
current_explicit = True
|
||||
current_lines = []
|
||||
|
||||
for raw_line in content.strip().splitlines():
|
||||
line = raw_line.rstrip()
|
||||
if is_section_heading(line):
|
||||
flush_current()
|
||||
current_heading = heading_text(line)
|
||||
current_explicit = True
|
||||
current_lines = []
|
||||
continue
|
||||
|
||||
bullet = parse_bullet_line(line)
|
||||
if current_heading is None:
|
||||
if bullet is None:
|
||||
continue
|
||||
current_heading = DEFAULT_LEGACY_SECTION
|
||||
current_explicit = False
|
||||
current_lines = [bullet]
|
||||
continue
|
||||
|
||||
current_lines.append(bullet if bullet is not None else MemoryRawLine(text=line))
|
||||
|
||||
flush_current()
|
||||
return MemoryDocument(sections=sections)
|
||||
|
||||
|
||||
def render_memory_document(document: MemoryDocument) -> str:
|
||||
rendered_sections: list[str] = []
|
||||
for section in document.sections:
|
||||
section_lines = [f"## {section.heading}"]
|
||||
for line in section.lines:
|
||||
if isinstance(line, MemoryBullet):
|
||||
section_lines.append(f"- {line.entry_date.isoformat()}: {line.text}")
|
||||
else:
|
||||
section_lines.append(line.text)
|
||||
rendered_sections.append("\n".join(section_lines).strip())
|
||||
return "\n\n".join(section for section in rendered_sections if section).strip()
|
||||
|
||||
|
||||
def extract_headings(memory: str | None) -> set[str]:
|
||||
document = parse_memory_document(memory)
|
||||
return {
|
||||
normalize_heading(section.heading)
|
||||
for section in document.sections
|
||||
if section.explicit_heading
|
||||
}
|
||||
|
||||
|
||||
def has_explicit_heading(content: str) -> bool:
|
||||
return parse_memory_document(content).has_explicit_heading
|
||||
|
||||
|
||||
def nonstandard_bullets(content: str) -> list[str]:
|
||||
warnings: list[str] = []
|
||||
for line in content.splitlines():
|
||||
stripped = line.strip()
|
||||
if not stripped.startswith("- "):
|
||||
continue
|
||||
if parse_bullet_line(stripped) is not None:
|
||||
continue
|
||||
short = stripped[:80] + ("..." if len(stripped) > 80 else "")
|
||||
warnings.append(f"Non-standard memory bullet: {short}")
|
||||
return warnings
|
||||
Loading…
Add table
Add a link
Reference in a new issue