From a0ff86e0e8e88c17ec83f26c50530137447d2809 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 20 May 2026 13:20:05 +0530 Subject: [PATCH] feat: add memory document model and parsing functionality for markdown handling --- .../app/services/memory/document.py | 200 ++++++++++++++++++ .../app/services/memory/service.py | 3 + .../app/services/memory/validation.py | 52 ++--- .../tools/test_update_memory_scope.py | 21 ++ .../unit/services/test_memory_service.py | 2 +- 5 files changed, 241 insertions(+), 37 deletions(-) create mode 100644 surfsense_backend/app/services/memory/document.py diff --git a/surfsense_backend/app/services/memory/document.py b/surfsense_backend/app/services/memory/document.py new file mode 100644 index 000000000..498195e25 --- /dev/null +++ b/surfsense_backend/app/services/memory/document.py @@ -0,0 +1,200 @@ +"""Memory-specific markdown document model and canonical renderer. + +This intentionally parses only SurfSense memory's small markdown contract: +``##`` sections with dated bullet items. Unknown lines are preserved so user +edits are not lost, while legacy marker bullets are normalized on render. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import date + +DEFAULT_LEGACY_SECTION = "Memory" +LEGACY_MARKERS = frozenset({"fact", "pref", "instr"}) + + +@dataclass(frozen=True) +class MemoryBullet: + entry_date: date + text: str + + +@dataclass(frozen=True) +class MemoryRawLine: + text: str + + +MemoryLine = MemoryBullet | MemoryRawLine + + +@dataclass(frozen=True) +class MemorySection: + heading: str + lines: list[MemoryLine] = field(default_factory=list) + explicit_heading: bool = True + + +@dataclass(frozen=True) +class MemoryDocument: + sections: list[MemorySection] = field(default_factory=list) + + @property + def has_explicit_heading(self) -> bool: + return any(section.explicit_heading for section in self.sections) + + +def is_section_heading(line: str) -> bool: + return line.startswith("## ") and bool(line[3:].strip()) + + +def heading_text(line: str) -> str: + return line[3:].strip() + + +def normalize_heading(heading: str) -> str: + chars: list[str] = [] + previous_was_space = True + for char in heading.strip().lower(): + if char.isalnum(): + chars.append(char) + previous_was_space = False + elif not previous_was_space: + chars.append(" ") + previous_was_space = True + return "".join(chars).strip() + + +def parse_bullet_line(line: str) -> MemoryBullet | None: + stripped = line.strip() + if not stripped.startswith("- "): + return None + + body = stripped[2:] + parsed = _parse_canonical_bullet(body) + if parsed is not None: + return parsed + return _parse_legacy_bullet(body) + + +def _parse_canonical_bullet(body: str) -> MemoryBullet | None: + if len(body) < 13 or body[10:12] != ": ": + return None + try: + entry_date = date.fromisoformat(body[:10]) + except ValueError: + return None + text = body[12:].strip() + if not text: + return None + return MemoryBullet(entry_date=entry_date, text=text) + + +def _parse_legacy_bullet(body: str) -> MemoryBullet | None: + if len(body) < 20 or not body.startswith("("): + return None + if len(body) < 14 or body[11:14] != ") [": + return None + try: + entry_date = date.fromisoformat(body[1:11]) + except ValueError: + return None + + marker_end = body.find("] ", 14) + if marker_end == -1: + return None + marker = body[14:marker_end] + if marker not in LEGACY_MARKERS: + return None + + text = body[marker_end + 2 :].strip() + if not text: + return None + return MemoryBullet(entry_date=entry_date, text=text) + + +def parse_memory_document(content: str | None) -> MemoryDocument: + if not content: + return MemoryDocument() + + sections: list[MemorySection] = [] + current_heading: str | None = None + current_explicit = True + current_lines: list[MemoryLine] = [] + + def flush_current() -> None: + nonlocal current_heading, current_explicit, current_lines + if current_heading is None: + return + sections.append( + MemorySection( + heading=current_heading, + lines=current_lines, + explicit_heading=current_explicit, + ) + ) + current_heading = None + current_explicit = True + current_lines = [] + + for raw_line in content.strip().splitlines(): + line = raw_line.rstrip() + if is_section_heading(line): + flush_current() + current_heading = heading_text(line) + current_explicit = True + current_lines = [] + continue + + bullet = parse_bullet_line(line) + if current_heading is None: + if bullet is None: + continue + current_heading = DEFAULT_LEGACY_SECTION + current_explicit = False + current_lines = [bullet] + continue + + current_lines.append(bullet if bullet is not None else MemoryRawLine(text=line)) + + flush_current() + return MemoryDocument(sections=sections) + + +def render_memory_document(document: MemoryDocument) -> str: + rendered_sections: list[str] = [] + for section in document.sections: + section_lines = [f"## {section.heading}"] + for line in section.lines: + if isinstance(line, MemoryBullet): + section_lines.append(f"- {line.entry_date.isoformat()}: {line.text}") + else: + section_lines.append(line.text) + rendered_sections.append("\n".join(section_lines).strip()) + return "\n\n".join(section for section in rendered_sections if section).strip() + + +def extract_headings(memory: str | None) -> set[str]: + document = parse_memory_document(memory) + return { + normalize_heading(section.heading) + for section in document.sections + if section.explicit_heading + } + + +def has_explicit_heading(content: str) -> bool: + return parse_memory_document(content).has_explicit_heading + + +def nonstandard_bullets(content: str) -> list[str]: + warnings: list[str] = [] + for line in content.splitlines(): + stripped = line.strip() + if not stripped.startswith("- "): + continue + if parse_bullet_line(stripped) is not None: + continue + short = stripped[:80] + ("..." if len(stripped) > 80 else "") + warnings.append(f"Non-standard memory bullet: {short}") + return warnings diff --git a/surfsense_backend/app/services/memory/service.py b/surfsense_backend/app/services/memory/service.py index d4a7d0974..dd4459e77 100644 --- a/surfsense_backend/app/services/memory/service.py +++ b/surfsense_backend/app/services/memory/service.py @@ -13,6 +13,7 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from app.db import SearchSpace, User +from app.services.memory.document import parse_memory_document, render_memory_document from app.services.memory.prompts import ( TEAM_MEMORY_EXTRACT_PROMPT, USER_MEMORY_EXTRACT_PROMPT, @@ -184,6 +185,8 @@ async def save_memory( warnings=warnings, ) + next_content = render_memory_document(parse_memory_document(next_content)) + try: _set_memory(target, normalized, next_content) session.add(target) diff --git a/surfsense_backend/app/services/memory/validation.py b/surfsense_backend/app/services/memory/validation.py index f9c5007d9..6565f39c7 100644 --- a/surfsense_backend/app/services/memory/validation.py +++ b/surfsense_backend/app/services/memory/validation.py @@ -2,20 +2,18 @@ from __future__ import annotations -import re from typing import Literal +from app.services.memory.document import ( + extract_headings, + has_explicit_heading, + nonstandard_bullets, + parse_memory_document, +) + MEMORY_SOFT_LIMIT = 18_000 MEMORY_HARD_LIMIT = 25_000 -_SECTION_HEADING_RE = re.compile(r"^##\s+(.+)$", re.MULTILINE) -_HEADING_LINE_RE = re.compile(r"^##\s+\S+", re.MULTILINE) -_HEADING_NORMALIZE_RE = re.compile(r"[^a-z0-9]+") -_LEGACY_BULLET_RE = re.compile( - r"^-\s+\(\d{4}-\d{2}-\d{2}\)\s+\[(fact|pref|instr)\]\s+.+$" -) -_NEW_BULLET_RE = re.compile(r"^-\s+\d{4}-\d{2}-\d{2}:\s+.+$") - _FORBIDDEN_TEAM_HEADINGS = { "preferences", "instructions", @@ -25,25 +23,16 @@ _FORBIDDEN_TEAM_HEADINGS = { def has_markdown_heading(content: str) -> bool: - return bool(_HEADING_LINE_RE.search(content)) + return has_explicit_heading(content) def strip_preamble_to_first_heading(content: str) -> str: """Drop model preamble before the first ``##`` heading, if one exists.""" - match = _HEADING_LINE_RE.search(content) - if not match: - return content.strip() - return content[match.start() :].strip() - - -def extract_headings(memory: str | None) -> set[str]: - if not memory: - return set() - return {_normalize_heading(h) for h in _SECTION_HEADING_RE.findall(memory)} - - -def _normalize_heading(heading: str) -> str: - return _HEADING_NORMALIZE_RE.sub(" ", heading.strip().lower()).strip() + lines = content.splitlines() + for index, line in enumerate(lines): + if line.startswith("## ") and line[3:].strip(): + return "\n".join(lines[index:]).strip() + return content.strip() def validate_memory_size(content: str) -> dict[str, str] | None: @@ -69,7 +58,7 @@ def validate_heading_sanity(content: str) -> dict[str, str] | None: return None if len(stripped) <= 40: return None - if any(_LEGACY_BULLET_RE.match(line.strip()) for line in stripped.splitlines()): + if parse_memory_document(stripped).sections: return None return { "status": "error", @@ -115,16 +104,7 @@ def validate_memory_scope( def validate_bullet_format(content: str) -> list[str]: - warnings: list[str] = [] - for line in content.splitlines(): - stripped = line.strip() - if not stripped.startswith("- "): - continue - if _NEW_BULLET_RE.match(stripped) or _LEGACY_BULLET_RE.match(stripped): - continue - short = stripped[:80] + ("..." if len(stripped) > 80 else "") - warnings.append(f"Non-standard memory bullet: {short}") - return warnings + return nonstandard_bullets(content) def validate_diff(old_memory: str | None, new_memory: str) -> list[str]: @@ -138,7 +118,7 @@ def validate_diff(old_memory: str | None, new_memory: str) -> list[str]: if dropped: names = ", ".join(sorted(dropped)) warnings.append( - f"Sections removed: {names}. If unintentional, restore from the settings page." + f"Sections removed: {names}. If unintentional, restore them from the memory document." ) old_len = len(old_memory) diff --git a/surfsense_backend/tests/unit/agents/new_chat/tools/test_update_memory_scope.py b/surfsense_backend/tests/unit/agents/new_chat/tools/test_update_memory_scope.py index f1a0f97f0..c941d7d65 100644 --- a/surfsense_backend/tests/unit/agents/new_chat/tools/test_update_memory_scope.py +++ b/surfsense_backend/tests/unit/agents/new_chat/tools/test_update_memory_scope.py @@ -64,6 +64,27 @@ def test_validate_bullet_format_warns_on_nonstandard_bullet() -> None: assert "Non-standard memory bullet" in warnings[0] +@pytest.mark.asyncio +async def test_save_memory_normalizes_legacy_marker_bullets(monkeypatch) -> None: + target = type("Target", (), {"memory_md": ""})() + session = _FakeSession() + + async def fake_load_target(**_kwargs): + return target + + monkeypatch.setattr("app.services.memory.service._load_target", fake_load_target) + + result = await save_memory( + scope=MemoryScope.USER, + target_id="00000000-0000-0000-0000-000000000000", + content="- (2026-04-10) [fact] Legacy fact is preserved\n", + session=session, + ) + + assert result.status == "saved" + assert target.memory_md == "## Memory\n- 2026-04-10: Legacy fact is preserved" + + @pytest.mark.asyncio async def test_save_memory_blocks_new_personal_heading_in_team_before_commit( monkeypatch, diff --git a/surfsense_backend/tests/unit/services/test_memory_service.py b/surfsense_backend/tests/unit/services/test_memory_service.py index e7fef2cac..0a45bf3aa 100644 --- a/surfsense_backend/tests/unit/services/test_memory_service.py +++ b/surfsense_backend/tests/unit/services/test_memory_service.py @@ -82,7 +82,7 @@ async def test_save_memory_accepts_legacy_marker_payload(monkeypatch) -> None: ) assert result.status == "saved" - assert "[fact]" in target.memory_md + assert target.memory_md == "## Memory\n- 2026-05-19: Legacy marker memory" @pytest.mark.asyncio