refactor: remove memory extraction functionality and update memory management protocols to ensure immediate updates for user and team interactions

2026-07-14 22:52:15 +02:00 · 2026-04-09 23:30:17 +05:30 · 2026-04-09 23:30:17 +05:30 · cd72fa9a48
commit cd72fa9a48
parent f38ea77940
7 changed files with 80 additions and 337 deletions
--- a/surfsense_backend/app/agents/new_chat/memory_extraction.py
+++ b/surfsense_backend/app/agents/new_chat/memory_extraction.py
@ -1,223 +0,0 @@
 """Post-response memory extraction for the SurfSense agent.
 After each agent response, a background task calls a lightweight LLM to decide
 whether the user's message contains any long-term information worth persisting
 (preferences, background, goals, instructions, etc.).  This ensures memory
 updates are never missed regardless of whether the main agent called
 ``update_memory`` during the conversation.
 The function re-reads memory from the database so it always sees the latest
 state — including any updates the agent may have already made.
 """
 from __future__ import annotations
 import logging
 from typing import Any
 from uuid import UUID
 from langchain_core.messages import HumanMessage
 from sqlalchemy import select
 from app.agents.new_chat.tools.update_memory import _save_memory
 from app.db import ChatVisibility, SearchSpace, User, shielded_async_session
 logger = logging.getLogger(__name__)
 _MEMORY_EXTRACT_PROMPT = """\
 You are a memory extraction assistant. Analyze the user's message and decide \
 if it contains any long-term information worth persisting to memory.
 Worth remembering: preferences, background/identity, goals, projects, \
 instructions, tools/languages they use, decisions, expertise, workplace.
 NOT worth remembering: greetings, one-off factual questions, session \
 logistics, ephemeral requests, follow-up clarifications with no new personal info.
 If the message contains memorizable information, output the FULL updated \
 memory document with the new facts merged into the existing content. Follow \
 these rules:
 - Use the same ## section structure as the existing memory.
 - Keep entries as single concise bullet points (under 120 chars each).
 - Add (YYYY-MM) date suffixes on time-sensitive entries.
 - Never remove or modify sections marked with (pinned).
 - If a new fact contradicts an existing entry, update the existing entry.
 - Do not duplicate information that is already present.
 - Standard sections: \
 "## About the user (pinned)", "## Preferences", "## Instructions (pinned)", \
 "## Current context"
 If nothing is worth remembering, output exactly: NO_UPDATE
 <current_memory>
 {current_memory}
 </current_memory>
 <user_message>
 {user_message}
 </user_message>"""
 _TEAM_MEMORY_EXTRACT_PROMPT = """\
 You are a memory extraction assistant for a team workspace. Analyze the \
 user's message and decide if it contains any long-term team information \
 worth persisting to the shared memory.
 Worth remembering: team decisions, conventions, coding standards, key facts \
 about the project/team, processes, architecture decisions.
 NOT worth remembering: greetings, personal preferences, one-off questions, \
 ephemeral requests.
 If the message contains memorizable information, output the FULL updated \
 memory document with the new facts merged into the existing content. Follow \
 these rules:
 - Use the same ## section structure as the existing memory.
 - Keep entries as single concise bullet points (under 120 chars each).
 - Add (YYYY-MM) date suffixes on time-sensitive entries.
 - Never remove or modify sections marked with (pinned).
 - Standard sections: \
 "## Team decisions (pinned)", "## Conventions (pinned)", "## Key facts", \
 "## Current priorities"
 If nothing is worth remembering, output exactly: NO_UPDATE
 <current_memory>
 {current_memory}
 </current_memory>
 <user_message>
 {user_message}
 </user_message>"""
 async def _call_extraction_llm(
    llm: Any,
    prompt_template: str,
    current_memory: str,
    user_message: str,
 ) -> str | None:
    """Run the extraction LLM and return the updated memory, or ``None``."""
    prompt = prompt_template.format(
        current_memory=current_memory or "(empty)",
        user_message=user_message,
    )
    response = await llm.ainvoke(
        [HumanMessage(content=prompt)],
        config={"tags": ["surfsense:internal", "memory-extraction"]},
    )
    text = (
        response.content if isinstance(response.content, str) else str(response.content)
    ).strip()
    if text == "NO_UPDATE" or not text:
        return None
    return text
 async def extract_and_save_memory(
    *,
    user_message: str,
    user_id: str | None,
    search_space_id: int,
    thread_visibility: ChatVisibility | None,
    llm: Any,
 ) -> None:
    """Background task: extract memorizable info and persist it.
    This function is designed to be fire-and-forget — it catches all
    exceptions internally and never propagates them.
    """
    if not user_id:
        return
    visibility = thread_visibility or ChatVisibility.PRIVATE
    try:
        await _extract_user_memory(user_message, user_id, llm)
    except Exception:
        logger.exception("Background user memory extraction failed")
    if visibility == ChatVisibility.SEARCH_SPACE:
        try:
            await _extract_team_memory(user_message, search_space_id, llm)
        except Exception:
            logger.exception("Background team memory extraction failed")
 async def _extract_user_memory(
    user_message: str,
    user_id: str,
    llm: Any,
 ) -> None:
    """Extract and persist user memory updates."""
    uid = UUID(user_id) if isinstance(user_id, str) else user_id
    async with shielded_async_session() as session:
        result = await session.execute(select(User).where(User.id == uid))
        user = result.scalars().first()
        if not user:
            return
        old_memory = user.memory_md
        updated = await _call_extraction_llm(
            llm, _MEMORY_EXTRACT_PROMPT, old_memory or "", user_message
        )
        if updated is None:
            logger.debug("Memory extraction: no update needed (user %s)", uid)
            return
        save_result = await _save_memory(
            updated_memory=updated,
            old_memory=old_memory,
            llm=llm,
            apply_fn=lambda content: setattr(user, "memory_md", content),
            commit_fn=session.commit,
            rollback_fn=session.rollback,
            label="memory",
        )
        logger.info(
            "Background memory extraction for user %s: %s",
            uid,
            save_result.get("status"),
        )
 async def _extract_team_memory(
    user_message: str,
    search_space_id: int,
    llm: Any,
 ) -> None:
    """Extract and persist team memory updates."""
    async with shielded_async_session() as session:
        result = await session.execute(
            select(SearchSpace).where(SearchSpace.id == search_space_id)
        )
        space = result.scalars().first()
        if not space:
            return
        old_memory = space.shared_memory_md
        updated = await _call_extraction_llm(
            llm, _TEAM_MEMORY_EXTRACT_PROMPT, old_memory or "", user_message
        )
        if updated is None:
            logger.debug(
                "Team memory extraction: no update needed (space %s)",
                search_space_id,
            )
            return
        save_result = await _save_memory(
            updated_memory=updated,
            old_memory=old_memory,
            llm=llm,
            apply_fn=lambda content: setattr(space, "shared_memory_md", content),
            commit_fn=session.commit,
            rollback_fn=session.rollback,
            label="team memory",
        )
        logger.info(
            "Background team memory extraction for space %s: %s",
            search_space_id,
            save_result.get("status"),
        )
--- a/surfsense_backend/app/agents/new_chat/middleware/memory_injection.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/memory_injection.py
@ -59,12 +59,14 @@ class MemoryInjectionMiddleware(AgentMiddleware):  # type: ignore[type-arg]
        async with shielded_async_session() as session:
            if self.user_id is not None:
-                user_memory, is_persisted = await self._load_user_memory(session)
+                user_memory, display_name = await self._load_user_memory(session)
                if display_name:
                    first_name = display_name.split()[0]
                    memory_blocks.append(f"<user_name>{first_name}</user_name>")
                if user_memory:
                    chars = len(user_memory)
                    persisted = "true" if is_persisted else "false"
                    memory_blocks.append(
-                        f'<user_memory chars="{chars}" limit="{MEMORY_HARD_LIMIT}" persisted="{persisted}">\n'
+                        f'<user_memory chars="{chars}" limit="{MEMORY_HARD_LIMIT}">\n'
                        f"{user_memory}\n"
                        f"</user_memory>"
                    )
@ -91,42 +93,19 @@ class MemoryInjectionMiddleware(AgentMiddleware):  # type: ignore[type-arg]
        return {"messages": new_messages}
-    async def _load_user_memory(self, session: AsyncSession) -> tuple[str | None, bool]:
+    async def _load_user_memory(self, session: AsyncSession) -> tuple[str | None, str | None]:
-        """Return (memory_content, is_persisted).
+        """Return (memory_content, display_name)."""
        When the user has no saved memory but has a display name, a seed
        document is created and **persisted to the database immediately**
        so the LLM doesn't need to make a tool call to save it.
        """
        try:
            result = await session.execute(
                select(User.memory_md, User.display_name).where(User.id == self.user_id)
            )
            row = result.one_or_none()
            if row is None:
-                return None, True
+                return None, None
-
+            return row.memory_md or None, row.display_name
            memory_md, display_name = row
            if memory_md:
                return memory_md, True
            if display_name:
                first_name = display_name.split()[0]
                seed = f"## About the user (pinned)\n- Name: {first_name}"
                await session.execute(
                    User.__table__.update()
                    .where(User.id == self.user_id)
                    .values(memory_md=seed)
                )
                await session.commit()
                logger.info("Auto-persisted memory seed for user %s", self.user_id)
                return seed, True
            return None, True
        except Exception:
            logger.exception("Failed to load user memory")
-            return None, True
+            return None, None
    async def _load_team_memory(self, session: AsyncSession) -> str | None:
        try:
--- a/surfsense_backend/app/agents/new_chat/system_prompt.py
+++ b/surfsense_backend/app/agents/new_chat/system_prompt.py
@ -40,6 +40,13 @@ CRITICAL RULE — KNOWLEDGE BASE FIRST, NEVER DEFAULT TO GENERAL KNOWLEDGE:
  * Tool-usage actions like generating reports, podcasts, images, or scraping webpages
 </knowledge_base_only_policy>
 <memory_protocol>
 IMPORTANT — After understanding each user message, ALWAYS check: does this message
 reveal durable facts about the user (role, interests, preferences, projects,
 background, or standing instructions)? If yes, you MUST call update_memory
 alongside your normal response — do not defer this to a later turn.
 </memory_protocol>
 </system_instruction>
 """
@ -71,6 +78,13 @@ CRITICAL RULE — KNOWLEDGE BASE FIRST, NEVER DEFAULT TO GENERAL KNOWLEDGE:
  * Tool-usage actions like generating reports, podcasts, images, or scraping webpages
 </knowledge_base_only_policy>
 <memory_protocol>
 IMPORTANT — After understanding each user message, ALWAYS check: does this message
 reveal durable facts about the team (decisions, conventions, architecture, processes,
 or key facts)? If yes, you MUST call update_memory alongside your normal response —
 do not defer this to a later turn.
 </memory_protocol>
 </system_instruction>
 """
@ -257,56 +271,52 @@ _MEMORY_TOOL_INSTRUCTIONS: dict[str, dict[str, str]] = {
    `limit` attributes show your current usage and the maximum allowed size.
  - This is your curated long-term memory — the distilled essence of what you know about
    the user, not raw conversation logs.
-  - Note: The system automatically extracts memorizable information from every
+  - You are the sole mechanism for persisting memory — there is no background extraction.
-    conversation in the background. Use this tool primarily for:
+    Call update_memory when:
-    * Explicit user requests: "remember this", "keep in mind", "note that", "forget X"
+    * The user explicitly asks to remember or forget something
-    * Restructuring or reorganizing the memory document
+    * The user shares durable facts or preferences that will matter in future conversations
-    * Correcting outdated or wrong entries
+  - The user's name is already provided via <user_name> — do not store it in memory.
-    * **If <user_memory> has persisted="false"** — you MUST still call update_memory
+  - Do not store short-lived or ephemeral info: one-off questions, greetings,
-      to persist the seed.
+    session logistics, or things that only matter for the current task.
  - Skip truly ephemeral info (one-off questions, greetings, session logistics).
  - Args:
    - updated_memory: The FULL updated markdown document (not a diff).
      Merge new facts with existing ones, update contradictions, remove outdated entries.
      Treat every update as a curation pass — consolidate, don't just append.
-      Include inline dates (YYYY-MM) on entries where temporal context matters (facts that
+  - Every bullet MUST start with a (YYYY-MM-DD) date prefix indicating when it was recorded or last updated.
      may change, decisions, context).  Skip dates on timeless preferences and instructions.
  - Keep it concise and well under the character limit shown in <user_memory>.
  - You MUST organize memory using these standard sections (add new `##` sections only if none of the standard ones fit):
-    ## About the user (pinned) — name, role, background, company (with date if it may change)
+    ## About the user (pinned) — role, background, company
    ## Preferences — languages, tools, frameworks, response style
    ## Instructions (pinned) — standing instructions, things to always/never do
    ## Current context — ongoing projects, goals, deadlines (with date)
  - Each entry MUST be a single bullet point. Keep entries concise (aim for under 120 chars each).
  - Each time-sensitive entry MUST include a (YYYY-MM) date suffix.
  - Sections with `(pinned)` in the heading are protected — the system will reject any
    update that removes them. Users can add `(pinned)` to any `##` heading to protect it.
-  - During consolidation, prioritize keeping: pinned sections > preferences > current context.
+  - During consolidation, prioritize keeping: pinned sections > preferences.
 """,
        "shared": """
 - update_memory: Update the team's shared memory document for this search space.
  - Your current team memory is already in <team_memory> in your context.  The `chars`
    and `limit` attributes show current usage and the maximum allowed size.
  - This is the team's curated long-term memory — decisions, conventions, key facts.
-  - Note: The system automatically extracts memorizable team information from every
+  - You are the sole mechanism for persisting team memory — there is no background extraction.
-    conversation in the background. Use this tool primarily for:
+    Call update_memory when:
-    * Explicit requests: "let's remember that", "note this decision", "forget X"
+    * A team member explicitly asks to remember or forget something
-    * Restructuring or reorganizing the team memory document
+    * The conversation surfaces durable team decisions, conventions, or facts
-    * Correcting outdated or wrong entries
+      that will matter in future conversations
-  - Skip truly ephemeral info (one-off questions, greetings, session logistics).
+  - Do not store short-lived or ephemeral info: one-off questions, greetings,
    session logistics, or things that only matter for the current task.
  - Args:
    - updated_memory: The FULL updated markdown document (not a diff).
      Merge new facts with existing ones, update contradictions, remove outdated entries.
      Treat every update as a curation pass — consolidate, don't just append.
-      Include inline dates (YYYY-MM) on decisions and time-sensitive entries.
+  - Every bullet MUST start with a (YYYY-MM-DD) date prefix indicating when it was recorded or last updated.
  - Keep it concise and well under the character limit shown in <team_memory>.
  - You MUST organize memory using these standard sections (add new `##` sections only if none of the standard ones fit):
-    ## Team decisions (pinned) — agreed-upon choices with rationale and date
+    ## Team decisions (pinned) — agreed-upon choices with rationale
    ## Conventions (pinned) — coding standards, tools, processes, naming patterns
    ## Key facts — where things are, how things work, team structure
    ## Current priorities — active projects, deadlines, blockers
  - Each entry MUST be a single bullet point. Keep entries concise (aim for under 120 chars each).
  - Each time-sensitive entry MUST include a (YYYY-MM) date suffix.
  - Sections with `(pinned)` in the heading are protected — the system will reject any
    update that removes them. Users can add `(pinned)` to any `##` heading to protect it.
  - During consolidation, prioritize keeping: pinned sections > key facts > current priorities.
@ -317,28 +327,26 @@ _MEMORY_TOOL_INSTRUCTIONS: dict[str, dict[str, str]] = {
 _MEMORY_TOOL_EXAMPLES: dict[str, dict[str, str]] = {
    "update_memory": {
        "private": """
- <user_memory persisted="false"> contains "## About the user (pinned)\\n- Name: Alex"
+- <user_memory> is empty. User: "I'm a space enthusiast, explain astrophage to me"
-  User: "I'm a university student, explain astrophage to me"
+  - The user casually shared a durable fact about themselves. Save it:
-  - Memory is not yet persisted AND the user casually shared that they are a student.
+    update_memory(updated_memory="## About the user (pinned)\\n- (2025-03-15) Space enthusiast\\n")
-    You MUST call update_memory to persist the seed plus the new fact:
+- User: "Remember that I prefer concise answers over detailed explanations"
-    update_memory(updated_memory="## About the user (pinned)\\n- Name: Alex\\n- University student\\n")
+  - Durable preference. You see the current <user_memory> and merge:
- User: "Remember that I prefer TypeScript over JavaScript"
+    update_memory(updated_memory="## About the user (pinned)\\n- (2025-03-15) Space enthusiast\\n\\n## Preferences\\n- (2025-03-15) Prefers concise answers over detailed explanations\\n...")
-  - Timeless preference, no date needed.  You see the current <user_memory> and merge:
+- User: "I actually moved to Tokyo last month"
-    update_memory(updated_memory="## About the user (pinned)\\n- Senior developer\\n\\n## Preferences\\n- Prefers TypeScript over JavaScript\\n...")
+  - Updated fact, date prefix reflects when recorded:
- User: "I actually moved to Google last month"
+    update_memory(updated_memory="## About the user (pinned)\\n- (2025-03-15) Lives in Tokyo (previously London)\\n...")
-  - Fact that changes over time, include date:
+- User: "I'm a freelance photographer working on a nature documentary"
-    update_memory(updated_memory="## About the user (pinned)\\n- Senior developer at Google (since 2026-03, previously Acme Corp)\\n...")
+  - Durable background info. Save it under About the user:
- User: "I'm building a SaaS app with Next.js and Supabase"
+    update_memory(updated_memory="## About the user (pinned)\\n- (2025-03-15) Freelance photographer\\n- (2025-03-15) Working on a nature documentary\\n")
  - Implicit project info shared as context. Save it:
    update_memory(updated_memory="## About the user (pinned)\\n- Name: Alex\\n\\n## Current context\\n- Building a SaaS app with Next.js and Supabase (2026-04)\\n")
 """,
        "shared": """
- User: "Let's remember that we decided to use GraphQL"
+- User: "Let's remember that we decided to do weekly standup meetings on Mondays"
-  - Decision with date:
+  - Durable team decision:
-    update_memory(updated_memory="## Team decisions (pinned)\\n- 2026-04: Adopted GraphQL over REST for new APIs\\n...")
+    update_memory(updated_memory="## Team decisions (pinned)\\n- (2025-03-15) Weekly standup meetings on Mondays\\n...")
- User: "Our deploy process uses Railway auto-deploys"
+- User: "Our office is in downtown Seattle, 5th floor"
-  - Key fact, no date needed:
+  - Durable team fact:
-    update_memory(updated_memory="## Key facts\\n- Deploy pipeline: git push -> Railway auto-deploys in ~3min\\n...")
+    update_memory(updated_memory="## Key facts\\n- (2025-03-15) Office location: downtown Seattle, 5th floor\\n...")
 """,
    },
 }
--- a/surfsense_backend/app/agents/new_chat/tools/update_memory.py
+++ b/surfsense_backend/app/agents/new_chat/tools/update_memory.py
@ -6,9 +6,10 @@ always sees the current memory in <user_memory> / <team_memory> tags injected
 by MemoryInjectionMiddleware, so it passes the FULL updated document each time.
 Overflow handling:
-  - Soft limit (15K chars): advisory warning returned alongside a successful save.
+  - Soft limit (18K chars): an automatic LLM-driven consolidation is attempted
-  - Hard limit (25K chars): save rejected; an automatic LLM-driven consolidation
+    to proactively keep memory lean.  The save always succeeds.
-    is attempted before falling back to the error.
+  - Hard limit (25K chars): save rejected if memory still exceeds this after
    consolidation.
  - Pinned sections: headings containing ``(pinned)`` are protected — the system
    rejects any update that drops them and auto-restores them during consolidation.
  - Diff validation: warns when entire ``##`` sections are dropped or when the
@ -31,7 +32,7 @@ from app.db import SearchSpace, User
 logger = logging.getLogger(__name__)
-MEMORY_SOFT_LIMIT = 15_000
+MEMORY_SOFT_LIMIT = 18_000
 MEMORY_HARD_LIMIT = 25_000
 _PINNED_RE = re.compile(r"^##\s+.+\(pinned\)", re.MULTILINE)
@ -188,7 +189,7 @@ RULES:
   preferences > current context.
 5. Merge duplicate entries, remove outdated entries, shorten verbose descriptions.
 6. Each entry must be a single bullet point.
-7. Preserve (YYYY-MM) date suffixes on time-sensitive entries.
+7. Every bullet MUST keep its (YYYY-MM-DD) date prefix.
 8. Output ONLY the consolidated markdown — no explanations, no wrapping.
 <memory_document>
@ -259,25 +260,19 @@ async def _save_memory(
    if pinned_err:
        return {"status": "error", "message": pinned_err}
-    # --- hard-limit gate with auto-consolidation fallback ---
+    # --- auto-consolidate proactively at the soft limit ---
    if len(content) > MEMORY_SOFT_LIMIT and llm is not None:
        consolidated = await _auto_consolidate(content, llm)
        if consolidated is not None:
            if old_memory:
                consolidated = _restore_missing_pinned(old_memory, consolidated)
            if len(consolidated) < len(content):
                content = consolidated
    # --- hard-limit gate (reject if still too large after consolidation) ---
    size_err = _validate_memory_size(content)
    if size_err:
-        if llm is None:
+        return size_err
            return size_err
        consolidated = await _auto_consolidate(content, llm)
        if consolidated is None:
            return size_err
        # Restore any pinned sections the consolidation LLM may have dropped
        if old_memory:
            consolidated = _restore_missing_pinned(old_memory, consolidated)
        recheck = _validate_memory_size(consolidated)
        if recheck:
            return recheck
        content = consolidated
    # --- persist ---
    try:
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@ -37,7 +37,6 @@ from app.agents.new_chat.llm_config import (
    load_agent_config,
    load_llm_config_from_yaml,
 )
 from app.agents.new_chat.memory_extraction import extract_and_save_memory
 from app.db import (
    ChatVisibility,
    NewChatMessage,
@ -60,8 +59,6 @@ from app.utils.perf import get_perf_logger, log_system_snapshot, trim_native_hea
 _perf_log = get_perf_logger()
 _background_tasks: set[asyncio.Task] = set()
 def format_mentioned_surfsense_docs_as_context(
    documents: list[SurfsenseDocsDocument],
@ -1525,19 +1522,6 @@ async def stream_new_chat(
            yield streaming_service.format_done()
            return
        if user_id and llm is not None:
            _mem_task = asyncio.create_task(
                extract_and_save_memory(
                    user_message=user_query,
                    user_id=user_id,
                    search_space_id=search_space_id,
                    thread_visibility=visibility,
                    llm=llm,
                )
            )
            _background_tasks.add(_mem_task)
            _mem_task.add_done_callback(_background_tasks.discard)
        # If the title task didn't finish during streaming, await it now
        if title_task is not None and not title_emitted:
            generated_title = await title_task
--- a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/MemoryContent.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/MemoryContent.tsx
@ -135,7 +135,7 @@ export function MemoryContent() {
 					onClick={handleClear}
 					disabled={saving || !savedMemory}
 				>
-					Clear All
+					Reset Memory
 				</Button>
 				<Button
 					type="button"
--- a/surfsense_web/components/settings/team-memory-manager.tsx
+++ b/surfsense_web/components/settings/team-memory-manager.tsx
@ -134,7 +134,7 @@ export function TeamMemoryManager({ searchSpaceId }: TeamMemoryManagerProps) {
 					onClick={handleClear}
 					disabled={saving || !searchSpace?.shared_memory_md}
 				>
-					Clear All
+					Clear Memory
 				</Button>
 				<Button
 					type="button"