mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-04 05:12:38 +02:00
351 lines
12 KiB
Python
351 lines
12 KiB
Python
"""Canonical virtual-path resolver for SurfSense knowledge-base documents.
|
|
|
|
This module is the single source of truth for mapping ``Document`` rows to
|
|
virtual paths under ``/documents/`` and back. It is used by:
|
|
|
|
* :class:`KnowledgeTreeMiddleware` (rendering the workspace tree)
|
|
* :class:`KnowledgePriorityMiddleware` (computing priority paths)
|
|
* :class:`KBPostgresBackend` (``als_info`` / ``aread`` / move operations)
|
|
* :class:`KnowledgeBasePersistenceMiddleware` (resolving moves and creates)
|
|
|
|
Centralising the logic ensures that title-collision suffixes, folder paths,
|
|
and ``unique_identifier_hash`` lookups never drift between renders and
|
|
commits.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
|
|
from sqlalchemy import select
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.db import Document, DocumentType, Folder
|
|
from app.utils.document_converters import generate_unique_identifier_hash
|
|
|
|
DOCUMENTS_ROOT = "/documents"
|
|
"""Root virtual folder for all KB documents."""
|
|
|
|
_INVALID_FILENAME_CHARS = re.compile(r"[\\/:*?\"<>|]+")
|
|
_WHITESPACE_RUN = re.compile(r"\s+")
|
|
|
|
|
|
def safe_filename(value: str, *, fallback: str = "untitled.xml") -> str:
|
|
"""Convert arbitrary text into a filesystem-safe ``.xml`` filename."""
|
|
name = _INVALID_FILENAME_CHARS.sub("_", value).strip()
|
|
name = _WHITESPACE_RUN.sub(" ", name)
|
|
if not name:
|
|
name = fallback
|
|
if len(name) > 180:
|
|
name = name[:180].rstrip()
|
|
if not name.lower().endswith(".xml"):
|
|
name = f"{name}.xml"
|
|
return name
|
|
|
|
|
|
def safe_folder_segment(value: str, *, fallback: str = "folder") -> str:
|
|
"""Sanitize a single folder name into a path-safe segment."""
|
|
name = _INVALID_FILENAME_CHARS.sub("_", value).strip()
|
|
name = _WHITESPACE_RUN.sub(" ", name)
|
|
if not name:
|
|
return fallback
|
|
if len(name) > 180:
|
|
name = name[:180].rstrip()
|
|
return name
|
|
|
|
|
|
def _suffix_with_doc_id(filename: str, doc_id: int | None) -> str:
|
|
if doc_id is None:
|
|
return filename
|
|
if not filename.lower().endswith(".xml"):
|
|
return f"{filename} ({doc_id}).xml"
|
|
stem = filename[:-4]
|
|
return f"{stem} ({doc_id}).xml"
|
|
|
|
|
|
_SUFFIX_PATTERN = re.compile(r"\s\((\d+)\)\.xml$", re.IGNORECASE)
|
|
|
|
|
|
def parse_doc_id_suffix(filename: str) -> tuple[str, int | None]:
|
|
"""Strip a trailing ``" (<doc_id>).xml"`` suffix; return ``(stem, doc_id)``.
|
|
|
|
If no suffix is present, returns ``(stem_without_xml_extension, None)``.
|
|
"""
|
|
match = _SUFFIX_PATTERN.search(filename)
|
|
if match:
|
|
doc_id = int(match.group(1))
|
|
stem = filename[: match.start()]
|
|
return stem, doc_id
|
|
if filename.lower().endswith(".xml"):
|
|
return filename[:-4], None
|
|
return filename, None
|
|
|
|
|
|
@dataclass
|
|
class PathIndex:
|
|
"""In-memory occupancy snapshot used by :func:`doc_to_virtual_path`.
|
|
|
|
Built once per call site so collision handling is deterministic and so
|
|
we don't perform N folder lookups per render.
|
|
"""
|
|
|
|
folder_paths: dict[int, str] = field(default_factory=dict)
|
|
"""``Folder.id`` -> absolute virtual folder path under ``/documents``."""
|
|
|
|
occupants: dict[str, int] = field(default_factory=dict)
|
|
"""virtual path -> ``Document.id`` already occupying that path (this render)."""
|
|
|
|
|
|
async def _build_folder_paths(
|
|
session: AsyncSession,
|
|
search_space_id: int,
|
|
) -> dict[int, str]:
|
|
"""Compute ``Folder.id`` -> absolute virtual path under ``/documents``."""
|
|
result = await session.execute(
|
|
select(Folder.id, Folder.name, Folder.parent_id).where(
|
|
Folder.search_space_id == search_space_id
|
|
)
|
|
)
|
|
rows = result.all()
|
|
by_id = {row.id: {"name": row.name, "parent_id": row.parent_id} for row in rows}
|
|
cache: dict[int, str] = {}
|
|
|
|
def resolve(folder_id: int) -> str:
|
|
if folder_id in cache:
|
|
return cache[folder_id]
|
|
parts: list[str] = []
|
|
cursor: int | None = folder_id
|
|
visited: set[int] = set()
|
|
while cursor is not None and cursor in by_id and cursor not in visited:
|
|
visited.add(cursor)
|
|
entry = by_id[cursor]
|
|
parts.append(safe_folder_segment(str(entry["name"])))
|
|
cursor = entry["parent_id"]
|
|
parts.reverse()
|
|
path = f"{DOCUMENTS_ROOT}/" + "/".join(parts) if parts else DOCUMENTS_ROOT
|
|
cache[folder_id] = path
|
|
return path
|
|
|
|
for folder_id in by_id:
|
|
resolve(folder_id)
|
|
return cache
|
|
|
|
|
|
async def build_path_index(
|
|
session: AsyncSession,
|
|
search_space_id: int,
|
|
*,
|
|
populate_occupants: bool = True,
|
|
) -> PathIndex:
|
|
"""Build a :class:`PathIndex` for a search space.
|
|
|
|
``populate_occupants`` controls whether the occupancy map is pre-seeded
|
|
from existing ``Document`` rows. Most callers want this so that
|
|
:func:`doc_to_virtual_path` can detect collisions across the whole space;
|
|
the persistence middleware sets this to ``False`` when it is iterating to
|
|
decide where to place fresh documents.
|
|
"""
|
|
folder_paths = await _build_folder_paths(session, search_space_id)
|
|
occupants: dict[str, int] = {}
|
|
if populate_occupants:
|
|
rows = await session.execute(
|
|
select(Document.id, Document.title, Document.folder_id).where(
|
|
Document.search_space_id == search_space_id,
|
|
)
|
|
)
|
|
for row in rows.all():
|
|
base = folder_paths.get(row.folder_id, DOCUMENTS_ROOT)
|
|
filename = safe_filename(str(row.title or "untitled"))
|
|
path = f"{base}/{filename}"
|
|
if path in occupants and occupants[path] != row.id:
|
|
path = f"{base}/{_suffix_with_doc_id(filename, row.id)}"
|
|
occupants[path] = row.id
|
|
return PathIndex(folder_paths=folder_paths, occupants=occupants)
|
|
|
|
|
|
def doc_to_virtual_path(
|
|
*,
|
|
doc_id: int | None,
|
|
title: str,
|
|
folder_id: int | None,
|
|
index: PathIndex,
|
|
) -> str:
|
|
"""Return the canonical virtual path for a document.
|
|
|
|
Mutates ``index.occupants`` so subsequent calls see this assignment and
|
|
deterministically pick a different suffix for the next colliding doc.
|
|
"""
|
|
base = index.folder_paths.get(folder_id, DOCUMENTS_ROOT)
|
|
filename = safe_filename(str(title or "untitled"))
|
|
path = f"{base}/{filename}"
|
|
occupant = index.occupants.get(path)
|
|
if occupant is not None and occupant != doc_id:
|
|
path = f"{base}/{_suffix_with_doc_id(filename, doc_id)}"
|
|
if doc_id is not None:
|
|
index.occupants[path] = doc_id
|
|
return path
|
|
|
|
|
|
async def virtual_path_to_doc(
|
|
session: AsyncSession,
|
|
*,
|
|
search_space_id: int,
|
|
virtual_path: str,
|
|
) -> Document | None:
|
|
"""Resolve a virtual path back to a ``Document`` row.
|
|
|
|
Resolution order:
|
|
1. ``Document.unique_identifier_hash`` lookup (fast path for paths created
|
|
by SurfSense itself — every NOTE write goes through this hash).
|
|
2. If the basename carries a ``" (<doc_id>).xml"`` disambiguation suffix,
|
|
try a direct id lookup constrained to the search space.
|
|
3. Title-from-basename + folder-resolution lookup as a last resort.
|
|
"""
|
|
if not virtual_path or not virtual_path.startswith(DOCUMENTS_ROOT):
|
|
return None
|
|
|
|
unique_hash = generate_unique_identifier_hash(
|
|
DocumentType.NOTE,
|
|
virtual_path,
|
|
search_space_id,
|
|
)
|
|
result = await session.execute(
|
|
select(Document).where(
|
|
Document.search_space_id == search_space_id,
|
|
Document.unique_identifier_hash == unique_hash,
|
|
)
|
|
)
|
|
document = result.scalar_one_or_none()
|
|
if document is not None:
|
|
return document
|
|
|
|
rel = virtual_path[len(DOCUMENTS_ROOT) :].lstrip("/")
|
|
if not rel:
|
|
return None
|
|
parts = [p for p in rel.split("/") if p]
|
|
if not parts:
|
|
return None
|
|
basename = parts[-1]
|
|
folder_parts = parts[:-1]
|
|
|
|
stem, suffix_doc_id = parse_doc_id_suffix(basename)
|
|
if suffix_doc_id is not None:
|
|
result = await session.execute(
|
|
select(Document).where(
|
|
Document.search_space_id == search_space_id,
|
|
Document.id == suffix_doc_id,
|
|
)
|
|
)
|
|
document = result.scalar_one_or_none()
|
|
if document is not None:
|
|
return document
|
|
|
|
folder_id = await _resolve_folder_id(
|
|
session, search_space_id=search_space_id, folder_parts=folder_parts
|
|
)
|
|
title_candidates: list[str] = []
|
|
raw_title = stem
|
|
title_candidates.append(raw_title)
|
|
if raw_title.endswith(".xml"):
|
|
title_candidates.append(raw_title[:-4])
|
|
|
|
for candidate in dict.fromkeys(title_candidates):
|
|
if not candidate:
|
|
continue
|
|
query = select(Document).where(
|
|
Document.search_space_id == search_space_id,
|
|
Document.title == candidate,
|
|
)
|
|
if folder_id is None:
|
|
query = query.where(Document.folder_id.is_(None))
|
|
else:
|
|
query = query.where(Document.folder_id == folder_id)
|
|
result = await session.execute(query)
|
|
document = result.scalars().first()
|
|
if document is not None:
|
|
return document
|
|
|
|
# Fallback: title-as-string lookup misses when the real DB title contains
|
|
# characters that ``safe_filename`` lossily replaces (``:``, ``/``, ``*``,
|
|
# etc.) — common for connector-imported docs (Google Calendar/Drive etc.).
|
|
# The workspace tree shows the lossy filename, so the agent passes that
|
|
# filename back here. Scan all documents in the resolved folder and match
|
|
# by ``safe_filename(title)`` to recover the original document.
|
|
folder_scan = select(Document).where(
|
|
Document.search_space_id == search_space_id,
|
|
)
|
|
if folder_id is None:
|
|
folder_scan = folder_scan.where(Document.folder_id.is_(None))
|
|
else:
|
|
folder_scan = folder_scan.where(Document.folder_id == folder_id)
|
|
result = await session.execute(folder_scan)
|
|
for candidate_doc in result.scalars().all():
|
|
encoded = safe_filename(str(candidate_doc.title or "untitled"))
|
|
if encoded == basename:
|
|
return candidate_doc
|
|
return None
|
|
|
|
|
|
async def _resolve_folder_id(
|
|
session: AsyncSession,
|
|
*,
|
|
search_space_id: int,
|
|
folder_parts: list[str],
|
|
) -> int | None:
|
|
"""Look up the leaf folder id for a chain of folder names; return ``None`` if missing."""
|
|
if not folder_parts:
|
|
return None
|
|
parent_id: int | None = None
|
|
for raw in folder_parts:
|
|
name = safe_folder_segment(raw)
|
|
query = select(Folder.id).where(
|
|
Folder.search_space_id == search_space_id,
|
|
Folder.name == name,
|
|
)
|
|
if parent_id is None:
|
|
query = query.where(Folder.parent_id.is_(None))
|
|
else:
|
|
query = query.where(Folder.parent_id == parent_id)
|
|
result = await session.execute(query)
|
|
row = result.first()
|
|
if row is None:
|
|
return None
|
|
parent_id = row[0]
|
|
return parent_id
|
|
|
|
|
|
def parse_documents_path(virtual_path: str) -> tuple[list[str], str]:
|
|
"""Parse a ``/documents/...`` path into ``(folder_parts, document_title)``.
|
|
|
|
The title has any ``.xml`` extension and trailing ``" (<doc_id>)"``
|
|
disambiguation suffix stripped.
|
|
"""
|
|
if not virtual_path or not virtual_path.startswith(DOCUMENTS_ROOT):
|
|
return [], ""
|
|
rel = virtual_path[len(DOCUMENTS_ROOT) :].strip("/")
|
|
if not rel:
|
|
return [], ""
|
|
parts = [p for p in rel.split("/") if p]
|
|
if not parts:
|
|
return [], ""
|
|
folder_parts = parts[:-1]
|
|
basename = parts[-1]
|
|
stem, _ = parse_doc_id_suffix(basename)
|
|
title = stem
|
|
if title.endswith(".xml"):
|
|
title = title[:-4]
|
|
return folder_parts, title
|
|
|
|
|
|
__all__ = [
|
|
"DOCUMENTS_ROOT",
|
|
"PathIndex",
|
|
"build_path_index",
|
|
"doc_to_virtual_path",
|
|
"parse_doc_id_suffix",
|
|
"parse_documents_path",
|
|
"safe_filename",
|
|
"safe_folder_segment",
|
|
"virtual_path_to_doc",
|
|
]
|