mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 01:06:23 +02:00
86 lines
2.3 KiB
Python
86 lines
2.3 KiB
Python
"""
|
|
Surfsense documentation indexer.
|
|
Indexes MDX documentation files at migration time.
|
|
"""
|
|
|
|
import hashlib
|
|
import logging
|
|
import re
|
|
from pathlib import Path
|
|
|
|
from app.config import config
|
|
from app.db import SurfsenseDocsChunk
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Path to docs relative to project root
|
|
DOCS_DIR = Path(__file__).resolve().parent.parent.parent.parent / "surfsense_web" / "content" / "docs"
|
|
|
|
|
|
def parse_mdx_frontmatter(content: str) -> tuple[str, str]:
|
|
"""
|
|
Parse MDX file to extract frontmatter title and content.
|
|
|
|
Args:
|
|
content: Raw MDX file content
|
|
|
|
Returns:
|
|
Tuple of (title, content_without_frontmatter)
|
|
"""
|
|
# Match frontmatter between --- markers
|
|
frontmatter_pattern = r"^---\s*\n(.*?)\n---\s*\n"
|
|
match = re.match(frontmatter_pattern, content, re.DOTALL)
|
|
|
|
if match:
|
|
frontmatter = match.group(1)
|
|
content_without_frontmatter = content[match.end():]
|
|
|
|
# Extract title from frontmatter
|
|
title_match = re.search(r"^title:\s*(.+)$", frontmatter, re.MULTILINE)
|
|
title = title_match.group(1).strip() if title_match else "Untitled"
|
|
|
|
# Remove quotes if present
|
|
title = title.strip("\"'")
|
|
|
|
return title, content_without_frontmatter.strip()
|
|
|
|
return "Untitled", content.strip()
|
|
|
|
|
|
def get_all_mdx_files() -> list[Path]:
|
|
"""
|
|
Get all MDX files from the docs directory.
|
|
|
|
Returns:
|
|
List of Path objects for each MDX file
|
|
"""
|
|
if not DOCS_DIR.exists():
|
|
logger.warning(f"Docs directory not found: {DOCS_DIR}")
|
|
return []
|
|
|
|
return list(DOCS_DIR.rglob("*.mdx"))
|
|
|
|
|
|
def generate_surfsense_docs_content_hash(content: str) -> str:
|
|
"""Generate SHA-256 hash for Surfsense docs content."""
|
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
|
|
|
|
def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]:
|
|
"""
|
|
Create chunks from Surfsense documentation content.
|
|
|
|
Args:
|
|
content: Document content to chunk
|
|
|
|
Returns:
|
|
List of SurfsenseDocsChunk objects with embeddings
|
|
"""
|
|
return [
|
|
SurfsenseDocsChunk(
|
|
content=chunk.text,
|
|
embedding=config.embedding_model_instance.embed(chunk.text),
|
|
)
|
|
for chunk in config.chunker_instance.chunk(content)
|
|
]
|
|
|