mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-29 19:35:20 +02:00
feat: create indexer module with MDX parsing
This commit is contained in:
parent
ba404cc151
commit
fff851ae3f
1 changed files with 64 additions and 0 deletions
64
surfsense_backend/app/tasks/surfsense_docs_indexer.py
Normal file
64
surfsense_backend/app/tasks/surfsense_docs_indexer.py
Normal file
|
|
@ -0,0 +1,64 @@
|
||||||
|
"""
|
||||||
|
Surfsense documentation indexer.
|
||||||
|
Indexes MDX documentation files at migration time.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Path to docs relative to project root
|
||||||
|
DOCS_DIR = Path(__file__).resolve().parent.parent.parent.parent / "surfsense_web" / "content" / "docs"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_mdx_frontmatter(content: str) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Parse MDX file to extract frontmatter title and content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Raw MDX file content
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (title, content_without_frontmatter)
|
||||||
|
"""
|
||||||
|
# Match frontmatter between --- markers
|
||||||
|
frontmatter_pattern = r"^---\s*\n(.*?)\n---\s*\n"
|
||||||
|
match = re.match(frontmatter_pattern, content, re.DOTALL)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
frontmatter = match.group(1)
|
||||||
|
content_without_frontmatter = content[match.end():]
|
||||||
|
|
||||||
|
# Extract title from frontmatter
|
||||||
|
title_match = re.search(r"^title:\s*(.+)$", frontmatter, re.MULTILINE)
|
||||||
|
title = title_match.group(1).strip() if title_match else "Untitled"
|
||||||
|
|
||||||
|
# Remove quotes if present
|
||||||
|
title = title.strip("\"'")
|
||||||
|
|
||||||
|
return title, content_without_frontmatter.strip()
|
||||||
|
|
||||||
|
return "Untitled", content.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_mdx_files() -> list[Path]:
|
||||||
|
"""
|
||||||
|
Get all MDX files from the docs directory.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Path objects for each MDX file
|
||||||
|
"""
|
||||||
|
if not DOCS_DIR.exists():
|
||||||
|
logger.warning(f"Docs directory not found: {DOCS_DIR}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
return list(DOCS_DIR.rglob("*.mdx"))
|
||||||
|
|
||||||
|
|
||||||
|
def generate_surfsense_docs_content_hash(content: str) -> str:
|
||||||
|
"""Generate SHA-256 hash for Surfsense docs content."""
|
||||||
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue