From fff851ae3fe07d6b2ac388296c4705083dc131bf Mon Sep 17 00:00:00 2001
From: CREDO23 <thierrybakera12@gmail.com>
Date: Fri, 9 Jan 2026 15:42:30 +0200
Subject: [PATCH] feat: create indexer module with MDX parsing

---
 .../app/tasks/surfsense_docs_indexer.py       | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 surfsense_backend/app/tasks/surfsense_docs_indexer.py

diff --git a/surfsense_backend/app/tasks/surfsense_docs_indexer.py b/surfsense_backend/app/tasks/surfsense_docs_indexer.py
new file mode 100644
index 000000000..c5e846635
--- /dev/null
+++ b/surfsense_backend/app/tasks/surfsense_docs_indexer.py
@@ -0,0 +1,64 @@
+"""
+Surfsense documentation indexer.
+Indexes MDX documentation files at migration time.
+"""
+
+import hashlib
+import logging
+import re
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+# Path to docs relative to project root
+DOCS_DIR = Path(__file__).resolve().parent.parent.parent.parent / "surfsense_web" / "content" / "docs"
+
+
+def parse_mdx_frontmatter(content: str) -> tuple[str, str]:
+    """
+    Parse MDX file to extract frontmatter title and content.
+
+    Args:
+        content: Raw MDX file content
+
+    Returns:
+        Tuple of (title, content_without_frontmatter)
+    """
+    # Match frontmatter between --- markers
+    frontmatter_pattern = r"^---\s*\n(.*?)\n---\s*\n"
+    match = re.match(frontmatter_pattern, content, re.DOTALL)
+
+    if match:
+        frontmatter = match.group(1)
+        content_without_frontmatter = content[match.end():]
+
+        # Extract title from frontmatter
+        title_match = re.search(r"^title:\s*(.+)$", frontmatter, re.MULTILINE)
+        title = title_match.group(1).strip() if title_match else "Untitled"
+
+        # Remove quotes if present
+        title = title.strip("\"'")
+
+        return title, content_without_frontmatter.strip()
+
+    return "Untitled", content.strip()
+
+
+def get_all_mdx_files() -> list[Path]:
+    """
+    Get all MDX files from the docs directory.
+
+    Returns:
+        List of Path objects for each MDX file
+    """
+    if not DOCS_DIR.exists():
+        logger.warning(f"Docs directory not found: {DOCS_DIR}")
+        return []
+
+    return list(DOCS_DIR.rglob("*.mdx"))
+
+
+def generate_surfsense_docs_content_hash(content: str) -> str:
+    """Generate SHA-256 hash for Surfsense docs content."""
+    return hashlib.sha256(content.encode("utf-8")).hexdigest()
+