diff --git a/scripts/docker/entrypoint-allinone.sh b/scripts/docker/entrypoint-allinone.sh index 0888facf1..ef0ef28ed 100644 --- a/scripts/docker/entrypoint-allinone.sh +++ b/scripts/docker/entrypoint-allinone.sh @@ -145,36 +145,13 @@ run_migrations() { echo "✅ Database migrations complete" } -# ================================================ -# Seed Surfsense documentation -# ================================================ -seed_surfsense_docs() { - echo "📚 Seeding Surfsense documentation..." - - # Start PostgreSQL temporarily for seeding - su - postgres -c "/usr/lib/postgresql/14/bin/pg_ctl -D /data/postgres -l /tmp/postgres_seed.log start" - sleep 5 - - cd /app/backend - python scripts/seed_surfsense_docs.py || echo "⚠️ Docs seeding may have already been done" - - # Stop PostgreSQL - su - postgres -c "/usr/lib/postgresql/14/bin/pg_ctl -D /data/postgres stop" - - echo "✅ Surfsense documentation seeded" -} - # Run migrations on first start or when explicitly requested if [ ! -f /data/.migrations_run ] || [ "${FORCE_MIGRATIONS:-false}" = "true" ]; then run_migrations touch /data/.migrations_run fi -# Seed docs on first start or when explicitly requested -if [ ! -f /data/.docs_seeded ] || [ "${FORCE_SEED_DOCS:-false}" = "true" ]; then - seed_surfsense_docs - touch /data/.docs_seeded -fi +# Note: Surfsense docs seeding is now handled by FastAPI startup (app.py lifespan) # ================================================ # Environment Variables Info diff --git a/surfsense_backend/app/app.py b/surfsense_backend/app/app.py index 993961148..3ad9d89bc 100644 --- a/surfsense_backend/app/app.py +++ b/surfsense_backend/app/app.py @@ -13,6 +13,7 @@ from app.config import config from app.db import User, create_db_and_tables, get_async_session from app.routes import router as crud_router from app.schemas import UserCreate, UserRead, UserUpdate +from app.tasks.surfsense_docs_indexer import seed_surfsense_docs from app.users import SECRET, auth_backend, current_active_user, fastapi_users @@ -22,6 +23,8 @@ async def lifespan(app: FastAPI): await create_db_and_tables() # Setup LangGraph checkpointer tables for conversation persistence await setup_checkpointer_tables() + # Seed Surfsense documentation + await seed_surfsense_docs() yield # Cleanup: close checkpointer connection on shutdown await close_checkpointer() diff --git a/surfsense_backend/app/tasks/surfsense_docs_indexer.py b/surfsense_backend/app/tasks/surfsense_docs_indexer.py index 51a1c0938..f2c1e69ba 100644 --- a/surfsense_backend/app/tasks/surfsense_docs_indexer.py +++ b/surfsense_backend/app/tasks/surfsense_docs_indexer.py @@ -1,6 +1,6 @@ """ Surfsense documentation indexer. -Indexes MDX documentation files at migration time. +Indexes MDX documentation files at startup. """ import hashlib @@ -10,10 +10,11 @@ from datetime import UTC, datetime from pathlib import Path from sqlalchemy import select -from sqlalchemy.orm import Session, selectinload +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload from app.config import config -from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument +from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker logger = logging.getLogger(__name__) @@ -89,12 +90,12 @@ def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]: ] -def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]: +async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, int]: """ Index all Surfsense documentation files. Args: - session: SQLAlchemy sync session + session: SQLAlchemy async session Returns: Tuple of (created, updated, skipped, deleted) counts @@ -105,7 +106,7 @@ def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]: deleted = 0 # Get all existing docs from database - existing_docs_result = session.execute( + existing_docs_result = await session.execute( select(SurfsenseDocsDocument).options(selectinload(SurfsenseDocsDocument.chunks)) ) existing_docs = {doc.source: doc for doc in existing_docs_result.scalars().all()} @@ -178,11 +179,11 @@ def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]: for source, doc in existing_docs.items(): if source not in processed_sources: logger.info(f"Deleting removed document: {source}") - session.delete(doc) + await session.delete(doc) deleted += 1 # Commit all changes - session.commit() + await session.commit() logger.info( f"Indexing complete: {created} created, {updated} updated, " @@ -191,3 +192,31 @@ def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]: return created, updated, skipped, deleted + +async def seed_surfsense_docs() -> tuple[int, int, int, int]: + """ + Seed Surfsense documentation into the database. + + This function indexes all MDX files from the docs directory. + It handles creating, updating, and deleting docs based on content changes. + + Returns: + Tuple of (created, updated, skipped, deleted) counts + Returns (0, 0, 0, 0) if an error occurs + """ + logger.info("Starting Surfsense docs indexing...") + + try: + async with async_session_maker() as session: + created, updated, skipped, deleted = await index_surfsense_docs(session) + + logger.info( + f"Surfsense docs indexing complete: " + f"created={created}, updated={updated}, skipped={skipped}, deleted={deleted}" + ) + + return created, updated, skipped, deleted + + except Exception as e: + logger.error(f"Failed to seed Surfsense docs: {e}", exc_info=True) + return 0, 0, 0, 0 diff --git a/surfsense_backend/scripts/seed_surfsense_docs.py b/surfsense_backend/scripts/seed_surfsense_docs.py index 2e9eee649..d9536bf91 100644 --- a/surfsense_backend/scripts/seed_surfsense_docs.py +++ b/surfsense_backend/scripts/seed_surfsense_docs.py @@ -1,47 +1,40 @@ #!/usr/bin/env python """ Seed Surfsense documentation into the database. -Run this script after migrations to index MDX documentation files. + +CLI wrapper for the seed_surfsense_docs function. +Can be run manually for debugging or re-indexing. Usage: python scripts/seed_surfsense_docs.py """ +import asyncio import sys from pathlib import Path # Add the parent directory to the path so we can import app modules sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) -from sqlalchemy import create_engine -from sqlalchemy.orm import Session - -from app.config import config -from app.tasks.surfsense_docs_indexer import index_surfsense_docs +from app.tasks.surfsense_docs_indexer import seed_surfsense_docs def main(): - """Main entry point for seeding Surfsense docs.""" - print("Starting Surfsense docs seeding...") + """CLI entry point for seeding Surfsense docs.""" + print("=" * 50) + print(" Surfsense Documentation Seeding") + print("=" * 50) - # Create sync engine from database URL - # Convert async URL to sync if needed - database_url = config.DATABASE_URL - if database_url.startswith("postgresql+asyncpg://"): - database_url = database_url.replace("postgresql+asyncpg://", "postgresql://") + created, updated, skipped, deleted = asyncio.run(seed_surfsense_docs()) - engine = create_engine(database_url) - - with Session(engine) as session: - created, updated, skipped, deleted = index_surfsense_docs(session) - - print(f"\nSurfsense docs seeding complete:") - print(f" Created: {created}") - print(f" Updated: {updated}") - print(f" Skipped: {skipped}") - print(f" Deleted: {deleted}") + print() + print("Results:") + print(f" Created: {created}") + print(f" Updated: {updated}") + print(f" Skipped: {skipped}") + print(f" Deleted: {deleted}") + print("=" * 50) if __name__ == "__main__": main() -