refactor: async docs seeding in FastAPI lifespan

This commit is contained in:
CREDO23 2026-01-12 20:15:11 +02:00
parent f6621f9a9a
commit 96545056cd
4 changed files with 58 additions and 56 deletions

View file

@ -145,36 +145,13 @@ run_migrations() {
echo "✅ Database migrations complete"
}
# ================================================
# Seed Surfsense documentation
# ================================================
seed_surfsense_docs() {
echo "📚 Seeding Surfsense documentation..."
# Start PostgreSQL temporarily for seeding
su - postgres -c "/usr/lib/postgresql/14/bin/pg_ctl -D /data/postgres -l /tmp/postgres_seed.log start"
sleep 5
cd /app/backend
python scripts/seed_surfsense_docs.py || echo "⚠️ Docs seeding may have already been done"
# Stop PostgreSQL
su - postgres -c "/usr/lib/postgresql/14/bin/pg_ctl -D /data/postgres stop"
echo "✅ Surfsense documentation seeded"
}
# Run migrations on first start or when explicitly requested
if [ ! -f /data/.migrations_run ] || [ "${FORCE_MIGRATIONS:-false}" = "true" ]; then
run_migrations
touch /data/.migrations_run
fi
# Seed docs on first start or when explicitly requested
if [ ! -f /data/.docs_seeded ] || [ "${FORCE_SEED_DOCS:-false}" = "true" ]; then
seed_surfsense_docs
touch /data/.docs_seeded
fi
# Note: Surfsense docs seeding is now handled by FastAPI startup (app.py lifespan)
# ================================================
# Environment Variables Info

View file

@ -13,6 +13,7 @@ from app.config import config
from app.db import User, create_db_and_tables, get_async_session
from app.routes import router as crud_router
from app.schemas import UserCreate, UserRead, UserUpdate
from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
from app.users import SECRET, auth_backend, current_active_user, fastapi_users
@ -22,6 +23,8 @@ async def lifespan(app: FastAPI):
await create_db_and_tables()
# Setup LangGraph checkpointer tables for conversation persistence
await setup_checkpointer_tables()
# Seed Surfsense documentation
await seed_surfsense_docs()
yield
# Cleanup: close checkpointer connection on shutdown
await close_checkpointer()

View file

@ -1,6 +1,6 @@
"""
Surfsense documentation indexer.
Indexes MDX documentation files at migration time.
Indexes MDX documentation files at startup.
"""
import hashlib
@ -10,10 +10,11 @@ from datetime import UTC, datetime
from pathlib import Path
from sqlalchemy import select
from sqlalchemy.orm import Session, selectinload
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
from app.config import config
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
logger = logging.getLogger(__name__)
@ -89,12 +90,12 @@ def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]:
]
def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]:
async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, int]:
"""
Index all Surfsense documentation files.
Args:
session: SQLAlchemy sync session
session: SQLAlchemy async session
Returns:
Tuple of (created, updated, skipped, deleted) counts
@ -105,7 +106,7 @@ def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]:
deleted = 0
# Get all existing docs from database
existing_docs_result = session.execute(
existing_docs_result = await session.execute(
select(SurfsenseDocsDocument).options(selectinload(SurfsenseDocsDocument.chunks))
)
existing_docs = {doc.source: doc for doc in existing_docs_result.scalars().all()}
@ -178,11 +179,11 @@ def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]:
for source, doc in existing_docs.items():
if source not in processed_sources:
logger.info(f"Deleting removed document: {source}")
session.delete(doc)
await session.delete(doc)
deleted += 1
# Commit all changes
session.commit()
await session.commit()
logger.info(
f"Indexing complete: {created} created, {updated} updated, "
@ -191,3 +192,31 @@ def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]:
return created, updated, skipped, deleted
async def seed_surfsense_docs() -> tuple[int, int, int, int]:
"""
Seed Surfsense documentation into the database.
This function indexes all MDX files from the docs directory.
It handles creating, updating, and deleting docs based on content changes.
Returns:
Tuple of (created, updated, skipped, deleted) counts
Returns (0, 0, 0, 0) if an error occurs
"""
logger.info("Starting Surfsense docs indexing...")
try:
async with async_session_maker() as session:
created, updated, skipped, deleted = await index_surfsense_docs(session)
logger.info(
f"Surfsense docs indexing complete: "
f"created={created}, updated={updated}, skipped={skipped}, deleted={deleted}"
)
return created, updated, skipped, deleted
except Exception as e:
logger.error(f"Failed to seed Surfsense docs: {e}", exc_info=True)
return 0, 0, 0, 0

View file

@ -1,47 +1,40 @@
#!/usr/bin/env python
"""
Seed Surfsense documentation into the database.
Run this script after migrations to index MDX documentation files.
CLI wrapper for the seed_surfsense_docs function.
Can be run manually for debugging or re-indexing.
Usage:
python scripts/seed_surfsense_docs.py
"""
import asyncio
import sys
from pathlib import Path
# Add the parent directory to the path so we can import app modules
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from app.config import config
from app.tasks.surfsense_docs_indexer import index_surfsense_docs
from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
def main():
"""Main entry point for seeding Surfsense docs."""
print("Starting Surfsense docs seeding...")
"""CLI entry point for seeding Surfsense docs."""
print("=" * 50)
print(" Surfsense Documentation Seeding")
print("=" * 50)
# Create sync engine from database URL
# Convert async URL to sync if needed
database_url = config.DATABASE_URL
if database_url.startswith("postgresql+asyncpg://"):
database_url = database_url.replace("postgresql+asyncpg://", "postgresql://")
created, updated, skipped, deleted = asyncio.run(seed_surfsense_docs())
engine = create_engine(database_url)
with Session(engine) as session:
created, updated, skipped, deleted = index_surfsense_docs(session)
print(f"\nSurfsense docs seeding complete:")
print(f" Created: {created}")
print(f" Updated: {updated}")
print(f" Skipped: {skipped}")
print(f" Deleted: {deleted}")
print()
print("Results:")
print(f" Created: {created}")
print(f" Updated: {updated}")
print(f" Skipped: {skipped}")
print(f" Deleted: {deleted}")
print("=" * 50)
if __name__ == "__main__":
main()