mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
refactor: async docs seeding in FastAPI lifespan
This commit is contained in:
parent
f6621f9a9a
commit
96545056cd
4 changed files with 58 additions and 56 deletions
|
|
@ -145,36 +145,13 @@ run_migrations() {
|
|||
echo "✅ Database migrations complete"
|
||||
}
|
||||
|
||||
# ================================================
|
||||
# Seed Surfsense documentation
|
||||
# ================================================
|
||||
seed_surfsense_docs() {
|
||||
echo "📚 Seeding Surfsense documentation..."
|
||||
|
||||
# Start PostgreSQL temporarily for seeding
|
||||
su - postgres -c "/usr/lib/postgresql/14/bin/pg_ctl -D /data/postgres -l /tmp/postgres_seed.log start"
|
||||
sleep 5
|
||||
|
||||
cd /app/backend
|
||||
python scripts/seed_surfsense_docs.py || echo "⚠️ Docs seeding may have already been done"
|
||||
|
||||
# Stop PostgreSQL
|
||||
su - postgres -c "/usr/lib/postgresql/14/bin/pg_ctl -D /data/postgres stop"
|
||||
|
||||
echo "✅ Surfsense documentation seeded"
|
||||
}
|
||||
|
||||
# Run migrations on first start or when explicitly requested
|
||||
if [ ! -f /data/.migrations_run ] || [ "${FORCE_MIGRATIONS:-false}" = "true" ]; then
|
||||
run_migrations
|
||||
touch /data/.migrations_run
|
||||
fi
|
||||
|
||||
# Seed docs on first start or when explicitly requested
|
||||
if [ ! -f /data/.docs_seeded ] || [ "${FORCE_SEED_DOCS:-false}" = "true" ]; then
|
||||
seed_surfsense_docs
|
||||
touch /data/.docs_seeded
|
||||
fi
|
||||
# Note: Surfsense docs seeding is now handled by FastAPI startup (app.py lifespan)
|
||||
|
||||
# ================================================
|
||||
# Environment Variables Info
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ from app.config import config
|
|||
from app.db import User, create_db_and_tables, get_async_session
|
||||
from app.routes import router as crud_router
|
||||
from app.schemas import UserCreate, UserRead, UserUpdate
|
||||
from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
|
||||
from app.users import SECRET, auth_backend, current_active_user, fastapi_users
|
||||
|
||||
|
||||
|
|
@ -22,6 +23,8 @@ async def lifespan(app: FastAPI):
|
|||
await create_db_and_tables()
|
||||
# Setup LangGraph checkpointer tables for conversation persistence
|
||||
await setup_checkpointer_tables()
|
||||
# Seed Surfsense documentation
|
||||
await seed_surfsense_docs()
|
||||
yield
|
||||
# Cleanup: close checkpointer connection on shutdown
|
||||
await close_checkpointer()
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
"""
|
||||
Surfsense documentation indexer.
|
||||
Indexes MDX documentation files at migration time.
|
||||
Indexes MDX documentation files at startup.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
|
|
@ -10,10 +10,11 @@ from datetime import UTC, datetime
|
|||
from pathlib import Path
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session, selectinload
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.config import config
|
||||
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument
|
||||
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -89,12 +90,12 @@ def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]:
|
|||
]
|
||||
|
||||
|
||||
def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]:
|
||||
async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, int]:
|
||||
"""
|
||||
Index all Surfsense documentation files.
|
||||
|
||||
Args:
|
||||
session: SQLAlchemy sync session
|
||||
session: SQLAlchemy async session
|
||||
|
||||
Returns:
|
||||
Tuple of (created, updated, skipped, deleted) counts
|
||||
|
|
@ -105,7 +106,7 @@ def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]:
|
|||
deleted = 0
|
||||
|
||||
# Get all existing docs from database
|
||||
existing_docs_result = session.execute(
|
||||
existing_docs_result = await session.execute(
|
||||
select(SurfsenseDocsDocument).options(selectinload(SurfsenseDocsDocument.chunks))
|
||||
)
|
||||
existing_docs = {doc.source: doc for doc in existing_docs_result.scalars().all()}
|
||||
|
|
@ -178,11 +179,11 @@ def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]:
|
|||
for source, doc in existing_docs.items():
|
||||
if source not in processed_sources:
|
||||
logger.info(f"Deleting removed document: {source}")
|
||||
session.delete(doc)
|
||||
await session.delete(doc)
|
||||
deleted += 1
|
||||
|
||||
# Commit all changes
|
||||
session.commit()
|
||||
await session.commit()
|
||||
|
||||
logger.info(
|
||||
f"Indexing complete: {created} created, {updated} updated, "
|
||||
|
|
@ -191,3 +192,31 @@ def index_surfsense_docs(session: Session) -> tuple[int, int, int, int]:
|
|||
|
||||
return created, updated, skipped, deleted
|
||||
|
||||
|
||||
async def seed_surfsense_docs() -> tuple[int, int, int, int]:
|
||||
"""
|
||||
Seed Surfsense documentation into the database.
|
||||
|
||||
This function indexes all MDX files from the docs directory.
|
||||
It handles creating, updating, and deleting docs based on content changes.
|
||||
|
||||
Returns:
|
||||
Tuple of (created, updated, skipped, deleted) counts
|
||||
Returns (0, 0, 0, 0) if an error occurs
|
||||
"""
|
||||
logger.info("Starting Surfsense docs indexing...")
|
||||
|
||||
try:
|
||||
async with async_session_maker() as session:
|
||||
created, updated, skipped, deleted = await index_surfsense_docs(session)
|
||||
|
||||
logger.info(
|
||||
f"Surfsense docs indexing complete: "
|
||||
f"created={created}, updated={updated}, skipped={skipped}, deleted={deleted}"
|
||||
)
|
||||
|
||||
return created, updated, skipped, deleted
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to seed Surfsense docs: {e}", exc_info=True)
|
||||
return 0, 0, 0, 0
|
||||
|
|
|
|||
|
|
@ -1,47 +1,40 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Seed Surfsense documentation into the database.
|
||||
Run this script after migrations to index MDX documentation files.
|
||||
|
||||
CLI wrapper for the seed_surfsense_docs function.
|
||||
Can be run manually for debugging or re-indexing.
|
||||
|
||||
Usage:
|
||||
python scripts/seed_surfsense_docs.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add the parent directory to the path so we can import app modules
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.config import config
|
||||
from app.tasks.surfsense_docs_indexer import index_surfsense_docs
|
||||
from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for seeding Surfsense docs."""
|
||||
print("Starting Surfsense docs seeding...")
|
||||
"""CLI entry point for seeding Surfsense docs."""
|
||||
print("=" * 50)
|
||||
print(" Surfsense Documentation Seeding")
|
||||
print("=" * 50)
|
||||
|
||||
# Create sync engine from database URL
|
||||
# Convert async URL to sync if needed
|
||||
database_url = config.DATABASE_URL
|
||||
if database_url.startswith("postgresql+asyncpg://"):
|
||||
database_url = database_url.replace("postgresql+asyncpg://", "postgresql://")
|
||||
created, updated, skipped, deleted = asyncio.run(seed_surfsense_docs())
|
||||
|
||||
engine = create_engine(database_url)
|
||||
|
||||
with Session(engine) as session:
|
||||
created, updated, skipped, deleted = index_surfsense_docs(session)
|
||||
|
||||
print(f"\nSurfsense docs seeding complete:")
|
||||
print(f" Created: {created}")
|
||||
print(f" Updated: {updated}")
|
||||
print(f" Skipped: {skipped}")
|
||||
print(f" Deleted: {deleted}")
|
||||
print()
|
||||
print("Results:")
|
||||
print(f" Created: {created}")
|
||||
print(f" Updated: {updated}")
|
||||
print(f" Skipped: {skipped}")
|
||||
print(f" Deleted: {deleted}")
|
||||
print("=" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue