mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-10 20:35:17 +02:00
feat(migrations): add created_by_id column and backfill existing documents with progress indicator
This commit is contained in:
parent
b9be4b458b
commit
619f13513f
1 changed files with 70 additions and 10 deletions
|
|
@ -7,11 +7,14 @@ Create Date: 2026-02-02
|
|||
Changes:
|
||||
1. Add created_by_id column (UUID, nullable, foreign key to user.id)
|
||||
2. Create index on created_by_id for performance
|
||||
3. Backfill existing documents with search space owner's user_id
|
||||
3. Backfill existing documents with search space owner's user_id (with progress indicator)
|
||||
"""
|
||||
|
||||
import sys
|
||||
from collections.abc import Sequence
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
|
|
@ -20,11 +23,15 @@ down_revision: str | None = "85"
|
|||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
# Batch size for backfill operation
|
||||
BATCH_SIZE = 5000
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Add created_by_id column to documents and backfill with search space owner."""
|
||||
|
||||
# 1. Add created_by_id column (nullable for backward compatibility)
|
||||
print("Step 1/4: Adding created_by_id column...")
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
|
|
@ -39,17 +46,21 @@ def upgrade() -> None:
|
|||
END$$;
|
||||
"""
|
||||
)
|
||||
print(" Done: created_by_id column added.")
|
||||
|
||||
# 2. Create index on created_by_id for efficient queries
|
||||
print("Step 2/4: Creating index on created_by_id...")
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS ix_documents_created_by_id
|
||||
ON documents (created_by_id);
|
||||
"""
|
||||
)
|
||||
print(" Done: Index created.")
|
||||
|
||||
# 3. Add foreign key constraint with ON DELETE SET NULL
|
||||
# First check if constraint already exists
|
||||
print("Step 3/4: Adding foreign key constraint...")
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
|
|
@ -67,18 +78,67 @@ def upgrade() -> None:
|
|||
END$$;
|
||||
"""
|
||||
)
|
||||
print(" Done: Foreign key constraint added.")
|
||||
|
||||
# 4. Backfill existing documents with search space owner's user_id
|
||||
# This ensures all existing documents are associated with the search space owner
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE documents
|
||||
SET created_by_id = searchspaces.user_id
|
||||
FROM searchspaces
|
||||
WHERE documents.search_space_id = searchspaces.id
|
||||
AND documents.created_by_id IS NULL;
|
||||
"""
|
||||
# Process in batches with progress indicator
|
||||
print("Step 4/4: Backfilling created_by_id for existing documents...")
|
||||
|
||||
connection = op.get_bind()
|
||||
|
||||
# Get total count of documents that need backfilling
|
||||
result = connection.execute(
|
||||
sa.text("""
|
||||
SELECT COUNT(*) FROM documents WHERE created_by_id IS NULL
|
||||
""")
|
||||
)
|
||||
total_count = result.scalar()
|
||||
|
||||
if total_count == 0:
|
||||
print(" No documents need backfilling. Skipping.")
|
||||
return
|
||||
|
||||
print(f" Total documents to backfill: {total_count:,}")
|
||||
|
||||
processed = 0
|
||||
batch_num = 0
|
||||
|
||||
while processed < total_count:
|
||||
batch_num += 1
|
||||
|
||||
# Update a batch of documents using a subquery to limit the update
|
||||
# We use ctid (tuple identifier) for efficient batching in PostgreSQL
|
||||
result = connection.execute(
|
||||
sa.text("""
|
||||
UPDATE documents
|
||||
SET created_by_id = searchspaces.user_id
|
||||
FROM searchspaces
|
||||
WHERE documents.search_space_id = searchspaces.id
|
||||
AND documents.created_by_id IS NULL
|
||||
AND documents.id IN (
|
||||
SELECT d.id FROM documents d
|
||||
WHERE d.created_by_id IS NULL
|
||||
LIMIT :batch_size
|
||||
)
|
||||
"""),
|
||||
{"batch_size": BATCH_SIZE}
|
||||
)
|
||||
|
||||
rows_updated = result.rowcount
|
||||
if rows_updated == 0:
|
||||
# No more rows to update
|
||||
break
|
||||
|
||||
processed += rows_updated
|
||||
progress_pct = min(100.0, (processed / total_count) * 100)
|
||||
|
||||
# Print progress with carriage return for in-place update
|
||||
sys.stdout.write(f"\r Progress: {processed:,}/{total_count:,} documents ({progress_pct:.1f}%) - Batch {batch_num}")
|
||||
sys.stdout.flush()
|
||||
|
||||
# Final newline after progress
|
||||
print()
|
||||
print(f" Done: Backfilled {processed:,} documents.")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue