feat(migrations): add created_by_id column and backfill existing documents with progress indicator

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-02-02 17:32:34 -08:00
parent b9be4b458b
commit 619f13513f

View file

@ -7,11 +7,14 @@ Create Date: 2026-02-02
Changes:
1. Add created_by_id column (UUID, nullable, foreign key to user.id)
2. Create index on created_by_id for performance
3. Backfill existing documents with search space owner's user_id
3. Backfill existing documents with search space owner's user_id (with progress indicator)
"""
import sys
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic.
@ -20,11 +23,15 @@ down_revision: str | None = "85"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
# Batch size for backfill operation
BATCH_SIZE = 5000
def upgrade() -> None:
"""Add created_by_id column to documents and backfill with search space owner."""
# 1. Add created_by_id column (nullable for backward compatibility)
print("Step 1/4: Adding created_by_id column...")
op.execute(
"""
DO $$
@ -39,17 +46,21 @@ def upgrade() -> None:
END$$;
"""
)
print(" Done: created_by_id column added.")
# 2. Create index on created_by_id for efficient queries
print("Step 2/4: Creating index on created_by_id...")
op.execute(
"""
CREATE INDEX IF NOT EXISTS ix_documents_created_by_id
ON documents (created_by_id);
"""
)
print(" Done: Index created.")
# 3. Add foreign key constraint with ON DELETE SET NULL
# First check if constraint already exists
print("Step 3/4: Adding foreign key constraint...")
op.execute(
"""
DO $$
@ -67,18 +78,67 @@ def upgrade() -> None:
END$$;
"""
)
print(" Done: Foreign key constraint added.")
# 4. Backfill existing documents with search space owner's user_id
# This ensures all existing documents are associated with the search space owner
op.execute(
"""
UPDATE documents
SET created_by_id = searchspaces.user_id
FROM searchspaces
WHERE documents.search_space_id = searchspaces.id
AND documents.created_by_id IS NULL;
"""
# Process in batches with progress indicator
print("Step 4/4: Backfilling created_by_id for existing documents...")
connection = op.get_bind()
# Get total count of documents that need backfilling
result = connection.execute(
sa.text("""
SELECT COUNT(*) FROM documents WHERE created_by_id IS NULL
""")
)
total_count = result.scalar()
if total_count == 0:
print(" No documents need backfilling. Skipping.")
return
print(f" Total documents to backfill: {total_count:,}")
processed = 0
batch_num = 0
while processed < total_count:
batch_num += 1
# Update a batch of documents using a subquery to limit the update
# We use ctid (tuple identifier) for efficient batching in PostgreSQL
result = connection.execute(
sa.text("""
UPDATE documents
SET created_by_id = searchspaces.user_id
FROM searchspaces
WHERE documents.search_space_id = searchspaces.id
AND documents.created_by_id IS NULL
AND documents.id IN (
SELECT d.id FROM documents d
WHERE d.created_by_id IS NULL
LIMIT :batch_size
)
"""),
{"batch_size": BATCH_SIZE}
)
rows_updated = result.rowcount
if rows_updated == 0:
# No more rows to update
break
processed += rows_updated
progress_pct = min(100.0, (processed / total_count) * 100)
# Print progress with carriage return for in-place update
sys.stdout.write(f"\r Progress: {processed:,}/{total_count:,} documents ({progress_pct:.1f}%) - Batch {batch_num}")
sys.stdout.flush()
# Final newline after progress
print()
print(f" Done: Backfilled {processed:,} documents.")
def downgrade() -> None: