feat: Added content based hashing to prevent duplicates and fix resync issues

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-05-28 23:52:00 -07:00
parent 38516e74f9
commit 5411bac8e0
17 changed files with 297 additions and 334 deletions

View file

@ -2,7 +2,6 @@
Revision ID: 1
Revises:
Create Date: 2023-10-27 10:00:00.000000
"""
from typing import Sequence, Union

View file

@ -2,7 +2,6 @@
Revision ID: 2
Revises: e55302644c51
Create Date: 2025-04-16 10:00:00.000000
"""
from typing import Sequence, Union

View file

@ -2,7 +2,6 @@
Revision ID: 3
Revises: 2
Create Date: 2025-04-16 10:05:00.059921
"""
from typing import Sequence, Union

View file

@ -2,7 +2,6 @@
Revision ID: 4
Revises: 3
Create Date: 2025-04-18 10:00:00.000000
"""
from typing import Sequence, Union

View file

@ -2,7 +2,6 @@
Revision ID: 5
Revises: 4
Create Date: 2023-06-10 00:00:00.000000
"""
from typing import Sequence, Union

View file

@ -2,7 +2,6 @@
Revision ID: 6
Revises: 5
Create Date: 2023-08-15 00:00:00.000000
"""
from typing import Sequence, Union

View file

@ -2,7 +2,6 @@
Revision ID: 7
Revises: 6
Create Date: 2023-08-15 01:00:00.000000
"""
from typing import Sequence, Union

View file

@ -0,0 +1,56 @@
"""Add content_hash column to documents table
Revision ID: 8
Revises: 7
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = '8'
down_revision: Union[str, None] = '7'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# Add content_hash column as nullable first to handle existing data
op.add_column('documents', sa.Column('content_hash', sa.String(), nullable=True))
# Update existing documents to generate content hashes
# Using SHA-256 hash of the content column with proper UTF-8 encoding
op.execute("""
UPDATE documents
SET content_hash = encode(sha256(convert_to(content, 'UTF8')), 'hex')
WHERE content_hash IS NULL
""")
# Handle duplicate content hashes by keeping only the oldest document for each hash
# Delete newer documents with duplicate content hashes
op.execute("""
DELETE FROM documents
WHERE id NOT IN (
SELECT MIN(id)
FROM documents
GROUP BY content_hash
)
""")
# Now alter the column to match the model: nullable=False, index=True, unique=True
op.alter_column('documents', 'content_hash',
existing_type=sa.String(),
nullable=False)
op.create_index(op.f('ix_documents_content_hash'), 'documents', ['content_hash'], unique=False)
op.create_unique_constraint(op.f('uq_documents_content_hash'), 'documents', ['content_hash'])
def downgrade() -> None:
# Remove constraints and index first
op.drop_constraint(op.f('uq_documents_content_hash'), 'documents', type_='unique')
op.drop_index(op.f('ix_documents_content_hash'), table_name='documents')
# Remove content_hash column from documents table
op.drop_column('documents', 'content_hash')

View file

@ -2,7 +2,6 @@
Revision ID: e55302644c51
Revises: 1
Create Date: 2025-04-13 19:56:00.059921
"""
from typing import Sequence, Union