mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-22 08:38:13 +02:00
feat: upload file and store embedding
This commit is contained in:
parent
cac25879bf
commit
ec1417da87
21 changed files with 2566 additions and 2 deletions
98
api/alembic/versions/dc33eef8dabe_add_document_tables.py
Normal file
98
api/alembic/versions/dc33eef8dabe_add_document_tables.py
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
"""add document tables
|
||||
|
||||
Revision ID: dc33eef8dabe
|
||||
Revises: dcb0a27d98c6
|
||||
Create Date: 2026-01-16 13:40:17.808807
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
from pgvector.sqlalchemy import Vector
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = 'dc33eef8dabe'
|
||||
down_revision: Union[str, None] = 'dcb0a27d98c6'
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
# Enable pgvector extension
|
||||
op.execute('CREATE EXTENSION IF NOT EXISTS vector')
|
||||
|
||||
sa.Enum('pending', 'processing', 'completed', 'failed', name='document_processing_status').create(op.get_bind())
|
||||
op.create_table('knowledge_base_documents',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('document_uuid', sa.String(length=36), nullable=False),
|
||||
sa.Column('organization_id', sa.Integer(), nullable=False),
|
||||
sa.Column('filename', sa.String(length=500), nullable=False),
|
||||
sa.Column('file_size_bytes', sa.Integer(), nullable=True),
|
||||
sa.Column('file_hash', sa.String(length=64), nullable=True),
|
||||
sa.Column('mime_type', sa.String(length=100), nullable=True),
|
||||
sa.Column('source_url', sa.String(), nullable=True),
|
||||
sa.Column('total_chunks', sa.Integer(), nullable=False),
|
||||
sa.Column('processing_status', postgresql.ENUM('pending', 'processing', 'completed', 'failed', name='document_processing_status', create_type=False), server_default=sa.text("'pending'::document_processing_status"), nullable=False),
|
||||
sa.Column('processing_error', sa.Text(), nullable=True),
|
||||
sa.Column('docling_metadata', sa.JSON(), nullable=False),
|
||||
sa.Column('custom_metadata', sa.JSON(), nullable=False),
|
||||
sa.Column('created_by', sa.Integer(), nullable=False),
|
||||
sa.Column('created_at', sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column('updated_at', sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column('is_active', sa.Boolean(), nullable=False),
|
||||
sa.Column('archived_at', sa.DateTime(timezone=True), nullable=True),
|
||||
sa.ForeignKeyConstraint(['created_by'], ['users.id'], ),
|
||||
sa.ForeignKeyConstraint(['organization_id'], ['organizations.id'], ondelete='CASCADE'),
|
||||
sa.PrimaryKeyConstraint('id'),
|
||||
op.create_index('ix_kb_documents_created_at', 'knowledge_base_documents', ['created_at'], unique=False)
|
||||
op.create_index('ix_kb_documents_organization_id', 'knowledge_base_documents', ['organization_id'], unique=False)
|
||||
op.create_index('ix_kb_documents_status', 'knowledge_base_documents', ['processing_status'], unique=False)
|
||||
op.create_index('ix_kb_documents_uuid', 'knowledge_base_documents', ['document_uuid'], unique=False)
|
||||
op.create_index(op.f('ix_knowledge_base_documents_document_uuid'), 'knowledge_base_documents', ['document_uuid'], unique=True)
|
||||
op.create_table('knowledge_base_chunks'),
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('document_id', sa.Integer(), nullable=False),
|
||||
sa.Column('organization_id', sa.Integer(), nullable=False),
|
||||
sa.Column('chunk_text', sa.Text(), nullable=False),
|
||||
sa.Column('contextualized_text', sa.Text(), nullable=True),
|
||||
sa.Column('chunk_index', sa.Integer(), nullable=False),
|
||||
sa.Column('chunk_metadata', sa.JSON(), nullable=False),
|
||||
sa.Column('embedding_model', sa.String(length=200), nullable=False),
|
||||
sa.Column('embedding_dimension', sa.Integer(), nullable=False),
|
||||
sa.Column('embedding', Vector(384), nullable=True),
|
||||
sa.Column('token_count', sa.Integer(), nullable=True),
|
||||
sa.Column('created_at', sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column('updated_at', sa.DateTime(timezone=True), nullable=True),
|
||||
sa.ForeignKeyConstraint(['document_id'], ['knowledge_base_documents.id'], ondelete='CASCADE'),
|
||||
sa.ForeignKeyConstraint(['organization_id'], ['organizations.id'], ondelete='CASCADE'),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
op.create_index('ix_kb_chunks_chunk_index', 'knowledge_base_chunks', ['chunk_index'], unique=False)
|
||||
op.create_index('ix_kb_chunks_document_id', 'knowledge_base_chunks', ['document_id'], unique=False)
|
||||
op.create_index('ix_kb_chunks_embedding_ivfflat', 'knowledge_base_chunks', ['embedding'], unique=False, postgresql_using='ivfflat', postgresql_with={'lists': 100}, postgresql_ops={'embedding': 'vector_cosine_ops'})
|
||||
op.create_index('ix_kb_chunks_organization_id', 'knowledge_base_chunks', ['organization_id'], unique=False)
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_index('ix_kb_chunks_organization_id', table_name='knowledge_base_chunks')
|
||||
op.drop_index('ix_kb_chunks_embedding_ivfflat', table_name='knowledge_base_chunks', postgresql_using='ivfflat', postgresql_with={'lists': 100}, postgresql_ops={'embedding': 'vector_cosine_ops'})
|
||||
op.drop_index('ix_kb_chunks_document_id', table_name='knowledge_base_chunks')
|
||||
op.drop_index('ix_kb_chunks_chunk_index', table_name='knowledge_base_chunks')
|
||||
op.drop_table('knowledge_base_chunks')
|
||||
op.drop_index(op.f('ix_knowledge_base_documents_document_uuid'), table_name='knowledge_base_documents')
|
||||
op.drop_index('ix_kb_documents_uuid', table_name='knowledge_base_documents')
|
||||
op.drop_index('ix_kb_documents_status', table_name='knowledge_base_documents')
|
||||
op.drop_index('ix_kb_documents_organization_id', table_name='knowledge_base_documents')
|
||||
op.drop_index('ix_kb_documents_created_at', table_name='knowledge_base_documents')
|
||||
op.drop_table('knowledge_base_documents')
|
||||
sa.Enum('pending', 'processing', 'completed', 'failed', name='document_processing_status').drop(op.get_bind())
|
||||
|
||||
# Note: We don't drop the vector extension as it may be used by other tables
|
||||
# If you want to drop it, uncomment the following line:
|
||||
# op.execute('DROP EXTENSION IF EXISTS vector')
|
||||
# ### end Alembic commands ###
|
||||
Loading…
Add table
Add a link
Reference in a new issue