mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-16 08:25:18 +02:00
feat: knowledge base functionality for the voice agent (#120)
* feat: upload file and store embedding * feat: add documents in nodes * feat: add openai embedding service
This commit is contained in:
parent
e2fa4bbb98
commit
ef5b9e40a9
52 changed files with 4551 additions and 114 deletions
194
api/alembic/versions/dc33eef8dabe_add_document_tables.py
Normal file
194
api/alembic/versions/dc33eef8dabe_add_document_tables.py
Normal file
|
|
@ -0,0 +1,194 @@
|
|||
"""add document tables
|
||||
|
||||
Revision ID: dc33eef8dabe
|
||||
Revises: dcb0a27d98c6
|
||||
Create Date: 2026-01-16 13:40:17.808807
|
||||
|
||||
"""
|
||||
|
||||
from typing import Sequence, Union
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
from pgvector.sqlalchemy import Vector
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "dc33eef8dabe"
|
||||
down_revision: Union[str, None] = "dcb0a27d98c6"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
# Enable pgvector extension
|
||||
op.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
||||
|
||||
sa.Enum(
|
||||
"pending",
|
||||
"processing",
|
||||
"completed",
|
||||
"failed",
|
||||
name="document_processing_status",
|
||||
).create(op.get_bind())
|
||||
op.create_table(
|
||||
"knowledge_base_documents",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("document_uuid", sa.String(length=36), nullable=False),
|
||||
sa.Column("organization_id", sa.Integer(), nullable=False),
|
||||
sa.Column("filename", sa.String(length=500), nullable=False),
|
||||
sa.Column("file_size_bytes", sa.Integer(), nullable=True),
|
||||
sa.Column("file_hash", sa.String(length=64), nullable=True),
|
||||
sa.Column("mime_type", sa.String(length=100), nullable=True),
|
||||
sa.Column("source_url", sa.String(), nullable=True),
|
||||
sa.Column("total_chunks", sa.Integer(), nullable=False),
|
||||
sa.Column(
|
||||
"processing_status",
|
||||
postgresql.ENUM(
|
||||
"pending",
|
||||
"processing",
|
||||
"completed",
|
||||
"failed",
|
||||
name="document_processing_status",
|
||||
create_type=False,
|
||||
),
|
||||
server_default=sa.text("'pending'::document_processing_status"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("processing_error", sa.Text(), nullable=True),
|
||||
sa.Column("docling_metadata", sa.JSON(), nullable=False),
|
||||
sa.Column("custom_metadata", sa.JSON(), nullable=False),
|
||||
sa.Column("created_by", sa.Integer(), nullable=False),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column("updated_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column("is_active", sa.Boolean(), nullable=False),
|
||||
sa.Column("archived_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.ForeignKeyConstraint(
|
||||
["created_by"],
|
||||
["users.id"],
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["organization_id"], ["organizations.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_kb_documents_created_at",
|
||||
"knowledge_base_documents",
|
||||
["created_at"],
|
||||
unique=False,
|
||||
)
|
||||
op.create_index(
|
||||
"ix_kb_documents_organization_id",
|
||||
"knowledge_base_documents",
|
||||
["organization_id"],
|
||||
unique=False,
|
||||
)
|
||||
op.create_index(
|
||||
"ix_kb_documents_status",
|
||||
"knowledge_base_documents",
|
||||
["processing_status"],
|
||||
unique=False,
|
||||
)
|
||||
op.create_index(
|
||||
"ix_kb_documents_uuid",
|
||||
"knowledge_base_documents",
|
||||
["document_uuid"],
|
||||
unique=False,
|
||||
)
|
||||
op.create_index(
|
||||
op.f("ix_knowledge_base_documents_document_uuid"),
|
||||
"knowledge_base_documents",
|
||||
["document_uuid"],
|
||||
unique=True,
|
||||
)
|
||||
op.create_table(
|
||||
"knowledge_base_chunks",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("document_id", sa.Integer(), nullable=False),
|
||||
sa.Column("organization_id", sa.Integer(), nullable=False),
|
||||
sa.Column("chunk_text", sa.Text(), nullable=False),
|
||||
sa.Column("contextualized_text", sa.Text(), nullable=True),
|
||||
sa.Column("chunk_index", sa.Integer(), nullable=False),
|
||||
sa.Column("chunk_metadata", sa.JSON(), nullable=False),
|
||||
sa.Column("embedding_model", sa.String(length=200), nullable=False),
|
||||
sa.Column("embedding_dimension", sa.Integer(), nullable=False),
|
||||
sa.Column("embedding", Vector(1536), nullable=True),
|
||||
sa.Column("token_count", sa.Integer(), nullable=True),
|
||||
sa.Column("created_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column("updated_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.ForeignKeyConstraint(
|
||||
["document_id"], ["knowledge_base_documents.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["organization_id"], ["organizations.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_kb_chunks_chunk_index",
|
||||
"knowledge_base_chunks",
|
||||
["chunk_index"],
|
||||
unique=False,
|
||||
)
|
||||
op.create_index(
|
||||
"ix_kb_chunks_document_id",
|
||||
"knowledge_base_chunks",
|
||||
["document_id"],
|
||||
unique=False,
|
||||
)
|
||||
op.create_index(
|
||||
"ix_kb_chunks_embedding_ivfflat",
|
||||
"knowledge_base_chunks",
|
||||
["embedding"],
|
||||
unique=False,
|
||||
postgresql_using="ivfflat",
|
||||
postgresql_with={"lists": 100},
|
||||
postgresql_ops={"embedding": "vector_cosine_ops"},
|
||||
)
|
||||
op.create_index(
|
||||
"ix_kb_chunks_organization_id",
|
||||
"knowledge_base_chunks",
|
||||
["organization_id"],
|
||||
unique=False,
|
||||
)
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_index("ix_kb_chunks_organization_id", table_name="knowledge_base_chunks")
|
||||
op.drop_index(
|
||||
"ix_kb_chunks_embedding_ivfflat",
|
||||
table_name="knowledge_base_chunks",
|
||||
postgresql_using="ivfflat",
|
||||
postgresql_with={"lists": 100},
|
||||
postgresql_ops={"embedding": "vector_cosine_ops"},
|
||||
)
|
||||
op.drop_index("ix_kb_chunks_document_id", table_name="knowledge_base_chunks")
|
||||
op.drop_index("ix_kb_chunks_chunk_index", table_name="knowledge_base_chunks")
|
||||
op.drop_table("knowledge_base_chunks")
|
||||
op.drop_index(
|
||||
op.f("ix_knowledge_base_documents_document_uuid"),
|
||||
table_name="knowledge_base_documents",
|
||||
)
|
||||
op.drop_index("ix_kb_documents_uuid", table_name="knowledge_base_documents")
|
||||
op.drop_index("ix_kb_documents_status", table_name="knowledge_base_documents")
|
||||
op.drop_index(
|
||||
"ix_kb_documents_organization_id", table_name="knowledge_base_documents"
|
||||
)
|
||||
op.drop_index("ix_kb_documents_created_at", table_name="knowledge_base_documents")
|
||||
op.drop_table("knowledge_base_documents")
|
||||
sa.Enum(
|
||||
"pending",
|
||||
"processing",
|
||||
"completed",
|
||||
"failed",
|
||||
name="document_processing_status",
|
||||
).drop(op.get_bind())
|
||||
|
||||
# Note: We don't drop the vector extension as it may be used by other tables
|
||||
# If you want to drop it, uncomment the following line:
|
||||
# op.execute('DROP EXTENSION IF EXISTS vector')
|
||||
# ### end Alembic commands ###
|
||||
Loading…
Add table
Add a link
Reference in a new issue