feat: add openai embedding service

2026-06-13 08:15:21 +02:00 · 2026-01-17 13:36:26 +05:30 · 2026-01-17 13:36:26 +05:30 · 3f0e500fde
commit 3f0e500fde
parent eb41285204
39 changed files with 1902 additions and 339 deletions
--- a/api/db/knowledge_base_client.py
+++ b/api/db/knowledge_base_client.py
@ -332,6 +332,7 @@ class KnowledgeBaseClient(BaseDBClient):
        limit: int = 5,
        document_ids: Optional[List[int]] = None,
        document_uuids: Optional[List[str]] = None,
+        embedding_model: Optional[str] = None,
    ) -> List[dict]:
        """Search for similar chunks using vector similarity.

@ -344,6 +345,7 @@ class KnowledgeBaseClient(BaseDBClient):
            limit: Maximum number of results to return
            document_ids: Optional list of document IDs to filter by
            document_uuids: Optional list of document UUIDs to filter by
+            embedding_model: Optional embedding model to filter by (for dimension compatibility)

        Returns:
            List of dictionaries with chunk data and similarity scores, ordered by similarity (highest first)
@ -359,23 +361,37 @@ class KnowledgeBaseClient(BaseDBClient):
                "c.organization_id = $2",
                "d.is_active = true",
            ]
-            params = [None, organization_id, limit]  # $1 will be embedding_str, $3 is limit
+            params = [
+                None,
+                organization_id,
+                limit,
+            ]  # $1 will be embedding_str, $3 is limit
            param_index = 4  # Next available parameter index

            # Add document_ids filter if provided
            if document_ids:
-                placeholders = ", ".join(f"${param_index + i}" for i in range(len(document_ids)))
+                placeholders = ", ".join(
+                    f"${param_index + i}" for i in range(len(document_ids))
+                )
                where_conditions.append(f"c.document_id IN ({placeholders})")
                params.extend(document_ids)
                param_index += len(document_ids)

            # Add document_uuids filter if provided
            if document_uuids:
-                placeholders = ", ".join(f"${param_index + i}" for i in range(len(document_uuids)))
+                placeholders = ", ".join(
+                    f"${param_index + i}" for i in range(len(document_uuids))
+                )
                where_conditions.append(f"d.document_uuid IN ({placeholders})")
                params.extend(document_uuids)
                param_index += len(document_uuids)

+            # Add embedding_model filter if provided (for dimension compatibility)
+            if embedding_model:
+                where_conditions.append(f"c.embedding_model = ${param_index}")
+                params.append(embedding_model)
+                param_index += 1
+
            # Build the complete SQL query
            where_clause = " AND ".join(where_conditions)
            query_sql = f"""
--- a/api/db/models.py
+++ b/api/db/models.py
@ -2,6 +2,7 @@ import uuid
 from datetime import UTC, datetime

 from loguru import logger
+from pgvector.sqlalchemy import Vector
 from sqlalchemy import (
    JSON,
    Boolean,
@ -19,7 +20,6 @@ from sqlalchemy import (
    and_,
    text,
 )
-from pgvector.sqlalchemy import Vector
 from sqlalchemy.orm import declarative_base, relationship

 from ..enums import (
@ -929,7 +929,13 @@ class KnowledgeBaseDocumentModel(Base):
    source_url = Column(String, nullable=True)  # If document was fetched from URL
    total_chunks = Column(Integer, nullable=False, default=0)
    processing_status = Column(
-        Enum("pending", "processing", "completed", "failed", name="document_processing_status"),
+        Enum(
+            "pending",
+            "processing",
+            "completed",
+            "failed",
+            name="document_processing_status",
+        ),
        nullable=False,
        default="pending",
        server_default=text("'pending'::document_processing_status"),
@ -937,7 +943,9 @@ class KnowledgeBaseDocumentModel(Base):
    processing_error = Column(Text, nullable=True)

    # Docling conversion metadata
-    docling_metadata = Column(JSON, nullable=False, default=dict)  # Store docling document metadata
+    docling_metadata = Column(
+        JSON, nullable=False, default=dict
+    )  # Store docling document metadata

    # Custom metadata (user-defined tags, categories, etc.)
    custom_metadata = Column(JSON, nullable=False, default=dict)
@ -1000,21 +1008,31 @@ class KnowledgeBaseChunkModel(Base):

    # Chunk content
    chunk_text = Column(Text, nullable=False)  # The actual chunk text
-    contextualized_text = Column(Text, nullable=True)  # Enriched text from chunker.contextualize()
+    contextualized_text = Column(
+        Text, nullable=True
+    )  # Enriched text from chunker.contextualize()

    # Chunk positioning and metadata
    chunk_index = Column(Integer, nullable=False)  # Position in document (0-based)

    # Docling chunk metadata
-    chunk_metadata = Column(JSON, nullable=False, default=dict)  # Store chunk.meta if available
+    chunk_metadata = Column(
+        JSON, nullable=False, default=dict
+    )  # Store chunk.meta if available

    # Embedding configuration
-    embedding_model = Column(String(200), nullable=False)  # e.g., "sentence-transformers/all-MiniLM-L6-v2"
-    embedding_dimension = Column(Integer, nullable=False)  # e.g., 384 for all-MiniLM-L6-v2
+    embedding_model = Column(
+        String(200), nullable=False
+    )  # e.g., "sentence-transformers/all-MiniLM-L6-v2"
+    embedding_dimension = Column(
+        Integer, nullable=False
+    )  # e.g., 384 for all-MiniLM-L6-v2

    # Vector embedding (pgvector column)
    # The dimension should match the embedding_dimension field
-    embedding = Column(Vector(384), nullable=True)  # Default to 384 for all-MiniLM-L6-v2
+    # Default: 1536 dimensions for OpenAI text-embedding-3-small
+    # SentenceTransformer (384-dim) also supported but stored as 384-dim vectors
+    embedding = Column(Vector(1536), nullable=True)

    # Token count (useful for chunking strategy analysis)
    token_count = Column(Integer, nullable=True)
@ -1036,6 +1054,9 @@ class KnowledgeBaseChunkModel(Base):
        Index("ix_kb_chunks_document_id", "document_id"),
        Index("ix_kb_chunks_organization_id", "organization_id"),
        Index("ix_kb_chunks_chunk_index", "chunk_index"),
+        Index(
+            "ix_kb_chunks_embedding_model", "embedding_model"
+        ),  # For filtering by model
        # Vector similarity search index (using IVFFlat or HNSW)
        # IVFFlat is good for datasets with 10k-1M vectors
        # HNSW is better for larger datasets but uses more memory