feat: add openai embedding service

This commit is contained in:
Abhishek Kumar 2026-01-17 13:36:26 +05:30
parent eb41285204
commit 3f0e500fde
39 changed files with 1902 additions and 339 deletions

View file

@ -332,6 +332,7 @@ class KnowledgeBaseClient(BaseDBClient):
limit: int = 5,
document_ids: Optional[List[int]] = None,
document_uuids: Optional[List[str]] = None,
embedding_model: Optional[str] = None,
) -> List[dict]:
"""Search for similar chunks using vector similarity.
@ -344,6 +345,7 @@ class KnowledgeBaseClient(BaseDBClient):
limit: Maximum number of results to return
document_ids: Optional list of document IDs to filter by
document_uuids: Optional list of document UUIDs to filter by
embedding_model: Optional embedding model to filter by (for dimension compatibility)
Returns:
List of dictionaries with chunk data and similarity scores, ordered by similarity (highest first)
@ -359,23 +361,37 @@ class KnowledgeBaseClient(BaseDBClient):
"c.organization_id = $2",
"d.is_active = true",
]
params = [None, organization_id, limit] # $1 will be embedding_str, $3 is limit
params = [
None,
organization_id,
limit,
] # $1 will be embedding_str, $3 is limit
param_index = 4 # Next available parameter index
# Add document_ids filter if provided
if document_ids:
placeholders = ", ".join(f"${param_index + i}" for i in range(len(document_ids)))
placeholders = ", ".join(
f"${param_index + i}" for i in range(len(document_ids))
)
where_conditions.append(f"c.document_id IN ({placeholders})")
params.extend(document_ids)
param_index += len(document_ids)
# Add document_uuids filter if provided
if document_uuids:
placeholders = ", ".join(f"${param_index + i}" for i in range(len(document_uuids)))
placeholders = ", ".join(
f"${param_index + i}" for i in range(len(document_uuids))
)
where_conditions.append(f"d.document_uuid IN ({placeholders})")
params.extend(document_uuids)
param_index += len(document_uuids)
# Add embedding_model filter if provided (for dimension compatibility)
if embedding_model:
where_conditions.append(f"c.embedding_model = ${param_index}")
params.append(embedding_model)
param_index += 1
# Build the complete SQL query
where_clause = " AND ".join(where_conditions)
query_sql = f"""

View file

@ -2,6 +2,7 @@ import uuid
from datetime import UTC, datetime
from loguru import logger
from pgvector.sqlalchemy import Vector
from sqlalchemy import (
JSON,
Boolean,
@ -19,7 +20,6 @@ from sqlalchemy import (
and_,
text,
)
from pgvector.sqlalchemy import Vector
from sqlalchemy.orm import declarative_base, relationship
from ..enums import (
@ -929,7 +929,13 @@ class KnowledgeBaseDocumentModel(Base):
source_url = Column(String, nullable=True) # If document was fetched from URL
total_chunks = Column(Integer, nullable=False, default=0)
processing_status = Column(
Enum("pending", "processing", "completed", "failed", name="document_processing_status"),
Enum(
"pending",
"processing",
"completed",
"failed",
name="document_processing_status",
),
nullable=False,
default="pending",
server_default=text("'pending'::document_processing_status"),
@ -937,7 +943,9 @@ class KnowledgeBaseDocumentModel(Base):
processing_error = Column(Text, nullable=True)
# Docling conversion metadata
docling_metadata = Column(JSON, nullable=False, default=dict) # Store docling document metadata
docling_metadata = Column(
JSON, nullable=False, default=dict
) # Store docling document metadata
# Custom metadata (user-defined tags, categories, etc.)
custom_metadata = Column(JSON, nullable=False, default=dict)
@ -1000,21 +1008,31 @@ class KnowledgeBaseChunkModel(Base):
# Chunk content
chunk_text = Column(Text, nullable=False) # The actual chunk text
contextualized_text = Column(Text, nullable=True) # Enriched text from chunker.contextualize()
contextualized_text = Column(
Text, nullable=True
) # Enriched text from chunker.contextualize()
# Chunk positioning and metadata
chunk_index = Column(Integer, nullable=False) # Position in document (0-based)
# Docling chunk metadata
chunk_metadata = Column(JSON, nullable=False, default=dict) # Store chunk.meta if available
chunk_metadata = Column(
JSON, nullable=False, default=dict
) # Store chunk.meta if available
# Embedding configuration
embedding_model = Column(String(200), nullable=False) # e.g., "sentence-transformers/all-MiniLM-L6-v2"
embedding_dimension = Column(Integer, nullable=False) # e.g., 384 for all-MiniLM-L6-v2
embedding_model = Column(
String(200), nullable=False
) # e.g., "sentence-transformers/all-MiniLM-L6-v2"
embedding_dimension = Column(
Integer, nullable=False
) # e.g., 384 for all-MiniLM-L6-v2
# Vector embedding (pgvector column)
# The dimension should match the embedding_dimension field
embedding = Column(Vector(384), nullable=True) # Default to 384 for all-MiniLM-L6-v2
# Default: 1536 dimensions for OpenAI text-embedding-3-small
# SentenceTransformer (384-dim) also supported but stored as 384-dim vectors
embedding = Column(Vector(1536), nullable=True)
# Token count (useful for chunking strategy analysis)
token_count = Column(Integer, nullable=True)
@ -1036,6 +1054,9 @@ class KnowledgeBaseChunkModel(Base):
Index("ix_kb_chunks_document_id", "document_id"),
Index("ix_kb_chunks_organization_id", "organization_id"),
Index("ix_kb_chunks_chunk_index", "chunk_index"),
Index(
"ix_kb_chunks_embedding_model", "embedding_model"
), # For filtering by model
# Vector similarity search index (using IVFFlat or HNSW)
# IVFFlat is good for datasets with 10k-1M vectors
# HNSW is better for larger datasets but uses more memory