mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-13 08:15:21 +02:00
feat: add openai embedding service
This commit is contained in:
parent
eb41285204
commit
3f0e500fde
39 changed files with 1902 additions and 339 deletions
|
|
@ -332,6 +332,7 @@ class KnowledgeBaseClient(BaseDBClient):
|
|||
limit: int = 5,
|
||||
document_ids: Optional[List[int]] = None,
|
||||
document_uuids: Optional[List[str]] = None,
|
||||
embedding_model: Optional[str] = None,
|
||||
) -> List[dict]:
|
||||
"""Search for similar chunks using vector similarity.
|
||||
|
||||
|
|
@ -344,6 +345,7 @@ class KnowledgeBaseClient(BaseDBClient):
|
|||
limit: Maximum number of results to return
|
||||
document_ids: Optional list of document IDs to filter by
|
||||
document_uuids: Optional list of document UUIDs to filter by
|
||||
embedding_model: Optional embedding model to filter by (for dimension compatibility)
|
||||
|
||||
Returns:
|
||||
List of dictionaries with chunk data and similarity scores, ordered by similarity (highest first)
|
||||
|
|
@ -359,23 +361,37 @@ class KnowledgeBaseClient(BaseDBClient):
|
|||
"c.organization_id = $2",
|
||||
"d.is_active = true",
|
||||
]
|
||||
params = [None, organization_id, limit] # $1 will be embedding_str, $3 is limit
|
||||
params = [
|
||||
None,
|
||||
organization_id,
|
||||
limit,
|
||||
] # $1 will be embedding_str, $3 is limit
|
||||
param_index = 4 # Next available parameter index
|
||||
|
||||
# Add document_ids filter if provided
|
||||
if document_ids:
|
||||
placeholders = ", ".join(f"${param_index + i}" for i in range(len(document_ids)))
|
||||
placeholders = ", ".join(
|
||||
f"${param_index + i}" for i in range(len(document_ids))
|
||||
)
|
||||
where_conditions.append(f"c.document_id IN ({placeholders})")
|
||||
params.extend(document_ids)
|
||||
param_index += len(document_ids)
|
||||
|
||||
# Add document_uuids filter if provided
|
||||
if document_uuids:
|
||||
placeholders = ", ".join(f"${param_index + i}" for i in range(len(document_uuids)))
|
||||
placeholders = ", ".join(
|
||||
f"${param_index + i}" for i in range(len(document_uuids))
|
||||
)
|
||||
where_conditions.append(f"d.document_uuid IN ({placeholders})")
|
||||
params.extend(document_uuids)
|
||||
param_index += len(document_uuids)
|
||||
|
||||
# Add embedding_model filter if provided (for dimension compatibility)
|
||||
if embedding_model:
|
||||
where_conditions.append(f"c.embedding_model = ${param_index}")
|
||||
params.append(embedding_model)
|
||||
param_index += 1
|
||||
|
||||
# Build the complete SQL query
|
||||
where_clause = " AND ".join(where_conditions)
|
||||
query_sql = f"""
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import uuid
|
|||
from datetime import UTC, datetime
|
||||
|
||||
from loguru import logger
|
||||
from pgvector.sqlalchemy import Vector
|
||||
from sqlalchemy import (
|
||||
JSON,
|
||||
Boolean,
|
||||
|
|
@ -19,7 +20,6 @@ from sqlalchemy import (
|
|||
and_,
|
||||
text,
|
||||
)
|
||||
from pgvector.sqlalchemy import Vector
|
||||
from sqlalchemy.orm import declarative_base, relationship
|
||||
|
||||
from ..enums import (
|
||||
|
|
@ -929,7 +929,13 @@ class KnowledgeBaseDocumentModel(Base):
|
|||
source_url = Column(String, nullable=True) # If document was fetched from URL
|
||||
total_chunks = Column(Integer, nullable=False, default=0)
|
||||
processing_status = Column(
|
||||
Enum("pending", "processing", "completed", "failed", name="document_processing_status"),
|
||||
Enum(
|
||||
"pending",
|
||||
"processing",
|
||||
"completed",
|
||||
"failed",
|
||||
name="document_processing_status",
|
||||
),
|
||||
nullable=False,
|
||||
default="pending",
|
||||
server_default=text("'pending'::document_processing_status"),
|
||||
|
|
@ -937,7 +943,9 @@ class KnowledgeBaseDocumentModel(Base):
|
|||
processing_error = Column(Text, nullable=True)
|
||||
|
||||
# Docling conversion metadata
|
||||
docling_metadata = Column(JSON, nullable=False, default=dict) # Store docling document metadata
|
||||
docling_metadata = Column(
|
||||
JSON, nullable=False, default=dict
|
||||
) # Store docling document metadata
|
||||
|
||||
# Custom metadata (user-defined tags, categories, etc.)
|
||||
custom_metadata = Column(JSON, nullable=False, default=dict)
|
||||
|
|
@ -1000,21 +1008,31 @@ class KnowledgeBaseChunkModel(Base):
|
|||
|
||||
# Chunk content
|
||||
chunk_text = Column(Text, nullable=False) # The actual chunk text
|
||||
contextualized_text = Column(Text, nullable=True) # Enriched text from chunker.contextualize()
|
||||
contextualized_text = Column(
|
||||
Text, nullable=True
|
||||
) # Enriched text from chunker.contextualize()
|
||||
|
||||
# Chunk positioning and metadata
|
||||
chunk_index = Column(Integer, nullable=False) # Position in document (0-based)
|
||||
|
||||
# Docling chunk metadata
|
||||
chunk_metadata = Column(JSON, nullable=False, default=dict) # Store chunk.meta if available
|
||||
chunk_metadata = Column(
|
||||
JSON, nullable=False, default=dict
|
||||
) # Store chunk.meta if available
|
||||
|
||||
# Embedding configuration
|
||||
embedding_model = Column(String(200), nullable=False) # e.g., "sentence-transformers/all-MiniLM-L6-v2"
|
||||
embedding_dimension = Column(Integer, nullable=False) # e.g., 384 for all-MiniLM-L6-v2
|
||||
embedding_model = Column(
|
||||
String(200), nullable=False
|
||||
) # e.g., "sentence-transformers/all-MiniLM-L6-v2"
|
||||
embedding_dimension = Column(
|
||||
Integer, nullable=False
|
||||
) # e.g., 384 for all-MiniLM-L6-v2
|
||||
|
||||
# Vector embedding (pgvector column)
|
||||
# The dimension should match the embedding_dimension field
|
||||
embedding = Column(Vector(384), nullable=True) # Default to 384 for all-MiniLM-L6-v2
|
||||
# Default: 1536 dimensions for OpenAI text-embedding-3-small
|
||||
# SentenceTransformer (384-dim) also supported but stored as 384-dim vectors
|
||||
embedding = Column(Vector(1536), nullable=True)
|
||||
|
||||
# Token count (useful for chunking strategy analysis)
|
||||
token_count = Column(Integer, nullable=True)
|
||||
|
|
@ -1036,6 +1054,9 @@ class KnowledgeBaseChunkModel(Base):
|
|||
Index("ix_kb_chunks_document_id", "document_id"),
|
||||
Index("ix_kb_chunks_organization_id", "organization_id"),
|
||||
Index("ix_kb_chunks_chunk_index", "chunk_index"),
|
||||
Index(
|
||||
"ix_kb_chunks_embedding_model", "embedding_model"
|
||||
), # For filtering by model
|
||||
# Vector similarity search index (using IVFFlat or HNSW)
|
||||
# IVFFlat is good for datasets with 10k-1M vectors
|
||||
# HNSW is better for larger datasets but uses more memory
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue