feat: add full document mode in knowledge base

This commit is contained in:
Abhishek Kumar 2026-04-09 13:49:20 +05:30
parent c085398933
commit 87c8c5e2c8
26 changed files with 1144 additions and 351 deletions

View file

@ -27,6 +27,7 @@ class KnowledgeBaseClient(BaseDBClient):
custom_metadata: Optional[dict] = None,
docling_metadata: Optional[dict] = None,
document_uuid: Optional[str] = None,
retrieval_mode: str = "chunked",
) -> KnowledgeBaseDocumentModel:
"""Create a new knowledge base document record.
@ -58,6 +59,7 @@ class KnowledgeBaseClient(BaseDBClient):
docling_metadata=docling_metadata or {},
processing_status="pending",
total_chunks=0,
retrieval_mode=retrieval_mode,
)
# Use provided UUID or let the model generate one
@ -425,6 +427,55 @@ class KnowledgeBaseClient(BaseDBClient):
# Convert asyncpg records to dictionaries
return [dict(row) for row in rows]
async def update_document_full_text(
self,
document_id: int,
full_text: str,
) -> None:
"""Store full document text for full_document retrieval mode.
Args:
document_id: ID of the document
full_text: The full extracted text content
"""
async with self.async_session() as session:
query = select(KnowledgeBaseDocumentModel).where(
KnowledgeBaseDocumentModel.id == document_id
)
result = await session.execute(query)
document = result.scalar_one_or_none()
if document:
document.full_text = full_text
await session.commit()
logger.info(
f"Stored full text for document {document_id} ({len(full_text)} chars)"
)
async def get_full_text_documents(
self,
organization_id: int,
document_uuids: List[str],
) -> List[KnowledgeBaseDocumentModel]:
"""Get full_document mode documents by their UUIDs.
Args:
organization_id: Organization ID for scoping
document_uuids: List of document UUIDs to fetch
Returns:
List of documents with retrieval_mode='full_document' and full_text set
"""
async with self.async_session() as session:
query = select(KnowledgeBaseDocumentModel).where(
KnowledgeBaseDocumentModel.organization_id == organization_id,
KnowledgeBaseDocumentModel.document_uuid.in_(document_uuids),
KnowledgeBaseDocumentModel.retrieval_mode == "full_document",
KnowledgeBaseDocumentModel.is_active == True,
KnowledgeBaseDocumentModel.processing_status == "completed",
)
result = await session.execute(query)
return list(result.scalars().all())
async def delete_document(
self,
document_uuid: str,

View file

@ -940,6 +940,14 @@ class KnowledgeBaseDocumentModel(Base):
file_hash = Column(String(64), nullable=True) # SHA-256 hash for deduplication
mime_type = Column(String(100), nullable=True)
# Retrieval mode: "chunked" (vector search) or "full_document" (return full text)
retrieval_mode = Column(
String(20), nullable=False, default="chunked", server_default="chunked"
)
full_text = Column(
Text, nullable=True
) # Stored when retrieval_mode is "full_document"
# Processing metadata
source_url = Column(String, nullable=True) # If document was fetched from URL
total_chunks = Column(Integer, nullable=False, default=0)