mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-07 07:55:16 +02:00
feat: add full document mode in knowledge base
This commit is contained in:
parent
c085398933
commit
87c8c5e2c8
26 changed files with 1144 additions and 351 deletions
|
|
@ -27,6 +27,7 @@ class KnowledgeBaseClient(BaseDBClient):
|
|||
custom_metadata: Optional[dict] = None,
|
||||
docling_metadata: Optional[dict] = None,
|
||||
document_uuid: Optional[str] = None,
|
||||
retrieval_mode: str = "chunked",
|
||||
) -> KnowledgeBaseDocumentModel:
|
||||
"""Create a new knowledge base document record.
|
||||
|
||||
|
|
@ -58,6 +59,7 @@ class KnowledgeBaseClient(BaseDBClient):
|
|||
docling_metadata=docling_metadata or {},
|
||||
processing_status="pending",
|
||||
total_chunks=0,
|
||||
retrieval_mode=retrieval_mode,
|
||||
)
|
||||
|
||||
# Use provided UUID or let the model generate one
|
||||
|
|
@ -425,6 +427,55 @@ class KnowledgeBaseClient(BaseDBClient):
|
|||
# Convert asyncpg records to dictionaries
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def update_document_full_text(
|
||||
self,
|
||||
document_id: int,
|
||||
full_text: str,
|
||||
) -> None:
|
||||
"""Store full document text for full_document retrieval mode.
|
||||
|
||||
Args:
|
||||
document_id: ID of the document
|
||||
full_text: The full extracted text content
|
||||
"""
|
||||
async with self.async_session() as session:
|
||||
query = select(KnowledgeBaseDocumentModel).where(
|
||||
KnowledgeBaseDocumentModel.id == document_id
|
||||
)
|
||||
result = await session.execute(query)
|
||||
document = result.scalar_one_or_none()
|
||||
if document:
|
||||
document.full_text = full_text
|
||||
await session.commit()
|
||||
logger.info(
|
||||
f"Stored full text for document {document_id} ({len(full_text)} chars)"
|
||||
)
|
||||
|
||||
async def get_full_text_documents(
|
||||
self,
|
||||
organization_id: int,
|
||||
document_uuids: List[str],
|
||||
) -> List[KnowledgeBaseDocumentModel]:
|
||||
"""Get full_document mode documents by their UUIDs.
|
||||
|
||||
Args:
|
||||
organization_id: Organization ID for scoping
|
||||
document_uuids: List of document UUIDs to fetch
|
||||
|
||||
Returns:
|
||||
List of documents with retrieval_mode='full_document' and full_text set
|
||||
"""
|
||||
async with self.async_session() as session:
|
||||
query = select(KnowledgeBaseDocumentModel).where(
|
||||
KnowledgeBaseDocumentModel.organization_id == organization_id,
|
||||
KnowledgeBaseDocumentModel.document_uuid.in_(document_uuids),
|
||||
KnowledgeBaseDocumentModel.retrieval_mode == "full_document",
|
||||
KnowledgeBaseDocumentModel.is_active == True,
|
||||
KnowledgeBaseDocumentModel.processing_status == "completed",
|
||||
)
|
||||
result = await session.execute(query)
|
||||
return list(result.scalars().all())
|
||||
|
||||
async def delete_document(
|
||||
self,
|
||||
document_uuid: str,
|
||||
|
|
|
|||
|
|
@ -940,6 +940,14 @@ class KnowledgeBaseDocumentModel(Base):
|
|||
file_hash = Column(String(64), nullable=True) # SHA-256 hash for deduplication
|
||||
mime_type = Column(String(100), nullable=True)
|
||||
|
||||
# Retrieval mode: "chunked" (vector search) or "full_document" (return full text)
|
||||
retrieval_mode = Column(
|
||||
String(20), nullable=False, default="chunked", server_default="chunked"
|
||||
)
|
||||
full_text = Column(
|
||||
Text, nullable=True
|
||||
) # Stored when retrieval_mode is "full_document"
|
||||
|
||||
# Processing metadata
|
||||
source_url = Column(String, nullable=True) # If document was fetched from URL
|
||||
total_chunks = Column(Integer, nullable=False, default=0)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue