Document chunks not stored in vector store (#665)

- Schema - ChunkEmbeddings now uses chunk_id: str instead of chunk: bytes
- Schema - DocumentEmbeddingsResponse now returns chunk_ids: list[str]
  instead of chunks
- Translators - Updated to serialize/deserialize chunk_id
- Clients - DocumentEmbeddingsClient.query() returns chunk_ids
- SDK/API - flow.py, socket_client.py, bulk_client.py updated
- Document embeddings service - Stores chunk_id (document ID) instead
  of chunk text
- Storage writers - Qdrant, Milvus, Pinecone store chunk_id in payload
- Query services - Return chunk_id from vector store searches
- Gateway dispatchers - Serialize chunk_id in API responses
- Document RAG - Added librarian client to fetch chunk content from
  Garage using chunk_ids
- CLI tools - Updated all three tools:
  - invoke_document_embeddings.py - displays chunk_ids, removed
    max_chunk_length
  - save_doc_embeds.py - exports chunk_id
  - load_doc_embeds.py - imports chunk_id
This commit is contained in:
cybermaggedon 2026-03-07 23:10:45 +00:00 committed by GitHub
parent be358efe67
commit 24bbe94136
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 331 additions and 91 deletions

View file

@ -144,15 +144,15 @@ class DocumentEmbeddingsTranslator(SendTranslator):
def to_pulsar(self, data: Dict[str, Any]) -> DocumentEmbeddings:
metadata = data.get("metadata", {})
chunks = [
ChunkEmbeddings(
chunk=chunk["chunk"].encode("utf-8") if isinstance(chunk["chunk"], str) else chunk["chunk"],
chunk_id=chunk["chunk_id"],
vectors=chunk["vectors"]
)
for chunk in data.get("chunks", [])
]
from ...schema import Metadata
return DocumentEmbeddings(
metadata=Metadata(
@ -168,7 +168,7 @@ class DocumentEmbeddingsTranslator(SendTranslator):
result = {
"chunks": [
{
"chunk": chunk.chunk.decode("utf-8") if isinstance(chunk.chunk, bytes) else chunk.chunk,
"chunk_id": chunk.chunk_id,
"vectors": chunk.vectors
}
for chunk in obj.chunks

View file

@ -36,13 +36,10 @@ class DocumentEmbeddingsResponseTranslator(MessageTranslator):
def from_pulsar(self, obj: DocumentEmbeddingsResponse) -> Dict[str, Any]:
result = {}
if obj.chunks is not None:
result["chunks"] = [
chunk.decode("utf-8") if isinstance(chunk, bytes) else chunk
for chunk in obj.chunks
]
if obj.chunk_ids is not None:
result["chunk_ids"] = list(obj.chunk_ids)
return result
def from_response_with_completion(self, obj: DocumentEmbeddingsResponse) -> Tuple[Dict[str, Any], bool]: