Document chunks not stored in vector store (#665)

- Schema - ChunkEmbeddings now uses chunk_id: str instead of chunk: bytes
- Schema - DocumentEmbeddingsResponse now returns chunk_ids: list[str]
  instead of chunks
- Translators - Updated to serialize/deserialize chunk_id
- Clients - DocumentEmbeddingsClient.query() returns chunk_ids
- SDK/API - flow.py, socket_client.py, bulk_client.py updated
- Document embeddings service - Stores chunk_id (document ID) instead
  of chunk text
- Storage writers - Qdrant, Milvus, Pinecone store chunk_id in payload
- Query services - Return chunk_id from vector store searches
- Gateway dispatchers - Serialize chunk_id in API responses
- Document RAG - Added librarian client to fetch chunk content from
  Garage using chunk_ids
- CLI tools - Updated all three tools:
  - invoke_document_embeddings.py - displays chunk_ids, removed
    max_chunk_length
  - save_doc_embeds.py - exports chunk_id
  - load_doc_embeds.py - imports chunk_id
This commit is contained in:
cybermaggedon 2026-03-07 23:10:45 +00:00 committed by GitHub
parent be358efe67
commit 24bbe94136
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 331 additions and 91 deletions

View file

@ -322,8 +322,8 @@ class BulkClient:
# Generate document embeddings to import
def doc_embedding_generator():
yield {"id": "doc1-chunk1", "embedding": [0.1, 0.2, ...]}
yield {"id": "doc1-chunk2", "embedding": [0.3, 0.4, ...]}
yield {"chunk_id": "doc1/p0/c0", "embedding": [0.1, 0.2, ...]}
yield {"chunk_id": "doc1/p0/c1", "embedding": [0.3, 0.4, ...]}
# ... more embeddings
bulk.import_document_embeddings(
@ -363,9 +363,9 @@ class BulkClient:
# Export and process document embeddings
for embedding in bulk.export_document_embeddings(flow="default"):
doc_id = embedding.get("id")
chunk_id = embedding.get("chunk_id")
vector = embedding.get("embedding")
print(f"{doc_id}: {len(vector)} dimensions")
print(f"{chunk_id}: {len(vector)} dimensions")
```
"""
async_gen = self._export_document_embeddings_async(flow)

View file

@ -634,7 +634,7 @@ class FlowInstance:
limit: Maximum number of results (default: 10)
Returns:
dict: Query results with similar document chunks
dict: Query results with chunk_ids of matching document chunks
Example:
```python
@ -645,6 +645,7 @@ class FlowInstance:
collection="research-papers",
limit=5
)
# results contains {"chunk_ids": ["doc1/p0/c0", "doc2/p1/c3", ...]}
```
"""

View file

@ -682,7 +682,7 @@ class SocketFlowInstance:
**kwargs: Additional parameters passed to the service
Returns:
dict: Query results with similar document chunks
dict: Query results with chunk_ids of matching document chunks
Example:
```python
@ -695,6 +695,7 @@ class SocketFlowInstance:
collection="research-papers",
limit=5
)
# results contains {"chunk_ids": ["doc1/p0/c0", ...]}
```
"""
# First convert text to embeddings vectors