Document chunks not stored in vector store (#665)

- Schema - ChunkEmbeddings now uses chunk_id: str instead of chunk: bytes
- Schema - DocumentEmbeddingsResponse now returns chunk_ids: list[str]
  instead of chunks
- Translators - Updated to serialize/deserialize chunk_id
- Clients - DocumentEmbeddingsClient.query() returns chunk_ids
- SDK/API - flow.py, socket_client.py, bulk_client.py updated
- Document embeddings service - Stores chunk_id (document ID) instead
  of chunk text
- Storage writers - Qdrant, Milvus, Pinecone store chunk_id in payload
- Query services - Return chunk_id from vector store searches
- Gateway dispatchers - Serialize chunk_id in API responses
- Document RAG - Added librarian client to fetch chunk content from
  Garage using chunk_ids
- CLI tools - Updated all three tools:
  - invoke_document_embeddings.py - displays chunk_ids, removed
    max_chunk_length
  - save_doc_embeds.py - exports chunk_id
  - load_doc_embeds.py - imports chunk_id
This commit is contained in:
cybermaggedon 2026-03-07 23:10:45 +00:00 committed by GitHub
parent be358efe67
commit 24bbe94136
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 331 additions and 91 deletions

View file

@ -1,6 +1,6 @@
"""
Queries document chunks by text similarity using vector embeddings.
Returns a list of matching document chunks, truncated to the specified length.
Returns a list of matching chunk IDs.
"""
import argparse
@ -10,13 +10,7 @@ from trustgraph.api import Api
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
def truncate_chunk(chunk, max_length):
"""Truncate a chunk to max_length characters, adding ellipsis if needed."""
if len(chunk) <= max_length:
return chunk
return chunk[:max_length] + "..."
def query(url, flow_id, query_text, user, collection, limit, max_chunk_length, token=None):
def query(url, flow_id, query_text, user, collection, limit, token=None):
# Create API client
api = Api(url=url, token=token)
@ -32,10 +26,12 @@ def query(url, flow_id, query_text, user, collection, limit, max_chunk_length, t
limit=limit
)
chunks = result.get("chunks", [])
for i, chunk in enumerate(chunks, 1):
truncated = truncate_chunk(chunk, max_chunk_length)
print(f"{i}. {truncated}")
chunk_ids = result.get("chunk_ids", [])
if not chunk_ids:
print("No matching chunks found.")
else:
for i, chunk_id in enumerate(chunk_ids, 1):
print(f"{i}. {chunk_id}")
finally:
# Clean up socket connection
@ -85,13 +81,6 @@ def main():
help='Maximum number of results (default: 10)',
)
parser.add_argument(
'--max-chunk-length',
type=int,
default=200,
help='Truncate chunks to N characters (default: 200)',
)
parser.add_argument(
'query',
nargs=1,
@ -109,7 +98,6 @@ def main():
user=args.user,
collection=args.collection,
limit=args.limit,
max_chunk_length=args.max_chunk_length,
token=args.token,
)

View file

@ -44,14 +44,14 @@ async def load_de(running, queue, url):
msg = {
"metadata": {
"id": msg["m"]["i"],
"id": msg["m"]["i"],
"metadata": msg["m"]["m"],
"user": msg["m"]["u"],
"collection": msg["m"]["c"],
},
"chunks": [
{
"chunk": chunk["c"],
"chunk_id": chunk["c"],
"vectors": chunk["v"],
}
for chunk in msg["c"]

View file

@ -50,14 +50,14 @@ async def fetch_de(running, queue, user, collection, url):
"de",
{
"m": {
"i": data["metadata"]["id"],
"i": data["metadata"]["id"],
"m": data["metadata"]["metadata"],
"u": data["metadata"]["user"],
"c": data["metadata"]["collection"],
},
"c": [
{
"c": chunk["chunk"],
"c": chunk["chunk_id"],
"v": chunk["vectors"],
}
for chunk in data["chunks"]