Document chunks not stored in vector store (#665)

- Schema - ChunkEmbeddings now uses chunk_id: str instead of chunk: bytes - Schema - DocumentEmbeddingsResponse now returns chunk_ids: list[str] instead of chunks - Translators - Updated to serialize/deserialize chunk_id - Clients - DocumentEmbeddingsClient.query() returns chunk_ids - SDK/API - flow.py, socket_client.py, bulk_client.py updated - Document embeddings service - Stores chunk_id (document ID) instead of chunk text - Storage writers - Qdrant, Milvus, Pinecone store chunk_id in payload - Query services - Return chunk_id from vector store searches - Gateway dispatchers - Serialize chunk_id in API responses - Document RAG - Added librarian client to fetch chunk content from Garage using chunk_ids - CLI tools - Updated all three tools: - invoke_document_embeddings.py - displays chunk_ids, removed max_chunk_length - save_doc_embeds.py - exports chunk_id - load_doc_embeds.py - imports chunk_id
2026-06-14 09:15:13 +02:00 · 2026-03-07 23:10:45 +00:00 · 2026-03-07 23:10:45 +00:00 · 24bbe94136
commit 24bbe94136
parent be358efe67
24 changed files with 331 additions and 91 deletions
--- a/trustgraph-cli/trustgraph/cli/invoke_document_embeddings.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_document_embeddings.py
@ -1,6 +1,6 @@
 """
 Queries document chunks by text similarity using vector embeddings.
-Returns a list of matching document chunks, truncated to the specified length.
+Returns a list of matching chunk IDs.
 """

 import argparse
@ -10,13 +10,7 @@ from trustgraph.api import Api
 default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
 default_token = os.getenv("TRUSTGRAPH_TOKEN", None)

-def truncate_chunk(chunk, max_length):
-    """Truncate a chunk to max_length characters, adding ellipsis if needed."""
-    if len(chunk) <= max_length:
-        return chunk
-    return chunk[:max_length] + "..."
-
-def query(url, flow_id, query_text, user, collection, limit, max_chunk_length, token=None):
+def query(url, flow_id, query_text, user, collection, limit, token=None):

    # Create API client
    api = Api(url=url, token=token)
@ -32,10 +26,12 @@ def query(url, flow_id, query_text, user, collection, limit, max_chunk_length, t
            limit=limit
        )

-        chunks = result.get("chunks", [])
-        for i, chunk in enumerate(chunks, 1):
-            truncated = truncate_chunk(chunk, max_chunk_length)
-            print(f"{i}. {truncated}")
+        chunk_ids = result.get("chunk_ids", [])
+        if not chunk_ids:
+            print("No matching chunks found.")
+        else:
+            for i, chunk_id in enumerate(chunk_ids, 1):
+                print(f"{i}. {chunk_id}")

    finally:
        # Clean up socket connection
@ -85,13 +81,6 @@ def main():
        help='Maximum number of results (default: 10)',
    )

-    parser.add_argument(
-        '--max-chunk-length',
-        type=int,
-        default=200,
-        help='Truncate chunks to N characters (default: 200)',
-    )
-
    parser.add_argument(
        'query',
        nargs=1,
@ -109,7 +98,6 @@ def main():
            user=args.user,
            collection=args.collection,
            limit=args.limit,
-            max_chunk_length=args.max_chunk_length,
            token=args.token,
        )

--- a/trustgraph-cli/trustgraph/cli/load_doc_embeds.py
+++ b/trustgraph-cli/trustgraph/cli/load_doc_embeds.py
@ -44,14 +44,14 @@ async def load_de(running, queue, url):

                msg = {
                    "metadata": {
-                        "id": msg["m"]["i"], 
+                        "id": msg["m"]["i"],
                        "metadata": msg["m"]["m"],
                        "user": msg["m"]["u"],
                        "collection": msg["m"]["c"],
                    },
                    "chunks": [
                        {
-                            "chunk": chunk["c"],
+                            "chunk_id": chunk["c"],
                            "vectors": chunk["v"],
                        }
                        for chunk in msg["c"]
--- a/trustgraph-cli/trustgraph/cli/save_doc_embeds.py
+++ b/trustgraph-cli/trustgraph/cli/save_doc_embeds.py
@ -50,14 +50,14 @@ async def fetch_de(running, queue, user, collection, url):
                        "de",
                        {
                            "m": {
-                                "i": data["metadata"]["id"], 
+                                "i": data["metadata"]["id"],
                                "m": data["metadata"]["metadata"],
                                "u": data["metadata"]["user"],
                                "c": data["metadata"]["collection"],
                            },
                            "c": [
                                {
-                                    "c": chunk["chunk"],
+                                    "c": chunk["chunk_id"],
                                    "v": chunk["vectors"],
                                }
                                for chunk in data["chunks"]