mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 08:26:21 +02:00
Document chunks not stored in vector store (#665)
- Schema - ChunkEmbeddings now uses chunk_id: str instead of chunk: bytes
- Schema - DocumentEmbeddingsResponse now returns chunk_ids: list[str]
instead of chunks
- Translators - Updated to serialize/deserialize chunk_id
- Clients - DocumentEmbeddingsClient.query() returns chunk_ids
- SDK/API - flow.py, socket_client.py, bulk_client.py updated
- Document embeddings service - Stores chunk_id (document ID) instead
of chunk text
- Storage writers - Qdrant, Milvus, Pinecone store chunk_id in payload
- Query services - Return chunk_id from vector store searches
- Gateway dispatchers - Serialize chunk_id in API responses
- Document RAG - Added librarian client to fetch chunk content from
Garage using chunk_ids
- CLI tools - Updated all three tools:
- invoke_document_embeddings.py - displays chunk_ids, removed
max_chunk_length
- save_doc_embeds.py - exports chunk_id
- load_doc_embeds.py - imports chunk_id
This commit is contained in:
parent
be358efe67
commit
24bbe94136
24 changed files with 331 additions and 91 deletions
|
|
@ -1,6 +1,6 @@
|
|||
"""
|
||||
Queries document chunks by text similarity using vector embeddings.
|
||||
Returns a list of matching document chunks, truncated to the specified length.
|
||||
Returns a list of matching chunk IDs.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -10,13 +10,7 @@ from trustgraph.api import Api
|
|||
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
|
||||
default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
|
||||
|
||||
def truncate_chunk(chunk, max_length):
|
||||
"""Truncate a chunk to max_length characters, adding ellipsis if needed."""
|
||||
if len(chunk) <= max_length:
|
||||
return chunk
|
||||
return chunk[:max_length] + "..."
|
||||
|
||||
def query(url, flow_id, query_text, user, collection, limit, max_chunk_length, token=None):
|
||||
def query(url, flow_id, query_text, user, collection, limit, token=None):
|
||||
|
||||
# Create API client
|
||||
api = Api(url=url, token=token)
|
||||
|
|
@ -32,10 +26,12 @@ def query(url, flow_id, query_text, user, collection, limit, max_chunk_length, t
|
|||
limit=limit
|
||||
)
|
||||
|
||||
chunks = result.get("chunks", [])
|
||||
for i, chunk in enumerate(chunks, 1):
|
||||
truncated = truncate_chunk(chunk, max_chunk_length)
|
||||
print(f"{i}. {truncated}")
|
||||
chunk_ids = result.get("chunk_ids", [])
|
||||
if not chunk_ids:
|
||||
print("No matching chunks found.")
|
||||
else:
|
||||
for i, chunk_id in enumerate(chunk_ids, 1):
|
||||
print(f"{i}. {chunk_id}")
|
||||
|
||||
finally:
|
||||
# Clean up socket connection
|
||||
|
|
@ -85,13 +81,6 @@ def main():
|
|||
help='Maximum number of results (default: 10)',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-chunk-length',
|
||||
type=int,
|
||||
default=200,
|
||||
help='Truncate chunks to N characters (default: 200)',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'query',
|
||||
nargs=1,
|
||||
|
|
@ -109,7 +98,6 @@ def main():
|
|||
user=args.user,
|
||||
collection=args.collection,
|
||||
limit=args.limit,
|
||||
max_chunk_length=args.max_chunk_length,
|
||||
token=args.token,
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue