mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-02 03:42:36 +02:00
Document chunks not stored in vector store (#665)
- Schema - ChunkEmbeddings now uses chunk_id: str instead of chunk: bytes
- Schema - DocumentEmbeddingsResponse now returns chunk_ids: list[str]
instead of chunks
- Translators - Updated to serialize/deserialize chunk_id
- Clients - DocumentEmbeddingsClient.query() returns chunk_ids
- SDK/API - flow.py, socket_client.py, bulk_client.py updated
- Document embeddings service - Stores chunk_id (document ID) instead
of chunk text
- Storage writers - Qdrant, Milvus, Pinecone store chunk_id in payload
- Query services - Return chunk_id from vector store searches
- Gateway dispatchers - Serialize chunk_id in API responses
- Document RAG - Added librarian client to fetch chunk content from
Garage using chunk_ids
- CLI tools - Updated all three tools:
- invoke_document_embeddings.py - displays chunk_ids, removed
max_chunk_length
- save_doc_embeds.py - exports chunk_id
- load_doc_embeds.py - imports chunk_id
This commit is contained in:
parent
be358efe67
commit
24bbe94136
24 changed files with 331 additions and 91 deletions
|
|
@ -84,14 +84,14 @@ class DocVectors:
|
|||
dim=dimension,
|
||||
)
|
||||
|
||||
doc_field = FieldSchema(
|
||||
name="doc",
|
||||
chunk_id_field = FieldSchema(
|
||||
name="chunk_id",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
)
|
||||
|
||||
schema = CollectionSchema(
|
||||
fields = [pkey_field, vec_field, doc_field],
|
||||
fields = [pkey_field, vec_field, chunk_id_field],
|
||||
description = "Document embedding schema",
|
||||
)
|
||||
|
||||
|
|
@ -119,17 +119,17 @@ class DocVectors:
|
|||
self.collections[(dimension, user, collection)] = collection_name
|
||||
logger.info(f"Created Milvus collection {collection_name} with dimension {dimension}")
|
||||
|
||||
def insert(self, embeds, doc, user, collection):
|
||||
def insert(self, embeds, chunk_id, user, collection):
|
||||
|
||||
dim = len(embeds)
|
||||
|
||||
if (dim, user, collection) not in self.collections:
|
||||
self.init_collection(dim, user, collection)
|
||||
|
||||
|
||||
data = [
|
||||
{
|
||||
"vector": embeds,
|
||||
"doc": doc,
|
||||
"chunk_id": chunk_id,
|
||||
}
|
||||
]
|
||||
|
||||
|
|
@ -138,7 +138,7 @@ class DocVectors:
|
|||
data=data
|
||||
)
|
||||
|
||||
def search(self, embeds, user, collection, fields=["doc"], limit=10):
|
||||
def search(self, embeds, user, collection, fields=["chunk_id"], limit=10):
|
||||
|
||||
dim = len(embeds)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue