mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-18 03:45:12 +02:00
Document chunks not stored in vector store (#665)
- Schema - ChunkEmbeddings now uses chunk_id: str instead of chunk: bytes
- Schema - DocumentEmbeddingsResponse now returns chunk_ids: list[str]
instead of chunks
- Translators - Updated to serialize/deserialize chunk_id
- Clients - DocumentEmbeddingsClient.query() returns chunk_ids
- SDK/API - flow.py, socket_client.py, bulk_client.py updated
- Document embeddings service - Stores chunk_id (document ID) instead
of chunk text
- Storage writers - Qdrant, Milvus, Pinecone store chunk_id in payload
- Query services - Return chunk_id from vector store searches
- Gateway dispatchers - Serialize chunk_id in API responses
- Document RAG - Added librarian client to fetch chunk content from
Garage using chunk_ids
- CLI tools - Updated all three tools:
- invoke_document_embeddings.py - displays chunk_ids, removed
max_chunk_length
- save_doc_embeds.py - exports chunk_id
- load_doc_embeds.py - imports chunk_id
This commit is contained in:
parent
be358efe67
commit
24bbe94136
24 changed files with 331 additions and 91 deletions
|
|
@ -27,7 +27,7 @@ class GraphEmbeddings:
|
|||
|
||||
@dataclass
|
||||
class ChunkEmbeddings:
|
||||
chunk: bytes = b""
|
||||
chunk_id: str = ""
|
||||
vectors: list[list[float]] = field(default_factory=list)
|
||||
|
||||
# This is a 'batching' mechanism for the above data
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ class DocumentEmbeddingsRequest:
|
|||
@dataclass
|
||||
class DocumentEmbeddingsResponse:
|
||||
error: Error | None = None
|
||||
chunks: list[str] = field(default_factory=list)
|
||||
chunk_ids: list[str] = field(default_factory=list)
|
||||
|
||||
document_embeddings_request_queue = topic(
|
||||
"document-embeddings-request", qos='q0', tenant='trustgraph', namespace='flow'
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue