mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-29 17:25:15 +02:00
Document chunks not stored in vector store (#665)
- Schema - ChunkEmbeddings now uses chunk_id: str instead of chunk: bytes
- Schema - DocumentEmbeddingsResponse now returns chunk_ids: list[str]
instead of chunks
- Translators - Updated to serialize/deserialize chunk_id
- Clients - DocumentEmbeddingsClient.query() returns chunk_ids
- SDK/API - flow.py, socket_client.py, bulk_client.py updated
- Document embeddings service - Stores chunk_id (document ID) instead
of chunk text
- Storage writers - Qdrant, Milvus, Pinecone store chunk_id in payload
- Query services - Return chunk_id from vector store searches
- Gateway dispatchers - Serialize chunk_id in API responses
- Document RAG - Added librarian client to fetch chunk content from
Garage using chunk_ids
- CLI tools - Updated all three tools:
- invoke_document_embeddings.py - displays chunk_ids, removed
max_chunk_length
- save_doc_embeds.py - exports chunk_id
- load_doc_embeds.py - imports chunk_id
This commit is contained in:
parent
be358efe67
commit
24bbe94136
24 changed files with 331 additions and 91 deletions
|
|
@ -322,8 +322,8 @@ class BulkClient:
|
|||
|
||||
# Generate document embeddings to import
|
||||
def doc_embedding_generator():
|
||||
yield {"id": "doc1-chunk1", "embedding": [0.1, 0.2, ...]}
|
||||
yield {"id": "doc1-chunk2", "embedding": [0.3, 0.4, ...]}
|
||||
yield {"chunk_id": "doc1/p0/c0", "embedding": [0.1, 0.2, ...]}
|
||||
yield {"chunk_id": "doc1/p0/c1", "embedding": [0.3, 0.4, ...]}
|
||||
# ... more embeddings
|
||||
|
||||
bulk.import_document_embeddings(
|
||||
|
|
@ -363,9 +363,9 @@ class BulkClient:
|
|||
|
||||
# Export and process document embeddings
|
||||
for embedding in bulk.export_document_embeddings(flow="default"):
|
||||
doc_id = embedding.get("id")
|
||||
chunk_id = embedding.get("chunk_id")
|
||||
vector = embedding.get("embedding")
|
||||
print(f"{doc_id}: {len(vector)} dimensions")
|
||||
print(f"{chunk_id}: {len(vector)} dimensions")
|
||||
```
|
||||
"""
|
||||
async_gen = self._export_document_embeddings_async(flow)
|
||||
|
|
|
|||
|
|
@ -634,7 +634,7 @@ class FlowInstance:
|
|||
limit: Maximum number of results (default: 10)
|
||||
|
||||
Returns:
|
||||
dict: Query results with similar document chunks
|
||||
dict: Query results with chunk_ids of matching document chunks
|
||||
|
||||
Example:
|
||||
```python
|
||||
|
|
@ -645,6 +645,7 @@ class FlowInstance:
|
|||
collection="research-papers",
|
||||
limit=5
|
||||
)
|
||||
# results contains {"chunk_ids": ["doc1/p0/c0", "doc2/p1/c3", ...]}
|
||||
```
|
||||
"""
|
||||
|
||||
|
|
|
|||
|
|
@ -682,7 +682,7 @@ class SocketFlowInstance:
|
|||
**kwargs: Additional parameters passed to the service
|
||||
|
||||
Returns:
|
||||
dict: Query results with similar document chunks
|
||||
dict: Query results with chunk_ids of matching document chunks
|
||||
|
||||
Example:
|
||||
```python
|
||||
|
|
@ -695,6 +695,7 @@ class SocketFlowInstance:
|
|||
collection="research-papers",
|
||||
limit=5
|
||||
)
|
||||
# results contains {"chunk_ids": ["doc1/p0/c0", ...]}
|
||||
```
|
||||
"""
|
||||
# First convert text to embeddings vectors
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ class DocumentEmbeddingsClient(RequestResponse):
|
|||
if resp.error:
|
||||
raise RuntimeError(resp.error.message)
|
||||
|
||||
return resp.chunks
|
||||
return resp.chunk_ids
|
||||
|
||||
class DocumentEmbeddingsClientSpec(RequestResponseSpec):
|
||||
def __init__(
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ class DocumentEmbeddingsQueryService(FlowProcessor):
|
|||
docs = await self.query_document_embeddings(request)
|
||||
|
||||
logger.debug("Sending document embeddings query response...")
|
||||
r = DocumentEmbeddingsResponse(chunks=docs, error=None)
|
||||
r = DocumentEmbeddingsResponse(chunk_ids=docs, error=None)
|
||||
await flow("response").send(r, properties={"id": id})
|
||||
|
||||
logger.debug("Document embeddings query request completed")
|
||||
|
|
@ -73,7 +73,7 @@ class DocumentEmbeddingsQueryService(FlowProcessor):
|
|||
type = "document-embeddings-query-error",
|
||||
message = str(e),
|
||||
),
|
||||
chunks=None,
|
||||
chunk_ids=[],
|
||||
)
|
||||
|
||||
await flow("response").send(r, properties={"id": id})
|
||||
|
|
|
|||
|
|
@ -144,15 +144,15 @@ class DocumentEmbeddingsTranslator(SendTranslator):
|
|||
|
||||
def to_pulsar(self, data: Dict[str, Any]) -> DocumentEmbeddings:
|
||||
metadata = data.get("metadata", {})
|
||||
|
||||
|
||||
chunks = [
|
||||
ChunkEmbeddings(
|
||||
chunk=chunk["chunk"].encode("utf-8") if isinstance(chunk["chunk"], str) else chunk["chunk"],
|
||||
chunk_id=chunk["chunk_id"],
|
||||
vectors=chunk["vectors"]
|
||||
)
|
||||
for chunk in data.get("chunks", [])
|
||||
]
|
||||
|
||||
|
||||
from ...schema import Metadata
|
||||
return DocumentEmbeddings(
|
||||
metadata=Metadata(
|
||||
|
|
@ -168,7 +168,7 @@ class DocumentEmbeddingsTranslator(SendTranslator):
|
|||
result = {
|
||||
"chunks": [
|
||||
{
|
||||
"chunk": chunk.chunk.decode("utf-8") if isinstance(chunk.chunk, bytes) else chunk.chunk,
|
||||
"chunk_id": chunk.chunk_id,
|
||||
"vectors": chunk.vectors
|
||||
}
|
||||
for chunk in obj.chunks
|
||||
|
|
|
|||
|
|
@ -36,13 +36,10 @@ class DocumentEmbeddingsResponseTranslator(MessageTranslator):
|
|||
|
||||
def from_pulsar(self, obj: DocumentEmbeddingsResponse) -> Dict[str, Any]:
|
||||
result = {}
|
||||
|
||||
if obj.chunks is not None:
|
||||
result["chunks"] = [
|
||||
chunk.decode("utf-8") if isinstance(chunk, bytes) else chunk
|
||||
for chunk in obj.chunks
|
||||
]
|
||||
|
||||
|
||||
if obj.chunk_ids is not None:
|
||||
result["chunk_ids"] = list(obj.chunk_ids)
|
||||
|
||||
return result
|
||||
|
||||
def from_response_with_completion(self, obj: DocumentEmbeddingsResponse) -> Tuple[Dict[str, Any], bool]:
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ class GraphEmbeddings:
|
|||
|
||||
@dataclass
|
||||
class ChunkEmbeddings:
|
||||
chunk: bytes = b""
|
||||
chunk_id: str = ""
|
||||
vectors: list[list[float]] = field(default_factory=list)
|
||||
|
||||
# This is a 'batching' mechanism for the above data
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ class DocumentEmbeddingsRequest:
|
|||
@dataclass
|
||||
class DocumentEmbeddingsResponse:
|
||||
error: Error | None = None
|
||||
chunks: list[str] = field(default_factory=list)
|
||||
chunk_ids: list[str] = field(default_factory=list)
|
||||
|
||||
document_embeddings_request_queue = topic(
|
||||
"document-embeddings-request", qos='q0', tenant='trustgraph', namespace='flow'
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue