trustgraph/trustgraph-flow/trustgraph/embeddings/document_embeddings/embeddings.py
cybermaggedon 24bbe94136
Document chunks not stored in vector store (#665)
- Schema - ChunkEmbeddings now uses chunk_id: str instead of chunk: bytes
- Schema - DocumentEmbeddingsResponse now returns chunk_ids: list[str]
  instead of chunks
- Translators - Updated to serialize/deserialize chunk_id
- Clients - DocumentEmbeddingsClient.query() returns chunk_ids
- SDK/API - flow.py, socket_client.py, bulk_client.py updated
- Document embeddings service - Stores chunk_id (document ID) instead
  of chunk text
- Storage writers - Qdrant, Milvus, Pinecone store chunk_id in payload
- Query services - Return chunk_id from vector store searches
- Gateway dispatchers - Serialize chunk_id in API responses
- Document RAG - Added librarian client to fetch chunk content from
  Garage using chunk_ids
- CLI tools - Updated all three tools:
  - invoke_document_embeddings.py - displays chunk_ids, removed
    max_chunk_length
  - save_doc_embeds.py - exports chunk_id
  - load_doc_embeds.py - imports chunk_id
2026-03-07 23:10:45 +00:00

101 lines
2.3 KiB
Python
Executable file

"""
Document embeddings, calls the embeddings service to get embeddings for a
chunk of text. Input is chunk of text plus metadata.
Output is chunk plus embedding.
"""
from ... schema import Chunk, ChunkEmbeddings, DocumentEmbeddings
from ... schema import EmbeddingsRequest, EmbeddingsResponse
from ... base import FlowProcessor, RequestResponseSpec, ConsumerSpec
from ... base import ProducerSpec
import logging
logger = logging.getLogger(__name__)
default_ident = "document-embeddings"
class Processor(FlowProcessor):
def __init__(self, **params):
id = params.get("id")
super(Processor, self).__init__(
**params | {
"id": id,
}
)
self.register_specification(
ConsumerSpec(
name = "input",
schema = Chunk,
handler = self.on_message,
)
)
self.register_specification(
RequestResponseSpec(
request_name = "embeddings-request",
request_schema = EmbeddingsRequest,
response_name = "embeddings-response",
response_schema = EmbeddingsResponse,
)
)
self.register_specification(
ProducerSpec(
name = "output",
schema = DocumentEmbeddings
)
)
async def on_message(self, msg, consumer, flow):
v = msg.value()
logger.info(f"Indexing {v.metadata.id}...")
try:
resp = await flow("embeddings-request").request(
EmbeddingsRequest(
text = v.chunk
)
)
vectors = resp.vectors
embeds = [
ChunkEmbeddings(
chunk_id=v.document_id,
vectors=vectors,
)
]
r = DocumentEmbeddings(
metadata=v.metadata,
chunks=embeds,
)
await flow("output").send(r)
except Exception as e:
logger.error("Exception occurred", exc_info=True)
# Retry
raise e
logger.info("Done.")
@staticmethod
def add_args(parser):
FlowProcessor.add_args(parser)
def run():
Processor.launch(default_ident, __doc__)