trustgraph/trustgraph-flow/trustgraph/query/doc_embeddings/qdrant/service.py
cybermaggedon 24bbe94136
Document chunks not stored in vector store (#665)
- Schema - ChunkEmbeddings now uses chunk_id: str instead of chunk: bytes
- Schema - DocumentEmbeddingsResponse now returns chunk_ids: list[str]
  instead of chunks
- Translators - Updated to serialize/deserialize chunk_id
- Clients - DocumentEmbeddingsClient.query() returns chunk_ids
- SDK/API - flow.py, socket_client.py, bulk_client.py updated
- Document embeddings service - Stores chunk_id (document ID) instead
  of chunk text
- Storage writers - Qdrant, Milvus, Pinecone store chunk_id in payload
- Query services - Return chunk_id from vector store searches
- Gateway dispatchers - Serialize chunk_id in API responses
- Document RAG - Added librarian client to fetch chunk content from
  Garage using chunk_ids
- CLI tools - Updated all three tools:
  - invoke_document_embeddings.py - displays chunk_ids, removed
    max_chunk_length
  - save_doc_embeds.py - exports chunk_id
  - load_doc_embeds.py - imports chunk_id
2026-03-07 23:10:45 +00:00

123 lines
3.6 KiB
Python
Executable file

"""
Document embeddings query service. Input is vector, output is an array
of chunk_ids
"""
import logging
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from qdrant_client.models import Distance, VectorParams
from .... schema import DocumentEmbeddingsResponse
from .... schema import Error
from .... base import DocumentEmbeddingsQueryService
# Module logger
logger = logging.getLogger(__name__)
default_ident = "doc-embeddings-query"
default_store_uri = 'http://localhost:6333'
class Processor(DocumentEmbeddingsQueryService):
def __init__(self, **params):
store_uri = params.get("store_uri", default_store_uri)
#optional api key
api_key = params.get("api_key", None)
super(Processor, self).__init__(
**params | {
"store_uri": store_uri,
"api_key": api_key,
}
)
self.qdrant = QdrantClient(url=store_uri, api_key=api_key)
self.last_collection = None
def ensure_collection_exists(self, collection, dim):
"""Ensure collection exists, create if it doesn't"""
if collection != self.last_collection:
if not self.qdrant.collection_exists(collection):
try:
self.qdrant.create_collection(
collection_name=collection,
vectors_config=VectorParams(
size=dim, distance=Distance.COSINE
),
)
logger.info(f"Created collection: {collection}")
except Exception as e:
logger.error(f"Qdrant collection creation failed: {e}")
raise e
self.last_collection = collection
def collection_exists(self, collection):
"""Check if collection exists (no implicit creation)"""
return self.qdrant.collection_exists(collection)
def collection_exists(self, collection):
"""Check if collection exists (no implicit creation)"""
return self.qdrant.collection_exists(collection)
async def query_document_embeddings(self, msg):
try:
chunk_ids = []
for vec in msg.vectors:
# Use dimension suffix in collection name
dim = len(vec)
collection = f"d_{msg.user}_{msg.collection}_{dim}"
# Check if collection exists - return empty if not
if not self.collection_exists(collection):
logger.info(f"Collection {collection} does not exist, returning empty results")
continue
search_result = self.qdrant.query_points(
collection_name=collection,
query=vec,
limit=msg.limit,
with_payload=True,
).points
for r in search_result:
chunk_id = r.payload["chunk_id"]
chunk_ids.append(chunk_id)
return chunk_ids
except Exception as e:
logger.error(f"Exception querying document embeddings: {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):
DocumentEmbeddingsQueryService.add_args(parser)
parser.add_argument(
'-t', '--store-uri',
default=default_store_uri,
help=f'Qdrant store URI (default: {default_store_uri})'
)
parser.add_argument(
'-k', '--api-key',
default=None,
help=f'API key for qdrant (default: None)'
)
def run():
Processor.launch(default_ident, __doc__)