Fix hard coded vector size (#555)

* Fixed hard-coded embeddings store size

* Vector store lazy-creates collections, different collections for
  different dimension lengths.

* Added tech spec for vector store lifecycle

* Fixed some tests for the new spec
This commit is contained in:
cybermaggedon 2025-11-10 16:56:51 +00:00 committed by GitHub
parent 05b9063fea
commit 6129bb68c1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 793 additions and 572 deletions

View file

@ -47,39 +47,6 @@ class Processor(DocumentEmbeddingsQueryService):
}
)
self.last_index_name = None
def ensure_index_exists(self, index_name, dim):
"""Ensure index exists, create if it doesn't"""
if index_name != self.last_index_name:
if not self.pinecone.has_index(index_name):
try:
self.pinecone.create_index(
name=index_name,
dimension=dim,
metric="cosine",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1",
)
)
logger.info(f"Created index: {index_name}")
# Wait for index to be ready
import time
for i in range(0, 1000):
if self.pinecone.describe_index(index_name).status["ready"]:
break
time.sleep(1)
if not self.pinecone.describe_index(index_name).status["ready"]:
raise RuntimeError("Gave up waiting for index creation")
except Exception as e:
logger.error(f"Pinecone index creation failed: {e}")
raise e
self.last_index_name = index_name
async def query_document_embeddings(self, msg):
try:
@ -94,11 +61,13 @@ class Processor(DocumentEmbeddingsQueryService):
dim = len(vec)
index_name = (
"d-" + msg.user + "-" + msg.collection
)
# Use dimension suffix in index name
index_name = f"d-{msg.user}-{msg.collection}-{dim}"
self.ensure_index_exists(index_name, dim)
# Check if index exists - skip if not
if not self.pinecone.has_index(index_name):
logger.info(f"Index {index_name} does not exist, skipping this vector")
continue
index = self.pinecone.Index(index_name)

View file

@ -38,28 +38,6 @@ class Processor(DocumentEmbeddingsQueryService):
)
self.qdrant = QdrantClient(url=store_uri, api_key=api_key)
self.last_collection = None
def ensure_collection_exists(self, collection, dim):
"""Ensure collection exists, create if it doesn't"""
if collection != self.last_collection:
if not self.qdrant.collection_exists(collection):
try:
self.qdrant.create_collection(
collection_name=collection,
vectors_config=VectorParams(
size=dim, distance=Distance.COSINE
),
)
logger.info(f"Created collection: {collection}")
except Exception as e:
logger.error(f"Qdrant collection creation failed: {e}")
raise e
self.last_collection = collection
def collection_exists(self, collection):
"""Check if collection exists (no implicit creation)"""
return self.qdrant.collection_exists(collection)
def collection_exists(self, collection):
"""Check if collection exists (no implicit creation)"""
@ -71,16 +49,17 @@ class Processor(DocumentEmbeddingsQueryService):
chunks = []
collection = (
"d_" + msg.user + "_" + msg.collection
)
# Check if collection exists - return empty if not
if not self.collection_exists(collection):
logger.info(f"Collection {collection} does not exist, returning empty results")
return []
for vec in msg.vectors:
# Use dimension suffix in collection name
dim = len(vec)
collection = f"d_{msg.user}_{msg.collection}_{dim}"
# Check if collection exists - return empty if not
if not self.collection_exists(collection):
logger.info(f"Collection {collection} does not exist, returning empty results")
continue
search_result = self.qdrant.query_points(
collection_name=collection,
query=vec,

View file

@ -49,39 +49,6 @@ class Processor(GraphEmbeddingsQueryService):
}
)
self.last_index_name = None
def ensure_index_exists(self, index_name, dim):
"""Ensure index exists, create if it doesn't"""
if index_name != self.last_index_name:
if not self.pinecone.has_index(index_name):
try:
self.pinecone.create_index(
name=index_name,
dimension=dim,
metric="cosine",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1",
)
)
logger.info(f"Created index: {index_name}")
# Wait for index to be ready
import time
for i in range(0, 1000):
if self.pinecone.describe_index(index_name).status["ready"]:
break
time.sleep(1)
if not self.pinecone.describe_index(index_name).status["ready"]:
raise RuntimeError("Gave up waiting for index creation")
except Exception as e:
logger.error(f"Pinecone index creation failed: {e}")
raise e
self.last_index_name = index_name
def create_value(self, ent):
if ent.startswith("http://") or ent.startswith("https://"):
return Value(value=ent, is_uri=True)
@ -103,11 +70,13 @@ class Processor(GraphEmbeddingsQueryService):
dim = len(vec)
index_name = (
"t-" + msg.user + "-" + msg.collection
)
# Use dimension suffix in index name
index_name = f"t-{msg.user}-{msg.collection}-{dim}"
self.ensure_index_exists(index_name, dim)
# Check if index exists - skip if not
if not self.pinecone.has_index(index_name):
logger.info(f"Index {index_name} does not exist, skipping this vector")
continue
index = self.pinecone.Index(index_name)

View file

@ -38,28 +38,6 @@ class Processor(GraphEmbeddingsQueryService):
)
self.qdrant = QdrantClient(url=store_uri, api_key=api_key)
self.last_collection = None
def ensure_collection_exists(self, collection, dim):
"""Ensure collection exists, create if it doesn't"""
if collection != self.last_collection:
if not self.qdrant.collection_exists(collection):
try:
self.qdrant.create_collection(
collection_name=collection,
vectors_config=VectorParams(
size=dim, distance=Distance.COSINE
),
)
logger.info(f"Created collection: {collection}")
except Exception as e:
logger.error(f"Qdrant collection creation failed: {e}")
raise e
self.last_collection = collection
def collection_exists(self, collection):
"""Check if collection exists (no implicit creation)"""
return self.qdrant.collection_exists(collection)
def collection_exists(self, collection):
"""Check if collection exists (no implicit creation)"""
@ -78,17 +56,17 @@ class Processor(GraphEmbeddingsQueryService):
entity_set = set()
entities = []
collection = (
"t_" + msg.user + "_" + msg.collection
)
# Check if collection exists - return empty if not
if not self.collection_exists(collection):
logger.info(f"Collection {collection} does not exist, returning empty results")
return []
for vec in msg.vectors:
# Use dimension suffix in collection name
dim = len(vec)
collection = f"t_{msg.user}_{msg.collection}_{dim}"
# Check if collection exists - return empty if not
if not self.collection_exists(collection):
logger.info(f"Collection {collection} does not exist, skipping this vector")
continue
# Heuristic hack, get (2*limit), so that we have more chance
# of getting (limit) entities
search_result = self.qdrant.query_points(