Fix hard coded vector size (#555)

* Fixed hard-coded embeddings store size

* Vector store lazy-creates collections, different collections for
  different dimension lengths.

* Added tech spec for vector store lifecycle

* Fixed some tests for the new spec
This commit is contained in:
cybermaggedon 2025-11-10 16:56:51 +00:00 committed by GitHub
parent 05b9063fea
commit 6129bb68c1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 793 additions and 572 deletions

View file

@ -134,8 +134,8 @@ class TestPineconeDocEmbeddingsStorageProcessor:
with patch('uuid.uuid4', side_effect=['id1', 'id2']):
await processor.store_document_embeddings(message)
# Verify index name and operations
expected_index_name = "d-test_user-test_collection"
# Verify index name and operations (with dimension suffix)
expected_index_name = "d-test_user-test_collection-3" # 3 dimensions
processor.pinecone.Index.assert_called_with(expected_index_name)
# Verify upsert was called for each vector
@ -179,7 +179,7 @@ class TestPineconeDocEmbeddingsStorageProcessor:
@pytest.mark.asyncio
async def test_store_document_embeddings_index_validation(self, processor):
"""Test that writing to non-existent index raises ValueError"""
"""Test that writing to non-existent index creates it lazily"""
message = MagicMock()
message.metadata = MagicMock()
message.metadata.user = 'test_user'
@ -191,12 +191,24 @@ class TestPineconeDocEmbeddingsStorageProcessor:
)
message.chunks = [chunk]
# Mock index doesn't exist
# Mock index doesn't exist initially
processor.pinecone.has_index.return_value = False
mock_index = MagicMock()
processor.pinecone.Index.return_value = mock_index
with pytest.raises(ValueError, match="Collection .* does not exist"):
with patch('uuid.uuid4', return_value='test-id'):
await processor.store_document_embeddings(message)
# Verify index was created with correct dimension
expected_index_name = "d-test_user-test_collection-3" # 3 dimensions
processor.pinecone.create_index.assert_called_once()
create_call = processor.pinecone.create_index.call_args
assert create_call[1]['name'] == expected_index_name
assert create_call[1]['dimension'] == 3
# Verify upsert was still called
mock_index.upsert.assert_called_once()
@pytest.mark.asyncio
async def test_store_document_embeddings_empty_chunk(self, processor):
"""Test storing document embeddings with empty chunk (should be skipped)"""
@ -345,7 +357,7 @@ class TestPineconeDocEmbeddingsStorageProcessor:
@pytest.mark.asyncio
async def test_store_document_embeddings_validation_before_creation(self, processor):
"""Test that validation error occurs before creation attempts"""
"""Test that lazy creation happens when index doesn't exist"""
message = MagicMock()
message.metadata = MagicMock()
message.metadata.user = 'test_user'
@ -359,13 +371,18 @@ class TestPineconeDocEmbeddingsStorageProcessor:
# Mock index doesn't exist
processor.pinecone.has_index.return_value = False
mock_index = MagicMock()
processor.pinecone.Index.return_value = mock_index
with pytest.raises(ValueError, match="Collection .* does not exist"):
with patch('uuid.uuid4', return_value='test-id'):
await processor.store_document_embeddings(message)
# Verify index was created
processor.pinecone.create_index.assert_called_once()
@pytest.mark.asyncio
async def test_store_document_embeddings_validates_before_timeout(self, processor):
"""Test that validation error occurs before timeout checks"""
"""Test that lazy creation works correctly"""
message = MagicMock()
message.metadata = MagicMock()
message.metadata.user = 'test_user'
@ -379,10 +396,16 @@ class TestPineconeDocEmbeddingsStorageProcessor:
# Mock index doesn't exist
processor.pinecone.has_index.return_value = False
mock_index = MagicMock()
processor.pinecone.Index.return_value = mock_index
with pytest.raises(ValueError, match="Collection .* does not exist"):
with patch('uuid.uuid4', return_value='test-id'):
await processor.store_document_embeddings(message)
# Verify index was created and used
processor.pinecone.create_index.assert_called_once()
mock_index.upsert.assert_called_once()
@pytest.mark.asyncio
async def test_store_document_embeddings_unicode_content(self, processor):
"""Test storing document embeddings with Unicode content"""

View file

@ -103,8 +103,8 @@ class TestQdrantDocEmbeddingsStorage(IsolatedAsyncioTestCase):
await processor.store_document_embeddings(mock_message)
# Assert
# Verify collection existence was checked
expected_collection = 'd_test_user_test_collection'
# Verify collection existence was checked (with dimension suffix)
expected_collection = 'd_test_user_test_collection_3' # 3 dimensions in vector [0.1, 0.2, 0.3]
mock_qdrant_instance.collection_exists.assert_called_once_with(expected_collection)
# Verify upsert was called
@ -112,7 +112,7 @@ class TestQdrantDocEmbeddingsStorage(IsolatedAsyncioTestCase):
# Verify upsert parameters
upsert_call_args = mock_qdrant_instance.upsert.call_args
assert upsert_call_args[1]['collection_name'] == expected_collection
assert upsert_call_args[1]['collection_name'] == 'd_test_user_test_collection_3'
assert len(upsert_call_args[1]['points']) == 1
point = upsert_call_args[1]['points'][0]
@ -272,18 +272,21 @@ class TestQdrantDocEmbeddingsStorage(IsolatedAsyncioTestCase):
# Assert
# Should not call upsert for empty chunks
mock_qdrant_instance.upsert.assert_not_called()
# But collection_exists should be called for validation
mock_qdrant_instance.collection_exists.assert_called_once()
# collection_exists should NOT be called since we return early for empty chunks
mock_qdrant_instance.collection_exists.assert_not_called()
@patch('trustgraph.storage.doc_embeddings.qdrant.write.QdrantClient')
@patch('trustgraph.storage.doc_embeddings.qdrant.write.uuid')
@patch('trustgraph.base.DocumentEmbeddingsStoreService.__init__')
async def test_collection_creation_when_not_exists(self, mock_base_init, mock_qdrant_client):
"""Test that writing to non-existent collection raises ValueError"""
async def test_collection_creation_when_not_exists(self, mock_base_init, mock_uuid, mock_qdrant_client):
"""Test that writing to non-existent collection creates it lazily"""
# Arrange
mock_base_init.return_value = None
mock_qdrant_instance = MagicMock()
mock_qdrant_instance.collection_exists.return_value = False # Collection doesn't exist
mock_qdrant_client.return_value = mock_qdrant_instance
mock_uuid.uuid4.return_value = MagicMock()
mock_uuid.uuid4.return_value.__str__ = MagicMock(return_value='test-uuid')
config = {
'store_uri': 'http://localhost:6333',
@ -305,19 +308,36 @@ class TestQdrantDocEmbeddingsStorage(IsolatedAsyncioTestCase):
mock_message.chunks = [mock_chunk]
# Act & Assert
with pytest.raises(ValueError, match="Collection .* does not exist"):
await processor.store_document_embeddings(mock_message)
# Act
await processor.store_document_embeddings(mock_message)
# Assert - collection should be lazily created
expected_collection = 'd_new_user_new_collection_5' # 5 dimensions
mock_qdrant_instance.collection_exists.assert_called_once_with(expected_collection)
mock_qdrant_instance.create_collection.assert_called_once()
# Verify create_collection was called with correct parameters
create_call = mock_qdrant_instance.create_collection.call_args
assert create_call[1]['collection_name'] == expected_collection
assert create_call[1]['vectors_config'].size == 5
# Verify upsert was still called
mock_qdrant_instance.upsert.assert_called_once()
@patch('trustgraph.storage.doc_embeddings.qdrant.write.QdrantClient')
@patch('trustgraph.storage.doc_embeddings.qdrant.write.uuid')
@patch('trustgraph.base.DocumentEmbeddingsStoreService.__init__')
async def test_collection_creation_exception(self, mock_base_init, mock_qdrant_client):
"""Test that validation error occurs before connection errors"""
async def test_collection_creation_exception(self, mock_base_init, mock_uuid, mock_qdrant_client):
"""Test that collection creation errors are propagated"""
# Arrange
mock_base_init.return_value = None
mock_qdrant_instance = MagicMock()
mock_qdrant_instance.collection_exists.return_value = False # Collection doesn't exist
# Simulate creation failure
mock_qdrant_instance.create_collection.side_effect = Exception("Connection error")
mock_qdrant_client.return_value = mock_qdrant_instance
mock_uuid.uuid4.return_value = MagicMock()
mock_uuid.uuid4.return_value.__str__ = MagicMock(return_value='test-uuid')
config = {
'store_uri': 'http://localhost:6333',
@ -339,8 +359,8 @@ class TestQdrantDocEmbeddingsStorage(IsolatedAsyncioTestCase):
mock_message.chunks = [mock_chunk]
# Act & Assert
with pytest.raises(ValueError, match="Collection .* does not exist"):
# Act & Assert - should propagate the creation error
with pytest.raises(Exception, match="Connection error"):
await processor.store_document_embeddings(mock_message)
@patch('trustgraph.storage.doc_embeddings.qdrant.write.QdrantClient')
@ -398,7 +418,7 @@ class TestQdrantDocEmbeddingsStorage(IsolatedAsyncioTestCase):
await processor.store_document_embeddings(mock_message2)
# Assert
expected_collection = 'd_cache_user_cache_collection'
expected_collection = 'd_cache_user_cache_collection_3' # 3 dimensions
# Verify collection existence is checked on each write
mock_qdrant_instance.collection_exists.assert_called_once_with(expected_collection)
@ -407,15 +427,18 @@ class TestQdrantDocEmbeddingsStorage(IsolatedAsyncioTestCase):
mock_qdrant_instance.upsert.assert_called_once()
@patch('trustgraph.storage.doc_embeddings.qdrant.write.QdrantClient')
@patch('trustgraph.storage.doc_embeddings.qdrant.write.uuid')
@patch('trustgraph.base.DocumentEmbeddingsStoreService.__init__')
async def test_different_dimensions_different_collections(self, mock_base_init, mock_qdrant_client):
async def test_different_dimensions_different_collections(self, mock_base_init, mock_uuid, mock_qdrant_client):
"""Test that different vector dimensions create different collections"""
# Arrange
mock_base_init.return_value = None
mock_qdrant_instance = MagicMock()
mock_qdrant_instance.collection_exists.return_value = True
mock_qdrant_client.return_value = mock_qdrant_instance
mock_uuid.uuid4.return_value = MagicMock()
mock_uuid.uuid4.return_value.__str__ = MagicMock(return_value='test-uuid')
config = {
'store_uri': 'http://localhost:6333',
'api_key': 'test-api-key',
@ -424,35 +447,39 @@ class TestQdrantDocEmbeddingsStorage(IsolatedAsyncioTestCase):
}
processor = Processor(**config)
# Create mock message with different dimension vectors
mock_message = MagicMock()
mock_message.metadata.user = 'dim_user'
mock_message.metadata.collection = 'dim_collection'
mock_chunk = MagicMock()
mock_chunk.chunk.decode.return_value = 'dimension test chunk'
mock_chunk.vectors = [
[0.1, 0.2], # 2 dimensions
[0.3, 0.4, 0.5] # 3 dimensions
]
mock_message.chunks = [mock_chunk]
# Act
await processor.store_document_embeddings(mock_message)
# Assert
# Should check existence of the same collection (dimensions no longer create separate collections)
expected_collection = 'd_dim_user_dim_collection'
mock_qdrant_instance.collection_exists.assert_called_once_with(expected_collection)
# Should check existence of DIFFERENT collections for each dimension
assert mock_qdrant_instance.collection_exists.call_count == 2
# Should upsert to the same collection for both vectors
# Verify the two different collection names were checked
collection_exists_calls = [call[0][0] for call in mock_qdrant_instance.collection_exists.call_args_list]
assert 'd_dim_user_dim_collection_2' in collection_exists_calls # 2-dim vector
assert 'd_dim_user_dim_collection_3' in collection_exists_calls # 3-dim vector
# Should upsert to different collections for each vector
assert mock_qdrant_instance.upsert.call_count == 2
upsert_calls = mock_qdrant_instance.upsert.call_args_list
assert upsert_calls[0][1]['collection_name'] == expected_collection
assert upsert_calls[1][1]['collection_name'] == expected_collection
assert upsert_calls[0][1]['collection_name'] == 'd_dim_user_dim_collection_2'
assert upsert_calls[1][1]['collection_name'] == 'd_dim_user_dim_collection_3'
@patch('trustgraph.storage.doc_embeddings.qdrant.write.QdrantClient')
@patch('trustgraph.base.DocumentEmbeddingsStoreService.__init__')

View file

@ -134,8 +134,8 @@ class TestPineconeGraphEmbeddingsStorageProcessor:
with patch('uuid.uuid4', side_effect=['id1', 'id2']):
await processor.store_graph_embeddings(message)
# Verify index name and operations
expected_index_name = "t-test_user-test_collection"
# Verify index name and operations (with dimension suffix)
expected_index_name = "t-test_user-test_collection-3" # 3 dimensions
processor.pinecone.Index.assert_called_with(expected_index_name)
# Verify upsert was called for each vector
@ -179,7 +179,7 @@ class TestPineconeGraphEmbeddingsStorageProcessor:
@pytest.mark.asyncio
async def test_store_graph_embeddings_index_validation(self, processor):
"""Test that writing to non-existent index raises ValueError"""
"""Test that writing to non-existent index creates it lazily"""
message = MagicMock()
message.metadata = MagicMock()
message.metadata.user = 'test_user'
@ -193,10 +193,22 @@ class TestPineconeGraphEmbeddingsStorageProcessor:
# Mock index doesn't exist
processor.pinecone.has_index.return_value = False
mock_index = MagicMock()
processor.pinecone.Index.return_value = mock_index
with pytest.raises(ValueError, match="Collection .* does not exist"):
with patch('uuid.uuid4', return_value='test-id'):
await processor.store_graph_embeddings(message)
# Verify index was created with correct dimension
expected_index_name = "t-test_user-test_collection-3" # 3 dimensions
processor.pinecone.create_index.assert_called_once()
create_call = processor.pinecone.create_index.call_args
assert create_call[1]['name'] == expected_index_name
assert create_call[1]['dimension'] == 3
# Verify upsert was still called
mock_index.upsert.assert_called_once()
@pytest.mark.asyncio
async def test_store_graph_embeddings_empty_entity_value(self, processor):
"""Test storing graph embeddings with empty entity value (should be skipped)"""
@ -267,11 +279,16 @@ class TestPineconeGraphEmbeddingsStorageProcessor:
with patch('uuid.uuid4', side_effect=['id1', 'id2', 'id3']):
await processor.store_graph_embeddings(message)
# Verify same index was used for all dimensions
expected_index_name = 't-test_user-test_collection'
processor.pinecone.Index.assert_called_with(expected_index_name)
# Verify different indexes were used for different dimensions
index_calls = processor.pinecone.Index.call_args_list
assert len(index_calls) == 3
# Extract index names from calls
index_names = [call[0][0] for call in index_calls]
assert 't-test_user-test_collection-2' in index_names # 2D vector
assert 't-test_user-test_collection-4' in index_names # 4D vector
assert 't-test_user-test_collection-3' in index_names # 3D vector
# Verify all vectors were upserted to the same index
# Verify all vectors were upserted (to their respective indexes)
assert mock_index.upsert.call_count == 3
@pytest.mark.asyncio
@ -316,7 +333,7 @@ class TestPineconeGraphEmbeddingsStorageProcessor:
@pytest.mark.asyncio
async def test_store_graph_embeddings_validation_before_creation(self, processor):
"""Test that validation error occurs before any creation attempts"""
"""Test that lazy creation happens when index doesn't exist"""
message = MagicMock()
message.metadata = MagicMock()
message.metadata.user = 'test_user'
@ -330,13 +347,18 @@ class TestPineconeGraphEmbeddingsStorageProcessor:
# Mock index doesn't exist
processor.pinecone.has_index.return_value = False
mock_index = MagicMock()
processor.pinecone.Index.return_value = mock_index
with pytest.raises(ValueError, match="Collection .* does not exist"):
with patch('uuid.uuid4', return_value='test-id'):
await processor.store_graph_embeddings(message)
# Verify index was created
processor.pinecone.create_index.assert_called_once()
@pytest.mark.asyncio
async def test_store_graph_embeddings_validates_before_timeout(self, processor):
"""Test that validation error occurs before timeout checks"""
"""Test that lazy creation works correctly"""
message = MagicMock()
message.metadata = MagicMock()
message.metadata.user = 'test_user'
@ -350,10 +372,16 @@ class TestPineconeGraphEmbeddingsStorageProcessor:
# Mock index doesn't exist
processor.pinecone.has_index.return_value = False
mock_index = MagicMock()
processor.pinecone.Index.return_value = mock_index
with pytest.raises(ValueError, match="Collection .* does not exist"):
with patch('uuid.uuid4', return_value='test-id'):
await processor.store_graph_embeddings(message)
# Verify index was created and used
processor.pinecone.create_index.assert_called_once()
mock_index.upsert.assert_called_once()
def test_add_args_method(self):
"""Test that add_args properly configures argument parser"""
from argparse import ArgumentParser

View file

@ -44,29 +44,6 @@ class TestQdrantGraphEmbeddingsStorage(IsolatedAsyncioTestCase):
assert hasattr(processor, 'qdrant')
assert processor.qdrant == mock_qdrant_instance
@patch('trustgraph.storage.graph_embeddings.qdrant.write.QdrantClient')
@patch('trustgraph.base.GraphEmbeddingsStoreService.__init__')
async def test_get_collection_validates_existence(self, mock_base_init, mock_qdrant_client):
"""Test get_collection validates that collection exists"""
# Arrange
mock_base_init.return_value = None
mock_qdrant_instance = MagicMock()
mock_qdrant_instance.collection_exists.return_value = False
mock_qdrant_client.return_value = mock_qdrant_instance
config = {
'store_uri': 'http://localhost:6333',
'api_key': 'test-api-key',
'taskgroup': AsyncMock(),
'id': 'test-qdrant-processor'
}
processor = Processor(**config)
# Act & Assert
with pytest.raises(ValueError, match="Collection .* does not exist"):
processor.get_collection(user='test_user', collection='test_collection')
@patch('trustgraph.storage.graph_embeddings.qdrant.write.QdrantClient')
@patch('trustgraph.storage.graph_embeddings.qdrant.write.uuid')
@patch('trustgraph.base.GraphEmbeddingsStoreService.__init__')
@ -103,114 +80,22 @@ class TestQdrantGraphEmbeddingsStorage(IsolatedAsyncioTestCase):
await processor.store_graph_embeddings(mock_message)
# Assert
# Verify collection existence was checked
expected_collection = 't_test_user_test_collection'
# Verify collection existence was checked (with dimension suffix)
expected_collection = 't_test_user_test_collection_3' # 3 dimensions in vector [0.1, 0.2, 0.3]
mock_qdrant_instance.collection_exists.assert_called_once_with(expected_collection)
# Verify upsert was called
mock_qdrant_instance.upsert.assert_called_once()
# Verify upsert parameters
upsert_call_args = mock_qdrant_instance.upsert.call_args
assert upsert_call_args[1]['collection_name'] == expected_collection
assert upsert_call_args[1]['collection_name'] == 't_test_user_test_collection_3'
assert len(upsert_call_args[1]['points']) == 1
point = upsert_call_args[1]['points'][0]
assert point.vector == [0.1, 0.2, 0.3]
assert point.payload['entity'] == 'test_entity'
@patch('trustgraph.storage.graph_embeddings.qdrant.write.QdrantClient')
@patch('trustgraph.base.GraphEmbeddingsStoreService.__init__')
async def test_get_collection_uses_existing_collection(self, mock_base_init, mock_qdrant_client):
"""Test get_collection uses existing collection without creating new one"""
# Arrange
mock_base_init.return_value = None
mock_qdrant_instance = MagicMock()
mock_qdrant_instance.collection_exists.return_value = True # Collection exists
mock_qdrant_client.return_value = mock_qdrant_instance
config = {
'store_uri': 'http://localhost:6333',
'api_key': 'test-api-key',
'taskgroup': AsyncMock(),
'id': 'test-qdrant-processor'
}
processor = Processor(**config)
# Act
collection_name = processor.get_collection(user='existing_user', collection='existing_collection')
# Assert
expected_name = 't_existing_user_existing_collection'
assert collection_name == expected_name
# Verify collection existence check was performed
mock_qdrant_instance.collection_exists.assert_called_once_with(expected_name)
# Verify create_collection was NOT called
mock_qdrant_instance.create_collection.assert_not_called()
@patch('trustgraph.storage.graph_embeddings.qdrant.write.QdrantClient')
@patch('trustgraph.base.GraphEmbeddingsStoreService.__init__')
async def test_get_collection_validates_on_each_call(self, mock_base_init, mock_qdrant_client):
"""Test get_collection validates collection existence on each call"""
# Arrange
mock_base_init.return_value = None
mock_qdrant_instance = MagicMock()
mock_qdrant_instance.collection_exists.return_value = True
mock_qdrant_client.return_value = mock_qdrant_instance
config = {
'store_uri': 'http://localhost:6333',
'api_key': 'test-api-key',
'taskgroup': AsyncMock(),
'id': 'test-qdrant-processor'
}
processor = Processor(**config)
# First call
collection_name1 = processor.get_collection(user='cache_user', collection='cache_collection')
# Reset mock to track second call
mock_qdrant_instance.reset_mock()
mock_qdrant_instance.collection_exists.return_value = True
# Act - Second call with same parameters
collection_name2 = processor.get_collection(user='cache_user', collection='cache_collection')
# Assert
expected_name = 't_cache_user_cache_collection'
assert collection_name1 == expected_name
assert collection_name2 == expected_name
# Verify collection existence check happens on each call
mock_qdrant_instance.collection_exists.assert_called_once_with(expected_name)
mock_qdrant_instance.create_collection.assert_not_called()
@patch('trustgraph.storage.graph_embeddings.qdrant.write.QdrantClient')
@patch('trustgraph.base.GraphEmbeddingsStoreService.__init__')
async def test_get_collection_creation_exception(self, mock_base_init, mock_qdrant_client):
"""Test get_collection raises ValueError when collection doesn't exist"""
# Arrange
mock_base_init.return_value = None
mock_qdrant_instance = MagicMock()
mock_qdrant_instance.collection_exists.return_value = False
mock_qdrant_client.return_value = mock_qdrant_instance
config = {
'store_uri': 'http://localhost:6333',
'api_key': 'test-api-key',
'taskgroup': AsyncMock(),
'id': 'test-qdrant-processor'
}
processor = Processor(**config)
# Act & Assert
with pytest.raises(ValueError, match="Collection .* does not exist"):
processor.get_collection(user='error_user', collection='error_collection')
@patch('trustgraph.storage.graph_embeddings.qdrant.write.QdrantClient')
@patch('trustgraph.storage.graph_embeddings.qdrant.write.uuid')
@patch('trustgraph.base.GraphEmbeddingsStoreService.__init__')