mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-27 17:36:23 +02:00
Collection management (#520)
* Tech spec * Refactored Cassanda knowledge graph for single table * Collection management, librarian services to manage metadata and collection deletion
This commit is contained in:
parent
48016d8fb2
commit
13ff7d765d
48 changed files with 2941 additions and 425 deletions
|
|
@ -13,163 +13,146 @@ class TestMilvusCollectionNaming:
|
|||
"""Test basic collection name creation"""
|
||||
result = make_safe_collection_name(
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
dimension=384,
|
||||
collection="test_collection",
|
||||
prefix="doc"
|
||||
)
|
||||
assert result == "doc_test_user_test_collection_384"
|
||||
assert result == "doc_test_user_test_collection"
|
||||
|
||||
def test_make_safe_collection_name_with_special_characters(self):
|
||||
"""Test collection name creation with special characters that need sanitization"""
|
||||
result = make_safe_collection_name(
|
||||
user="user@domain.com",
|
||||
collection="test-collection.v2",
|
||||
dimension=768,
|
||||
prefix="entity"
|
||||
)
|
||||
assert result == "entity_user_domain_com_test_collection_v2_768"
|
||||
assert result == "entity_user_domain_com_test_collection_v2"
|
||||
|
||||
def test_make_safe_collection_name_with_unicode(self):
|
||||
"""Test collection name creation with Unicode characters"""
|
||||
result = make_safe_collection_name(
|
||||
user="测试用户",
|
||||
collection="colección_española",
|
||||
dimension=512,
|
||||
collection="colección_española",
|
||||
prefix="doc"
|
||||
)
|
||||
assert result == "doc_default_colecci_n_espa_ola_512"
|
||||
assert result == "doc_default_colecci_n_espa_ola"
|
||||
|
||||
def test_make_safe_collection_name_with_spaces(self):
|
||||
"""Test collection name creation with spaces"""
|
||||
result = make_safe_collection_name(
|
||||
user="test user",
|
||||
collection="my test collection",
|
||||
dimension=256,
|
||||
prefix="entity"
|
||||
)
|
||||
assert result == "entity_test_user_my_test_collection_256"
|
||||
assert result == "entity_test_user_my_test_collection"
|
||||
|
||||
def test_make_safe_collection_name_with_multiple_consecutive_special_chars(self):
|
||||
"""Test collection name creation with multiple consecutive special characters"""
|
||||
result = make_safe_collection_name(
|
||||
user="user@@@domain!!!",
|
||||
collection="test---collection...v2",
|
||||
dimension=384,
|
||||
prefix="doc"
|
||||
prefix="doc"
|
||||
)
|
||||
assert result == "doc_user_domain_test_collection_v2_384"
|
||||
assert result == "doc_user_domain_test_collection_v2"
|
||||
|
||||
def test_make_safe_collection_name_with_leading_trailing_underscores(self):
|
||||
"""Test collection name creation with leading/trailing special characters"""
|
||||
result = make_safe_collection_name(
|
||||
user="__test_user__",
|
||||
collection="@@test_collection##",
|
||||
dimension=128,
|
||||
prefix="entity"
|
||||
)
|
||||
assert result == "entity_test_user_test_collection_128"
|
||||
assert result == "entity_test_user_test_collection"
|
||||
|
||||
def test_make_safe_collection_name_empty_user(self):
|
||||
"""Test collection name creation with empty user (should fallback to 'default')"""
|
||||
result = make_safe_collection_name(
|
||||
user="",
|
||||
collection="test_collection",
|
||||
dimension=384,
|
||||
prefix="doc"
|
||||
)
|
||||
assert result == "doc_default_test_collection_384"
|
||||
assert result == "doc_default_test_collection"
|
||||
|
||||
def test_make_safe_collection_name_empty_collection(self):
|
||||
"""Test collection name creation with empty collection (should fallback to 'default')"""
|
||||
result = make_safe_collection_name(
|
||||
user="test_user",
|
||||
collection="",
|
||||
dimension=384,
|
||||
prefix="doc"
|
||||
)
|
||||
assert result == "doc_test_user_default_384"
|
||||
assert result == "doc_test_user_default"
|
||||
|
||||
def test_make_safe_collection_name_both_empty(self):
|
||||
"""Test collection name creation with both user and collection empty"""
|
||||
result = make_safe_collection_name(
|
||||
user="",
|
||||
collection="",
|
||||
dimension=384,
|
||||
prefix="doc"
|
||||
)
|
||||
assert result == "doc_default_default_384"
|
||||
assert result == "doc_default_default"
|
||||
|
||||
def test_make_safe_collection_name_only_special_characters(self):
|
||||
"""Test collection name creation with only special characters (should fallback to 'default')"""
|
||||
result = make_safe_collection_name(
|
||||
user="@@@!!!",
|
||||
collection="---###",
|
||||
dimension=512,
|
||||
prefix="entity"
|
||||
)
|
||||
assert result == "entity_default_default_512"
|
||||
assert result == "entity_default_default"
|
||||
|
||||
def test_make_safe_collection_name_whitespace_only(self):
|
||||
"""Test collection name creation with whitespace-only strings"""
|
||||
result = make_safe_collection_name(
|
||||
user=" \n\t ",
|
||||
collection=" \r\n ",
|
||||
dimension=256,
|
||||
prefix="doc"
|
||||
)
|
||||
assert result == "doc_default_default_256"
|
||||
assert result == "doc_default_default"
|
||||
|
||||
def test_make_safe_collection_name_mixed_valid_invalid_chars(self):
|
||||
"""Test collection name creation with mixed valid and invalid characters"""
|
||||
result = make_safe_collection_name(
|
||||
user="user123@test",
|
||||
collection="coll_2023.v1",
|
||||
dimension=384,
|
||||
prefix="entity"
|
||||
)
|
||||
assert result == "entity_user123_test_coll_2023_v1_384"
|
||||
assert result == "entity_user123_test_coll_2023_v1"
|
||||
|
||||
def test_make_safe_collection_name_different_prefixes(self):
|
||||
"""Test collection name creation with different prefixes"""
|
||||
user = "test_user"
|
||||
collection = "test_collection"
|
||||
dimension = 384
|
||||
|
||||
doc_result = make_safe_collection_name(user, collection, dimension, "doc")
|
||||
entity_result = make_safe_collection_name(user, collection, dimension, "entity")
|
||||
custom_result = make_safe_collection_name(user, collection, dimension, "custom")
|
||||
|
||||
assert doc_result == "doc_test_user_test_collection_384"
|
||||
assert entity_result == "entity_test_user_test_collection_384"
|
||||
assert custom_result == "custom_test_user_test_collection_384"
|
||||
|
||||
doc_result = make_safe_collection_name(user, collection, "doc")
|
||||
entity_result = make_safe_collection_name(user, collection, "entity")
|
||||
custom_result = make_safe_collection_name(user, collection, "custom")
|
||||
|
||||
assert doc_result == "doc_test_user_test_collection"
|
||||
assert entity_result == "entity_test_user_test_collection"
|
||||
assert custom_result == "custom_test_user_test_collection"
|
||||
|
||||
def test_make_safe_collection_name_different_dimensions(self):
|
||||
"""Test collection name creation with different dimensions"""
|
||||
"""Test collection name creation - dimension handling no longer part of function"""
|
||||
user = "test_user"
|
||||
collection = "test_collection"
|
||||
prefix = "doc"
|
||||
|
||||
result_128 = make_safe_collection_name(user, collection, 128, prefix)
|
||||
result_384 = make_safe_collection_name(user, collection, 384, prefix)
|
||||
result_768 = make_safe_collection_name(user, collection, 768, prefix)
|
||||
|
||||
assert result_128 == "doc_test_user_test_collection_128"
|
||||
assert result_384 == "doc_test_user_test_collection_384"
|
||||
assert result_768 == "doc_test_user_test_collection_768"
|
||||
|
||||
# With new API, dimensions are handled separately, function always returns same result
|
||||
result = make_safe_collection_name(user, collection, prefix)
|
||||
|
||||
assert result == "doc_test_user_test_collection"
|
||||
|
||||
def test_make_safe_collection_name_long_names(self):
|
||||
"""Test collection name creation with very long user/collection names"""
|
||||
long_user = "a" * 100
|
||||
long_collection = "b" * 100
|
||||
|
||||
|
||||
result = make_safe_collection_name(
|
||||
user=long_user,
|
||||
collection=long_collection,
|
||||
dimension=384,
|
||||
prefix="doc"
|
||||
)
|
||||
|
||||
expected = f"doc_{long_user}_{long_collection}_384"
|
||||
|
||||
expected = f"doc_{long_user}_{long_collection}"
|
||||
assert result == expected
|
||||
assert len(result) > 200 # Verify it handles long names
|
||||
|
||||
|
|
@ -178,20 +161,18 @@ class TestMilvusCollectionNaming:
|
|||
result = make_safe_collection_name(
|
||||
user="user123",
|
||||
collection="collection456",
|
||||
dimension=384,
|
||||
prefix="doc"
|
||||
)
|
||||
assert result == "doc_user123_collection456_384"
|
||||
assert result == "doc_user123_collection456"
|
||||
|
||||
def test_make_safe_collection_name_case_sensitivity(self):
|
||||
"""Test that collection name creation preserves case"""
|
||||
result = make_safe_collection_name(
|
||||
user="TestUser",
|
||||
collection="TestCollection",
|
||||
dimension=384,
|
||||
prefix="Doc"
|
||||
)
|
||||
assert result == "Doc_TestUser_TestCollection_384"
|
||||
assert result == "Doc_TestUser_TestCollection"
|
||||
|
||||
def test_make_safe_collection_name_realistic_examples(self):
|
||||
"""Test collection name creation with realistic user/collection combinations"""
|
||||
|
|
@ -202,30 +183,27 @@ class TestMilvusCollectionNaming:
|
|||
("user_123", "test_collection", "user_123", "test_collection"),
|
||||
("αβγ-user", "测试集合", "user", "default"),
|
||||
]
|
||||
|
||||
|
||||
for user, collection, expected_user, expected_collection in test_cases:
|
||||
result = make_safe_collection_name(user, collection, 384, "doc")
|
||||
assert result == f"doc_{expected_user}_{expected_collection}_384"
|
||||
result = make_safe_collection_name(user, collection, "doc")
|
||||
assert result == f"doc_{expected_user}_{expected_collection}"
|
||||
|
||||
def test_make_safe_collection_name_matches_qdrant_pattern(self):
|
||||
"""Test that Milvus collection names follow similar pattern to Qdrant"""
|
||||
"""Test that Milvus collection names follow similar pattern to Qdrant (but without dimension in name)"""
|
||||
# Qdrant uses: "d_{user}_{collection}_{dimension}" and "t_{user}_{collection}_{dimension}"
|
||||
# Milvus should use: "{prefix}_{safe_user}_{safe_collection}_{dimension}"
|
||||
|
||||
# New Milvus API uses: "{prefix}_{safe_user}_{safe_collection}" (dimension handled separately)
|
||||
|
||||
user = "test.user@domain.com"
|
||||
collection = "test-collection.v2"
|
||||
dimension = 384
|
||||
|
||||
doc_result = make_safe_collection_name(user, collection, dimension, "doc")
|
||||
entity_result = make_safe_collection_name(user, collection, dimension, "entity")
|
||||
|
||||
# Should follow the pattern but with sanitized names
|
||||
assert doc_result == "doc_test_user_domain_com_test_collection_v2_384"
|
||||
assert entity_result == "entity_test_user_domain_com_test_collection_v2_384"
|
||||
|
||||
# Verify structure matches expected pattern (may have more underscores due to sanitization)
|
||||
# The important thing is that it follows prefix_user_collection_dimension structure
|
||||
|
||||
doc_result = make_safe_collection_name(user, collection, "doc")
|
||||
entity_result = make_safe_collection_name(user, collection, "entity")
|
||||
|
||||
# Should follow the pattern but with sanitized names and no dimension
|
||||
assert doc_result == "doc_test_user_domain_com_test_collection_v2"
|
||||
assert entity_result == "entity_test_user_domain_com_test_collection_v2"
|
||||
|
||||
# Verify structure matches expected pattern
|
||||
assert doc_result.startswith("doc_")
|
||||
assert doc_result.endswith("_384")
|
||||
assert entity_result.startswith("entity_")
|
||||
assert entity_result.endswith("_384")
|
||||
# Dimension is no longer part of the collection name
|
||||
|
|
@ -32,7 +32,7 @@ class TestMilvusUserCollectionIntegration:
|
|||
doc_vectors.insert(vector, "test document", user, collection)
|
||||
|
||||
expected_collection_name = make_safe_collection_name(
|
||||
user, collection, len(vector), "doc"
|
||||
user, collection, "doc"
|
||||
)
|
||||
|
||||
# Verify collection was created with correct name
|
||||
|
|
@ -58,7 +58,7 @@ class TestMilvusUserCollectionIntegration:
|
|||
entity_vectors.insert(vector, "test entity", user, collection)
|
||||
|
||||
expected_collection_name = make_safe_collection_name(
|
||||
user, collection, len(vector), "entity"
|
||||
user, collection, "entity"
|
||||
)
|
||||
|
||||
# Verify collection was created with correct name
|
||||
|
|
@ -89,7 +89,7 @@ class TestMilvusUserCollectionIntegration:
|
|||
result = doc_vectors.search(vector, user, collection, limit=5)
|
||||
|
||||
# Verify search was called with correct collection name
|
||||
expected_collection_name = make_safe_collection_name(user, collection, 3, "doc")
|
||||
expected_collection_name = make_safe_collection_name(user, collection, "doc")
|
||||
mock_client.search.assert_called_once()
|
||||
search_call = mock_client.search.call_args
|
||||
assert search_call[1]["collection_name"] == expected_collection_name
|
||||
|
|
@ -118,7 +118,7 @@ class TestMilvusUserCollectionIntegration:
|
|||
result = entity_vectors.search(vector, user, collection, limit=5)
|
||||
|
||||
# Verify search was called with correct collection name
|
||||
expected_collection_name = make_safe_collection_name(user, collection, 3, "entity")
|
||||
expected_collection_name = make_safe_collection_name(user, collection, "entity")
|
||||
mock_client.search.assert_called_once()
|
||||
search_call = mock_client.search.call_args
|
||||
assert search_call[1]["collection_name"] == expected_collection_name
|
||||
|
|
@ -142,9 +142,9 @@ class TestMilvusUserCollectionIntegration:
|
|||
|
||||
collection_names = set(doc_vectors.collections.values())
|
||||
expected_names = {
|
||||
"doc_user1_collection1_3",
|
||||
"doc_user2_collection2_3",
|
||||
"doc_user1_collection2_3"
|
||||
"doc_user1_collection1",
|
||||
"doc_user2_collection2",
|
||||
"doc_user1_collection2"
|
||||
}
|
||||
assert collection_names == expected_names
|
||||
|
||||
|
|
@ -167,9 +167,9 @@ class TestMilvusUserCollectionIntegration:
|
|||
|
||||
collection_names = set(entity_vectors.collections.values())
|
||||
expected_names = {
|
||||
"entity_user1_collection1_3",
|
||||
"entity_user2_collection2_3",
|
||||
"entity_user1_collection2_3"
|
||||
"entity_user1_collection1",
|
||||
"entity_user2_collection2",
|
||||
"entity_user1_collection2"
|
||||
}
|
||||
assert collection_names == expected_names
|
||||
|
||||
|
|
@ -194,10 +194,13 @@ class TestMilvusUserCollectionIntegration:
|
|||
|
||||
collection_names = set(doc_vectors.collections.values())
|
||||
expected_names = {
|
||||
"doc_test_user_test_collection_3", # 3D
|
||||
"doc_test_user_test_collection_4", # 4D
|
||||
"doc_test_user_test_collection_2" # 2D
|
||||
"doc_test_user_test_collection", # Same name for all dimensions
|
||||
"doc_test_user_test_collection", # now stored per dimension in key
|
||||
"doc_test_user_test_collection" # but collection name is the same
|
||||
}
|
||||
# Note: Now all dimensions use the same collection name, they are differentiated by the key
|
||||
assert len(collection_names) == 1 # Only one unique collection name
|
||||
assert "doc_test_user_test_collection" in collection_names
|
||||
assert collection_names == expected_names
|
||||
|
||||
@patch('trustgraph.direct.milvus_doc_embeddings.MilvusClient')
|
||||
|
|
@ -220,7 +223,7 @@ class TestMilvusUserCollectionIntegration:
|
|||
# Verify only one collection was created
|
||||
assert len(doc_vectors.collections) == 1
|
||||
|
||||
expected_collection_name = "doc_test_user_test_collection_3"
|
||||
expected_collection_name = "doc_test_user_test_collection"
|
||||
assert doc_vectors.collections[(3, user, collection)] == expected_collection_name
|
||||
|
||||
@patch('trustgraph.direct.milvus_doc_embeddings.MilvusClient')
|
||||
|
|
@ -233,10 +236,10 @@ class TestMilvusUserCollectionIntegration:
|
|||
|
||||
# Test various special character combinations
|
||||
test_cases = [
|
||||
("user@domain.com", "test-collection.v1", "doc_user_domain_com_test_collection_v1_3"),
|
||||
("user_123", "collection_456", "doc_user_123_collection_456_3"),
|
||||
("user with spaces", "collection with spaces", "doc_user_with_spaces_collection_with_spaces_3"),
|
||||
("user@@@test", "collection---test", "doc_user_test_collection_test_3"),
|
||||
("user@domain.com", "test-collection.v1", "doc_user_domain_com_test_collection_v1"),
|
||||
("user_123", "collection_456", "doc_user_123_collection_456"),
|
||||
("user with spaces", "collection with spaces", "doc_user_with_spaces_collection_with_spaces"),
|
||||
("user@@@test", "collection---test", "doc_user_test_collection_test"),
|
||||
]
|
||||
|
||||
vector = [0.1, 0.2, 0.3]
|
||||
|
|
@ -250,24 +253,24 @@ class TestMilvusUserCollectionIntegration:
|
|||
def test_collection_name_backward_compatibility(self):
|
||||
"""Test that new collection names don't conflict with old pattern"""
|
||||
# Old pattern was: {prefix}_{dimension}
|
||||
# New pattern is: {prefix}_{safe_user}_{safe_collection}_{dimension}
|
||||
|
||||
# New pattern is: {prefix}_{safe_user}_{safe_collection}
|
||||
|
||||
# The new pattern should never generate names that match the old pattern
|
||||
old_pattern_examples = ["doc_384", "entity_768", "doc_512"]
|
||||
|
||||
|
||||
test_cases = [
|
||||
("user", "collection", 384, "doc"),
|
||||
("test", "test", 768, "entity"),
|
||||
("a", "b", 512, "doc"),
|
||||
("user", "collection", "doc"),
|
||||
("test", "test", "entity"),
|
||||
("a", "b", "doc"),
|
||||
]
|
||||
|
||||
for user, collection, dimension, prefix in test_cases:
|
||||
new_name = make_safe_collection_name(user, collection, dimension, prefix)
|
||||
|
||||
# New names should have at least 4 underscores (prefix_user_collection_dimension)
|
||||
|
||||
for user, collection, prefix in test_cases:
|
||||
new_name = make_safe_collection_name(user, collection, prefix)
|
||||
|
||||
# New names should have at least 2 underscores (prefix_user_collection)
|
||||
# Old names had only 1 underscore (prefix_dimension)
|
||||
assert new_name.count('_') >= 3, f"New name {new_name} doesn't have enough underscores"
|
||||
|
||||
assert new_name.count('_') >= 2, f"New name {new_name} doesn't have enough underscores"
|
||||
|
||||
# New names should not match old pattern
|
||||
assert new_name not in old_pattern_examples, f"New name {new_name} conflicts with old pattern"
|
||||
|
||||
|
|
@ -286,23 +289,23 @@ class TestMilvusUserCollectionIntegration:
|
|||
dimension = 384
|
||||
|
||||
# Generate collection names
|
||||
doc_name1 = make_safe_collection_name(user1, collection1, dimension, "doc")
|
||||
doc_name2 = make_safe_collection_name(user2, collection2, dimension, "doc")
|
||||
|
||||
entity_name1 = make_safe_collection_name(user1, collection1, dimension, "entity")
|
||||
entity_name2 = make_safe_collection_name(user2, collection2, dimension, "entity")
|
||||
doc_name1 = make_safe_collection_name(user1, collection1, "doc")
|
||||
doc_name2 = make_safe_collection_name(user2, collection2, "doc")
|
||||
|
||||
entity_name1 = make_safe_collection_name(user1, collection1, "entity")
|
||||
entity_name2 = make_safe_collection_name(user2, collection2, "entity")
|
||||
|
||||
# Verify complete isolation
|
||||
assert doc_name1 != doc_name2, "Document collections should be isolated"
|
||||
assert entity_name1 != entity_name2, "Entity collections should be isolated"
|
||||
|
||||
# Verify names match expected pattern from Qdrant
|
||||
# Verify names match expected pattern from new API
|
||||
# Qdrant uses: d_{user}_{collection}_{dimension}, t_{user}_{collection}_{dimension}
|
||||
# Milvus uses: doc_{safe_user}_{safe_collection}_{dimension}, entity_{safe_user}_{safe_collection}_{dimension}
|
||||
assert doc_name1 == "doc_my_user_test_coll_1_384"
|
||||
assert doc_name2 == "doc_other_user_production_data_384"
|
||||
assert entity_name1 == "entity_my_user_test_coll_1_384"
|
||||
assert entity_name2 == "entity_other_user_production_data_384"
|
||||
# New Milvus API uses: doc_{safe_user}_{safe_collection}, entity_{safe_user}_{safe_collection}
|
||||
assert doc_name1 == "doc_my_user_test_coll_1"
|
||||
assert doc_name2 == "doc_other_user_production_data"
|
||||
assert entity_name1 == "entity_my_user_test_coll_1"
|
||||
assert entity_name2 == "entity_other_user_production_data"
|
||||
|
||||
# This test would have FAILED with the old implementation that used:
|
||||
# - doc_384 for all document embeddings (no user/collection differentiation)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue