Release 1.4 -> master (#524)

Catch up
This commit is contained in:
cybermaggedon 2025-09-20 16:00:37 +01:00 committed by GitHub
parent a8e437fc7f
commit 6c7af8789d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
216 changed files with 31360 additions and 1611 deletions

View file

@ -0,0 +1,209 @@
"""
Unit tests for Milvus collection name sanitization functionality
"""
import pytest
from trustgraph.direct.milvus_doc_embeddings import make_safe_collection_name
class TestMilvusCollectionNaming:
"""Test cases for Milvus collection name generation and sanitization"""
def test_make_safe_collection_name_basic(self):
"""Test basic collection name creation"""
result = make_safe_collection_name(
user="test_user",
collection="test_collection",
prefix="doc"
)
assert result == "doc_test_user_test_collection"
def test_make_safe_collection_name_with_special_characters(self):
"""Test collection name creation with special characters that need sanitization"""
result = make_safe_collection_name(
user="user@domain.com",
collection="test-collection.v2",
prefix="entity"
)
assert result == "entity_user_domain_com_test_collection_v2"
def test_make_safe_collection_name_with_unicode(self):
"""Test collection name creation with Unicode characters"""
result = make_safe_collection_name(
user="测试用户",
collection="colección_española",
prefix="doc"
)
assert result == "doc_default_colecci_n_espa_ola"
def test_make_safe_collection_name_with_spaces(self):
"""Test collection name creation with spaces"""
result = make_safe_collection_name(
user="test user",
collection="my test collection",
prefix="entity"
)
assert result == "entity_test_user_my_test_collection"
def test_make_safe_collection_name_with_multiple_consecutive_special_chars(self):
"""Test collection name creation with multiple consecutive special characters"""
result = make_safe_collection_name(
user="user@@@domain!!!",
collection="test---collection...v2",
prefix="doc"
)
assert result == "doc_user_domain_test_collection_v2"
def test_make_safe_collection_name_with_leading_trailing_underscores(self):
"""Test collection name creation with leading/trailing special characters"""
result = make_safe_collection_name(
user="__test_user__",
collection="@@test_collection##",
prefix="entity"
)
assert result == "entity_test_user_test_collection"
def test_make_safe_collection_name_empty_user(self):
"""Test collection name creation with empty user (should fallback to 'default')"""
result = make_safe_collection_name(
user="",
collection="test_collection",
prefix="doc"
)
assert result == "doc_default_test_collection"
def test_make_safe_collection_name_empty_collection(self):
"""Test collection name creation with empty collection (should fallback to 'default')"""
result = make_safe_collection_name(
user="test_user",
collection="",
prefix="doc"
)
assert result == "doc_test_user_default"
def test_make_safe_collection_name_both_empty(self):
"""Test collection name creation with both user and collection empty"""
result = make_safe_collection_name(
user="",
collection="",
prefix="doc"
)
assert result == "doc_default_default"
def test_make_safe_collection_name_only_special_characters(self):
"""Test collection name creation with only special characters (should fallback to 'default')"""
result = make_safe_collection_name(
user="@@@!!!",
collection="---###",
prefix="entity"
)
assert result == "entity_default_default"
def test_make_safe_collection_name_whitespace_only(self):
"""Test collection name creation with whitespace-only strings"""
result = make_safe_collection_name(
user=" \n\t ",
collection=" \r\n ",
prefix="doc"
)
assert result == "doc_default_default"
def test_make_safe_collection_name_mixed_valid_invalid_chars(self):
"""Test collection name creation with mixed valid and invalid characters"""
result = make_safe_collection_name(
user="user123@test",
collection="coll_2023.v1",
prefix="entity"
)
assert result == "entity_user123_test_coll_2023_v1"
def test_make_safe_collection_name_different_prefixes(self):
"""Test collection name creation with different prefixes"""
user = "test_user"
collection = "test_collection"
doc_result = make_safe_collection_name(user, collection, "doc")
entity_result = make_safe_collection_name(user, collection, "entity")
custom_result = make_safe_collection_name(user, collection, "custom")
assert doc_result == "doc_test_user_test_collection"
assert entity_result == "entity_test_user_test_collection"
assert custom_result == "custom_test_user_test_collection"
def test_make_safe_collection_name_different_dimensions(self):
"""Test collection name creation - dimension handling no longer part of function"""
user = "test_user"
collection = "test_collection"
prefix = "doc"
# With new API, dimensions are handled separately, function always returns same result
result = make_safe_collection_name(user, collection, prefix)
assert result == "doc_test_user_test_collection"
def test_make_safe_collection_name_long_names(self):
"""Test collection name creation with very long user/collection names"""
long_user = "a" * 100
long_collection = "b" * 100
result = make_safe_collection_name(
user=long_user,
collection=long_collection,
prefix="doc"
)
expected = f"doc_{long_user}_{long_collection}"
assert result == expected
assert len(result) > 200 # Verify it handles long names
def test_make_safe_collection_name_numeric_values(self):
"""Test collection name creation with numeric user/collection values"""
result = make_safe_collection_name(
user="user123",
collection="collection456",
prefix="doc"
)
assert result == "doc_user123_collection456"
def test_make_safe_collection_name_case_sensitivity(self):
"""Test that collection name creation preserves case"""
result = make_safe_collection_name(
user="TestUser",
collection="TestCollection",
prefix="Doc"
)
assert result == "Doc_TestUser_TestCollection"
def test_make_safe_collection_name_realistic_examples(self):
"""Test collection name creation with realistic user/collection combinations"""
test_cases = [
# (user, collection, expected_safe_user, expected_safe_collection)
("john.doe", "production-2024", "john_doe", "production_2024"),
("team@company.com", "ml_models.v1", "team_company_com", "ml_models_v1"),
("user_123", "test_collection", "user_123", "test_collection"),
("αβγ-user", "测试集合", "user", "default"),
]
for user, collection, expected_user, expected_collection in test_cases:
result = make_safe_collection_name(user, collection, "doc")
assert result == f"doc_{expected_user}_{expected_collection}"
def test_make_safe_collection_name_matches_qdrant_pattern(self):
"""Test that Milvus collection names follow similar pattern to Qdrant (but without dimension in name)"""
# Qdrant uses: "d_{user}_{collection}_{dimension}" and "t_{user}_{collection}_{dimension}"
# New Milvus API uses: "{prefix}_{safe_user}_{safe_collection}" (dimension handled separately)
user = "test.user@domain.com"
collection = "test-collection.v2"
doc_result = make_safe_collection_name(user, collection, "doc")
entity_result = make_safe_collection_name(user, collection, "entity")
# Should follow the pattern but with sanitized names and no dimension
assert doc_result == "doc_test_user_domain_com_test_collection_v2"
assert entity_result == "entity_test_user_domain_com_test_collection_v2"
# Verify structure matches expected pattern
assert doc_result.startswith("doc_")
assert entity_result.startswith("entity_")
# Dimension is no longer part of the collection name

View file

@ -0,0 +1,312 @@
"""
Integration tests for Milvus user/collection functionality
Tests the complete flow of the new user/collection parameter handling
"""
import pytest
from unittest.mock import MagicMock, patch
from trustgraph.direct.milvus_doc_embeddings import DocVectors, make_safe_collection_name
from trustgraph.direct.milvus_graph_embeddings import EntityVectors
class TestMilvusUserCollectionIntegration:
"""Test cases for Milvus user/collection integration functionality"""
@patch('trustgraph.direct.milvus_doc_embeddings.MilvusClient')
def test_doc_vectors_collection_creation_with_user_collection(self, mock_milvus_client):
"""Test DocVectors creates collections with proper user/collection names"""
mock_client = MagicMock()
mock_milvus_client.return_value = mock_client
doc_vectors = DocVectors(uri="http://test:19530", prefix="doc")
# Test collection creation for different user/collection combinations
test_cases = [
("user1", "collection1", [0.1, 0.2, 0.3]),
("user2", "collection2", [0.1, 0.2, 0.3, 0.4]),
("user@domain.com", "test-collection.v1", [0.1, 0.2, 0.3]),
]
for user, collection, vector in test_cases:
doc_vectors.insert(vector, "test document", user, collection)
expected_collection_name = make_safe_collection_name(
user, collection, "doc"
)
# Verify collection was created with correct name
assert (len(vector), user, collection) in doc_vectors.collections
assert doc_vectors.collections[(len(vector), user, collection)] == expected_collection_name
@patch('trustgraph.direct.milvus_graph_embeddings.MilvusClient')
def test_entity_vectors_collection_creation_with_user_collection(self, mock_milvus_client):
"""Test EntityVectors creates collections with proper user/collection names"""
mock_client = MagicMock()
mock_milvus_client.return_value = mock_client
entity_vectors = EntityVectors(uri="http://test:19530", prefix="entity")
# Test collection creation for different user/collection combinations
test_cases = [
("user1", "collection1", [0.1, 0.2, 0.3]),
("user2", "collection2", [0.1, 0.2, 0.3, 0.4]),
("user@domain.com", "test-collection.v1", [0.1, 0.2, 0.3]),
]
for user, collection, vector in test_cases:
entity_vectors.insert(vector, "test entity", user, collection)
expected_collection_name = make_safe_collection_name(
user, collection, "entity"
)
# Verify collection was created with correct name
assert (len(vector), user, collection) in entity_vectors.collections
assert entity_vectors.collections[(len(vector), user, collection)] == expected_collection_name
@patch('trustgraph.direct.milvus_doc_embeddings.MilvusClient')
def test_doc_vectors_search_uses_correct_collection(self, mock_milvus_client):
"""Test DocVectors search uses the correct collection for user/collection"""
mock_client = MagicMock()
mock_milvus_client.return_value = mock_client
# Mock search results
mock_client.search.return_value = [
{"entity": {"doc": "test document"}}
]
doc_vectors = DocVectors(uri="http://test:19530", prefix="doc")
# First insert to create collection
vector = [0.1, 0.2, 0.3]
user = "test_user"
collection = "test_collection"
doc_vectors.insert(vector, "test doc", user, collection)
# Now search
result = doc_vectors.search(vector, user, collection, limit=5)
# Verify search was called with correct collection name
expected_collection_name = make_safe_collection_name(user, collection, "doc")
mock_client.search.assert_called_once()
search_call = mock_client.search.call_args
assert search_call[1]["collection_name"] == expected_collection_name
@patch('trustgraph.direct.milvus_graph_embeddings.MilvusClient')
def test_entity_vectors_search_uses_correct_collection(self, mock_milvus_client):
"""Test EntityVectors search uses the correct collection for user/collection"""
mock_client = MagicMock()
mock_milvus_client.return_value = mock_client
# Mock search results
mock_client.search.return_value = [
{"entity": {"entity": "test entity"}}
]
entity_vectors = EntityVectors(uri="http://test:19530", prefix="entity")
# First insert to create collection
vector = [0.1, 0.2, 0.3]
user = "test_user"
collection = "test_collection"
entity_vectors.insert(vector, "test entity", user, collection)
# Now search
result = entity_vectors.search(vector, user, collection, limit=5)
# Verify search was called with correct collection name
expected_collection_name = make_safe_collection_name(user, collection, "entity")
mock_client.search.assert_called_once()
search_call = mock_client.search.call_args
assert search_call[1]["collection_name"] == expected_collection_name
@patch('trustgraph.direct.milvus_doc_embeddings.MilvusClient')
def test_doc_vectors_collection_isolation(self, mock_milvus_client):
"""Test that different user/collection combinations create separate collections"""
mock_client = MagicMock()
mock_milvus_client.return_value = mock_client
doc_vectors = DocVectors(uri="http://test:19530", prefix="doc")
# Insert same vector for different user/collection combinations
vector = [0.1, 0.2, 0.3]
doc_vectors.insert(vector, "user1 doc", "user1", "collection1")
doc_vectors.insert(vector, "user2 doc", "user2", "collection2")
doc_vectors.insert(vector, "user1 doc2", "user1", "collection2")
# Verify three separate collections were created
assert len(doc_vectors.collections) == 3
collection_names = set(doc_vectors.collections.values())
expected_names = {
"doc_user1_collection1",
"doc_user2_collection2",
"doc_user1_collection2"
}
assert collection_names == expected_names
@patch('trustgraph.direct.milvus_graph_embeddings.MilvusClient')
def test_entity_vectors_collection_isolation(self, mock_milvus_client):
"""Test that different user/collection combinations create separate collections"""
mock_client = MagicMock()
mock_milvus_client.return_value = mock_client
entity_vectors = EntityVectors(uri="http://test:19530", prefix="entity")
# Insert same vector for different user/collection combinations
vector = [0.1, 0.2, 0.3]
entity_vectors.insert(vector, "user1 entity", "user1", "collection1")
entity_vectors.insert(vector, "user2 entity", "user2", "collection2")
entity_vectors.insert(vector, "user1 entity2", "user1", "collection2")
# Verify three separate collections were created
assert len(entity_vectors.collections) == 3
collection_names = set(entity_vectors.collections.values())
expected_names = {
"entity_user1_collection1",
"entity_user2_collection2",
"entity_user1_collection2"
}
assert collection_names == expected_names
@patch('trustgraph.direct.milvus_doc_embeddings.MilvusClient')
def test_doc_vectors_dimension_isolation(self, mock_milvus_client):
"""Test that different dimensions create separate collections even with same user/collection"""
mock_client = MagicMock()
mock_milvus_client.return_value = mock_client
doc_vectors = DocVectors(uri="http://test:19530", prefix="doc")
user = "test_user"
collection = "test_collection"
# Insert vectors with different dimensions
doc_vectors.insert([0.1, 0.2, 0.3], "3D doc", user, collection) # 3D
doc_vectors.insert([0.1, 0.2, 0.3, 0.4], "4D doc", user, collection) # 4D
doc_vectors.insert([0.1, 0.2], "2D doc", user, collection) # 2D
# Verify three separate collections were created for different dimensions
assert len(doc_vectors.collections) == 3
collection_names = set(doc_vectors.collections.values())
expected_names = {
"doc_test_user_test_collection", # Same name for all dimensions
"doc_test_user_test_collection", # now stored per dimension in key
"doc_test_user_test_collection" # but collection name is the same
}
# Note: Now all dimensions use the same collection name, they are differentiated by the key
assert len(collection_names) == 1 # Only one unique collection name
assert "doc_test_user_test_collection" in collection_names
assert collection_names == expected_names
@patch('trustgraph.direct.milvus_doc_embeddings.MilvusClient')
def test_doc_vectors_collection_reuse(self, mock_milvus_client):
"""Test that same user/collection/dimension reuses existing collection"""
mock_client = MagicMock()
mock_milvus_client.return_value = mock_client
doc_vectors = DocVectors(uri="http://test:19530", prefix="doc")
user = "test_user"
collection = "test_collection"
vector = [0.1, 0.2, 0.3]
# Insert multiple documents with same user/collection/dimension
doc_vectors.insert(vector, "doc1", user, collection)
doc_vectors.insert(vector, "doc2", user, collection)
doc_vectors.insert(vector, "doc3", user, collection)
# Verify only one collection was created
assert len(doc_vectors.collections) == 1
expected_collection_name = "doc_test_user_test_collection"
assert doc_vectors.collections[(3, user, collection)] == expected_collection_name
@patch('trustgraph.direct.milvus_doc_embeddings.MilvusClient')
def test_doc_vectors_special_characters_handling(self, mock_milvus_client):
"""Test that special characters in user/collection names are handled correctly"""
mock_client = MagicMock()
mock_milvus_client.return_value = mock_client
doc_vectors = DocVectors(uri="http://test:19530", prefix="doc")
# Test various special character combinations
test_cases = [
("user@domain.com", "test-collection.v1", "doc_user_domain_com_test_collection_v1"),
("user_123", "collection_456", "doc_user_123_collection_456"),
("user with spaces", "collection with spaces", "doc_user_with_spaces_collection_with_spaces"),
("user@@@test", "collection---test", "doc_user_test_collection_test"),
]
vector = [0.1, 0.2, 0.3]
for user, collection, expected_name in test_cases:
doc_vectors_instance = DocVectors(uri="http://test:19530", prefix="doc")
doc_vectors_instance.insert(vector, "test doc", user, collection)
assert doc_vectors_instance.collections[(3, user, collection)] == expected_name
def test_collection_name_backward_compatibility(self):
"""Test that new collection names don't conflict with old pattern"""
# Old pattern was: {prefix}_{dimension}
# New pattern is: {prefix}_{safe_user}_{safe_collection}
# The new pattern should never generate names that match the old pattern
old_pattern_examples = ["doc_384", "entity_768", "doc_512"]
test_cases = [
("user", "collection", "doc"),
("test", "test", "entity"),
("a", "b", "doc"),
]
for user, collection, prefix in test_cases:
new_name = make_safe_collection_name(user, collection, prefix)
# New names should have at least 2 underscores (prefix_user_collection)
# Old names had only 1 underscore (prefix_dimension)
assert new_name.count('_') >= 2, f"New name {new_name} doesn't have enough underscores"
# New names should not match old pattern
assert new_name not in old_pattern_examples, f"New name {new_name} conflicts with old pattern"
def test_user_collection_isolation_regression(self):
"""
Regression test to ensure user/collection parameters prevent data mixing.
This test guards against the bug where all users shared the same Milvus
collections, causing data contamination between users/collections.
"""
# Test the specific case that was broken before the fix
user1, collection1 = "my_user", "test_coll_1"
user2, collection2 = "other_user", "production_data"
dimension = 384
# Generate collection names
doc_name1 = make_safe_collection_name(user1, collection1, "doc")
doc_name2 = make_safe_collection_name(user2, collection2, "doc")
entity_name1 = make_safe_collection_name(user1, collection1, "entity")
entity_name2 = make_safe_collection_name(user2, collection2, "entity")
# Verify complete isolation
assert doc_name1 != doc_name2, "Document collections should be isolated"
assert entity_name1 != entity_name2, "Entity collections should be isolated"
# Verify names match expected pattern from new API
# Qdrant uses: d_{user}_{collection}_{dimension}, t_{user}_{collection}_{dimension}
# New Milvus API uses: doc_{safe_user}_{safe_collection}, entity_{safe_user}_{safe_collection}
assert doc_name1 == "doc_my_user_test_coll_1"
assert doc_name2 == "doc_other_user_production_data"
assert entity_name1 == "entity_my_user_test_coll_1"
assert entity_name2 == "entity_other_user_production_data"
# This test would have FAILED with the old implementation that used:
# - doc_384 for all document embeddings (no user/collection differentiation)
# - entity_384 for all graph embeddings (no user/collection differentiation)