Extending test coverage (#434)

* Contract tests

* Testing embeedings

* Agent unit tests

* Knowledge pipeline tests

* Turn on contract tests
This commit is contained in:
cybermaggedon 2025-07-14 17:54:04 +01:00 committed by GitHub
parent 2f7fddd206
commit 4daa54abaf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 6303 additions and 44 deletions

View file

@ -0,0 +1,10 @@
"""
Unit tests for embeddings services
Testing Strategy:
- Mock external embedding libraries (FastEmbed, Ollama client)
- Test core business logic for text embedding generation
- Test error handling and edge cases
- Test vector dimension consistency
- Test batch processing logic
"""

View file

@ -0,0 +1,114 @@
"""
Shared fixtures for embeddings unit tests
"""
import pytest
import numpy as np
from unittest.mock import Mock, AsyncMock, MagicMock
from trustgraph.schema import EmbeddingsRequest, EmbeddingsResponse, Error
@pytest.fixture
def sample_text():
"""Sample text for embedding tests"""
return "This is a sample text for embedding generation."
@pytest.fixture
def sample_embedding_vector():
"""Sample embedding vector for mocking"""
return [0.1, 0.2, -0.3, 0.4, -0.5, 0.6, 0.7, -0.8, 0.9, -1.0]
@pytest.fixture
def sample_batch_embeddings():
"""Sample batch of embedding vectors"""
return [
[0.1, 0.2, -0.3, 0.4, -0.5],
[0.6, 0.7, -0.8, 0.9, -1.0],
[-0.1, -0.2, 0.3, -0.4, 0.5]
]
@pytest.fixture
def sample_embeddings_request():
"""Sample EmbeddingsRequest for testing"""
return EmbeddingsRequest(
text="Test text for embedding"
)
@pytest.fixture
def sample_embeddings_response(sample_embedding_vector):
"""Sample successful EmbeddingsResponse"""
return EmbeddingsResponse(
error=None,
vectors=sample_embedding_vector
)
@pytest.fixture
def sample_error_response():
"""Sample error EmbeddingsResponse"""
return EmbeddingsResponse(
error=Error(type="embedding-error", message="Model not found"),
vectors=None
)
@pytest.fixture
def mock_message():
"""Mock Pulsar message for testing"""
message = Mock()
message.properties.return_value = {"id": "test-message-123"}
return message
@pytest.fixture
def mock_flow():
"""Mock flow for producer/consumer testing"""
flow = Mock()
flow.return_value.send = AsyncMock()
flow.producer = {"response": Mock()}
flow.producer["response"].send = AsyncMock()
return flow
@pytest.fixture
def mock_consumer():
"""Mock Pulsar consumer"""
return AsyncMock()
@pytest.fixture
def mock_producer():
"""Mock Pulsar producer"""
return AsyncMock()
@pytest.fixture
def mock_fastembed_embedding():
"""Mock FastEmbed TextEmbedding"""
mock = Mock()
mock.embed.return_value = [np.array([0.1, 0.2, -0.3, 0.4, -0.5])]
return mock
@pytest.fixture
def mock_ollama_client():
"""Mock Ollama client"""
mock = Mock()
mock.embed.return_value = Mock(
embeddings=[0.1, 0.2, -0.3, 0.4, -0.5]
)
return mock
@pytest.fixture
def embedding_test_params():
"""Common parameters for embedding processor testing"""
return {
"model": "test-model",
"concurrency": 1,
"id": "test-embeddings"
}

View file

@ -0,0 +1,278 @@
"""
Unit tests for embedding business logic
Tests the core embedding functionality without external dependencies,
focusing on data processing, validation, and business rules.
"""
import pytest
import numpy as np
from unittest.mock import Mock, patch
class TestEmbeddingBusinessLogic:
"""Test embedding business logic and data processing"""
def test_embedding_vector_validation(self):
"""Test validation of embedding vectors"""
# Arrange
valid_vectors = [
[0.1, 0.2, 0.3],
[-0.5, 0.0, 0.8],
[], # Empty vector
[1.0] * 1536 # Large vector
]
invalid_vectors = [
None,
"not a vector",
[1, 2, "string"],
[[1, 2], [3, 4]] # Nested
]
# Act & Assert
def is_valid_vector(vec):
if not isinstance(vec, list):
return False
return all(isinstance(x, (int, float)) for x in vec)
for vec in valid_vectors:
assert is_valid_vector(vec), f"Should be valid: {vec}"
for vec in invalid_vectors:
assert not is_valid_vector(vec), f"Should be invalid: {vec}"
def test_dimension_consistency_check(self):
"""Test dimension consistency validation"""
# Arrange
same_dimension_vectors = [
[0.1, 0.2, 0.3, 0.4, 0.5],
[0.6, 0.7, 0.8, 0.9, 1.0],
[-0.1, -0.2, -0.3, -0.4, -0.5]
]
mixed_dimension_vectors = [
[0.1, 0.2, 0.3],
[0.4, 0.5, 0.6, 0.7],
[0.8, 0.9]
]
# Act
def check_dimension_consistency(vectors):
if not vectors:
return True
expected_dim = len(vectors[0])
return all(len(vec) == expected_dim for vec in vectors)
# Assert
assert check_dimension_consistency(same_dimension_vectors)
assert not check_dimension_consistency(mixed_dimension_vectors)
def test_text_preprocessing_logic(self):
"""Test text preprocessing for embeddings"""
# Arrange
test_cases = [
("Simple text", "Simple text"),
("", ""),
("Text with\nnewlines", "Text with\nnewlines"),
("Unicode: 世界 🌍", "Unicode: 世界 🌍"),
(" Whitespace ", " Whitespace ")
]
# Act & Assert
for input_text, expected in test_cases:
# Simple preprocessing (identity in this case)
processed = str(input_text) if input_text is not None else ""
assert processed == expected
def test_batch_processing_logic(self):
"""Test batch processing logic for multiple texts"""
# Arrange
texts = ["Text 1", "Text 2", "Text 3"]
def mock_embed_single(text):
# Simulate embedding generation based on text length
return [len(text) / 10.0] * 5
# Act
results = []
for text in texts:
embedding = mock_embed_single(text)
results.append((text, embedding))
# Assert
assert len(results) == len(texts)
for i, (original_text, embedding) in enumerate(results):
assert original_text == texts[i]
assert len(embedding) == 5
expected_value = len(texts[i]) / 10.0
assert all(abs(val - expected_value) < 0.001 for val in embedding)
def test_numpy_array_conversion_logic(self):
"""Test numpy array to list conversion"""
# Arrange
test_arrays = [
np.array([1, 2, 3], dtype=np.int32),
np.array([1.0, 2.0, 3.0], dtype=np.float64),
np.array([0.1, 0.2, 0.3], dtype=np.float32)
]
# Act
converted = []
for arr in test_arrays:
result = arr.tolist()
converted.append(result)
# Assert
assert converted[0] == [1, 2, 3]
assert converted[1] == [1.0, 2.0, 3.0]
# Float32 might have precision differences, so check approximately
assert len(converted[2]) == 3
assert all(isinstance(x, float) for x in converted[2])
def test_error_response_generation(self):
"""Test error response generation logic"""
# Arrange
error_scenarios = [
("model_not_found", "Model 'xyz' not found"),
("connection_error", "Failed to connect to service"),
("rate_limit", "Rate limit exceeded"),
("invalid_input", "Invalid input format")
]
# Act & Assert
for error_type, error_message in error_scenarios:
error_response = {
"error": {
"type": error_type,
"message": error_message
},
"vectors": None
}
assert error_response["error"]["type"] == error_type
assert error_response["error"]["message"] == error_message
assert error_response["vectors"] is None
def test_success_response_generation(self):
"""Test success response generation logic"""
# Arrange
test_vectors = [0.1, 0.2, 0.3, 0.4, 0.5]
# Act
success_response = {
"error": None,
"vectors": test_vectors
}
# Assert
assert success_response["error"] is None
assert success_response["vectors"] == test_vectors
assert len(success_response["vectors"]) == 5
def test_model_parameter_handling(self):
"""Test model parameter validation and handling"""
# Arrange
valid_models = {
"ollama": ["mxbai-embed-large", "nomic-embed-text"],
"fastembed": ["sentence-transformers/all-MiniLM-L6-v2", "BAAI/bge-small-en-v1.5"]
}
# Act & Assert
for provider, models in valid_models.items():
for model in models:
assert isinstance(model, str)
assert len(model) > 0
if provider == "fastembed":
assert "/" in model or "-" in model
def test_concurrent_processing_simulation(self):
"""Test concurrent processing simulation"""
# Arrange
import asyncio
async def mock_async_embed(text, delay=0.001):
await asyncio.sleep(delay)
return [ord(text[0]) / 255.0] if text else [0.0]
# Act
async def run_concurrent():
texts = ["A", "B", "C", "D", "E"]
tasks = [mock_async_embed(text) for text in texts]
results = await asyncio.gather(*tasks)
return list(zip(texts, results))
# Run test
results = asyncio.run(run_concurrent())
# Assert
assert len(results) == 5
for i, (text, embedding) in enumerate(results):
expected_char = chr(ord('A') + i)
assert text == expected_char
expected_value = ord(expected_char) / 255.0
assert abs(embedding[0] - expected_value) < 0.001
def test_empty_and_edge_cases(self):
"""Test empty inputs and edge cases"""
# Arrange
edge_cases = [
("", "empty string"),
(" ", "single space"),
("a", "single character"),
("A" * 10000, "very long string"),
("\\n\\t\\r", "special characters"),
("混合English中文", "mixed languages")
]
# Act & Assert
for text, description in edge_cases:
# Basic validation that text can be processed
assert isinstance(text, str), f"Failed for {description}"
assert len(text) >= 0, f"Failed for {description}"
# Simulate embedding generation would work
mock_embedding = [len(text) % 10] * 3
assert len(mock_embedding) == 3, f"Failed for {description}"
def test_vector_normalization_logic(self):
"""Test vector normalization calculations"""
# Arrange
test_vectors = [
[3.0, 4.0], # Should normalize to [0.6, 0.8]
[1.0, 0.0], # Should normalize to [1.0, 0.0]
[0.0, 0.0], # Zero vector edge case
]
# Act & Assert
for vector in test_vectors:
magnitude = sum(x**2 for x in vector) ** 0.5
if magnitude > 0:
normalized = [x / magnitude for x in vector]
# Check unit length (approximately)
norm_magnitude = sum(x**2 for x in normalized) ** 0.5
assert abs(norm_magnitude - 1.0) < 0.0001
else:
# Zero vector case
assert all(x == 0 for x in vector)
def test_cosine_similarity_calculation(self):
"""Test cosine similarity computation"""
# Arrange
vector_pairs = [
([1, 0], [0, 1], 0.0), # Orthogonal
([1, 0], [1, 0], 1.0), # Identical
([1, 1], [-1, -1], -1.0), # Opposite
]
# Act & Assert
def cosine_similarity(v1, v2):
dot = sum(a * b for a, b in zip(v1, v2))
mag1 = sum(x**2 for x in v1) ** 0.5
mag2 = sum(x**2 for x in v2) ** 0.5
return dot / (mag1 * mag2) if mag1 * mag2 > 0 else 0
for v1, v2, expected in vector_pairs:
similarity = cosine_similarity(v1, v2)
assert abs(similarity - expected) < 0.0001

View file

@ -0,0 +1,340 @@
"""
Unit tests for embedding utilities and common functionality
Tests dimension consistency, batch processing, error handling patterns,
and other utilities common across embedding services.
"""
import pytest
from unittest.mock import patch, Mock, AsyncMock
import numpy as np
from trustgraph.schema import EmbeddingsRequest, EmbeddingsResponse, Error
from trustgraph.exceptions import TooManyRequests
class MockEmbeddingProcessor:
"""Simple mock embedding processor for testing functionality"""
def __init__(self, embedding_function=None, **params):
# Store embedding function for mocking
self.embedding_function = embedding_function
self.model = params.get('model', 'test-model')
async def on_embeddings(self, text):
if self.embedding_function:
return self.embedding_function(text)
return [0.1, 0.2, 0.3, 0.4, 0.5] # Default test embedding
class TestEmbeddingDimensionConsistency:
"""Test cases for embedding dimension consistency"""
async def test_consistent_dimensions_single_processor(self):
"""Test that a single processor returns consistent dimensions"""
# Arrange
dimension = 128
def mock_embedding(text):
return [0.1] * dimension
processor = MockEmbeddingProcessor(embedding_function=mock_embedding)
# Act
results = []
test_texts = ["Text 1", "Text 2", "Text 3", "Text 4", "Text 5"]
for text in test_texts:
result = await processor.on_embeddings(text)
results.append(result)
# Assert
for result in results:
assert len(result) == dimension, f"Expected dimension {dimension}, got {len(result)}"
# All results should have same dimensions
first_dim = len(results[0])
for i, result in enumerate(results[1:], 1):
assert len(result) == first_dim, f"Dimension mismatch at index {i}"
async def test_dimension_consistency_across_text_lengths(self):
"""Test dimension consistency across varying text lengths"""
# Arrange
dimension = 384
def mock_embedding(text):
# Dimension should not depend on text length
return [0.1] * dimension
processor = MockEmbeddingProcessor(embedding_function=mock_embedding)
# Act - Test various text lengths
test_texts = [
"", # Empty text
"Hi", # Very short
"This is a medium length sentence for testing.", # Medium
"This is a very long text that should still produce embeddings of consistent dimension regardless of the input text length and content." * 10 # Very long
]
results = []
for text in test_texts:
result = await processor.on_embeddings(text)
results.append(result)
# Assert
for i, result in enumerate(results):
assert len(result) == dimension, f"Text length {len(test_texts[i])} produced wrong dimension"
def test_dimension_validation_different_models(self):
"""Test dimension validation for different model configurations"""
# Arrange
models_and_dims = [
("small-model", 128),
("medium-model", 384),
("large-model", 1536)
]
# Act & Assert
for model_name, expected_dim in models_and_dims:
# Test dimension validation logic
test_vector = [0.1] * expected_dim
assert len(test_vector) == expected_dim, f"Model {model_name} dimension mismatch"
class TestEmbeddingBatchProcessing:
"""Test cases for batch processing logic"""
async def test_sequential_processing_maintains_order(self):
"""Test that sequential processing maintains text order"""
# Arrange
def mock_embedding(text):
# Return embedding that encodes the text for verification
return [ord(text[0]) / 255.0] if text else [0.0] # Normalize to [0,1]
processor = MockEmbeddingProcessor(embedding_function=mock_embedding)
# Act
test_texts = ["A", "B", "C", "D", "E"]
results = []
for text in test_texts:
result = await processor.on_embeddings(text)
results.append((text, result))
# Assert
for i, (original_text, embedding) in enumerate(results):
assert original_text == test_texts[i]
expected_value = ord(test_texts[i][0]) / 255.0
assert abs(embedding[0] - expected_value) < 0.001
async def test_batch_processing_throughput(self):
"""Test batch processing capabilities"""
# Arrange
call_count = 0
def mock_embedding(text):
nonlocal call_count
call_count += 1
return [0.1, 0.2, 0.3]
processor = MockEmbeddingProcessor(embedding_function=mock_embedding)
# Act - Process multiple texts
batch_size = 10
test_texts = [f"Text {i}" for i in range(batch_size)]
results = []
for text in test_texts:
result = await processor.on_embeddings(text)
results.append(result)
# Assert
assert call_count == batch_size
assert len(results) == batch_size
for result in results:
assert result == [0.1, 0.2, 0.3]
async def test_concurrent_processing_simulation(self):
"""Test concurrent processing behavior simulation"""
# Arrange
import asyncio
processing_times = []
def mock_embedding(text):
import time
processing_times.append(time.time())
return [len(text) / 100.0] # Encoding text length
processor = MockEmbeddingProcessor(embedding_function=mock_embedding)
# Act - Simulate concurrent processing
test_texts = [f"Text {i}" for i in range(5)]
tasks = [processor.on_embeddings(text) for text in test_texts]
results = await asyncio.gather(*tasks)
# Assert
assert len(results) == 5
assert len(processing_times) == 5
# Results should correspond to text lengths
for i, result in enumerate(results):
expected_value = len(test_texts[i]) / 100.0
assert abs(result[0] - expected_value) < 0.001
class TestEmbeddingErrorHandling:
"""Test cases for error handling in embedding services"""
async def test_embedding_function_error_handling(self):
"""Test error handling in embedding function"""
# Arrange
def failing_embedding(text):
raise Exception("Embedding model failed")
processor = MockEmbeddingProcessor(embedding_function=failing_embedding)
# Act & Assert
with pytest.raises(Exception, match="Embedding model failed"):
await processor.on_embeddings("Test text")
async def test_rate_limit_exception_propagation(self):
"""Test that rate limit exceptions are properly propagated"""
# Arrange
def rate_limited_embedding(text):
raise TooManyRequests("Rate limit exceeded")
processor = MockEmbeddingProcessor(embedding_function=rate_limited_embedding)
# Act & Assert
with pytest.raises(TooManyRequests, match="Rate limit exceeded"):
await processor.on_embeddings("Test text")
async def test_none_result_handling(self):
"""Test handling when embedding function returns None"""
# Arrange
def none_embedding(text):
return None
processor = MockEmbeddingProcessor(embedding_function=none_embedding)
# Act
result = await processor.on_embeddings("Test text")
# Assert
assert result is None
async def test_invalid_embedding_format_handling(self):
"""Test handling of invalid embedding formats"""
# Arrange
def invalid_embedding(text):
return "not a list" # Invalid format
processor = MockEmbeddingProcessor(embedding_function=invalid_embedding)
# Act
result = await processor.on_embeddings("Test text")
# Assert
assert result == "not a list" # Returns what the function provides
class TestEmbeddingUtilities:
"""Test cases for embedding utility functions and helpers"""
def test_vector_normalization_simulation(self):
"""Test vector normalization logic simulation"""
# Arrange
test_vectors = [
[1.0, 2.0, 3.0],
[0.5, -0.5, 1.0],
[10.0, 20.0, 30.0]
]
# Act - Simulate L2 normalization
normalized_vectors = []
for vector in test_vectors:
magnitude = sum(x**2 for x in vector) ** 0.5
if magnitude > 0:
normalized = [x / magnitude for x in vector]
else:
normalized = vector
normalized_vectors.append(normalized)
# Assert
for normalized in normalized_vectors:
magnitude = sum(x**2 for x in normalized) ** 0.5
assert abs(magnitude - 1.0) < 0.0001, "Vector should be unit length"
def test_cosine_similarity_calculation(self):
"""Test cosine similarity calculation between embeddings"""
# Arrange
vector1 = [1.0, 0.0, 0.0]
vector2 = [0.0, 1.0, 0.0]
vector3 = [1.0, 0.0, 0.0] # Same as vector1
# Act - Calculate cosine similarities
def cosine_similarity(v1, v2):
dot_product = sum(a * b for a, b in zip(v1, v2))
mag1 = sum(x**2 for x in v1) ** 0.5
mag2 = sum(x**2 for x in v2) ** 0.5
return dot_product / (mag1 * mag2) if mag1 * mag2 > 0 else 0
sim_12 = cosine_similarity(vector1, vector2)
sim_13 = cosine_similarity(vector1, vector3)
# Assert
assert abs(sim_12 - 0.0) < 0.0001, "Orthogonal vectors should have 0 similarity"
assert abs(sim_13 - 1.0) < 0.0001, "Identical vectors should have 1.0 similarity"
def test_embedding_validation_helpers(self):
"""Test embedding validation helper functions"""
# Arrange
valid_embeddings = [
[0.1, 0.2, 0.3],
[1.0, -1.0, 0.0],
[] # Empty embedding
]
invalid_embeddings = [
None,
"not a list",
[1, 2, "three"], # Mixed types
[[1, 2], [3, 4]] # Nested lists
]
# Act & Assert
def is_valid_embedding(embedding):
if not isinstance(embedding, list):
return False
return all(isinstance(x, (int, float)) for x in embedding)
for embedding in valid_embeddings:
assert is_valid_embedding(embedding), f"Should be valid: {embedding}"
for embedding in invalid_embeddings:
assert not is_valid_embedding(embedding), f"Should be invalid: {embedding}"
async def test_embedding_metadata_handling(self):
"""Test handling of embedding metadata and properties"""
# Arrange
def metadata_embedding(text):
return {
"vectors": [0.1, 0.2, 0.3],
"model": "test-model",
"dimension": 3,
"text_length": len(text)
}
# Mock processor that returns metadata
class MetadataProcessor(MockEmbeddingProcessor):
async def on_embeddings(self, text):
result = metadata_embedding(text)
return result["vectors"] # Return only vectors for compatibility
processor = MetadataProcessor()
# Act
result = await processor.on_embeddings("Test text with metadata")
# Assert
assert isinstance(result, list)
assert len(result) == 3
assert result == [0.1, 0.2, 0.3]