mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 16:36:21 +02:00
Merge 2.0 to master (#651)
This commit is contained in:
parent
3666ece2c5
commit
b9d7bf9a8b
212 changed files with 13940 additions and 6180 deletions
|
|
@ -10,7 +10,7 @@ import pytest
|
|||
from unittest.mock import Mock, patch, MagicMock
|
||||
|
||||
from trustgraph.storage.triples.cassandra.write import Processor as TriplesWriter
|
||||
from trustgraph.storage.objects.cassandra.write import Processor as ObjectsWriter
|
||||
from trustgraph.storage.rows.cassandra.write import Processor as RowsWriter
|
||||
from trustgraph.query.triples.cassandra.service import Processor as TriplesQuery
|
||||
from trustgraph.storage.knowledge.store import Processor as KgStore
|
||||
|
||||
|
|
@ -81,10 +81,10 @@ class TestTriplesWriterConfiguration:
|
|||
assert processor.cassandra_password is None
|
||||
|
||||
|
||||
class TestObjectsWriterConfiguration:
|
||||
class TestRowsWriterConfiguration:
|
||||
"""Test Cassandra configuration in objects writer processor."""
|
||||
|
||||
@patch('trustgraph.storage.objects.cassandra.write.Cluster')
|
||||
@patch('trustgraph.storage.rows.cassandra.write.Cluster')
|
||||
def test_environment_variable_configuration(self, mock_cluster):
|
||||
"""Test processor picks up configuration from environment variables."""
|
||||
env_vars = {
|
||||
|
|
@ -97,13 +97,13 @@ class TestObjectsWriterConfiguration:
|
|||
mock_cluster.return_value = mock_cluster_instance
|
||||
|
||||
with patch.dict(os.environ, env_vars, clear=True):
|
||||
processor = ObjectsWriter(taskgroup=MagicMock())
|
||||
processor = RowsWriter(taskgroup=MagicMock())
|
||||
|
||||
assert processor.cassandra_host == ['obj-env-host1', 'obj-env-host2']
|
||||
assert processor.cassandra_username == 'obj-env-user'
|
||||
assert processor.cassandra_password == 'obj-env-pass'
|
||||
|
||||
@patch('trustgraph.storage.objects.cassandra.write.Cluster')
|
||||
@patch('trustgraph.storage.rows.cassandra.write.Cluster')
|
||||
def test_cassandra_connection_with_hosts_list(self, mock_cluster):
|
||||
"""Test that Cassandra connection uses hosts list correctly."""
|
||||
env_vars = {
|
||||
|
|
@ -118,7 +118,7 @@ class TestObjectsWriterConfiguration:
|
|||
mock_cluster.return_value = mock_cluster_instance
|
||||
|
||||
with patch.dict(os.environ, env_vars, clear=True):
|
||||
processor = ObjectsWriter(taskgroup=MagicMock())
|
||||
processor = RowsWriter(taskgroup=MagicMock())
|
||||
processor.connect_cassandra()
|
||||
|
||||
# Verify cluster was called with hosts list
|
||||
|
|
@ -129,8 +129,8 @@ class TestObjectsWriterConfiguration:
|
|||
assert 'contact_points' in call_args.kwargs
|
||||
assert call_args.kwargs['contact_points'] == ['conn-host1', 'conn-host2', 'conn-host3']
|
||||
|
||||
@patch('trustgraph.storage.objects.cassandra.write.Cluster')
|
||||
@patch('trustgraph.storage.objects.cassandra.write.PlainTextAuthProvider')
|
||||
@patch('trustgraph.storage.rows.cassandra.write.Cluster')
|
||||
@patch('trustgraph.storage.rows.cassandra.write.PlainTextAuthProvider')
|
||||
def test_authentication_configuration(self, mock_auth_provider, mock_cluster):
|
||||
"""Test authentication is configured when credentials are provided."""
|
||||
env_vars = {
|
||||
|
|
@ -145,7 +145,7 @@ class TestObjectsWriterConfiguration:
|
|||
mock_cluster.return_value = mock_cluster_instance
|
||||
|
||||
with patch.dict(os.environ, env_vars, clear=True):
|
||||
processor = ObjectsWriter(taskgroup=MagicMock())
|
||||
processor = RowsWriter(taskgroup=MagicMock())
|
||||
processor.connect_cassandra()
|
||||
|
||||
# Verify auth provider was created with correct credentials
|
||||
|
|
@ -302,10 +302,10 @@ class TestCommandLineArgumentHandling:
|
|||
def test_objects_writer_add_args(self):
|
||||
"""Test that objects writer adds standard Cassandra arguments."""
|
||||
import argparse
|
||||
from trustgraph.storage.objects.cassandra.write import Processor as ObjectsWriter
|
||||
from trustgraph.storage.rows.cassandra.write import Processor as RowsWriter
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
ObjectsWriter.add_args(parser)
|
||||
RowsWriter.add_args(parser)
|
||||
|
||||
# Parse empty args to check that arguments exist
|
||||
args = parser.parse_args([])
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ import pytest
|
|||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from trustgraph.storage.graph_embeddings.milvus.write import Processor
|
||||
from trustgraph.schema import Value, EntityEmbeddings
|
||||
from trustgraph.schema import Term, EntityEmbeddings, IRI, LITERAL
|
||||
|
||||
|
||||
class TestMilvusGraphEmbeddingsStorageProcessor:
|
||||
|
|
@ -22,11 +22,11 @@ class TestMilvusGraphEmbeddingsStorageProcessor:
|
|||
|
||||
# Create test entities with embeddings
|
||||
entity1 = EntityEmbeddings(
|
||||
entity=Value(value='http://example.com/entity1', is_uri=True),
|
||||
entity=Term(type=IRI, iri='http://example.com/entity1'),
|
||||
vectors=[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
|
||||
)
|
||||
entity2 = EntityEmbeddings(
|
||||
entity=Value(value='literal entity', is_uri=False),
|
||||
entity=Term(type=LITERAL, value='literal entity'),
|
||||
vectors=[[0.7, 0.8, 0.9]]
|
||||
)
|
||||
message.entities = [entity1, entity2]
|
||||
|
|
@ -84,7 +84,7 @@ class TestMilvusGraphEmbeddingsStorageProcessor:
|
|||
message.metadata.collection = 'test_collection'
|
||||
|
||||
entity = EntityEmbeddings(
|
||||
entity=Value(value='http://example.com/entity', is_uri=True),
|
||||
entity=Term(type=IRI, iri='http://example.com/entity'),
|
||||
vectors=[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
|
||||
)
|
||||
message.entities = [entity]
|
||||
|
|
@ -136,7 +136,7 @@ class TestMilvusGraphEmbeddingsStorageProcessor:
|
|||
message.metadata.collection = 'test_collection'
|
||||
|
||||
entity = EntityEmbeddings(
|
||||
entity=Value(value='', is_uri=False),
|
||||
entity=Term(type=LITERAL, value=''),
|
||||
vectors=[[0.1, 0.2, 0.3]]
|
||||
)
|
||||
message.entities = [entity]
|
||||
|
|
@ -155,7 +155,7 @@ class TestMilvusGraphEmbeddingsStorageProcessor:
|
|||
message.metadata.collection = 'test_collection'
|
||||
|
||||
entity = EntityEmbeddings(
|
||||
entity=Value(value=None, is_uri=False),
|
||||
entity=Term(type=LITERAL, value=None),
|
||||
vectors=[[0.1, 0.2, 0.3]]
|
||||
)
|
||||
message.entities = [entity]
|
||||
|
|
@ -174,15 +174,15 @@ class TestMilvusGraphEmbeddingsStorageProcessor:
|
|||
message.metadata.collection = 'test_collection'
|
||||
|
||||
valid_entity = EntityEmbeddings(
|
||||
entity=Value(value='http://example.com/valid', is_uri=True),
|
||||
entity=Term(type=IRI, iri='http://example.com/valid'),
|
||||
vectors=[[0.1, 0.2, 0.3]]
|
||||
)
|
||||
empty_entity = EntityEmbeddings(
|
||||
entity=Value(value='', is_uri=False),
|
||||
entity=Term(type=LITERAL, value=''),
|
||||
vectors=[[0.4, 0.5, 0.6]]
|
||||
)
|
||||
none_entity = EntityEmbeddings(
|
||||
entity=Value(value=None, is_uri=False),
|
||||
entity=Term(type=LITERAL, value=None),
|
||||
vectors=[[0.7, 0.8, 0.9]]
|
||||
)
|
||||
message.entities = [valid_entity, empty_entity, none_entity]
|
||||
|
|
@ -217,7 +217,7 @@ class TestMilvusGraphEmbeddingsStorageProcessor:
|
|||
message.metadata.collection = 'test_collection'
|
||||
|
||||
entity = EntityEmbeddings(
|
||||
entity=Value(value='http://example.com/entity', is_uri=True),
|
||||
entity=Term(type=IRI, iri='http://example.com/entity'),
|
||||
vectors=[]
|
||||
)
|
||||
message.entities = [entity]
|
||||
|
|
@ -236,7 +236,7 @@ class TestMilvusGraphEmbeddingsStorageProcessor:
|
|||
message.metadata.collection = 'test_collection'
|
||||
|
||||
entity = EntityEmbeddings(
|
||||
entity=Value(value='http://example.com/entity', is_uri=True),
|
||||
entity=Term(type=IRI, iri='http://example.com/entity'),
|
||||
vectors=[
|
||||
[0.1, 0.2], # 2D vector
|
||||
[0.3, 0.4, 0.5, 0.6], # 4D vector
|
||||
|
|
@ -269,11 +269,11 @@ class TestMilvusGraphEmbeddingsStorageProcessor:
|
|||
message.metadata.collection = 'test_collection'
|
||||
|
||||
uri_entity = EntityEmbeddings(
|
||||
entity=Value(value='http://example.com/uri_entity', is_uri=True),
|
||||
entity=Term(type=IRI, iri='http://example.com/uri_entity'),
|
||||
vectors=[[0.1, 0.2, 0.3]]
|
||||
)
|
||||
literal_entity = EntityEmbeddings(
|
||||
entity=Value(value='literal entity text', is_uri=False),
|
||||
entity=Term(type=LITERAL, value='literal entity text'),
|
||||
vectors=[[0.4, 0.5, 0.6]]
|
||||
)
|
||||
message.entities = [uri_entity, literal_entity]
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ from unittest import IsolatedAsyncioTestCase
|
|||
|
||||
# Import the service under test
|
||||
from trustgraph.storage.graph_embeddings.qdrant.write import Processor
|
||||
from trustgraph.schema import IRI, LITERAL
|
||||
|
||||
|
||||
class TestQdrantGraphEmbeddingsStorage(IsolatedAsyncioTestCase):
|
||||
|
|
@ -67,7 +68,8 @@ class TestQdrantGraphEmbeddingsStorage(IsolatedAsyncioTestCase):
|
|||
mock_message.metadata.collection = 'test_collection'
|
||||
|
||||
mock_entity = MagicMock()
|
||||
mock_entity.entity.value = 'test_entity'
|
||||
mock_entity.entity.type = IRI
|
||||
mock_entity.entity.iri = 'test_entity'
|
||||
mock_entity.vectors = [[0.1, 0.2, 0.3]] # Single vector with 3 dimensions
|
||||
|
||||
mock_message.entities = [mock_entity]
|
||||
|
|
@ -120,11 +122,13 @@ class TestQdrantGraphEmbeddingsStorage(IsolatedAsyncioTestCase):
|
|||
mock_message.metadata.collection = 'multi_collection'
|
||||
|
||||
mock_entity1 = MagicMock()
|
||||
mock_entity1.entity.value = 'entity_one'
|
||||
mock_entity1.entity.type = IRI
|
||||
mock_entity1.entity.iri = 'entity_one'
|
||||
mock_entity1.vectors = [[0.1, 0.2]]
|
||||
|
||||
|
||||
mock_entity2 = MagicMock()
|
||||
mock_entity2.entity.value = 'entity_two'
|
||||
mock_entity2.entity.type = IRI
|
||||
mock_entity2.entity.iri = 'entity_two'
|
||||
mock_entity2.vectors = [[0.3, 0.4]]
|
||||
|
||||
mock_message.entities = [mock_entity1, mock_entity2]
|
||||
|
|
@ -179,7 +183,8 @@ class TestQdrantGraphEmbeddingsStorage(IsolatedAsyncioTestCase):
|
|||
mock_message.metadata.collection = 'vector_collection'
|
||||
|
||||
mock_entity = MagicMock()
|
||||
mock_entity.entity.value = 'multi_vector_entity'
|
||||
mock_entity.entity.type = IRI
|
||||
mock_entity.entity.iri = 'multi_vector_entity'
|
||||
mock_entity.vectors = [
|
||||
[0.1, 0.2, 0.3],
|
||||
[0.4, 0.5, 0.6],
|
||||
|
|
@ -231,11 +236,12 @@ class TestQdrantGraphEmbeddingsStorage(IsolatedAsyncioTestCase):
|
|||
mock_message.metadata.collection = 'empty_collection'
|
||||
|
||||
mock_entity_empty = MagicMock()
|
||||
mock_entity_empty.entity.type = LITERAL
|
||||
mock_entity_empty.entity.value = "" # Empty string
|
||||
mock_entity_empty.vectors = [[0.1, 0.2]]
|
||||
|
||||
|
||||
mock_entity_none = MagicMock()
|
||||
mock_entity_none.entity.value = None # None value
|
||||
mock_entity_none.entity = None # None entity
|
||||
mock_entity_none.vectors = [[0.3, 0.4]]
|
||||
|
||||
mock_message.entities = [mock_entity_empty, mock_entity_none]
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from unittest.mock import MagicMock, patch, call
|
|||
|
||||
from trustgraph.storage.triples.neo4j.write import Processor as StorageProcessor
|
||||
from trustgraph.query.triples.neo4j.service import Processor as QueryProcessor
|
||||
from trustgraph.schema import Triples, Triple, Value, Metadata
|
||||
from trustgraph.schema import Triples, Triple, Term, Metadata, IRI, LITERAL
|
||||
from trustgraph.schema import TriplesQueryRequest
|
||||
|
||||
|
||||
|
|
@ -60,9 +60,9 @@ class TestNeo4jUserCollectionIsolation:
|
|||
)
|
||||
|
||||
triple = Triple(
|
||||
s=Value(value="http://example.com/subject", is_uri=True),
|
||||
p=Value(value="http://example.com/predicate", is_uri=True),
|
||||
o=Value(value="literal_value", is_uri=False)
|
||||
s=Term(type=IRI, iri="http://example.com/subject"),
|
||||
p=Term(type=IRI, iri="http://example.com/predicate"),
|
||||
o=Term(type=LITERAL, value="literal_value")
|
||||
)
|
||||
|
||||
message = Triples(
|
||||
|
|
@ -128,9 +128,9 @@ class TestNeo4jUserCollectionIsolation:
|
|||
metadata = Metadata(id="test-id")
|
||||
|
||||
triple = Triple(
|
||||
s=Value(value="http://example.com/subject", is_uri=True),
|
||||
p=Value(value="http://example.com/predicate", is_uri=True),
|
||||
o=Value(value="http://example.com/object", is_uri=True)
|
||||
s=Term(type=IRI, iri="http://example.com/subject"),
|
||||
p=Term(type=IRI, iri="http://example.com/predicate"),
|
||||
o=Term(type=IRI, iri="http://example.com/object")
|
||||
)
|
||||
|
||||
message = Triples(
|
||||
|
|
@ -170,8 +170,8 @@ class TestNeo4jUserCollectionIsolation:
|
|||
query = TriplesQueryRequest(
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
s=Value(value="http://example.com/subject", is_uri=True),
|
||||
p=Value(value="http://example.com/predicate", is_uri=True),
|
||||
s=Term(type=IRI, iri="http://example.com/subject"),
|
||||
p=Term(type=IRI, iri="http://example.com/predicate"),
|
||||
o=None
|
||||
)
|
||||
|
||||
|
|
@ -254,9 +254,9 @@ class TestNeo4jUserCollectionIsolation:
|
|||
metadata=Metadata(user="user1", collection="coll1"),
|
||||
triples=[
|
||||
Triple(
|
||||
s=Value(value="http://example.com/user1/subject", is_uri=True),
|
||||
p=Value(value="http://example.com/predicate", is_uri=True),
|
||||
o=Value(value="user1_data", is_uri=False)
|
||||
s=Term(type=IRI, iri="http://example.com/user1/subject"),
|
||||
p=Term(type=IRI, iri="http://example.com/predicate"),
|
||||
o=Term(type=LITERAL, value="user1_data")
|
||||
)
|
||||
]
|
||||
)
|
||||
|
|
@ -265,9 +265,9 @@ class TestNeo4jUserCollectionIsolation:
|
|||
metadata=Metadata(user="user2", collection="coll2"),
|
||||
triples=[
|
||||
Triple(
|
||||
s=Value(value="http://example.com/user2/subject", is_uri=True),
|
||||
p=Value(value="http://example.com/predicate", is_uri=True),
|
||||
o=Value(value="user2_data", is_uri=False)
|
||||
s=Term(type=IRI, iri="http://example.com/user2/subject"),
|
||||
p=Term(type=IRI, iri="http://example.com/predicate"),
|
||||
o=Term(type=LITERAL, value="user2_data")
|
||||
)
|
||||
]
|
||||
)
|
||||
|
|
@ -429,9 +429,9 @@ class TestNeo4jUserCollectionRegression:
|
|||
metadata=Metadata(user="user1", collection="coll1"),
|
||||
triples=[
|
||||
Triple(
|
||||
s=Value(value=shared_uri, is_uri=True),
|
||||
p=Value(value="http://example.com/p", is_uri=True),
|
||||
o=Value(value="user1_value", is_uri=False)
|
||||
s=Term(type=IRI, iri=shared_uri),
|
||||
p=Term(type=IRI, iri="http://example.com/p"),
|
||||
o=Term(type=LITERAL, value="user1_value")
|
||||
)
|
||||
]
|
||||
)
|
||||
|
|
@ -440,9 +440,9 @@ class TestNeo4jUserCollectionRegression:
|
|||
metadata=Metadata(user="user2", collection="coll2"),
|
||||
triples=[
|
||||
Triple(
|
||||
s=Value(value=shared_uri, is_uri=True),
|
||||
p=Value(value="http://example.com/p", is_uri=True),
|
||||
o=Value(value="user2_value", is_uri=False)
|
||||
s=Term(type=IRI, iri=shared_uri),
|
||||
p=Term(type=IRI, iri="http://example.com/p"),
|
||||
o=Term(type=LITERAL, value="user2_value")
|
||||
)
|
||||
]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,533 +0,0 @@
|
|||
"""
|
||||
Unit tests for Cassandra Object Storage Processor
|
||||
|
||||
Tests the business logic of the object storage processor including:
|
||||
- Schema configuration handling
|
||||
- Type conversions
|
||||
- Name sanitization
|
||||
- Table structure generation
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, AsyncMock, patch
|
||||
import json
|
||||
|
||||
from trustgraph.storage.objects.cassandra.write import Processor
|
||||
from trustgraph.schema import ExtractedObject, Metadata, RowSchema, Field
|
||||
|
||||
|
||||
class TestObjectsCassandraStorageLogic:
|
||||
"""Test business logic without FlowProcessor dependencies"""
|
||||
|
||||
def test_sanitize_name(self):
|
||||
"""Test name sanitization for Cassandra compatibility"""
|
||||
processor = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
|
||||
# Test various name patterns (back to original logic)
|
||||
assert processor.sanitize_name("simple_name") == "simple_name"
|
||||
assert processor.sanitize_name("Name-With-Dashes") == "name_with_dashes"
|
||||
assert processor.sanitize_name("name.with.dots") == "name_with_dots"
|
||||
assert processor.sanitize_name("123_starts_with_number") == "o_123_starts_with_number"
|
||||
assert processor.sanitize_name("name with spaces") == "name_with_spaces"
|
||||
assert processor.sanitize_name("special!@#$%^chars") == "special______chars"
|
||||
|
||||
def test_get_cassandra_type(self):
|
||||
"""Test field type conversion to Cassandra types"""
|
||||
processor = MagicMock()
|
||||
processor.get_cassandra_type = Processor.get_cassandra_type.__get__(processor, Processor)
|
||||
|
||||
# Basic type mappings
|
||||
assert processor.get_cassandra_type("string") == "text"
|
||||
assert processor.get_cassandra_type("boolean") == "boolean"
|
||||
assert processor.get_cassandra_type("timestamp") == "timestamp"
|
||||
assert processor.get_cassandra_type("uuid") == "uuid"
|
||||
|
||||
# Integer types with size hints
|
||||
assert processor.get_cassandra_type("integer", size=2) == "int"
|
||||
assert processor.get_cassandra_type("integer", size=8) == "bigint"
|
||||
|
||||
# Float types with size hints
|
||||
assert processor.get_cassandra_type("float", size=2) == "float"
|
||||
assert processor.get_cassandra_type("float", size=8) == "double"
|
||||
|
||||
# Unknown type defaults to text
|
||||
assert processor.get_cassandra_type("unknown_type") == "text"
|
||||
|
||||
def test_convert_value(self):
|
||||
"""Test value conversion for different field types"""
|
||||
processor = MagicMock()
|
||||
processor.convert_value = Processor.convert_value.__get__(processor, Processor)
|
||||
|
||||
# Integer conversions
|
||||
assert processor.convert_value("123", "integer") == 123
|
||||
assert processor.convert_value(123.5, "integer") == 123
|
||||
assert processor.convert_value(None, "integer") is None
|
||||
|
||||
# Float conversions
|
||||
assert processor.convert_value("123.45", "float") == 123.45
|
||||
assert processor.convert_value(123, "float") == 123.0
|
||||
|
||||
# Boolean conversions
|
||||
assert processor.convert_value("true", "boolean") is True
|
||||
assert processor.convert_value("false", "boolean") is False
|
||||
assert processor.convert_value("1", "boolean") is True
|
||||
assert processor.convert_value("0", "boolean") is False
|
||||
assert processor.convert_value("yes", "boolean") is True
|
||||
assert processor.convert_value("no", "boolean") is False
|
||||
|
||||
# String conversions
|
||||
assert processor.convert_value(123, "string") == "123"
|
||||
assert processor.convert_value(True, "string") == "True"
|
||||
|
||||
def test_table_creation_cql_generation(self):
|
||||
"""Test CQL generation for table creation"""
|
||||
processor = MagicMock()
|
||||
processor.schemas = {}
|
||||
processor.known_keyspaces = set()
|
||||
processor.known_tables = {}
|
||||
processor.session = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
processor.sanitize_table = Processor.sanitize_table.__get__(processor, Processor)
|
||||
processor.get_cassandra_type = Processor.get_cassandra_type.__get__(processor, Processor)
|
||||
def mock_ensure_keyspace(keyspace):
|
||||
processor.known_keyspaces.add(keyspace)
|
||||
processor.known_tables[keyspace] = set()
|
||||
processor.ensure_keyspace = mock_ensure_keyspace
|
||||
processor.ensure_table = Processor.ensure_table.__get__(processor, Processor)
|
||||
|
||||
# Create test schema
|
||||
schema = RowSchema(
|
||||
name="customer_records",
|
||||
description="Test customer schema",
|
||||
fields=[
|
||||
Field(
|
||||
name="customer_id",
|
||||
type="string",
|
||||
size=50,
|
||||
primary=True,
|
||||
required=True,
|
||||
indexed=False
|
||||
),
|
||||
Field(
|
||||
name="email",
|
||||
type="string",
|
||||
size=100,
|
||||
required=True,
|
||||
indexed=True
|
||||
),
|
||||
Field(
|
||||
name="age",
|
||||
type="integer",
|
||||
size=4,
|
||||
required=False,
|
||||
indexed=False
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
# Call ensure_table
|
||||
processor.ensure_table("test_user", "customer_records", schema)
|
||||
|
||||
# Verify keyspace was ensured (check that it was added to known_keyspaces)
|
||||
assert "test_user" in processor.known_keyspaces
|
||||
|
||||
# Check the CQL that was executed (first call should be table creation)
|
||||
all_calls = processor.session.execute.call_args_list
|
||||
table_creation_cql = all_calls[0][0][0] # First call
|
||||
|
||||
# Verify table structure (keyspace uses sanitize_name, table uses sanitize_table)
|
||||
assert "CREATE TABLE IF NOT EXISTS test_user.o_customer_records" in table_creation_cql
|
||||
assert "collection text" in table_creation_cql
|
||||
assert "customer_id text" in table_creation_cql
|
||||
assert "email text" in table_creation_cql
|
||||
assert "age int" in table_creation_cql
|
||||
assert "PRIMARY KEY ((collection, customer_id))" in table_creation_cql
|
||||
|
||||
def test_table_creation_without_primary_key(self):
|
||||
"""Test table creation when no primary key is defined"""
|
||||
processor = MagicMock()
|
||||
processor.schemas = {}
|
||||
processor.known_keyspaces = set()
|
||||
processor.known_tables = {}
|
||||
processor.session = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
processor.sanitize_table = Processor.sanitize_table.__get__(processor, Processor)
|
||||
processor.get_cassandra_type = Processor.get_cassandra_type.__get__(processor, Processor)
|
||||
def mock_ensure_keyspace(keyspace):
|
||||
processor.known_keyspaces.add(keyspace)
|
||||
processor.known_tables[keyspace] = set()
|
||||
processor.ensure_keyspace = mock_ensure_keyspace
|
||||
processor.ensure_table = Processor.ensure_table.__get__(processor, Processor)
|
||||
|
||||
# Create schema without primary key
|
||||
schema = RowSchema(
|
||||
name="events",
|
||||
description="Event log",
|
||||
fields=[
|
||||
Field(name="event_type", type="string", size=50),
|
||||
Field(name="timestamp", type="timestamp", size=0)
|
||||
]
|
||||
)
|
||||
|
||||
# Call ensure_table
|
||||
processor.ensure_table("test_user", "events", schema)
|
||||
|
||||
# Check the CQL includes synthetic_id (field names don't get o_ prefix)
|
||||
executed_cql = processor.session.execute.call_args[0][0]
|
||||
assert "synthetic_id uuid" in executed_cql
|
||||
assert "PRIMARY KEY ((collection, synthetic_id))" in executed_cql
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_schema_config_parsing(self):
|
||||
"""Test parsing of schema configurations"""
|
||||
processor = MagicMock()
|
||||
processor.schemas = {}
|
||||
processor.config_key = "schema"
|
||||
processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
|
||||
|
||||
# Create test configuration
|
||||
config = {
|
||||
"schema": {
|
||||
"customer_records": json.dumps({
|
||||
"name": "customer_records",
|
||||
"description": "Customer data",
|
||||
"fields": [
|
||||
{
|
||||
"name": "id",
|
||||
"type": "string",
|
||||
"primary_key": True,
|
||||
"required": True
|
||||
},
|
||||
{
|
||||
"name": "name",
|
||||
"type": "string",
|
||||
"required": True
|
||||
},
|
||||
{
|
||||
"name": "balance",
|
||||
"type": "float",
|
||||
"size": 8
|
||||
}
|
||||
]
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
# Process configuration
|
||||
await processor.on_schema_config(config, version=1)
|
||||
|
||||
# Verify schema was loaded
|
||||
assert "customer_records" in processor.schemas
|
||||
schema = processor.schemas["customer_records"]
|
||||
assert schema.name == "customer_records"
|
||||
assert len(schema.fields) == 3
|
||||
|
||||
# Check field properties
|
||||
id_field = schema.fields[0]
|
||||
assert id_field.name == "id"
|
||||
assert id_field.type == "string"
|
||||
assert id_field.primary is True
|
||||
# Note: Field.required always returns False due to Pulsar schema limitations
|
||||
# The actual required value is tracked during schema parsing
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_object_processing_logic(self):
|
||||
"""Test the logic for processing ExtractedObject"""
|
||||
processor = MagicMock()
|
||||
processor.schemas = {
|
||||
"test_schema": RowSchema(
|
||||
name="test_schema",
|
||||
description="Test",
|
||||
fields=[
|
||||
Field(name="id", type="string", size=50, primary=True),
|
||||
Field(name="value", type="integer", size=4)
|
||||
]
|
||||
)
|
||||
}
|
||||
processor.ensure_table = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
processor.sanitize_table = Processor.sanitize_table.__get__(processor, Processor)
|
||||
processor.convert_value = Processor.convert_value.__get__(processor, Processor)
|
||||
processor.session = MagicMock()
|
||||
processor.on_object = Processor.on_object.__get__(processor, Processor)
|
||||
processor.known_keyspaces = {"test_user"} # Pre-populate to skip validation query
|
||||
processor.known_tables = {"test_user": set()} # Pre-populate
|
||||
|
||||
# Create test object
|
||||
test_obj = ExtractedObject(
|
||||
metadata=Metadata(
|
||||
id="test-001",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[]
|
||||
),
|
||||
schema_name="test_schema",
|
||||
values=[{"id": "123", "value": "456"}],
|
||||
confidence=0.9,
|
||||
source_span="test source"
|
||||
)
|
||||
|
||||
# Create mock message
|
||||
msg = MagicMock()
|
||||
msg.value.return_value = test_obj
|
||||
|
||||
# Process object
|
||||
await processor.on_object(msg, None, None)
|
||||
|
||||
# Verify table was ensured
|
||||
processor.ensure_table.assert_called_once_with("test_user", "test_schema", processor.schemas["test_schema"])
|
||||
|
||||
# Verify insert was executed (keyspace normal, table with o_ prefix)
|
||||
processor.session.execute.assert_called_once()
|
||||
insert_cql = processor.session.execute.call_args[0][0]
|
||||
values = processor.session.execute.call_args[0][1]
|
||||
|
||||
assert "INSERT INTO test_user.o_test_schema" in insert_cql
|
||||
assert "collection" in insert_cql
|
||||
assert values[0] == "test_collection" # collection value
|
||||
assert values[1] == "123" # id value (from values[0])
|
||||
assert values[2] == 456 # converted integer value (from values[0])
|
||||
|
||||
def test_secondary_index_creation(self):
|
||||
"""Test that secondary indexes are created for indexed fields"""
|
||||
processor = MagicMock()
|
||||
processor.schemas = {}
|
||||
processor.known_keyspaces = {"test_user"} # Pre-populate to skip validation query
|
||||
processor.known_tables = {"test_user": set()} # Pre-populate
|
||||
processor.session = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
processor.sanitize_table = Processor.sanitize_table.__get__(processor, Processor)
|
||||
processor.get_cassandra_type = Processor.get_cassandra_type.__get__(processor, Processor)
|
||||
def mock_ensure_keyspace(keyspace):
|
||||
processor.known_keyspaces.add(keyspace)
|
||||
if keyspace not in processor.known_tables:
|
||||
processor.known_tables[keyspace] = set()
|
||||
processor.ensure_keyspace = mock_ensure_keyspace
|
||||
processor.ensure_table = Processor.ensure_table.__get__(processor, Processor)
|
||||
|
||||
# Create schema with indexed field
|
||||
schema = RowSchema(
|
||||
name="products",
|
||||
description="Product catalog",
|
||||
fields=[
|
||||
Field(name="product_id", type="string", size=50, primary=True),
|
||||
Field(name="category", type="string", size=30, indexed=True),
|
||||
Field(name="price", type="float", size=8, indexed=True)
|
||||
]
|
||||
)
|
||||
|
||||
# Call ensure_table
|
||||
processor.ensure_table("test_user", "products", schema)
|
||||
|
||||
# Should have 3 calls: create table + 2 indexes
|
||||
assert processor.session.execute.call_count == 3
|
||||
|
||||
# Check index creation calls (table has o_ prefix, fields don't)
|
||||
calls = processor.session.execute.call_args_list
|
||||
index_calls = [call[0][0] for call in calls if "CREATE INDEX" in call[0][0]]
|
||||
assert len(index_calls) == 2
|
||||
assert any("o_products_category_idx" in call for call in index_calls)
|
||||
assert any("o_products_price_idx" in call for call in index_calls)
|
||||
|
||||
|
||||
class TestObjectsCassandraStorageBatchLogic:
|
||||
"""Test batch processing logic in Cassandra storage"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_batch_object_processing_logic(self):
|
||||
"""Test processing of batch ExtractedObjects"""
|
||||
processor = MagicMock()
|
||||
processor.schemas = {
|
||||
"batch_schema": RowSchema(
|
||||
name="batch_schema",
|
||||
description="Test batch schema",
|
||||
fields=[
|
||||
Field(name="id", type="string", size=50, primary=True),
|
||||
Field(name="name", type="string", size=100),
|
||||
Field(name="value", type="integer", size=4)
|
||||
]
|
||||
)
|
||||
}
|
||||
processor.known_keyspaces = {"test_user"} # Pre-populate to skip validation query
|
||||
processor.ensure_table = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
processor.sanitize_table = Processor.sanitize_table.__get__(processor, Processor)
|
||||
processor.convert_value = Processor.convert_value.__get__(processor, Processor)
|
||||
processor.session = MagicMock()
|
||||
processor.on_object = Processor.on_object.__get__(processor, Processor)
|
||||
|
||||
# Create batch object with multiple values
|
||||
batch_obj = ExtractedObject(
|
||||
metadata=Metadata(
|
||||
id="batch-001",
|
||||
user="test_user",
|
||||
collection="batch_collection",
|
||||
metadata=[]
|
||||
),
|
||||
schema_name="batch_schema",
|
||||
values=[
|
||||
{"id": "001", "name": "First", "value": "100"},
|
||||
{"id": "002", "name": "Second", "value": "200"},
|
||||
{"id": "003", "name": "Third", "value": "300"}
|
||||
],
|
||||
confidence=0.95,
|
||||
source_span="batch source"
|
||||
)
|
||||
|
||||
# Create mock message
|
||||
msg = MagicMock()
|
||||
msg.value.return_value = batch_obj
|
||||
|
||||
# Process batch object
|
||||
await processor.on_object(msg, None, None)
|
||||
|
||||
# Verify table was ensured once
|
||||
processor.ensure_table.assert_called_once_with("test_user", "batch_schema", processor.schemas["batch_schema"])
|
||||
|
||||
# Verify 3 separate insert calls (one per batch item)
|
||||
assert processor.session.execute.call_count == 3
|
||||
|
||||
# Check each insert call
|
||||
calls = processor.session.execute.call_args_list
|
||||
for i, call in enumerate(calls):
|
||||
insert_cql = call[0][0]
|
||||
values = call[0][1]
|
||||
|
||||
assert "INSERT INTO test_user.o_batch_schema" in insert_cql
|
||||
assert "collection" in insert_cql
|
||||
|
||||
# Check values for each batch item
|
||||
assert values[0] == "batch_collection" # collection
|
||||
assert values[1] == f"00{i+1}" # id from batch item i
|
||||
assert values[2] == f"First" if i == 0 else f"Second" if i == 1 else f"Third" # name
|
||||
assert values[3] == (i+1) * 100 # converted integer value
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_batch_processing_logic(self):
|
||||
"""Test processing of empty batch ExtractedObjects"""
|
||||
processor = MagicMock()
|
||||
processor.schemas = {
|
||||
"empty_schema": RowSchema(
|
||||
name="empty_schema",
|
||||
fields=[Field(name="id", type="string", size=50, primary=True)]
|
||||
)
|
||||
}
|
||||
processor.ensure_table = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
processor.sanitize_table = Processor.sanitize_table.__get__(processor, Processor)
|
||||
processor.convert_value = Processor.convert_value.__get__(processor, Processor)
|
||||
processor.session = MagicMock()
|
||||
processor.on_object = Processor.on_object.__get__(processor, Processor)
|
||||
processor.known_keyspaces = {"test_user"} # Pre-populate to skip validation query
|
||||
processor.known_tables = {"test_user": set()} # Pre-populate
|
||||
|
||||
# Create empty batch object
|
||||
empty_batch_obj = ExtractedObject(
|
||||
metadata=Metadata(
|
||||
id="empty-001",
|
||||
user="test_user",
|
||||
collection="empty_collection",
|
||||
metadata=[]
|
||||
),
|
||||
schema_name="empty_schema",
|
||||
values=[], # Empty batch
|
||||
confidence=1.0,
|
||||
source_span="empty source"
|
||||
)
|
||||
|
||||
msg = MagicMock()
|
||||
msg.value.return_value = empty_batch_obj
|
||||
|
||||
# Process empty batch object
|
||||
await processor.on_object(msg, None, None)
|
||||
|
||||
# Verify table was ensured
|
||||
processor.ensure_table.assert_called_once()
|
||||
|
||||
# Verify no insert calls for empty batch
|
||||
processor.session.execute.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_item_batch_processing_logic(self):
|
||||
"""Test processing of single-item batch (backward compatibility)"""
|
||||
processor = MagicMock()
|
||||
processor.schemas = {
|
||||
"single_schema": RowSchema(
|
||||
name="single_schema",
|
||||
fields=[
|
||||
Field(name="id", type="string", size=50, primary=True),
|
||||
Field(name="data", type="string", size=100)
|
||||
]
|
||||
)
|
||||
}
|
||||
processor.ensure_table = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
processor.sanitize_table = Processor.sanitize_table.__get__(processor, Processor)
|
||||
processor.convert_value = Processor.convert_value.__get__(processor, Processor)
|
||||
processor.session = MagicMock()
|
||||
processor.on_object = Processor.on_object.__get__(processor, Processor)
|
||||
processor.known_keyspaces = {"test_user"} # Pre-populate to skip validation query
|
||||
processor.known_tables = {"test_user": set()} # Pre-populate
|
||||
|
||||
# Create single-item batch object (backward compatibility case)
|
||||
single_batch_obj = ExtractedObject(
|
||||
metadata=Metadata(
|
||||
id="single-001",
|
||||
user="test_user",
|
||||
collection="single_collection",
|
||||
metadata=[]
|
||||
),
|
||||
schema_name="single_schema",
|
||||
values=[{"id": "single-1", "data": "single data"}], # Array with one item
|
||||
confidence=0.8,
|
||||
source_span="single source"
|
||||
)
|
||||
|
||||
msg = MagicMock()
|
||||
msg.value.return_value = single_batch_obj
|
||||
|
||||
# Process single-item batch object
|
||||
await processor.on_object(msg, None, None)
|
||||
|
||||
# Verify table was ensured
|
||||
processor.ensure_table.assert_called_once()
|
||||
|
||||
# Verify exactly one insert call
|
||||
processor.session.execute.assert_called_once()
|
||||
|
||||
insert_cql = processor.session.execute.call_args[0][0]
|
||||
values = processor.session.execute.call_args[0][1]
|
||||
|
||||
assert "INSERT INTO test_user.o_single_schema" in insert_cql
|
||||
assert values[0] == "single_collection" # collection
|
||||
assert values[1] == "single-1" # id value
|
||||
assert values[2] == "single data" # data value
|
||||
|
||||
def test_batch_value_conversion_logic(self):
|
||||
"""Test value conversion works correctly for batch items"""
|
||||
processor = MagicMock()
|
||||
processor.convert_value = Processor.convert_value.__get__(processor, Processor)
|
||||
|
||||
# Test various conversion scenarios that would occur in batch processing
|
||||
test_cases = [
|
||||
# Integer conversions for batch items
|
||||
("123", "integer", 123),
|
||||
("456", "integer", 456),
|
||||
("789", "integer", 789),
|
||||
# Float conversions for batch items
|
||||
("12.5", "float", 12.5),
|
||||
("34.7", "float", 34.7),
|
||||
# Boolean conversions for batch items
|
||||
("true", "boolean", True),
|
||||
("false", "boolean", False),
|
||||
("1", "boolean", True),
|
||||
("0", "boolean", False),
|
||||
# String conversions for batch items
|
||||
(123, "string", "123"),
|
||||
(45.6, "string", "45.6"),
|
||||
]
|
||||
|
||||
for input_val, field_type, expected_output in test_cases:
|
||||
result = processor.convert_value(input_val, field_type)
|
||||
assert result == expected_output, f"Failed for {input_val} -> {field_type}: got {result}, expected {expected_output}"
|
||||
435
tests/unit/test_storage/test_row_embeddings_qdrant_storage.py
Normal file
435
tests/unit/test_storage/test_row_embeddings_qdrant_storage.py
Normal file
|
|
@ -0,0 +1,435 @@
|
|||
"""
|
||||
Unit tests for trustgraph.storage.row_embeddings.qdrant.write
|
||||
Tests the Stage 2 processor that stores pre-computed row embeddings in Qdrant.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
from unittest import IsolatedAsyncioTestCase
|
||||
|
||||
|
||||
class TestQdrantRowEmbeddingsStorage(IsolatedAsyncioTestCase):
|
||||
"""Test Qdrant row embeddings storage functionality"""
|
||||
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.QdrantClient')
|
||||
async def test_processor_initialization_basic(self, mock_qdrant_client):
|
||||
"""Test basic Qdrant processor initialization"""
|
||||
from trustgraph.storage.row_embeddings.qdrant.write import Processor
|
||||
|
||||
mock_qdrant_instance = MagicMock()
|
||||
mock_qdrant_client.return_value = mock_qdrant_instance
|
||||
|
||||
config = {
|
||||
'store_uri': 'http://localhost:6333',
|
||||
'api_key': 'test-api-key',
|
||||
'taskgroup': AsyncMock(),
|
||||
'id': 'test-qdrant-processor'
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
mock_qdrant_client.assert_called_once_with(
|
||||
url='http://localhost:6333', api_key='test-api-key'
|
||||
)
|
||||
assert hasattr(processor, 'qdrant')
|
||||
assert processor.qdrant == mock_qdrant_instance
|
||||
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.QdrantClient')
|
||||
async def test_processor_initialization_with_defaults(self, mock_qdrant_client):
|
||||
"""Test processor initialization with default values"""
|
||||
from trustgraph.storage.row_embeddings.qdrant.write import Processor
|
||||
|
||||
mock_qdrant_instance = MagicMock()
|
||||
mock_qdrant_client.return_value = mock_qdrant_instance
|
||||
|
||||
config = {
|
||||
'taskgroup': AsyncMock(),
|
||||
'id': 'test-qdrant-processor'
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
mock_qdrant_client.assert_called_once_with(
|
||||
url='http://localhost:6333', api_key=None
|
||||
)
|
||||
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.QdrantClient')
|
||||
async def test_sanitize_name(self, mock_qdrant_client):
|
||||
"""Test name sanitization for Qdrant collections"""
|
||||
from trustgraph.storage.row_embeddings.qdrant.write import Processor
|
||||
|
||||
mock_qdrant_client.return_value = MagicMock()
|
||||
|
||||
config = {
|
||||
'taskgroup': AsyncMock(),
|
||||
'id': 'test-processor'
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
# Test basic sanitization
|
||||
assert processor.sanitize_name("simple") == "simple"
|
||||
assert processor.sanitize_name("with-dash") == "with_dash"
|
||||
assert processor.sanitize_name("with.dot") == "with_dot"
|
||||
assert processor.sanitize_name("UPPERCASE") == "uppercase"
|
||||
|
||||
# Test numeric prefix handling
|
||||
assert processor.sanitize_name("123start") == "r_123start"
|
||||
assert processor.sanitize_name("_underscore") == "r__underscore"
|
||||
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.QdrantClient')
|
||||
async def test_get_collection_name(self, mock_qdrant_client):
|
||||
"""Test Qdrant collection name generation"""
|
||||
from trustgraph.storage.row_embeddings.qdrant.write import Processor
|
||||
|
||||
mock_qdrant_client.return_value = MagicMock()
|
||||
|
||||
config = {
|
||||
'taskgroup': AsyncMock(),
|
||||
'id': 'test-processor'
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
collection_name = processor.get_collection_name(
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
schema_name="customer_data",
|
||||
dimension=384
|
||||
)
|
||||
|
||||
assert collection_name == "rows_test_user_test_collection_customer_data_384"
|
||||
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.QdrantClient')
|
||||
async def test_ensure_collection_creates_new(self, mock_qdrant_client):
|
||||
"""Test that ensure_collection creates a new collection when needed"""
|
||||
from trustgraph.storage.row_embeddings.qdrant.write import Processor
|
||||
|
||||
mock_qdrant_instance = MagicMock()
|
||||
mock_qdrant_instance.collection_exists.return_value = False
|
||||
mock_qdrant_client.return_value = mock_qdrant_instance
|
||||
|
||||
config = {
|
||||
'taskgroup': AsyncMock(),
|
||||
'id': 'test-processor'
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
processor.ensure_collection("test_collection", 384)
|
||||
|
||||
mock_qdrant_instance.collection_exists.assert_called_once_with("test_collection")
|
||||
mock_qdrant_instance.create_collection.assert_called_once()
|
||||
|
||||
# Verify the collection is cached
|
||||
assert "test_collection" in processor.created_collections
|
||||
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.QdrantClient')
|
||||
async def test_ensure_collection_skips_existing(self, mock_qdrant_client):
|
||||
"""Test that ensure_collection skips creation when collection exists"""
|
||||
from trustgraph.storage.row_embeddings.qdrant.write import Processor
|
||||
|
||||
mock_qdrant_instance = MagicMock()
|
||||
mock_qdrant_instance.collection_exists.return_value = True
|
||||
mock_qdrant_client.return_value = mock_qdrant_instance
|
||||
|
||||
config = {
|
||||
'taskgroup': AsyncMock(),
|
||||
'id': 'test-processor'
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
processor.ensure_collection("existing_collection", 384)
|
||||
|
||||
mock_qdrant_instance.collection_exists.assert_called_once()
|
||||
mock_qdrant_instance.create_collection.assert_not_called()
|
||||
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.QdrantClient')
|
||||
async def test_ensure_collection_uses_cache(self, mock_qdrant_client):
|
||||
"""Test that ensure_collection uses cache for previously created collections"""
|
||||
from trustgraph.storage.row_embeddings.qdrant.write import Processor
|
||||
|
||||
mock_qdrant_instance = MagicMock()
|
||||
mock_qdrant_client.return_value = mock_qdrant_instance
|
||||
|
||||
config = {
|
||||
'taskgroup': AsyncMock(),
|
||||
'id': 'test-processor'
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
processor.created_collections.add("cached_collection")
|
||||
|
||||
processor.ensure_collection("cached_collection", 384)
|
||||
|
||||
# Should not check or create - just return
|
||||
mock_qdrant_instance.collection_exists.assert_not_called()
|
||||
mock_qdrant_instance.create_collection.assert_not_called()
|
||||
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.QdrantClient')
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.uuid')
|
||||
async def test_on_embeddings_basic(self, mock_uuid, mock_qdrant_client):
|
||||
"""Test processing basic row embeddings message"""
|
||||
from trustgraph.storage.row_embeddings.qdrant.write import Processor
|
||||
from trustgraph.schema import RowEmbeddings, RowIndexEmbedding, Metadata
|
||||
|
||||
mock_qdrant_instance = MagicMock()
|
||||
mock_qdrant_instance.collection_exists.return_value = True
|
||||
mock_qdrant_client.return_value = mock_qdrant_instance
|
||||
mock_uuid.uuid4.return_value = 'test-uuid-123'
|
||||
|
||||
config = {
|
||||
'taskgroup': AsyncMock(),
|
||||
'id': 'test-processor'
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
processor.known_collections[('test_user', 'test_collection')] = {}
|
||||
|
||||
# Create embeddings message
|
||||
metadata = MagicMock()
|
||||
metadata.user = 'test_user'
|
||||
metadata.collection = 'test_collection'
|
||||
metadata.id = 'doc-123'
|
||||
|
||||
embedding = RowIndexEmbedding(
|
||||
index_name='customer_id',
|
||||
index_value=['CUST001'],
|
||||
text='CUST001',
|
||||
vectors=[[0.1, 0.2, 0.3]]
|
||||
)
|
||||
|
||||
embeddings_msg = RowEmbeddings(
|
||||
metadata=metadata,
|
||||
schema_name='customers',
|
||||
embeddings=[embedding]
|
||||
)
|
||||
|
||||
# Mock message wrapper
|
||||
mock_msg = MagicMock()
|
||||
mock_msg.value.return_value = embeddings_msg
|
||||
|
||||
await processor.on_embeddings(mock_msg, MagicMock(), MagicMock())
|
||||
|
||||
# Verify upsert was called
|
||||
mock_qdrant_instance.upsert.assert_called_once()
|
||||
|
||||
# Verify upsert parameters
|
||||
upsert_call_args = mock_qdrant_instance.upsert.call_args
|
||||
assert upsert_call_args[1]['collection_name'] == 'rows_test_user_test_collection_customers_3'
|
||||
|
||||
point = upsert_call_args[1]['points'][0]
|
||||
assert point.vector == [0.1, 0.2, 0.3]
|
||||
assert point.payload['index_name'] == 'customer_id'
|
||||
assert point.payload['index_value'] == ['CUST001']
|
||||
assert point.payload['text'] == 'CUST001'
|
||||
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.QdrantClient')
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.uuid')
|
||||
async def test_on_embeddings_multiple_vectors(self, mock_uuid, mock_qdrant_client):
|
||||
"""Test processing embeddings with multiple vectors"""
|
||||
from trustgraph.storage.row_embeddings.qdrant.write import Processor
|
||||
from trustgraph.schema import RowEmbeddings, RowIndexEmbedding
|
||||
|
||||
mock_qdrant_instance = MagicMock()
|
||||
mock_qdrant_instance.collection_exists.return_value = True
|
||||
mock_qdrant_client.return_value = mock_qdrant_instance
|
||||
mock_uuid.uuid4.return_value = 'test-uuid'
|
||||
|
||||
config = {
|
||||
'taskgroup': AsyncMock(),
|
||||
'id': 'test-processor'
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
processor.known_collections[('test_user', 'test_collection')] = {}
|
||||
|
||||
metadata = MagicMock()
|
||||
metadata.user = 'test_user'
|
||||
metadata.collection = 'test_collection'
|
||||
metadata.id = 'doc-123'
|
||||
|
||||
# Embedding with multiple vectors
|
||||
embedding = RowIndexEmbedding(
|
||||
index_name='name',
|
||||
index_value=['John Doe'],
|
||||
text='John Doe',
|
||||
vectors=[[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]
|
||||
)
|
||||
|
||||
embeddings_msg = RowEmbeddings(
|
||||
metadata=metadata,
|
||||
schema_name='people',
|
||||
embeddings=[embedding]
|
||||
)
|
||||
|
||||
mock_msg = MagicMock()
|
||||
mock_msg.value.return_value = embeddings_msg
|
||||
|
||||
await processor.on_embeddings(mock_msg, MagicMock(), MagicMock())
|
||||
|
||||
# Should be called 3 times (once per vector)
|
||||
assert mock_qdrant_instance.upsert.call_count == 3
|
||||
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.QdrantClient')
|
||||
async def test_on_embeddings_skips_empty_vectors(self, mock_qdrant_client):
|
||||
"""Test that embeddings with no vectors are skipped"""
|
||||
from trustgraph.storage.row_embeddings.qdrant.write import Processor
|
||||
from trustgraph.schema import RowEmbeddings, RowIndexEmbedding
|
||||
|
||||
mock_qdrant_instance = MagicMock()
|
||||
mock_qdrant_client.return_value = mock_qdrant_instance
|
||||
|
||||
config = {
|
||||
'taskgroup': AsyncMock(),
|
||||
'id': 'test-processor'
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
processor.known_collections[('test_user', 'test_collection')] = {}
|
||||
|
||||
metadata = MagicMock()
|
||||
metadata.user = 'test_user'
|
||||
metadata.collection = 'test_collection'
|
||||
metadata.id = 'doc-123'
|
||||
|
||||
# Embedding with no vectors
|
||||
embedding = RowIndexEmbedding(
|
||||
index_name='id',
|
||||
index_value=['123'],
|
||||
text='123',
|
||||
vectors=[] # Empty vectors
|
||||
)
|
||||
|
||||
embeddings_msg = RowEmbeddings(
|
||||
metadata=metadata,
|
||||
schema_name='items',
|
||||
embeddings=[embedding]
|
||||
)
|
||||
|
||||
mock_msg = MagicMock()
|
||||
mock_msg.value.return_value = embeddings_msg
|
||||
|
||||
await processor.on_embeddings(mock_msg, MagicMock(), MagicMock())
|
||||
|
||||
# Should not call upsert for empty vectors
|
||||
mock_qdrant_instance.upsert.assert_not_called()
|
||||
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.QdrantClient')
|
||||
async def test_on_embeddings_drops_unknown_collection(self, mock_qdrant_client):
|
||||
"""Test that messages for unknown collections are dropped"""
|
||||
from trustgraph.storage.row_embeddings.qdrant.write import Processor
|
||||
from trustgraph.schema import RowEmbeddings, RowIndexEmbedding
|
||||
|
||||
mock_qdrant_instance = MagicMock()
|
||||
mock_qdrant_client.return_value = mock_qdrant_instance
|
||||
|
||||
config = {
|
||||
'taskgroup': AsyncMock(),
|
||||
'id': 'test-processor'
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
# No collections registered
|
||||
|
||||
metadata = MagicMock()
|
||||
metadata.user = 'unknown_user'
|
||||
metadata.collection = 'unknown_collection'
|
||||
metadata.id = 'doc-123'
|
||||
|
||||
embedding = RowIndexEmbedding(
|
||||
index_name='id',
|
||||
index_value=['123'],
|
||||
text='123',
|
||||
vectors=[[0.1, 0.2]]
|
||||
)
|
||||
|
||||
embeddings_msg = RowEmbeddings(
|
||||
metadata=metadata,
|
||||
schema_name='items',
|
||||
embeddings=[embedding]
|
||||
)
|
||||
|
||||
mock_msg = MagicMock()
|
||||
mock_msg.value.return_value = embeddings_msg
|
||||
|
||||
await processor.on_embeddings(mock_msg, MagicMock(), MagicMock())
|
||||
|
||||
# Should not call upsert for unknown collection
|
||||
mock_qdrant_instance.upsert.assert_not_called()
|
||||
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.QdrantClient')
|
||||
async def test_delete_collection(self, mock_qdrant_client):
|
||||
"""Test deleting all collections for a user/collection"""
|
||||
from trustgraph.storage.row_embeddings.qdrant.write import Processor
|
||||
|
||||
mock_qdrant_instance = MagicMock()
|
||||
|
||||
# Mock collections list
|
||||
mock_coll1 = MagicMock()
|
||||
mock_coll1.name = 'rows_test_user_test_collection_schema1_384'
|
||||
mock_coll2 = MagicMock()
|
||||
mock_coll2.name = 'rows_test_user_test_collection_schema2_384'
|
||||
mock_coll3 = MagicMock()
|
||||
mock_coll3.name = 'rows_other_user_other_collection_schema_384'
|
||||
|
||||
mock_collections = MagicMock()
|
||||
mock_collections.collections = [mock_coll1, mock_coll2, mock_coll3]
|
||||
mock_qdrant_instance.get_collections.return_value = mock_collections
|
||||
|
||||
mock_qdrant_client.return_value = mock_qdrant_instance
|
||||
|
||||
config = {
|
||||
'taskgroup': AsyncMock(),
|
||||
'id': 'test-processor'
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
processor.created_collections.add('rows_test_user_test_collection_schema1_384')
|
||||
|
||||
await processor.delete_collection('test_user', 'test_collection')
|
||||
|
||||
# Should delete only the matching collections
|
||||
assert mock_qdrant_instance.delete_collection.call_count == 2
|
||||
|
||||
# Verify the cached collection was removed
|
||||
assert 'rows_test_user_test_collection_schema1_384' not in processor.created_collections
|
||||
|
||||
@patch('trustgraph.storage.row_embeddings.qdrant.write.QdrantClient')
|
||||
async def test_delete_collection_schema(self, mock_qdrant_client):
|
||||
"""Test deleting collections for a specific schema"""
|
||||
from trustgraph.storage.row_embeddings.qdrant.write import Processor
|
||||
|
||||
mock_qdrant_instance = MagicMock()
|
||||
|
||||
mock_coll1 = MagicMock()
|
||||
mock_coll1.name = 'rows_test_user_test_collection_customers_384'
|
||||
mock_coll2 = MagicMock()
|
||||
mock_coll2.name = 'rows_test_user_test_collection_orders_384'
|
||||
|
||||
mock_collections = MagicMock()
|
||||
mock_collections.collections = [mock_coll1, mock_coll2]
|
||||
mock_qdrant_instance.get_collections.return_value = mock_collections
|
||||
|
||||
mock_qdrant_client.return_value = mock_qdrant_instance
|
||||
|
||||
config = {
|
||||
'taskgroup': AsyncMock(),
|
||||
'id': 'test-processor'
|
||||
}
|
||||
|
||||
processor = Processor(**config)
|
||||
|
||||
await processor.delete_collection_schema(
|
||||
'test_user', 'test_collection', 'customers'
|
||||
)
|
||||
|
||||
# Should only delete the customers schema collection
|
||||
mock_qdrant_instance.delete_collection.assert_called_once()
|
||||
call_args = mock_qdrant_instance.delete_collection.call_args[0]
|
||||
assert call_args[0] == 'rows_test_user_test_collection_customers_384'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__])
|
||||
474
tests/unit/test_storage/test_rows_cassandra_storage.py
Normal file
474
tests/unit/test_storage/test_rows_cassandra_storage.py
Normal file
|
|
@ -0,0 +1,474 @@
|
|||
"""
|
||||
Unit tests for Cassandra Row Storage Processor (Unified Table Implementation)
|
||||
|
||||
Tests the business logic of the row storage processor including:
|
||||
- Schema configuration handling
|
||||
- Name sanitization
|
||||
- Unified table structure
|
||||
- Index management
|
||||
- Row storage with multi-index support
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, AsyncMock, patch
|
||||
import json
|
||||
|
||||
from trustgraph.storage.rows.cassandra.write import Processor
|
||||
from trustgraph.schema import ExtractedObject, Metadata, RowSchema, Field
|
||||
|
||||
|
||||
class TestRowsCassandraStorageLogic:
|
||||
"""Test business logic for unified table implementation"""
|
||||
|
||||
def test_sanitize_name(self):
|
||||
"""Test name sanitization for Cassandra compatibility"""
|
||||
processor = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
|
||||
# Test various name patterns
|
||||
assert processor.sanitize_name("simple_name") == "simple_name"
|
||||
assert processor.sanitize_name("Name-With-Dashes") == "name_with_dashes"
|
||||
assert processor.sanitize_name("name.with.dots") == "name_with_dots"
|
||||
assert processor.sanitize_name("123_starts_with_number") == "r_123_starts_with_number"
|
||||
assert processor.sanitize_name("name with spaces") == "name_with_spaces"
|
||||
assert processor.sanitize_name("special!@#$%^chars") == "special______chars"
|
||||
assert processor.sanitize_name("UPPERCASE") == "uppercase"
|
||||
assert processor.sanitize_name("CamelCase") == "camelcase"
|
||||
assert processor.sanitize_name("_underscore_start") == "r__underscore_start"
|
||||
|
||||
def test_get_index_names(self):
|
||||
"""Test extraction of index names from schema"""
|
||||
processor = MagicMock()
|
||||
processor.get_index_names = Processor.get_index_names.__get__(processor, Processor)
|
||||
|
||||
# Schema with primary and indexed fields
|
||||
schema = RowSchema(
|
||||
name="test_schema",
|
||||
description="Test",
|
||||
fields=[
|
||||
Field(name="id", type="string", primary=True),
|
||||
Field(name="category", type="string", indexed=True),
|
||||
Field(name="name", type="string"), # Not indexed
|
||||
Field(name="status", type="string", indexed=True)
|
||||
]
|
||||
)
|
||||
|
||||
index_names = processor.get_index_names(schema)
|
||||
|
||||
# Should include primary key and indexed fields
|
||||
assert "id" in index_names
|
||||
assert "category" in index_names
|
||||
assert "status" in index_names
|
||||
assert "name" not in index_names # Not indexed
|
||||
assert len(index_names) == 3
|
||||
|
||||
def test_get_index_names_no_indexes(self):
|
||||
"""Test schema with no indexed fields"""
|
||||
processor = MagicMock()
|
||||
processor.get_index_names = Processor.get_index_names.__get__(processor, Processor)
|
||||
|
||||
schema = RowSchema(
|
||||
name="no_index_schema",
|
||||
fields=[
|
||||
Field(name="data1", type="string"),
|
||||
Field(name="data2", type="string")
|
||||
]
|
||||
)
|
||||
|
||||
index_names = processor.get_index_names(schema)
|
||||
assert len(index_names) == 0
|
||||
|
||||
def test_build_index_value(self):
|
||||
"""Test building index values from row data"""
|
||||
processor = MagicMock()
|
||||
processor.build_index_value = Processor.build_index_value.__get__(processor, Processor)
|
||||
|
||||
value_map = {"id": "123", "category": "electronics", "name": "Widget"}
|
||||
|
||||
# Single field index
|
||||
result = processor.build_index_value(value_map, "id")
|
||||
assert result == ["123"]
|
||||
|
||||
result = processor.build_index_value(value_map, "category")
|
||||
assert result == ["electronics"]
|
||||
|
||||
# Missing field returns empty string
|
||||
result = processor.build_index_value(value_map, "missing")
|
||||
assert result == [""]
|
||||
|
||||
def test_build_index_value_composite(self):
|
||||
"""Test building composite index values"""
|
||||
processor = MagicMock()
|
||||
processor.build_index_value = Processor.build_index_value.__get__(processor, Processor)
|
||||
|
||||
value_map = {"region": "us-west", "category": "electronics", "id": "123"}
|
||||
|
||||
# Composite index (comma-separated field names)
|
||||
result = processor.build_index_value(value_map, "region,category")
|
||||
assert result == ["us-west", "electronics"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_schema_config_parsing(self):
|
||||
"""Test parsing of schema configurations"""
|
||||
processor = MagicMock()
|
||||
processor.schemas = {}
|
||||
processor.config_key = "schema"
|
||||
processor.registered_partitions = set()
|
||||
processor.on_schema_config = Processor.on_schema_config.__get__(processor, Processor)
|
||||
|
||||
# Create test configuration
|
||||
config = {
|
||||
"schema": {
|
||||
"customer_records": json.dumps({
|
||||
"name": "customer_records",
|
||||
"description": "Customer data",
|
||||
"fields": [
|
||||
{
|
||||
"name": "id",
|
||||
"type": "string",
|
||||
"primary_key": True,
|
||||
"required": True
|
||||
},
|
||||
{
|
||||
"name": "name",
|
||||
"type": "string",
|
||||
"required": True
|
||||
},
|
||||
{
|
||||
"name": "category",
|
||||
"type": "string",
|
||||
"indexed": True
|
||||
}
|
||||
]
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
# Process configuration
|
||||
await processor.on_schema_config(config, version=1)
|
||||
|
||||
# Verify schema was loaded
|
||||
assert "customer_records" in processor.schemas
|
||||
schema = processor.schemas["customer_records"]
|
||||
assert schema.name == "customer_records"
|
||||
assert len(schema.fields) == 3
|
||||
|
||||
# Check field properties
|
||||
id_field = schema.fields[0]
|
||||
assert id_field.name == "id"
|
||||
assert id_field.type == "string"
|
||||
assert id_field.primary is True
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_object_processing_stores_data_map(self):
|
||||
"""Test that row processing stores data as map<text, text>"""
|
||||
processor = MagicMock()
|
||||
processor.schemas = {
|
||||
"test_schema": RowSchema(
|
||||
name="test_schema",
|
||||
description="Test",
|
||||
fields=[
|
||||
Field(name="id", type="string", size=50, primary=True),
|
||||
Field(name="value", type="string", size=100)
|
||||
]
|
||||
)
|
||||
}
|
||||
processor.tables_initialized = {"test_user"}
|
||||
processor.registered_partitions = set()
|
||||
processor.session = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
processor.get_index_names = Processor.get_index_names.__get__(processor, Processor)
|
||||
processor.build_index_value = Processor.build_index_value.__get__(processor, Processor)
|
||||
processor.ensure_tables = MagicMock()
|
||||
processor.register_partitions = MagicMock()
|
||||
processor.collection_exists = MagicMock(return_value=True)
|
||||
processor.on_object = Processor.on_object.__get__(processor, Processor)
|
||||
|
||||
# Create test object
|
||||
test_obj = ExtractedObject(
|
||||
metadata=Metadata(
|
||||
id="test-001",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[]
|
||||
),
|
||||
schema_name="test_schema",
|
||||
values=[{"id": "123", "value": "test_data"}],
|
||||
confidence=0.9,
|
||||
source_span="test source"
|
||||
)
|
||||
|
||||
# Create mock message
|
||||
msg = MagicMock()
|
||||
msg.value.return_value = test_obj
|
||||
|
||||
# Process object
|
||||
await processor.on_object(msg, None, None)
|
||||
|
||||
# Verify insert was executed
|
||||
processor.session.execute.assert_called()
|
||||
insert_call = processor.session.execute.call_args
|
||||
insert_cql = insert_call[0][0]
|
||||
values = insert_call[0][1]
|
||||
|
||||
# Verify using unified rows table
|
||||
assert "INSERT INTO test_user.rows" in insert_cql
|
||||
|
||||
# Values should be: (collection, schema_name, index_name, index_value, data, source)
|
||||
assert values[0] == "test_collection" # collection
|
||||
assert values[1] == "test_schema" # schema_name
|
||||
assert values[2] == "id" # index_name (primary key field)
|
||||
assert values[3] == ["123"] # index_value as list
|
||||
assert values[4] == {"id": "123", "value": "test_data"} # data map
|
||||
assert values[5] == "" # source
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_object_processing_multiple_indexes(self):
|
||||
"""Test that row is written once per indexed field"""
|
||||
processor = MagicMock()
|
||||
processor.schemas = {
|
||||
"multi_index_schema": RowSchema(
|
||||
name="multi_index_schema",
|
||||
fields=[
|
||||
Field(name="id", type="string", primary=True),
|
||||
Field(name="category", type="string", indexed=True),
|
||||
Field(name="status", type="string", indexed=True)
|
||||
]
|
||||
)
|
||||
}
|
||||
processor.tables_initialized = {"test_user"}
|
||||
processor.registered_partitions = set()
|
||||
processor.session = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
processor.get_index_names = Processor.get_index_names.__get__(processor, Processor)
|
||||
processor.build_index_value = Processor.build_index_value.__get__(processor, Processor)
|
||||
processor.ensure_tables = MagicMock()
|
||||
processor.register_partitions = MagicMock()
|
||||
processor.collection_exists = MagicMock(return_value=True)
|
||||
processor.on_object = Processor.on_object.__get__(processor, Processor)
|
||||
|
||||
test_obj = ExtractedObject(
|
||||
metadata=Metadata(
|
||||
id="test-001",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[]
|
||||
),
|
||||
schema_name="multi_index_schema",
|
||||
values=[{"id": "123", "category": "electronics", "status": "active"}],
|
||||
confidence=0.9,
|
||||
source_span=""
|
||||
)
|
||||
|
||||
msg = MagicMock()
|
||||
msg.value.return_value = test_obj
|
||||
|
||||
await processor.on_object(msg, None, None)
|
||||
|
||||
# Should have 3 inserts (one per indexed field: id, category, status)
|
||||
assert processor.session.execute.call_count == 3
|
||||
|
||||
# Check that different index_names were used
|
||||
index_names_used = set()
|
||||
for call in processor.session.execute.call_args_list:
|
||||
values = call[0][1]
|
||||
index_names_used.add(values[2]) # index_name is 3rd value
|
||||
|
||||
assert index_names_used == {"id", "category", "status"}
|
||||
|
||||
|
||||
class TestRowsCassandraStorageBatchLogic:
|
||||
"""Test batch processing logic for unified table implementation"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_batch_object_processing(self):
|
||||
"""Test processing of batch ExtractedObjects"""
|
||||
processor = MagicMock()
|
||||
processor.schemas = {
|
||||
"batch_schema": RowSchema(
|
||||
name="batch_schema",
|
||||
fields=[
|
||||
Field(name="id", type="string", primary=True),
|
||||
Field(name="name", type="string")
|
||||
]
|
||||
)
|
||||
}
|
||||
processor.tables_initialized = {"test_user"}
|
||||
processor.registered_partitions = set()
|
||||
processor.session = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
processor.get_index_names = Processor.get_index_names.__get__(processor, Processor)
|
||||
processor.build_index_value = Processor.build_index_value.__get__(processor, Processor)
|
||||
processor.ensure_tables = MagicMock()
|
||||
processor.register_partitions = MagicMock()
|
||||
processor.collection_exists = MagicMock(return_value=True)
|
||||
processor.on_object = Processor.on_object.__get__(processor, Processor)
|
||||
|
||||
# Create batch object with multiple values
|
||||
batch_obj = ExtractedObject(
|
||||
metadata=Metadata(
|
||||
id="batch-001",
|
||||
user="test_user",
|
||||
collection="batch_collection",
|
||||
metadata=[]
|
||||
),
|
||||
schema_name="batch_schema",
|
||||
values=[
|
||||
{"id": "001", "name": "First"},
|
||||
{"id": "002", "name": "Second"},
|
||||
{"id": "003", "name": "Third"}
|
||||
],
|
||||
confidence=0.95,
|
||||
source_span=""
|
||||
)
|
||||
|
||||
msg = MagicMock()
|
||||
msg.value.return_value = batch_obj
|
||||
|
||||
await processor.on_object(msg, None, None)
|
||||
|
||||
# Should have 3 inserts (one per row, one index per row since only primary key)
|
||||
assert processor.session.execute.call_count == 3
|
||||
|
||||
# Check each insert has different id
|
||||
ids_inserted = set()
|
||||
for call in processor.session.execute.call_args_list:
|
||||
values = call[0][1]
|
||||
ids_inserted.add(tuple(values[3])) # index_value is 4th value
|
||||
|
||||
assert ids_inserted == {("001",), ("002",), ("003",)}
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_batch_processing(self):
|
||||
"""Test processing of empty batch ExtractedObjects"""
|
||||
processor = MagicMock()
|
||||
processor.schemas = {
|
||||
"empty_schema": RowSchema(
|
||||
name="empty_schema",
|
||||
fields=[Field(name="id", type="string", primary=True)]
|
||||
)
|
||||
}
|
||||
processor.tables_initialized = {"test_user"}
|
||||
processor.registered_partitions = set()
|
||||
processor.session = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
processor.get_index_names = Processor.get_index_names.__get__(processor, Processor)
|
||||
processor.build_index_value = Processor.build_index_value.__get__(processor, Processor)
|
||||
processor.ensure_tables = MagicMock()
|
||||
processor.register_partitions = MagicMock()
|
||||
processor.collection_exists = MagicMock(return_value=True)
|
||||
processor.on_object = Processor.on_object.__get__(processor, Processor)
|
||||
|
||||
# Create empty batch object
|
||||
empty_batch_obj = ExtractedObject(
|
||||
metadata=Metadata(
|
||||
id="empty-001",
|
||||
user="test_user",
|
||||
collection="empty_collection",
|
||||
metadata=[]
|
||||
),
|
||||
schema_name="empty_schema",
|
||||
values=[], # Empty batch
|
||||
confidence=1.0,
|
||||
source_span=""
|
||||
)
|
||||
|
||||
msg = MagicMock()
|
||||
msg.value.return_value = empty_batch_obj
|
||||
|
||||
await processor.on_object(msg, None, None)
|
||||
|
||||
# Verify no insert calls for empty batch
|
||||
processor.session.execute.assert_not_called()
|
||||
|
||||
|
||||
class TestUnifiedTableStructure:
|
||||
"""Test the unified rows table structure"""
|
||||
|
||||
def test_ensure_tables_creates_unified_structure(self):
|
||||
"""Test that ensure_tables creates the unified rows table"""
|
||||
processor = MagicMock()
|
||||
processor.known_keyspaces = {"test_user"}
|
||||
processor.tables_initialized = set()
|
||||
processor.session = MagicMock()
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
processor.ensure_keyspace = MagicMock()
|
||||
processor.ensure_tables = Processor.ensure_tables.__get__(processor, Processor)
|
||||
|
||||
processor.ensure_tables("test_user")
|
||||
|
||||
# Should have 2 calls: create rows table + create row_partitions table
|
||||
assert processor.session.execute.call_count == 2
|
||||
|
||||
# Check rows table creation
|
||||
rows_cql = processor.session.execute.call_args_list[0][0][0]
|
||||
assert "CREATE TABLE IF NOT EXISTS test_user.rows" in rows_cql
|
||||
assert "collection text" in rows_cql
|
||||
assert "schema_name text" in rows_cql
|
||||
assert "index_name text" in rows_cql
|
||||
assert "index_value frozen<list<text>>" in rows_cql
|
||||
assert "data map<text, text>" in rows_cql
|
||||
assert "source text" in rows_cql
|
||||
assert "PRIMARY KEY ((collection, schema_name, index_name), index_value)" in rows_cql
|
||||
|
||||
# Check row_partitions table creation
|
||||
partitions_cql = processor.session.execute.call_args_list[1][0][0]
|
||||
assert "CREATE TABLE IF NOT EXISTS test_user.row_partitions" in partitions_cql
|
||||
assert "PRIMARY KEY ((collection), schema_name, index_name)" in partitions_cql
|
||||
|
||||
# Verify keyspace added to initialized set
|
||||
assert "test_user" in processor.tables_initialized
|
||||
|
||||
def test_ensure_tables_idempotent(self):
|
||||
"""Test that ensure_tables is idempotent"""
|
||||
processor = MagicMock()
|
||||
processor.tables_initialized = {"test_user"} # Already initialized
|
||||
processor.session = MagicMock()
|
||||
processor.ensure_tables = Processor.ensure_tables.__get__(processor, Processor)
|
||||
|
||||
processor.ensure_tables("test_user")
|
||||
|
||||
# Should not execute any CQL since already initialized
|
||||
processor.session.execute.assert_not_called()
|
||||
|
||||
|
||||
class TestPartitionRegistration:
|
||||
"""Test partition registration for tracking what's stored"""
|
||||
|
||||
def test_register_partitions(self):
|
||||
"""Test registering partitions for a collection/schema pair"""
|
||||
processor = MagicMock()
|
||||
processor.registered_partitions = set()
|
||||
processor.session = MagicMock()
|
||||
processor.schemas = {
|
||||
"test_schema": RowSchema(
|
||||
name="test_schema",
|
||||
fields=[
|
||||
Field(name="id", type="string", primary=True),
|
||||
Field(name="category", type="string", indexed=True)
|
||||
]
|
||||
)
|
||||
}
|
||||
processor.sanitize_name = Processor.sanitize_name.__get__(processor, Processor)
|
||||
processor.get_index_names = Processor.get_index_names.__get__(processor, Processor)
|
||||
processor.register_partitions = Processor.register_partitions.__get__(processor, Processor)
|
||||
|
||||
processor.register_partitions("test_user", "test_collection", "test_schema")
|
||||
|
||||
# Should have 2 inserts (one per index: id, category)
|
||||
assert processor.session.execute.call_count == 2
|
||||
|
||||
# Verify cache was updated
|
||||
assert ("test_collection", "test_schema") in processor.registered_partitions
|
||||
|
||||
def test_register_partitions_idempotent(self):
|
||||
"""Test that partition registration is idempotent"""
|
||||
processor = MagicMock()
|
||||
processor.registered_partitions = {("test_collection", "test_schema")} # Already registered
|
||||
processor.session = MagicMock()
|
||||
processor.register_partitions = Processor.register_partitions.__get__(processor, Processor)
|
||||
|
||||
processor.register_partitions("test_user", "test_collection", "test_schema")
|
||||
|
||||
# Should not execute any CQL since already registered
|
||||
processor.session.execute.assert_not_called()
|
||||
|
|
@ -6,7 +6,8 @@ import pytest
|
|||
from unittest.mock import MagicMock, patch, AsyncMock
|
||||
|
||||
from trustgraph.storage.triples.cassandra.write import Processor
|
||||
from trustgraph.schema import Value, Triple
|
||||
from trustgraph.schema import Triple, LITERAL, IRI
|
||||
from trustgraph.direct.cassandra_kg import DEFAULT_GRAPH
|
||||
|
||||
|
||||
class TestCassandraStorageProcessor:
|
||||
|
|
@ -86,29 +87,29 @@ class TestCassandraStorageProcessor:
|
|||
assert processor.cassandra_username == 'new-user' # Only cassandra_* params work
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.storage.triples.cassandra.write.KnowledgeGraph')
|
||||
async def test_table_switching_with_auth(self, mock_trustgraph):
|
||||
@patch('trustgraph.storage.triples.cassandra.write.EntityCentricKnowledgeGraph')
|
||||
async def test_table_switching_with_auth(self, mock_kg_class):
|
||||
"""Test table switching logic when authentication is provided"""
|
||||
taskgroup_mock = MagicMock()
|
||||
mock_tg_instance = MagicMock()
|
||||
mock_trustgraph.return_value = mock_tg_instance
|
||||
|
||||
mock_kg_class.return_value = mock_tg_instance
|
||||
|
||||
processor = Processor(
|
||||
taskgroup=taskgroup_mock,
|
||||
cassandra_username='testuser',
|
||||
cassandra_password='testpass'
|
||||
)
|
||||
|
||||
|
||||
# Create mock message
|
||||
mock_message = MagicMock()
|
||||
mock_message.metadata.user = 'user1'
|
||||
mock_message.metadata.collection = 'collection1'
|
||||
mock_message.triples = []
|
||||
|
||||
|
||||
await processor.store_triples(mock_message)
|
||||
|
||||
|
||||
# Verify KnowledgeGraph was called with auth parameters
|
||||
mock_trustgraph.assert_called_once_with(
|
||||
mock_kg_class.assert_called_once_with(
|
||||
hosts=['cassandra'], # Updated default
|
||||
keyspace='user1',
|
||||
username='testuser',
|
||||
|
|
@ -117,128 +118,150 @@ class TestCassandraStorageProcessor:
|
|||
assert processor.table == 'user1'
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.storage.triples.cassandra.write.KnowledgeGraph')
|
||||
async def test_table_switching_without_auth(self, mock_trustgraph):
|
||||
@patch('trustgraph.storage.triples.cassandra.write.EntityCentricKnowledgeGraph')
|
||||
async def test_table_switching_without_auth(self, mock_kg_class):
|
||||
"""Test table switching logic when no authentication is provided"""
|
||||
taskgroup_mock = MagicMock()
|
||||
mock_tg_instance = MagicMock()
|
||||
mock_trustgraph.return_value = mock_tg_instance
|
||||
|
||||
mock_kg_class.return_value = mock_tg_instance
|
||||
|
||||
processor = Processor(taskgroup=taskgroup_mock)
|
||||
|
||||
|
||||
# Create mock message
|
||||
mock_message = MagicMock()
|
||||
mock_message.metadata.user = 'user2'
|
||||
mock_message.metadata.collection = 'collection2'
|
||||
mock_message.triples = []
|
||||
|
||||
|
||||
await processor.store_triples(mock_message)
|
||||
|
||||
|
||||
# Verify KnowledgeGraph was called without auth parameters
|
||||
mock_trustgraph.assert_called_once_with(
|
||||
mock_kg_class.assert_called_once_with(
|
||||
hosts=['cassandra'], # Updated default
|
||||
keyspace='user2'
|
||||
)
|
||||
assert processor.table == 'user2'
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.storage.triples.cassandra.write.KnowledgeGraph')
|
||||
async def test_table_reuse_when_same(self, mock_trustgraph):
|
||||
@patch('trustgraph.storage.triples.cassandra.write.EntityCentricKnowledgeGraph')
|
||||
async def test_table_reuse_when_same(self, mock_kg_class):
|
||||
"""Test that TrustGraph is not recreated when table hasn't changed"""
|
||||
taskgroup_mock = MagicMock()
|
||||
mock_tg_instance = MagicMock()
|
||||
mock_trustgraph.return_value = mock_tg_instance
|
||||
|
||||
mock_kg_class.return_value = mock_tg_instance
|
||||
|
||||
processor = Processor(taskgroup=taskgroup_mock)
|
||||
|
||||
|
||||
# Create mock message
|
||||
mock_message = MagicMock()
|
||||
mock_message.metadata.user = 'user1'
|
||||
mock_message.metadata.collection = 'collection1'
|
||||
mock_message.triples = []
|
||||
|
||||
|
||||
# First call should create TrustGraph
|
||||
await processor.store_triples(mock_message)
|
||||
assert mock_trustgraph.call_count == 1
|
||||
|
||||
assert mock_kg_class.call_count == 1
|
||||
|
||||
# Second call with same table should reuse TrustGraph
|
||||
await processor.store_triples(mock_message)
|
||||
assert mock_trustgraph.call_count == 1 # Should not increase
|
||||
assert mock_kg_class.call_count == 1 # Should not increase
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.storage.triples.cassandra.write.KnowledgeGraph')
|
||||
async def test_triple_insertion(self, mock_trustgraph):
|
||||
@patch('trustgraph.storage.triples.cassandra.write.EntityCentricKnowledgeGraph')
|
||||
async def test_triple_insertion(self, mock_kg_class):
|
||||
"""Test that triples are properly inserted into Cassandra"""
|
||||
taskgroup_mock = MagicMock()
|
||||
mock_tg_instance = MagicMock()
|
||||
mock_trustgraph.return_value = mock_tg_instance
|
||||
|
||||
mock_kg_class.return_value = mock_tg_instance
|
||||
|
||||
processor = Processor(taskgroup=taskgroup_mock)
|
||||
|
||||
# Create mock triples
|
||||
|
||||
# Create mock triples with proper Term structure
|
||||
triple1 = MagicMock()
|
||||
triple1.s.type = LITERAL
|
||||
triple1.s.value = 'subject1'
|
||||
triple1.s.datatype = ''
|
||||
triple1.s.language = ''
|
||||
triple1.p.type = LITERAL
|
||||
triple1.p.value = 'predicate1'
|
||||
triple1.o.type = LITERAL
|
||||
triple1.o.value = 'object1'
|
||||
|
||||
triple1.o.datatype = ''
|
||||
triple1.o.language = ''
|
||||
triple1.g = None
|
||||
|
||||
triple2 = MagicMock()
|
||||
triple2.s.type = LITERAL
|
||||
triple2.s.value = 'subject2'
|
||||
triple2.s.datatype = ''
|
||||
triple2.s.language = ''
|
||||
triple2.p.type = LITERAL
|
||||
triple2.p.value = 'predicate2'
|
||||
triple2.o.type = LITERAL
|
||||
triple2.o.value = 'object2'
|
||||
|
||||
triple2.o.datatype = ''
|
||||
triple2.o.language = ''
|
||||
triple2.g = None
|
||||
|
||||
# Create mock message
|
||||
mock_message = MagicMock()
|
||||
mock_message.metadata.user = 'user1'
|
||||
mock_message.metadata.collection = 'collection1'
|
||||
mock_message.triples = [triple1, triple2]
|
||||
|
||||
|
||||
await processor.store_triples(mock_message)
|
||||
|
||||
# Verify both triples were inserted
|
||||
|
||||
# Verify both triples were inserted (with g=, otype=, dtype=, lang= parameters)
|
||||
assert mock_tg_instance.insert.call_count == 2
|
||||
mock_tg_instance.insert.assert_any_call('collection1', 'subject1', 'predicate1', 'object1')
|
||||
mock_tg_instance.insert.assert_any_call('collection1', 'subject2', 'predicate2', 'object2')
|
||||
mock_tg_instance.insert.assert_any_call(
|
||||
'collection1', 'subject1', 'predicate1', 'object1',
|
||||
g=DEFAULT_GRAPH, otype='l', dtype='', lang=''
|
||||
)
|
||||
mock_tg_instance.insert.assert_any_call(
|
||||
'collection1', 'subject2', 'predicate2', 'object2',
|
||||
g=DEFAULT_GRAPH, otype='l', dtype='', lang=''
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.storage.triples.cassandra.write.KnowledgeGraph')
|
||||
async def test_triple_insertion_with_empty_list(self, mock_trustgraph):
|
||||
@patch('trustgraph.storage.triples.cassandra.write.EntityCentricKnowledgeGraph')
|
||||
async def test_triple_insertion_with_empty_list(self, mock_kg_class):
|
||||
"""Test behavior when message has no triples"""
|
||||
taskgroup_mock = MagicMock()
|
||||
mock_tg_instance = MagicMock()
|
||||
mock_trustgraph.return_value = mock_tg_instance
|
||||
|
||||
mock_kg_class.return_value = mock_tg_instance
|
||||
|
||||
processor = Processor(taskgroup=taskgroup_mock)
|
||||
|
||||
|
||||
# Create mock message with empty triples
|
||||
mock_message = MagicMock()
|
||||
mock_message.metadata.user = 'user1'
|
||||
mock_message.metadata.collection = 'collection1'
|
||||
mock_message.triples = []
|
||||
|
||||
|
||||
await processor.store_triples(mock_message)
|
||||
|
||||
|
||||
# Verify no triples were inserted
|
||||
mock_tg_instance.insert.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.storage.triples.cassandra.write.KnowledgeGraph')
|
||||
@patch('trustgraph.storage.triples.cassandra.write.EntityCentricKnowledgeGraph')
|
||||
@patch('trustgraph.storage.triples.cassandra.write.time.sleep')
|
||||
async def test_exception_handling_with_retry(self, mock_sleep, mock_trustgraph):
|
||||
async def test_exception_handling_with_retry(self, mock_sleep, mock_kg_class):
|
||||
"""Test exception handling during TrustGraph creation"""
|
||||
taskgroup_mock = MagicMock()
|
||||
mock_trustgraph.side_effect = Exception("Connection failed")
|
||||
|
||||
mock_kg_class.side_effect = Exception("Connection failed")
|
||||
|
||||
processor = Processor(taskgroup=taskgroup_mock)
|
||||
|
||||
|
||||
# Create mock message
|
||||
mock_message = MagicMock()
|
||||
mock_message.metadata.user = 'user1'
|
||||
mock_message.metadata.collection = 'collection1'
|
||||
mock_message.triples = []
|
||||
|
||||
|
||||
with pytest.raises(Exception, match="Connection failed"):
|
||||
await processor.store_triples(mock_message)
|
||||
|
||||
|
||||
# Verify sleep was called before re-raising
|
||||
mock_sleep.assert_called_once_with(1)
|
||||
|
||||
|
|
@ -326,92 +349,104 @@ class TestCassandraStorageProcessor:
|
|||
mock_launch.assert_called_once_with(default_ident, '\nGraph writer. Input is graph edge. Writes edges to Cassandra graph.\n')
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.storage.triples.cassandra.write.KnowledgeGraph')
|
||||
async def test_store_triples_table_switching_between_different_tables(self, mock_trustgraph):
|
||||
@patch('trustgraph.storage.triples.cassandra.write.EntityCentricKnowledgeGraph')
|
||||
async def test_store_triples_table_switching_between_different_tables(self, mock_kg_class):
|
||||
"""Test table switching when different tables are used in sequence"""
|
||||
taskgroup_mock = MagicMock()
|
||||
mock_tg_instance1 = MagicMock()
|
||||
mock_tg_instance2 = MagicMock()
|
||||
mock_trustgraph.side_effect = [mock_tg_instance1, mock_tg_instance2]
|
||||
|
||||
mock_kg_class.side_effect = [mock_tg_instance1, mock_tg_instance2]
|
||||
|
||||
processor = Processor(taskgroup=taskgroup_mock)
|
||||
|
||||
|
||||
# First message with table1
|
||||
mock_message1 = MagicMock()
|
||||
mock_message1.metadata.user = 'user1'
|
||||
mock_message1.metadata.collection = 'collection1'
|
||||
mock_message1.triples = []
|
||||
|
||||
|
||||
await processor.store_triples(mock_message1)
|
||||
assert processor.table == 'user1'
|
||||
assert processor.tg == mock_tg_instance1
|
||||
|
||||
|
||||
# Second message with different table
|
||||
mock_message2 = MagicMock()
|
||||
mock_message2.metadata.user = 'user2'
|
||||
mock_message2.metadata.collection = 'collection2'
|
||||
mock_message2.triples = []
|
||||
|
||||
|
||||
await processor.store_triples(mock_message2)
|
||||
assert processor.table == 'user2'
|
||||
assert processor.tg == mock_tg_instance2
|
||||
|
||||
|
||||
# Verify TrustGraph was created twice for different tables
|
||||
assert mock_trustgraph.call_count == 2
|
||||
assert mock_kg_class.call_count == 2
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.storage.triples.cassandra.write.KnowledgeGraph')
|
||||
async def test_store_triples_with_special_characters_in_values(self, mock_trustgraph):
|
||||
@patch('trustgraph.storage.triples.cassandra.write.EntityCentricKnowledgeGraph')
|
||||
async def test_store_triples_with_special_characters_in_values(self, mock_kg_class):
|
||||
"""Test storing triples with special characters and unicode"""
|
||||
taskgroup_mock = MagicMock()
|
||||
mock_tg_instance = MagicMock()
|
||||
mock_trustgraph.return_value = mock_tg_instance
|
||||
|
||||
mock_kg_class.return_value = mock_tg_instance
|
||||
|
||||
processor = Processor(taskgroup=taskgroup_mock)
|
||||
|
||||
# Create triple with special characters
|
||||
|
||||
# Create triple with special characters and proper Term structure
|
||||
triple = MagicMock()
|
||||
triple.s.type = LITERAL
|
||||
triple.s.value = 'subject with spaces & symbols'
|
||||
triple.s.datatype = ''
|
||||
triple.s.language = ''
|
||||
triple.p.type = LITERAL
|
||||
triple.p.value = 'predicate:with/colons'
|
||||
triple.o.type = LITERAL
|
||||
triple.o.value = 'object with "quotes" and unicode: ñáéíóú'
|
||||
|
||||
triple.o.datatype = ''
|
||||
triple.o.language = ''
|
||||
triple.g = None
|
||||
|
||||
mock_message = MagicMock()
|
||||
mock_message.metadata.user = 'test_user'
|
||||
mock_message.metadata.collection = 'test_collection'
|
||||
mock_message.triples = [triple]
|
||||
|
||||
|
||||
await processor.store_triples(mock_message)
|
||||
|
||||
|
||||
# Verify the triple was inserted with special characters preserved
|
||||
mock_tg_instance.insert.assert_called_once_with(
|
||||
'test_collection',
|
||||
'subject with spaces & symbols',
|
||||
'predicate:with/colons',
|
||||
'object with "quotes" and unicode: ñáéíóú'
|
||||
'object with "quotes" and unicode: ñáéíóú',
|
||||
g=DEFAULT_GRAPH,
|
||||
otype='l',
|
||||
dtype='',
|
||||
lang=''
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.storage.triples.cassandra.write.KnowledgeGraph')
|
||||
async def test_store_triples_preserves_old_table_on_exception(self, mock_trustgraph):
|
||||
@patch('trustgraph.storage.triples.cassandra.write.EntityCentricKnowledgeGraph')
|
||||
async def test_store_triples_preserves_old_table_on_exception(self, mock_kg_class):
|
||||
"""Test that table remains unchanged when TrustGraph creation fails"""
|
||||
taskgroup_mock = MagicMock()
|
||||
|
||||
|
||||
processor = Processor(taskgroup=taskgroup_mock)
|
||||
|
||||
|
||||
# Set an initial table
|
||||
processor.table = ('old_user', 'old_collection')
|
||||
|
||||
|
||||
# Mock TrustGraph to raise exception
|
||||
mock_trustgraph.side_effect = Exception("Connection failed")
|
||||
|
||||
mock_kg_class.side_effect = Exception("Connection failed")
|
||||
|
||||
mock_message = MagicMock()
|
||||
mock_message.metadata.user = 'new_user'
|
||||
mock_message.metadata.collection = 'new_collection'
|
||||
mock_message.triples = []
|
||||
|
||||
|
||||
with pytest.raises(Exception, match="Connection failed"):
|
||||
await processor.store_triples(mock_message)
|
||||
|
||||
|
||||
# Table should remain unchanged since self.table = table happens after try/except
|
||||
assert processor.table == ('old_user', 'old_collection')
|
||||
# TrustGraph should be set to None though
|
||||
|
|
@ -422,12 +457,12 @@ class TestCassandraPerformanceOptimizations:
|
|||
"""Test cases for multi-table performance optimizations"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.storage.triples.cassandra.write.KnowledgeGraph')
|
||||
async def test_legacy_mode_uses_single_table(self, mock_trustgraph):
|
||||
@patch('trustgraph.storage.triples.cassandra.write.EntityCentricKnowledgeGraph')
|
||||
async def test_legacy_mode_uses_single_table(self, mock_kg_class):
|
||||
"""Test that legacy mode still works with single table"""
|
||||
taskgroup_mock = MagicMock()
|
||||
mock_tg_instance = MagicMock()
|
||||
mock_trustgraph.return_value = mock_tg_instance
|
||||
mock_kg_class.return_value = mock_tg_instance
|
||||
|
||||
with patch.dict('os.environ', {'CASSANDRA_USE_LEGACY': 'true'}):
|
||||
processor = Processor(taskgroup=taskgroup_mock)
|
||||
|
|
@ -440,16 +475,15 @@ class TestCassandraPerformanceOptimizations:
|
|||
await processor.store_triples(mock_message)
|
||||
|
||||
# Verify KnowledgeGraph instance uses legacy mode
|
||||
kg_instance = mock_trustgraph.return_value
|
||||
assert kg_instance is not None
|
||||
assert mock_tg_instance is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.storage.triples.cassandra.write.KnowledgeGraph')
|
||||
async def test_optimized_mode_uses_multi_table(self, mock_trustgraph):
|
||||
@patch('trustgraph.storage.triples.cassandra.write.EntityCentricKnowledgeGraph')
|
||||
async def test_optimized_mode_uses_multi_table(self, mock_kg_class):
|
||||
"""Test that optimized mode uses multi-table schema"""
|
||||
taskgroup_mock = MagicMock()
|
||||
mock_tg_instance = MagicMock()
|
||||
mock_trustgraph.return_value = mock_tg_instance
|
||||
mock_kg_class.return_value = mock_tg_instance
|
||||
|
||||
with patch.dict('os.environ', {'CASSANDRA_USE_LEGACY': 'false'}):
|
||||
processor = Processor(taskgroup=taskgroup_mock)
|
||||
|
|
@ -462,24 +496,31 @@ class TestCassandraPerformanceOptimizations:
|
|||
await processor.store_triples(mock_message)
|
||||
|
||||
# Verify KnowledgeGraph instance is in optimized mode
|
||||
kg_instance = mock_trustgraph.return_value
|
||||
assert kg_instance is not None
|
||||
assert mock_tg_instance is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch('trustgraph.storage.triples.cassandra.write.KnowledgeGraph')
|
||||
async def test_batch_write_consistency(self, mock_trustgraph):
|
||||
@patch('trustgraph.storage.triples.cassandra.write.EntityCentricKnowledgeGraph')
|
||||
async def test_batch_write_consistency(self, mock_kg_class):
|
||||
"""Test that all tables stay consistent during batch writes"""
|
||||
taskgroup_mock = MagicMock()
|
||||
mock_tg_instance = MagicMock()
|
||||
mock_trustgraph.return_value = mock_tg_instance
|
||||
mock_kg_class.return_value = mock_tg_instance
|
||||
|
||||
processor = Processor(taskgroup=taskgroup_mock)
|
||||
|
||||
# Create test triple
|
||||
# Create test triple with proper Term structure
|
||||
triple = MagicMock()
|
||||
triple.s.type = LITERAL
|
||||
triple.s.value = 'test_subject'
|
||||
triple.s.datatype = ''
|
||||
triple.s.language = ''
|
||||
triple.p.type = LITERAL
|
||||
triple.p.value = 'test_predicate'
|
||||
triple.o.type = LITERAL
|
||||
triple.o.value = 'test_object'
|
||||
triple.o.datatype = ''
|
||||
triple.o.language = ''
|
||||
triple.g = None
|
||||
|
||||
mock_message = MagicMock()
|
||||
mock_message.metadata.user = 'user1'
|
||||
|
|
@ -490,7 +531,8 @@ class TestCassandraPerformanceOptimizations:
|
|||
|
||||
# Verify insert was called for the triple (implementation details tested in KnowledgeGraph)
|
||||
mock_tg_instance.insert.assert_called_once_with(
|
||||
'collection1', 'test_subject', 'test_predicate', 'test_object'
|
||||
'collection1', 'test_subject', 'test_predicate', 'test_object',
|
||||
g=DEFAULT_GRAPH, otype='l', dtype='', lang=''
|
||||
)
|
||||
|
||||
def test_environment_variable_controls_mode(self):
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ import pytest
|
|||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from trustgraph.storage.triples.falkordb.write import Processor
|
||||
from trustgraph.schema import Value, Triple
|
||||
from trustgraph.schema import Term, Triple, IRI, LITERAL
|
||||
|
||||
|
||||
class TestFalkorDBStorageProcessor:
|
||||
|
|
@ -22,9 +22,9 @@ class TestFalkorDBStorageProcessor:
|
|||
|
||||
# Create a test triple
|
||||
triple = Triple(
|
||||
s=Value(value='http://example.com/subject', is_uri=True),
|
||||
p=Value(value='http://example.com/predicate', is_uri=True),
|
||||
o=Value(value='literal object', is_uri=False)
|
||||
s=Term(type=IRI, iri='http://example.com/subject'),
|
||||
p=Term(type=IRI, iri='http://example.com/predicate'),
|
||||
o=Term(type=LITERAL, value='literal object')
|
||||
)
|
||||
message.triples = [triple]
|
||||
|
||||
|
|
@ -183,9 +183,9 @@ class TestFalkorDBStorageProcessor:
|
|||
message.metadata.collection = 'test_collection'
|
||||
|
||||
triple = Triple(
|
||||
s=Value(value='http://example.com/subject', is_uri=True),
|
||||
p=Value(value='http://example.com/predicate', is_uri=True),
|
||||
o=Value(value='http://example.com/object', is_uri=True)
|
||||
s=Term(type=IRI, iri='http://example.com/subject'),
|
||||
p=Term(type=IRI, iri='http://example.com/predicate'),
|
||||
o=Term(type=IRI, iri='http://example.com/object')
|
||||
)
|
||||
message.triples = [triple]
|
||||
|
||||
|
|
@ -269,14 +269,14 @@ class TestFalkorDBStorageProcessor:
|
|||
message.metadata.collection = 'test_collection'
|
||||
|
||||
triple1 = Triple(
|
||||
s=Value(value='http://example.com/subject1', is_uri=True),
|
||||
p=Value(value='http://example.com/predicate1', is_uri=True),
|
||||
o=Value(value='literal object1', is_uri=False)
|
||||
s=Term(type=IRI, iri='http://example.com/subject1'),
|
||||
p=Term(type=IRI, iri='http://example.com/predicate1'),
|
||||
o=Term(type=LITERAL, value='literal object1')
|
||||
)
|
||||
triple2 = Triple(
|
||||
s=Value(value='http://example.com/subject2', is_uri=True),
|
||||
p=Value(value='http://example.com/predicate2', is_uri=True),
|
||||
o=Value(value='http://example.com/object2', is_uri=True)
|
||||
s=Term(type=IRI, iri='http://example.com/subject2'),
|
||||
p=Term(type=IRI, iri='http://example.com/predicate2'),
|
||||
o=Term(type=IRI, iri='http://example.com/object2')
|
||||
)
|
||||
message.triples = [triple1, triple2]
|
||||
|
||||
|
|
@ -337,14 +337,14 @@ class TestFalkorDBStorageProcessor:
|
|||
message.metadata.collection = 'test_collection'
|
||||
|
||||
triple1 = Triple(
|
||||
s=Value(value='http://example.com/subject1', is_uri=True),
|
||||
p=Value(value='http://example.com/predicate1', is_uri=True),
|
||||
o=Value(value='literal object', is_uri=False)
|
||||
s=Term(type=IRI, iri='http://example.com/subject1'),
|
||||
p=Term(type=IRI, iri='http://example.com/predicate1'),
|
||||
o=Term(type=LITERAL, value='literal object')
|
||||
)
|
||||
triple2 = Triple(
|
||||
s=Value(value='http://example.com/subject2', is_uri=True),
|
||||
p=Value(value='http://example.com/predicate2', is_uri=True),
|
||||
o=Value(value='http://example.com/object2', is_uri=True)
|
||||
s=Term(type=IRI, iri='http://example.com/subject2'),
|
||||
p=Term(type=IRI, iri='http://example.com/predicate2'),
|
||||
o=Term(type=IRI, iri='http://example.com/object2')
|
||||
)
|
||||
message.triples = [triple1, triple2]
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ import pytest
|
|||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from trustgraph.storage.triples.memgraph.write import Processor
|
||||
from trustgraph.schema import Value, Triple
|
||||
from trustgraph.schema import Term, Triple, IRI, LITERAL
|
||||
|
||||
|
||||
class TestMemgraphStorageProcessor:
|
||||
|
|
@ -22,9 +22,9 @@ class TestMemgraphStorageProcessor:
|
|||
|
||||
# Create a test triple
|
||||
triple = Triple(
|
||||
s=Value(value='http://example.com/subject', is_uri=True),
|
||||
p=Value(value='http://example.com/predicate', is_uri=True),
|
||||
o=Value(value='literal object', is_uri=False)
|
||||
s=Term(type=IRI, iri='http://example.com/subject'),
|
||||
p=Term(type=IRI, iri='http://example.com/predicate'),
|
||||
o=Term(type=LITERAL, value='literal object')
|
||||
)
|
||||
message.triples = [triple]
|
||||
|
||||
|
|
@ -231,9 +231,9 @@ class TestMemgraphStorageProcessor:
|
|||
mock_tx = MagicMock()
|
||||
|
||||
triple = Triple(
|
||||
s=Value(value='http://example.com/subject', is_uri=True),
|
||||
p=Value(value='http://example.com/predicate', is_uri=True),
|
||||
o=Value(value='http://example.com/object', is_uri=True)
|
||||
s=Term(type=IRI, iri='http://example.com/subject'),
|
||||
p=Term(type=IRI, iri='http://example.com/predicate'),
|
||||
o=Term(type=IRI, iri='http://example.com/object')
|
||||
)
|
||||
|
||||
processor.create_triple(mock_tx, triple, "test_user", "test_collection")
|
||||
|
|
@ -265,9 +265,9 @@ class TestMemgraphStorageProcessor:
|
|||
mock_tx = MagicMock()
|
||||
|
||||
triple = Triple(
|
||||
s=Value(value='http://example.com/subject', is_uri=True),
|
||||
p=Value(value='http://example.com/predicate', is_uri=True),
|
||||
o=Value(value='literal object', is_uri=False)
|
||||
s=Term(type=IRI, iri='http://example.com/subject'),
|
||||
p=Term(type=IRI, iri='http://example.com/predicate'),
|
||||
o=Term(type=LITERAL, value='literal object')
|
||||
)
|
||||
|
||||
processor.create_triple(mock_tx, triple, "test_user", "test_collection")
|
||||
|
|
@ -347,14 +347,14 @@ class TestMemgraphStorageProcessor:
|
|||
message.metadata.collection = 'test_collection'
|
||||
|
||||
triple1 = Triple(
|
||||
s=Value(value='http://example.com/subject1', is_uri=True),
|
||||
p=Value(value='http://example.com/predicate1', is_uri=True),
|
||||
o=Value(value='literal object1', is_uri=False)
|
||||
s=Term(type=IRI, iri='http://example.com/subject1'),
|
||||
p=Term(type=IRI, iri='http://example.com/predicate1'),
|
||||
o=Term(type=LITERAL, value='literal object1')
|
||||
)
|
||||
triple2 = Triple(
|
||||
s=Value(value='http://example.com/subject2', is_uri=True),
|
||||
p=Value(value='http://example.com/predicate2', is_uri=True),
|
||||
o=Value(value='http://example.com/object2', is_uri=True)
|
||||
s=Term(type=IRI, iri='http://example.com/subject2'),
|
||||
p=Term(type=IRI, iri='http://example.com/predicate2'),
|
||||
o=Term(type=IRI, iri='http://example.com/object2')
|
||||
)
|
||||
message.triples = [triple1, triple2]
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import pytest
|
|||
from unittest.mock import MagicMock, patch, AsyncMock
|
||||
|
||||
from trustgraph.storage.triples.neo4j.write import Processor
|
||||
from trustgraph.schema import IRI, LITERAL
|
||||
|
||||
|
||||
class TestNeo4jStorageProcessor:
|
||||
|
|
@ -257,10 +258,12 @@ class TestNeo4jStorageProcessor:
|
|||
|
||||
# Create mock triple with URI object
|
||||
triple = MagicMock()
|
||||
triple.s.value = "http://example.com/subject"
|
||||
triple.p.value = "http://example.com/predicate"
|
||||
triple.o.value = "http://example.com/object"
|
||||
triple.o.is_uri = True
|
||||
triple.s.type = IRI
|
||||
triple.s.iri = "http://example.com/subject"
|
||||
triple.p.type = IRI
|
||||
triple.p.iri = "http://example.com/predicate"
|
||||
triple.o.type = IRI
|
||||
triple.o.iri = "http://example.com/object"
|
||||
|
||||
# Create mock message with metadata
|
||||
mock_message = MagicMock()
|
||||
|
|
@ -327,10 +330,12 @@ class TestNeo4jStorageProcessor:
|
|||
|
||||
# Create mock triple with literal object
|
||||
triple = MagicMock()
|
||||
triple.s.value = "http://example.com/subject"
|
||||
triple.p.value = "http://example.com/predicate"
|
||||
triple.s.type = IRI
|
||||
triple.s.iri = "http://example.com/subject"
|
||||
triple.p.type = IRI
|
||||
triple.p.iri = "http://example.com/predicate"
|
||||
triple.o.type = LITERAL
|
||||
triple.o.value = "literal value"
|
||||
triple.o.is_uri = False
|
||||
|
||||
# Create mock message with metadata
|
||||
mock_message = MagicMock()
|
||||
|
|
@ -398,16 +403,20 @@ class TestNeo4jStorageProcessor:
|
|||
|
||||
# Create mock triples
|
||||
triple1 = MagicMock()
|
||||
triple1.s.value = "http://example.com/subject1"
|
||||
triple1.p.value = "http://example.com/predicate1"
|
||||
triple1.o.value = "http://example.com/object1"
|
||||
triple1.o.is_uri = True
|
||||
|
||||
triple1.s.type = IRI
|
||||
triple1.s.iri = "http://example.com/subject1"
|
||||
triple1.p.type = IRI
|
||||
triple1.p.iri = "http://example.com/predicate1"
|
||||
triple1.o.type = IRI
|
||||
triple1.o.iri = "http://example.com/object1"
|
||||
|
||||
triple2 = MagicMock()
|
||||
triple2.s.value = "http://example.com/subject2"
|
||||
triple2.p.value = "http://example.com/predicate2"
|
||||
triple2.s.type = IRI
|
||||
triple2.s.iri = "http://example.com/subject2"
|
||||
triple2.p.type = IRI
|
||||
triple2.p.iri = "http://example.com/predicate2"
|
||||
triple2.o.type = LITERAL
|
||||
triple2.o.value = "literal value"
|
||||
triple2.o.is_uri = False
|
||||
|
||||
# Create mock message with metadata
|
||||
mock_message = MagicMock()
|
||||
|
|
@ -550,10 +559,12 @@ class TestNeo4jStorageProcessor:
|
|||
|
||||
# Create triple with special characters
|
||||
triple = MagicMock()
|
||||
triple.s.value = "http://example.com/subject with spaces"
|
||||
triple.p.value = "http://example.com/predicate:with/symbols"
|
||||
triple.s.type = IRI
|
||||
triple.s.iri = "http://example.com/subject with spaces"
|
||||
triple.p.type = IRI
|
||||
triple.p.iri = "http://example.com/predicate:with/symbols"
|
||||
triple.o.type = LITERAL
|
||||
triple.o.value = 'literal with "quotes" and unicode: ñáéíóú'
|
||||
triple.o.is_uri = False
|
||||
|
||||
mock_message = MagicMock()
|
||||
mock_message.triples = [triple]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue