trustgraph/tests/unit/test_knowledge_graph/test_triple_construction.py

445 lines
17 KiB
Python
Raw Normal View History

"""
Unit tests for triple construction logic
Tests the core business logic for constructing RDF triples from extracted
entities and relationships, including URI generation, Term object creation,
and triple validation.
"""
import pytest
from unittest.mock import Mock
from .conftest import Triple, Triples, Term, Metadata, IRI, LITERAL
import re
import hashlib
class TestTripleConstructionLogic:
"""Test cases for triple construction business logic"""
def test_uri_generation_from_text(self):
"""Test URI generation from entity text"""
# Arrange
def generate_uri(text, entity_type, base_uri="http://trustgraph.ai/kg"):
# Normalize text for URI
normalized = text.lower()
normalized = re.sub(r'[^\w\s-]', '', normalized) # Remove special chars
normalized = re.sub(r'\s+', '-', normalized.strip()) # Replace spaces with hyphens
# Map entity types to namespaces
type_mappings = {
"PERSON": "person",
"ORG": "org",
"PLACE": "place",
"PRODUCT": "product"
}
namespace = type_mappings.get(entity_type, "entity")
return f"{base_uri}/{namespace}/{normalized}"
test_cases = [
("John Smith", "PERSON", "http://trustgraph.ai/kg/person/john-smith"),
("OpenAI Inc.", "ORG", "http://trustgraph.ai/kg/org/openai-inc"),
("San Francisco", "PLACE", "http://trustgraph.ai/kg/place/san-francisco"),
("GPT-4", "PRODUCT", "http://trustgraph.ai/kg/product/gpt-4")
]
# Act & Assert
for text, entity_type, expected_uri in test_cases:
generated_uri = generate_uri(text, entity_type)
assert generated_uri == expected_uri, f"URI generation failed for '{text}'"
def test_term_object_creation(self):
"""Test creation of Term objects for subjects, predicates, and objects"""
# Arrange
def create_term_object(text, is_uri, datatype=""):
if is_uri:
return Term(type=IRI, iri=text)
else:
return Term(type=LITERAL, value=text, datatype=datatype if datatype else None)
test_cases = [
("http://trustgraph.ai/kg/person/john-smith", True, ""),
("John Smith", False, "string"),
("42", False, "integer"),
("http://schema.org/worksFor", True, "")
]
# Act & Assert
for value_text, is_uri, datatype in test_cases:
term_obj = create_term_object(value_text, is_uri, datatype)
assert isinstance(term_obj, Term)
if is_uri:
assert term_obj.type == IRI
assert term_obj.iri == value_text
else:
assert term_obj.type == LITERAL
assert term_obj.value == value_text
def test_triple_construction_from_relationship(self):
"""Test constructing Triple objects from relationships"""
# Arrange
relationship = {
"subject": "John Smith",
"predicate": "works_for",
"object": "OpenAI",
"subject_type": "PERSON",
"object_type": "ORG"
}
def construct_triple(relationship, uri_base="http://trustgraph.ai/kg"):
# Generate URIs
subject_uri = f"{uri_base}/person/{relationship['subject'].lower().replace(' ', '-')}"
object_uri = f"{uri_base}/org/{relationship['object'].lower().replace(' ', '-')}"
# Map predicate to schema.org URI
predicate_mappings = {
"works_for": "http://schema.org/worksFor",
"located_in": "http://schema.org/location",
"developed": "http://schema.org/creator"
}
predicate_uri = predicate_mappings.get(relationship["predicate"],
f"{uri_base}/predicate/{relationship['predicate']}")
# Create Term objects
subject_term = Term(type=IRI, iri=subject_uri)
predicate_term = Term(type=IRI, iri=predicate_uri)
object_term = Term(type=IRI, iri=object_uri)
# Create Triple
return Triple(
s=subject_term,
p=predicate_term,
o=object_term
)
# Act
triple = construct_triple(relationship)
# Assert
assert isinstance(triple, Triple)
assert triple.s.iri == "http://trustgraph.ai/kg/person/john-smith"
assert triple.s.type == IRI
assert triple.p.iri == "http://schema.org/worksFor"
assert triple.p.type == IRI
assert triple.o.iri == "http://trustgraph.ai/kg/org/openai"
assert triple.o.type == IRI
def test_literal_value_handling(self):
"""Test handling of literal values vs URI values"""
# Arrange
test_data = [
("John Smith", "name", "John Smith", False), # Literal name
("John Smith", "age", "30", False), # Literal age
("John Smith", "email", "john@example.com", False), # Literal email
("John Smith", "worksFor", "http://trustgraph.ai/kg/org/openai", True) # URI reference
]
def create_triple_with_literal(subject_uri, predicate, object_value, object_is_uri):
subject_term = Term(type=IRI, iri=subject_uri)
# Determine predicate URI
predicate_mappings = {
"name": "http://schema.org/name",
"age": "http://schema.org/age",
"email": "http://schema.org/email",
"worksFor": "http://schema.org/worksFor"
}
predicate_uri = predicate_mappings.get(predicate, f"http://trustgraph.ai/kg/predicate/{predicate}")
predicate_term = Term(type=IRI, iri=predicate_uri)
# Create object term with appropriate type
if object_is_uri:
object_term = Term(type=IRI, iri=object_value)
else:
datatype = None
if predicate == "age":
datatype = "integer"
elif predicate in ["name", "email"]:
datatype = "string"
object_term = Term(type=LITERAL, value=object_value, datatype=datatype)
return Triple(s=subject_term, p=predicate_term, o=object_term)
# Act & Assert
for subject_uri, predicate, object_value, object_is_uri in test_data:
subject_full_uri = "http://trustgraph.ai/kg/person/john-smith"
triple = create_triple_with_literal(subject_full_uri, predicate, object_value, object_is_uri)
if object_is_uri:
assert triple.o.type == IRI
assert triple.o.iri == object_value
else:
assert triple.o.type == LITERAL
assert triple.o.value == object_value
if predicate == "age":
assert triple.o.datatype == "integer"
elif predicate in ["name", "email"]:
assert triple.o.datatype == "string"
def test_namespace_management(self):
"""Test namespace prefix management and expansion"""
# Arrange
namespaces = {
"tg": "http://trustgraph.ai/kg/",
"schema": "http://schema.org/",
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#"
}
def expand_prefixed_uri(prefixed_uri, namespaces):
if ":" not in prefixed_uri:
return prefixed_uri
prefix, local_name = prefixed_uri.split(":", 1)
if prefix in namespaces:
return namespaces[prefix] + local_name
return prefixed_uri
def create_prefixed_uri(full_uri, namespaces):
for prefix, namespace_uri in namespaces.items():
if full_uri.startswith(namespace_uri):
local_name = full_uri[len(namespace_uri):]
return f"{prefix}:{local_name}"
return full_uri
# Act & Assert
test_cases = [
("tg:person/john-smith", "http://trustgraph.ai/kg/person/john-smith"),
("schema:worksFor", "http://schema.org/worksFor"),
("rdf:type", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type")
]
for prefixed, expanded in test_cases:
# Test expansion
result = expand_prefixed_uri(prefixed, namespaces)
assert result == expanded
# Test compression
compressed = create_prefixed_uri(expanded, namespaces)
assert compressed == prefixed
def test_triple_validation(self):
"""Test triple validation rules"""
# Arrange
def get_term_value(term):
"""Extract value from a Term"""
if term.type == IRI:
return term.iri
else:
return term.value
def validate_triple(triple):
errors = []
# Check required components
s_val = get_term_value(triple.s) if triple.s else None
p_val = get_term_value(triple.p) if triple.p else None
o_val = get_term_value(triple.o) if triple.o else None
if not triple.s or not s_val:
errors.append("Missing or empty subject")
if not triple.p or not p_val:
errors.append("Missing or empty predicate")
if not triple.o or not o_val:
errors.append("Missing or empty object")
# Check URI validity for URI values
uri_pattern = r'^https?://[^\s/$.?#].[^\s]*$'
if triple.s.type == IRI and not re.match(uri_pattern, triple.s.iri or ""):
errors.append("Invalid subject URI format")
if triple.p.type == IRI and not re.match(uri_pattern, triple.p.iri or ""):
errors.append("Invalid predicate URI format")
if triple.o.type == IRI and not re.match(uri_pattern, triple.o.iri or ""):
errors.append("Invalid object URI format")
# Predicates should typically be URIs
if triple.p.type != IRI:
errors.append("Predicate should be a URI")
return len(errors) == 0, errors
# Test valid triple
valid_triple = Triple(
s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
p=Term(type=IRI, iri="http://schema.org/name"),
o=Term(type=LITERAL, value="John Smith", datatype="string")
)
# Test invalid triples
invalid_triples = [
Triple(s=Term(type=IRI, iri=""),
p=Term(type=IRI, iri="http://schema.org/name"),
o=Term(type=LITERAL, value="John")), # Empty subject
Triple(s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
p=Term(type=LITERAL, value="name"), # Non-URI predicate
o=Term(type=LITERAL, value="John")),
Triple(s=Term(type=IRI, iri="invalid-uri"),
p=Term(type=IRI, iri="http://schema.org/name"),
o=Term(type=LITERAL, value="John")) # Invalid URI format
]
# Act & Assert
is_valid, errors = validate_triple(valid_triple)
assert is_valid, f"Valid triple failed validation: {errors}"
for invalid_triple in invalid_triples:
is_valid, errors = validate_triple(invalid_triple)
assert not is_valid, f"Invalid triple passed validation: {invalid_triple}"
assert len(errors) > 0
def test_batch_triple_construction(self):
"""Test constructing multiple triples from entity/relationship data"""
# Arrange
entities = [
{"text": "John Smith", "type": "PERSON"},
{"text": "OpenAI", "type": "ORG"},
{"text": "San Francisco", "type": "PLACE"}
]
relationships = [
{"subject": "John Smith", "predicate": "works_for", "object": "OpenAI"},
{"subject": "OpenAI", "predicate": "located_in", "object": "San Francisco"}
]
def construct_triple_batch(entities, relationships, document_id="doc-1"):
triples = []
# Create type triples for entities
for entity in entities:
entity_uri = f"http://trustgraph.ai/kg/{entity['type'].lower()}/{entity['text'].lower().replace(' ', '-')}"
type_uri = f"http://trustgraph.ai/kg/type/{entity['type']}"
type_triple = Triple(
s=Term(type=IRI, iri=entity_uri),
p=Term(type=IRI, iri="http://www.w3.org/1999/02/22-rdf-syntax-ns#type"),
o=Term(type=IRI, iri=type_uri)
)
triples.append(type_triple)
# Create relationship triples
for rel in relationships:
subject_uri = f"http://trustgraph.ai/kg/entity/{rel['subject'].lower().replace(' ', '-')}"
object_uri = f"http://trustgraph.ai/kg/entity/{rel['object'].lower().replace(' ', '-')}"
predicate_uri = f"http://schema.org/{rel['predicate'].replace('_', '')}"
rel_triple = Triple(
s=Term(type=IRI, iri=subject_uri),
p=Term(type=IRI, iri=predicate_uri),
o=Term(type=IRI, iri=object_uri)
)
triples.append(rel_triple)
return triples
# Act
triples = construct_triple_batch(entities, relationships)
# Assert
assert len(triples) == len(entities) + len(relationships) # Type triples + relationship triples
# Check that all triples are valid Triple objects
for triple in triples:
assert isinstance(triple, Triple)
assert triple.s.iri != ""
assert triple.p.iri != ""
assert triple.o.iri != ""
def test_triples_batch_object_creation(self):
"""Test creating Triples batch objects with metadata"""
# Arrange
sample_triples = [
Triple(
s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
p=Term(type=IRI, iri="http://schema.org/name"),
o=Term(type=LITERAL, value="John Smith", datatype="string")
),
Triple(
s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
p=Term(type=IRI, iri="http://schema.org/worksFor"),
o=Term(type=IRI, iri="http://trustgraph.ai/kg/org/openai")
)
]
metadata = Metadata(
id="test-doc-123",
user="test_user",
collection="test_collection",
)
# Act
triples_batch = Triples(
metadata=metadata,
triples=sample_triples
)
# Assert
assert isinstance(triples_batch, Triples)
assert triples_batch.metadata.id == "test-doc-123"
assert triples_batch.metadata.user == "test_user"
assert triples_batch.metadata.collection == "test_collection"
assert len(triples_batch.triples) == 2
# Check that triples are properly embedded
for triple in triples_batch.triples:
assert isinstance(triple, Triple)
assert isinstance(triple.s, Term)
assert isinstance(triple.p, Term)
assert isinstance(triple.o, Term)
def test_uri_collision_handling(self):
"""Test handling of URI collisions and duplicate detection"""
# Arrange
entities = [
{"text": "John Smith", "type": "PERSON", "context": "Engineer at OpenAI"},
{"text": "John Smith", "type": "PERSON", "context": "Professor at Stanford"},
{"text": "Apple Inc.", "type": "ORG", "context": "Technology company"},
{"text": "Apple", "type": "PRODUCT", "context": "Fruit"}
]
def generate_unique_uri(entity, existing_uris):
base_text = entity["text"].lower().replace(" ", "-")
entity_type = entity["type"].lower()
base_uri = f"http://trustgraph.ai/kg/{entity_type}/{base_text}"
# If URI doesn't exist, use it
if base_uri not in existing_uris:
return base_uri
# Generate hash from context to create unique identifier
context = entity.get("context", "")
context_hash = hashlib.md5(context.encode()).hexdigest()[:8]
unique_uri = f"{base_uri}-{context_hash}"
return unique_uri
# Act
generated_uris = []
existing_uris = set()
for entity in entities:
uri = generate_unique_uri(entity, existing_uris)
generated_uris.append(uri)
existing_uris.add(uri)
# Assert
# All URIs should be unique
assert len(generated_uris) == len(set(generated_uris))
# Both John Smith entities should have different URIs
john_smith_uris = [uri for uri in generated_uris if "john-smith" in uri]
assert len(john_smith_uris) == 2
assert john_smith_uris[0] != john_smith_uris[1]
# Apple entities should have different URIs due to different types
apple_uris = [uri for uri in generated_uris if "apple" in uri]
assert len(apple_uris) == 2
assert apple_uris[0] != apple_uris[1]