""" Unit tests for triple construction logic Tests the core business logic for constructing RDF triples from extracted entities and relationships, including URI generation, Term object creation, and triple validation. """ import pytest from unittest.mock import Mock from .conftest import Triple, Triples, Term, Metadata, IRI, LITERAL import re import hashlib class TestTripleConstructionLogic: """Test cases for triple construction business logic""" def test_uri_generation_from_text(self): """Test URI generation from entity text""" # Arrange def generate_uri(text, entity_type, base_uri="http://trustgraph.ai/kg"): # Normalize text for URI normalized = text.lower() normalized = re.sub(r'[^\w\s-]', '', normalized) # Remove special chars normalized = re.sub(r'\s+', '-', normalized.strip()) # Replace spaces with hyphens # Map entity types to namespaces type_mappings = { "PERSON": "person", "ORG": "org", "PLACE": "place", "PRODUCT": "product" } namespace = type_mappings.get(entity_type, "entity") return f"{base_uri}/{namespace}/{normalized}" test_cases = [ ("John Smith", "PERSON", "http://trustgraph.ai/kg/person/john-smith"), ("OpenAI Inc.", "ORG", "http://trustgraph.ai/kg/org/openai-inc"), ("San Francisco", "PLACE", "http://trustgraph.ai/kg/place/san-francisco"), ("GPT-4", "PRODUCT", "http://trustgraph.ai/kg/product/gpt-4") ] # Act & Assert for text, entity_type, expected_uri in test_cases: generated_uri = generate_uri(text, entity_type) assert generated_uri == expected_uri, f"URI generation failed for '{text}'" def test_term_object_creation(self): """Test creation of Term objects for subjects, predicates, and objects""" # Arrange def create_term_object(text, is_uri, datatype=""): if is_uri: return Term(type=IRI, iri=text) else: return Term(type=LITERAL, value=text, datatype=datatype if datatype else None) test_cases = [ ("http://trustgraph.ai/kg/person/john-smith", True, ""), ("John Smith", False, "string"), ("42", False, "integer"), ("http://schema.org/worksFor", True, "") ] # Act & Assert for value_text, is_uri, datatype in test_cases: term_obj = create_term_object(value_text, is_uri, datatype) assert isinstance(term_obj, Term) if is_uri: assert term_obj.type == IRI assert term_obj.iri == value_text else: assert term_obj.type == LITERAL assert term_obj.value == value_text def test_triple_construction_from_relationship(self): """Test constructing Triple objects from relationships""" # Arrange relationship = { "subject": "John Smith", "predicate": "works_for", "object": "OpenAI", "subject_type": "PERSON", "object_type": "ORG" } def construct_triple(relationship, uri_base="http://trustgraph.ai/kg"): # Generate URIs subject_uri = f"{uri_base}/person/{relationship['subject'].lower().replace(' ', '-')}" object_uri = f"{uri_base}/org/{relationship['object'].lower().replace(' ', '-')}" # Map predicate to schema.org URI predicate_mappings = { "works_for": "http://schema.org/worksFor", "located_in": "http://schema.org/location", "developed": "http://schema.org/creator" } predicate_uri = predicate_mappings.get(relationship["predicate"], f"{uri_base}/predicate/{relationship['predicate']}") # Create Term objects subject_term = Term(type=IRI, iri=subject_uri) predicate_term = Term(type=IRI, iri=predicate_uri) object_term = Term(type=IRI, iri=object_uri) # Create Triple return Triple( s=subject_term, p=predicate_term, o=object_term ) # Act triple = construct_triple(relationship) # Assert assert isinstance(triple, Triple) assert triple.s.iri == "http://trustgraph.ai/kg/person/john-smith" assert triple.s.type == IRI assert triple.p.iri == "http://schema.org/worksFor" assert triple.p.type == IRI assert triple.o.iri == "http://trustgraph.ai/kg/org/openai" assert triple.o.type == IRI def test_literal_value_handling(self): """Test handling of literal values vs URI values""" # Arrange test_data = [ ("John Smith", "name", "John Smith", False), # Literal name ("John Smith", "age", "30", False), # Literal age ("John Smith", "email", "john@example.com", False), # Literal email ("John Smith", "worksFor", "http://trustgraph.ai/kg/org/openai", True) # URI reference ] def create_triple_with_literal(subject_uri, predicate, object_value, object_is_uri): subject_term = Term(type=IRI, iri=subject_uri) # Determine predicate URI predicate_mappings = { "name": "http://schema.org/name", "age": "http://schema.org/age", "email": "http://schema.org/email", "worksFor": "http://schema.org/worksFor" } predicate_uri = predicate_mappings.get(predicate, f"http://trustgraph.ai/kg/predicate/{predicate}") predicate_term = Term(type=IRI, iri=predicate_uri) # Create object term with appropriate type if object_is_uri: object_term = Term(type=IRI, iri=object_value) else: datatype = None if predicate == "age": datatype = "integer" elif predicate in ["name", "email"]: datatype = "string" object_term = Term(type=LITERAL, value=object_value, datatype=datatype) return Triple(s=subject_term, p=predicate_term, o=object_term) # Act & Assert for subject_uri, predicate, object_value, object_is_uri in test_data: subject_full_uri = "http://trustgraph.ai/kg/person/john-smith" triple = create_triple_with_literal(subject_full_uri, predicate, object_value, object_is_uri) if object_is_uri: assert triple.o.type == IRI assert triple.o.iri == object_value else: assert triple.o.type == LITERAL assert triple.o.value == object_value if predicate == "age": assert triple.o.datatype == "integer" elif predicate in ["name", "email"]: assert triple.o.datatype == "string" def test_namespace_management(self): """Test namespace prefix management and expansion""" # Arrange namespaces = { "tg": "http://trustgraph.ai/kg/", "schema": "http://schema.org/", "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "rdfs": "http://www.w3.org/2000/01/rdf-schema#" } def expand_prefixed_uri(prefixed_uri, namespaces): if ":" not in prefixed_uri: return prefixed_uri prefix, local_name = prefixed_uri.split(":", 1) if prefix in namespaces: return namespaces[prefix] + local_name return prefixed_uri def create_prefixed_uri(full_uri, namespaces): for prefix, namespace_uri in namespaces.items(): if full_uri.startswith(namespace_uri): local_name = full_uri[len(namespace_uri):] return f"{prefix}:{local_name}" return full_uri # Act & Assert test_cases = [ ("tg:person/john-smith", "http://trustgraph.ai/kg/person/john-smith"), ("schema:worksFor", "http://schema.org/worksFor"), ("rdf:type", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type") ] for prefixed, expanded in test_cases: # Test expansion result = expand_prefixed_uri(prefixed, namespaces) assert result == expanded # Test compression compressed = create_prefixed_uri(expanded, namespaces) assert compressed == prefixed def test_triple_validation(self): """Test triple validation rules""" # Arrange def get_term_value(term): """Extract value from a Term""" if term.type == IRI: return term.iri else: return term.value def validate_triple(triple): errors = [] # Check required components s_val = get_term_value(triple.s) if triple.s else None p_val = get_term_value(triple.p) if triple.p else None o_val = get_term_value(triple.o) if triple.o else None if not triple.s or not s_val: errors.append("Missing or empty subject") if not triple.p or not p_val: errors.append("Missing or empty predicate") if not triple.o or not o_val: errors.append("Missing or empty object") # Check URI validity for URI values uri_pattern = r'^https?://[^\s/$.?#].[^\s]*$' if triple.s.type == IRI and not re.match(uri_pattern, triple.s.iri or ""): errors.append("Invalid subject URI format") if triple.p.type == IRI and not re.match(uri_pattern, triple.p.iri or ""): errors.append("Invalid predicate URI format") if triple.o.type == IRI and not re.match(uri_pattern, triple.o.iri or ""): errors.append("Invalid object URI format") # Predicates should typically be URIs if triple.p.type != IRI: errors.append("Predicate should be a URI") return len(errors) == 0, errors # Test valid triple valid_triple = Triple( s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"), p=Term(type=IRI, iri="http://schema.org/name"), o=Term(type=LITERAL, value="John Smith", datatype="string") ) # Test invalid triples invalid_triples = [ Triple(s=Term(type=IRI, iri=""), p=Term(type=IRI, iri="http://schema.org/name"), o=Term(type=LITERAL, value="John")), # Empty subject Triple(s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"), p=Term(type=LITERAL, value="name"), # Non-URI predicate o=Term(type=LITERAL, value="John")), Triple(s=Term(type=IRI, iri="invalid-uri"), p=Term(type=IRI, iri="http://schema.org/name"), o=Term(type=LITERAL, value="John")) # Invalid URI format ] # Act & Assert is_valid, errors = validate_triple(valid_triple) assert is_valid, f"Valid triple failed validation: {errors}" for invalid_triple in invalid_triples: is_valid, errors = validate_triple(invalid_triple) assert not is_valid, f"Invalid triple passed validation: {invalid_triple}" assert len(errors) > 0 def test_batch_triple_construction(self): """Test constructing multiple triples from entity/relationship data""" # Arrange entities = [ {"text": "John Smith", "type": "PERSON"}, {"text": "OpenAI", "type": "ORG"}, {"text": "San Francisco", "type": "PLACE"} ] relationships = [ {"subject": "John Smith", "predicate": "works_for", "object": "OpenAI"}, {"subject": "OpenAI", "predicate": "located_in", "object": "San Francisco"} ] def construct_triple_batch(entities, relationships, document_id="doc-1"): triples = [] # Create type triples for entities for entity in entities: entity_uri = f"http://trustgraph.ai/kg/{entity['type'].lower()}/{entity['text'].lower().replace(' ', '-')}" type_uri = f"http://trustgraph.ai/kg/type/{entity['type']}" type_triple = Triple( s=Term(type=IRI, iri=entity_uri), p=Term(type=IRI, iri="http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), o=Term(type=IRI, iri=type_uri) ) triples.append(type_triple) # Create relationship triples for rel in relationships: subject_uri = f"http://trustgraph.ai/kg/entity/{rel['subject'].lower().replace(' ', '-')}" object_uri = f"http://trustgraph.ai/kg/entity/{rel['object'].lower().replace(' ', '-')}" predicate_uri = f"http://schema.org/{rel['predicate'].replace('_', '')}" rel_triple = Triple( s=Term(type=IRI, iri=subject_uri), p=Term(type=IRI, iri=predicate_uri), o=Term(type=IRI, iri=object_uri) ) triples.append(rel_triple) return triples # Act triples = construct_triple_batch(entities, relationships) # Assert assert len(triples) == len(entities) + len(relationships) # Type triples + relationship triples # Check that all triples are valid Triple objects for triple in triples: assert isinstance(triple, Triple) assert triple.s.iri != "" assert triple.p.iri != "" assert triple.o.iri != "" def test_triples_batch_object_creation(self): """Test creating Triples batch objects with metadata""" # Arrange sample_triples = [ Triple( s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"), p=Term(type=IRI, iri="http://schema.org/name"), o=Term(type=LITERAL, value="John Smith", datatype="string") ), Triple( s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"), p=Term(type=IRI, iri="http://schema.org/worksFor"), o=Term(type=IRI, iri="http://trustgraph.ai/kg/org/openai") ) ] metadata = Metadata( id="test-doc-123", user="test_user", collection="test_collection", ) # Act triples_batch = Triples( metadata=metadata, triples=sample_triples ) # Assert assert isinstance(triples_batch, Triples) assert triples_batch.metadata.id == "test-doc-123" assert triples_batch.metadata.user == "test_user" assert triples_batch.metadata.collection == "test_collection" assert len(triples_batch.triples) == 2 # Check that triples are properly embedded for triple in triples_batch.triples: assert isinstance(triple, Triple) assert isinstance(triple.s, Term) assert isinstance(triple.p, Term) assert isinstance(triple.o, Term) def test_uri_collision_handling(self): """Test handling of URI collisions and duplicate detection""" # Arrange entities = [ {"text": "John Smith", "type": "PERSON", "context": "Engineer at OpenAI"}, {"text": "John Smith", "type": "PERSON", "context": "Professor at Stanford"}, {"text": "Apple Inc.", "type": "ORG", "context": "Technology company"}, {"text": "Apple", "type": "PRODUCT", "context": "Fruit"} ] def generate_unique_uri(entity, existing_uris): base_text = entity["text"].lower().replace(" ", "-") entity_type = entity["type"].lower() base_uri = f"http://trustgraph.ai/kg/{entity_type}/{base_text}" # If URI doesn't exist, use it if base_uri not in existing_uris: return base_uri # Generate hash from context to create unique identifier context = entity.get("context", "") context_hash = hashlib.md5(context.encode()).hexdigest()[:8] unique_uri = f"{base_uri}-{context_hash}" return unique_uri # Act generated_uris = [] existing_uris = set() for entity in entities: uri = generate_unique_uri(entity, existing_uris) generated_uris.append(uri) existing_uris.add(uri) # Assert # All URIs should be unique assert len(generated_uris) == len(set(generated_uris)) # Both John Smith entities should have different URIs john_smith_uris = [uri for uri in generated_uris if "john-smith" in uri] assert len(john_smith_uris) == 2 assert john_smith_uris[0] != john_smith_uris[1] # Apple entities should have different URIs due to different types apple_uris = [uri for uri in generated_uris if "apple" in uri] assert len(apple_uris) == 2 assert apple_uris[0] != apple_uris[1]