Changed schema for Value -> Term, majorly breaking change (#622)

* Changed schema for Value -> Term, majorly breaking change

* Following the schema change, Value -> Term into all processing

* Updated Cassandra for g, p, s, o index patterns (7 indexes)

* Reviewed and updated all tests

* Neo4j, Memgraph and FalkorDB remain broken, will look at once settled down
This commit is contained in:
cybermaggedon 2026-01-27 13:48:08 +00:00 committed by GitHub
parent e061f2c633
commit cf0daedefa
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
86 changed files with 2458 additions and 1764 deletions

View file

@ -2,13 +2,13 @@
Unit tests for triple construction logic
Tests the core business logic for constructing RDF triples from extracted
entities and relationships, including URI generation, Value object creation,
entities and relationships, including URI generation, Term object creation,
and triple validation.
"""
import pytest
from unittest.mock import Mock
from .conftest import Triple, Triples, Value, Metadata
from .conftest import Triple, Triples, Term, Metadata, IRI, LITERAL
import re
import hashlib
@ -48,80 +48,82 @@ class TestTripleConstructionLogic:
generated_uri = generate_uri(text, entity_type)
assert generated_uri == expected_uri, f"URI generation failed for '{text}'"
def test_value_object_creation(self):
"""Test creation of Value objects for subjects, predicates, and objects"""
def test_term_object_creation(self):
"""Test creation of Term objects for subjects, predicates, and objects"""
# Arrange
def create_value_object(text, is_uri, value_type=""):
return Value(
value=text,
is_uri=is_uri,
type=value_type
)
def create_term_object(text, is_uri, datatype=""):
if is_uri:
return Term(type=IRI, iri=text)
else:
return Term(type=LITERAL, value=text, datatype=datatype if datatype else None)
test_cases = [
("http://trustgraph.ai/kg/person/john-smith", True, ""),
("John Smith", False, "string"),
("42", False, "integer"),
("http://schema.org/worksFor", True, "")
]
# Act & Assert
for value_text, is_uri, value_type in test_cases:
value_obj = create_value_object(value_text, is_uri, value_type)
assert isinstance(value_obj, Value)
assert value_obj.value == value_text
assert value_obj.is_uri == is_uri
assert value_obj.type == value_type
for value_text, is_uri, datatype in test_cases:
term_obj = create_term_object(value_text, is_uri, datatype)
assert isinstance(term_obj, Term)
if is_uri:
assert term_obj.type == IRI
assert term_obj.iri == value_text
else:
assert term_obj.type == LITERAL
assert term_obj.value == value_text
def test_triple_construction_from_relationship(self):
"""Test constructing Triple objects from relationships"""
# Arrange
relationship = {
"subject": "John Smith",
"predicate": "works_for",
"predicate": "works_for",
"object": "OpenAI",
"subject_type": "PERSON",
"object_type": "ORG"
}
def construct_triple(relationship, uri_base="http://trustgraph.ai/kg"):
# Generate URIs
subject_uri = f"{uri_base}/person/{relationship['subject'].lower().replace(' ', '-')}"
object_uri = f"{uri_base}/org/{relationship['object'].lower().replace(' ', '-')}"
# Map predicate to schema.org URI
predicate_mappings = {
"works_for": "http://schema.org/worksFor",
"located_in": "http://schema.org/location",
"developed": "http://schema.org/creator"
}
predicate_uri = predicate_mappings.get(relationship["predicate"],
predicate_uri = predicate_mappings.get(relationship["predicate"],
f"{uri_base}/predicate/{relationship['predicate']}")
# Create Value objects
subject_value = Value(value=subject_uri, is_uri=True, type="")
predicate_value = Value(value=predicate_uri, is_uri=True, type="")
object_value = Value(value=object_uri, is_uri=True, type="")
# Create Term objects
subject_term = Term(type=IRI, iri=subject_uri)
predicate_term = Term(type=IRI, iri=predicate_uri)
object_term = Term(type=IRI, iri=object_uri)
# Create Triple
return Triple(
s=subject_value,
p=predicate_value,
o=object_value
s=subject_term,
p=predicate_term,
o=object_term
)
# Act
triple = construct_triple(relationship)
# Assert
assert isinstance(triple, Triple)
assert triple.s.value == "http://trustgraph.ai/kg/person/john-smith"
assert triple.s.is_uri is True
assert triple.p.value == "http://schema.org/worksFor"
assert triple.p.is_uri is True
assert triple.o.value == "http://trustgraph.ai/kg/org/openai"
assert triple.o.is_uri is True
assert triple.s.iri == "http://trustgraph.ai/kg/person/john-smith"
assert triple.s.type == IRI
assert triple.p.iri == "http://schema.org/worksFor"
assert triple.p.type == IRI
assert triple.o.iri == "http://trustgraph.ai/kg/org/openai"
assert triple.o.type == IRI
def test_literal_value_handling(self):
"""Test handling of literal values vs URI values"""
@ -132,10 +134,10 @@ class TestTripleConstructionLogic:
("John Smith", "email", "john@example.com", False), # Literal email
("John Smith", "worksFor", "http://trustgraph.ai/kg/org/openai", True) # URI reference
]
def create_triple_with_literal(subject_uri, predicate, object_value, object_is_uri):
subject_val = Value(value=subject_uri, is_uri=True, type="")
subject_term = Term(type=IRI, iri=subject_uri)
# Determine predicate URI
predicate_mappings = {
"name": "http://schema.org/name",
@ -144,32 +146,37 @@ class TestTripleConstructionLogic:
"worksFor": "http://schema.org/worksFor"
}
predicate_uri = predicate_mappings.get(predicate, f"http://trustgraph.ai/kg/predicate/{predicate}")
predicate_val = Value(value=predicate_uri, is_uri=True, type="")
# Create object value with appropriate type
object_type = ""
if not object_is_uri:
predicate_term = Term(type=IRI, iri=predicate_uri)
# Create object term with appropriate type
if object_is_uri:
object_term = Term(type=IRI, iri=object_value)
else:
datatype = None
if predicate == "age":
object_type = "integer"
datatype = "integer"
elif predicate in ["name", "email"]:
object_type = "string"
object_val = Value(value=object_value, is_uri=object_is_uri, type=object_type)
return Triple(s=subject_val, p=predicate_val, o=object_val)
datatype = "string"
object_term = Term(type=LITERAL, value=object_value, datatype=datatype)
return Triple(s=subject_term, p=predicate_term, o=object_term)
# Act & Assert
for subject_uri, predicate, object_value, object_is_uri in test_data:
subject_full_uri = "http://trustgraph.ai/kg/person/john-smith"
triple = create_triple_with_literal(subject_full_uri, predicate, object_value, object_is_uri)
assert triple.o.is_uri == object_is_uri
assert triple.o.value == object_value
if object_is_uri:
assert triple.o.type == IRI
assert triple.o.iri == object_value
else:
assert triple.o.type == LITERAL
assert triple.o.value == object_value
if predicate == "age":
assert triple.o.type == "integer"
assert triple.o.datatype == "integer"
elif predicate in ["name", "email"]:
assert triple.o.type == "string"
assert triple.o.datatype == "string"
def test_namespace_management(self):
"""Test namespace prefix management and expansion"""
@ -216,63 +223,74 @@ class TestTripleConstructionLogic:
def test_triple_validation(self):
"""Test triple validation rules"""
# Arrange
def get_term_value(term):
"""Extract value from a Term"""
if term.type == IRI:
return term.iri
else:
return term.value
def validate_triple(triple):
errors = []
# Check required components
if not triple.s or not triple.s.value:
s_val = get_term_value(triple.s) if triple.s else None
p_val = get_term_value(triple.p) if triple.p else None
o_val = get_term_value(triple.o) if triple.o else None
if not triple.s or not s_val:
errors.append("Missing or empty subject")
if not triple.p or not triple.p.value:
if not triple.p or not p_val:
errors.append("Missing or empty predicate")
if not triple.o or not triple.o.value:
if not triple.o or not o_val:
errors.append("Missing or empty object")
# Check URI validity for URI values
uri_pattern = r'^https?://[^\s/$.?#].[^\s]*$'
if triple.s.is_uri and not re.match(uri_pattern, triple.s.value):
if triple.s.type == IRI and not re.match(uri_pattern, triple.s.iri or ""):
errors.append("Invalid subject URI format")
if triple.p.is_uri and not re.match(uri_pattern, triple.p.value):
if triple.p.type == IRI and not re.match(uri_pattern, triple.p.iri or ""):
errors.append("Invalid predicate URI format")
if triple.o.is_uri and not re.match(uri_pattern, triple.o.value):
if triple.o.type == IRI and not re.match(uri_pattern, triple.o.iri or ""):
errors.append("Invalid object URI format")
# Predicates should typically be URIs
if not triple.p.is_uri:
if triple.p.type != IRI:
errors.append("Predicate should be a URI")
return len(errors) == 0, errors
# Test valid triple
valid_triple = Triple(
s=Value(value="http://trustgraph.ai/kg/person/john", is_uri=True, type=""),
p=Value(value="http://schema.org/name", is_uri=True, type=""),
o=Value(value="John Smith", is_uri=False, type="string")
s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
p=Term(type=IRI, iri="http://schema.org/name"),
o=Term(type=LITERAL, value="John Smith", datatype="string")
)
# Test invalid triples
invalid_triples = [
Triple(s=Value(value="", is_uri=True, type=""),
p=Value(value="http://schema.org/name", is_uri=True, type=""),
o=Value(value="John", is_uri=False, type="")), # Empty subject
Triple(s=Value(value="http://trustgraph.ai/kg/person/john", is_uri=True, type=""),
p=Value(value="name", is_uri=False, type=""), # Non-URI predicate
o=Value(value="John", is_uri=False, type="")),
Triple(s=Value(value="invalid-uri", is_uri=True, type=""),
p=Value(value="http://schema.org/name", is_uri=True, type=""),
o=Value(value="John", is_uri=False, type="")) # Invalid URI format
Triple(s=Term(type=IRI, iri=""),
p=Term(type=IRI, iri="http://schema.org/name"),
o=Term(type=LITERAL, value="John")), # Empty subject
Triple(s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
p=Term(type=LITERAL, value="name"), # Non-URI predicate
o=Term(type=LITERAL, value="John")),
Triple(s=Term(type=IRI, iri="invalid-uri"),
p=Term(type=IRI, iri="http://schema.org/name"),
o=Term(type=LITERAL, value="John")) # Invalid URI format
]
# Act & Assert
is_valid, errors = validate_triple(valid_triple)
assert is_valid, f"Valid triple failed validation: {errors}"
for invalid_triple in invalid_triples:
is_valid, errors = validate_triple(invalid_triple)
assert not is_valid, f"Invalid triple passed validation: {invalid_triple}"
@ -286,97 +304,97 @@ class TestTripleConstructionLogic:
{"text": "OpenAI", "type": "ORG"},
{"text": "San Francisco", "type": "PLACE"}
]
relationships = [
{"subject": "John Smith", "predicate": "works_for", "object": "OpenAI"},
{"subject": "OpenAI", "predicate": "located_in", "object": "San Francisco"}
]
def construct_triple_batch(entities, relationships, document_id="doc-1"):
triples = []
# Create type triples for entities
for entity in entities:
entity_uri = f"http://trustgraph.ai/kg/{entity['type'].lower()}/{entity['text'].lower().replace(' ', '-')}"
type_uri = f"http://trustgraph.ai/kg/type/{entity['type']}"
type_triple = Triple(
s=Value(value=entity_uri, is_uri=True, type=""),
p=Value(value="http://www.w3.org/1999/02/22-rdf-syntax-ns#type", is_uri=True, type=""),
o=Value(value=type_uri, is_uri=True, type="")
s=Term(type=IRI, iri=entity_uri),
p=Term(type=IRI, iri="http://www.w3.org/1999/02/22-rdf-syntax-ns#type"),
o=Term(type=IRI, iri=type_uri)
)
triples.append(type_triple)
# Create relationship triples
for rel in relationships:
subject_uri = f"http://trustgraph.ai/kg/entity/{rel['subject'].lower().replace(' ', '-')}"
object_uri = f"http://trustgraph.ai/kg/entity/{rel['object'].lower().replace(' ', '-')}"
predicate_uri = f"http://schema.org/{rel['predicate'].replace('_', '')}"
rel_triple = Triple(
s=Value(value=subject_uri, is_uri=True, type=""),
p=Value(value=predicate_uri, is_uri=True, type=""),
o=Value(value=object_uri, is_uri=True, type="")
s=Term(type=IRI, iri=subject_uri),
p=Term(type=IRI, iri=predicate_uri),
o=Term(type=IRI, iri=object_uri)
)
triples.append(rel_triple)
return triples
# Act
triples = construct_triple_batch(entities, relationships)
# Assert
assert len(triples) == len(entities) + len(relationships) # Type triples + relationship triples
# Check that all triples are valid Triple objects
for triple in triples:
assert isinstance(triple, Triple)
assert triple.s.value != ""
assert triple.p.value != ""
assert triple.o.value != ""
assert triple.s.iri != ""
assert triple.p.iri != ""
assert triple.o.iri != ""
def test_triples_batch_object_creation(self):
"""Test creating Triples batch objects with metadata"""
# Arrange
sample_triples = [
Triple(
s=Value(value="http://trustgraph.ai/kg/person/john", is_uri=True, type=""),
p=Value(value="http://schema.org/name", is_uri=True, type=""),
o=Value(value="John Smith", is_uri=False, type="string")
s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
p=Term(type=IRI, iri="http://schema.org/name"),
o=Term(type=LITERAL, value="John Smith", datatype="string")
),
Triple(
s=Value(value="http://trustgraph.ai/kg/person/john", is_uri=True, type=""),
p=Value(value="http://schema.org/worksFor", is_uri=True, type=""),
o=Value(value="http://trustgraph.ai/kg/org/openai", is_uri=True, type="")
s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
p=Term(type=IRI, iri="http://schema.org/worksFor"),
o=Term(type=IRI, iri="http://trustgraph.ai/kg/org/openai")
)
]
metadata = Metadata(
id="test-doc-123",
user="test_user",
user="test_user",
collection="test_collection",
metadata=[]
)
# Act
triples_batch = Triples(
metadata=metadata,
triples=sample_triples
)
# Assert
assert isinstance(triples_batch, Triples)
assert triples_batch.metadata.id == "test-doc-123"
assert triples_batch.metadata.user == "test_user"
assert triples_batch.metadata.collection == "test_collection"
assert len(triples_batch.triples) == 2
# Check that triples are properly embedded
for triple in triples_batch.triples:
assert isinstance(triple, Triple)
assert isinstance(triple.s, Value)
assert isinstance(triple.p, Value)
assert isinstance(triple.o, Value)
assert isinstance(triple.s, Term)
assert isinstance(triple.p, Term)
assert isinstance(triple.o, Term)
def test_uri_collision_handling(self):
"""Test handling of URI collisions and duplicate detection"""