Changed schema for Value -> Term, majorly breaking change (#622)

* Changed schema for Value -> Term, majorly breaking change

* Following the schema change, Value -> Term into all processing

* Updated Cassandra for g, p, s, o index patterns (7 indexes)

* Reviewed and updated all tests

* Neo4j, Memgraph and FalkorDB remain broken, will look at once settled down
This commit is contained in:
cybermaggedon 2026-01-27 13:48:08 +00:00 committed by GitHub
parent e061f2c633
commit cf0daedefa
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
86 changed files with 2458 additions and 1764 deletions

View file

@ -6,11 +6,21 @@ import pytest
from unittest.mock import Mock, AsyncMock
# Mock schema classes for testing
class Value:
def __init__(self, value, is_uri, type):
self.value = value
self.is_uri = is_uri
# Term type constants
IRI = "i"
LITERAL = "l"
BLANK = "b"
TRIPLE = "t"
class Term:
def __init__(self, type, iri=None, value=None, id=None, datatype=None, language=None, triple=None):
self.type = type
self.iri = iri
self.value = value
self.id = id
self.datatype = datatype
self.language = language
self.triple = triple
class Triple:
def __init__(self, s, p, o):
@ -66,32 +76,30 @@ def sample_relationships():
@pytest.fixture
def sample_value_uri():
"""Sample URI Value object"""
return Value(
value="http://example.com/person/john-smith",
is_uri=True,
type=""
def sample_term_uri():
"""Sample URI Term object"""
return Term(
type=IRI,
iri="http://example.com/person/john-smith"
)
@pytest.fixture
def sample_value_literal():
"""Sample literal Value object"""
return Value(
value="John Smith",
is_uri=False,
type="string"
def sample_term_literal():
"""Sample literal Term object"""
return Term(
type=LITERAL,
value="John Smith"
)
@pytest.fixture
def sample_triple(sample_value_uri, sample_value_literal):
def sample_triple(sample_term_uri, sample_term_literal):
"""Sample Triple object"""
return Triple(
s=sample_value_uri,
p=Value(value="http://schema.org/name", is_uri=True, type=""),
o=sample_value_literal
s=sample_term_uri,
p=Term(type=IRI, iri="http://schema.org/name"),
o=sample_term_literal
)

View file

@ -11,7 +11,7 @@ import json
from unittest.mock import AsyncMock, MagicMock, patch
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Value, Error
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
from trustgraph.schema import EntityContext, EntityContexts
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
from trustgraph.template.prompt_manager import PromptManager
@ -53,9 +53,9 @@ class TestAgentKgExtractor:
id="doc123",
metadata=[
Triple(
s=Value(value="doc123", is_uri=True),
p=Value(value="http://example.org/type", is_uri=True),
o=Value(value="document", is_uri=False)
s=Term(type=IRI, iri="doc123"),
p=Term(type=IRI, iri="http://example.org/type"),
o=Term(type=LITERAL, value="document")
)
]
)
@ -178,27 +178,27 @@ This is not JSON at all
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
# Check entity label triple
label_triple = next((t for t in triples if t.p.value == RDF_LABEL and t.o.value == "Machine Learning"), None)
label_triple = next((t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "Machine Learning"), None)
assert label_triple is not None
assert label_triple.s.value == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
assert label_triple.s.is_uri == True
assert label_triple.o.is_uri == False
assert label_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
assert label_triple.s.type == IRI
assert label_triple.o.type == LITERAL
# Check definition triple
def_triple = next((t for t in triples if t.p.value == DEFINITION), None)
def_triple = next((t for t in triples if t.p.iri == DEFINITION), None)
assert def_triple is not None
assert def_triple.s.value == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
assert def_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
assert def_triple.o.value == "A subset of AI that enables learning from data."
# Check subject-of triple
subject_of_triple = next((t for t in triples if t.p.value == SUBJECT_OF), None)
subject_of_triple = next((t for t in triples if t.p.iri == SUBJECT_OF), None)
assert subject_of_triple is not None
assert subject_of_triple.s.value == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
assert subject_of_triple.o.value == "doc123"
assert subject_of_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
assert subject_of_triple.o.iri == "doc123"
# Check entity context
assert len(entity_contexts) == 1
assert entity_contexts[0].entity.value == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
assert entity_contexts[0].entity.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
assert entity_contexts[0].context == "A subset of AI that enables learning from data."
def test_process_extraction_data_relationships(self, agent_extractor, sample_metadata):
@ -218,25 +218,25 @@ This is not JSON at all
# Check that subject, predicate, and object labels are created
subject_uri = f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
predicate_uri = f"{TRUSTGRAPH_ENTITIES}is_subset_of"
# Find label triples
subject_label = next((t for t in triples if t.s.value == subject_uri and t.p.value == RDF_LABEL), None)
subject_label = next((t for t in triples if t.s.iri == subject_uri and t.p.iri == RDF_LABEL), None)
assert subject_label is not None
assert subject_label.o.value == "Machine Learning"
predicate_label = next((t for t in triples if t.s.value == predicate_uri and t.p.value == RDF_LABEL), None)
predicate_label = next((t for t in triples if t.s.iri == predicate_uri and t.p.iri == RDF_LABEL), None)
assert predicate_label is not None
assert predicate_label.o.value == "is_subset_of"
# Check main relationship triple
object_uri = f"{TRUSTGRAPH_ENTITIES}Artificial%20Intelligence"
rel_triple = next((t for t in triples if t.s.value == subject_uri and t.p.value == predicate_uri), None)
rel_triple = next((t for t in triples if t.s.iri == subject_uri and t.p.iri == predicate_uri), None)
assert rel_triple is not None
assert rel_triple.o.value == object_uri
assert rel_triple.o.is_uri == True
assert rel_triple.o.iri == object_uri
assert rel_triple.o.type == IRI
# Check subject-of relationships
subject_of_triples = [t for t in triples if t.p.value == SUBJECT_OF and t.o.value == "doc123"]
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF and t.o.iri == "doc123"]
assert len(subject_of_triples) >= 2 # At least subject and predicate should have subject-of relations
def test_process_extraction_data_literal_object(self, agent_extractor, sample_metadata):
@ -254,7 +254,7 @@ This is not JSON at all
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
# Check that object labels are not created for literal objects
object_labels = [t for t in triples if t.p.value == RDF_LABEL and t.o.value == "95%"]
object_labels = [t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "95%"]
# Based on the code logic, it should not create object labels for non-entity objects
# But there might be a bug in the original implementation
@ -263,12 +263,12 @@ This is not JSON at all
triples, entity_contexts = agent_extractor.process_extraction_data(sample_extraction_data, sample_metadata)
# Check that we have both definition and relationship triples
definition_triples = [t for t in triples if t.p.value == DEFINITION]
definition_triples = [t for t in triples if t.p.iri == DEFINITION]
assert len(definition_triples) == 2 # Two definitions
# Check entity contexts are created for definitions
assert len(entity_contexts) == 2
entity_uris = [ec.entity.value for ec in entity_contexts]
entity_uris = [ec.entity.iri for ec in entity_contexts]
assert f"{TRUSTGRAPH_ENTITIES}Machine%20Learning" in entity_uris
assert f"{TRUSTGRAPH_ENTITIES}Neural%20Networks" in entity_uris
@ -282,7 +282,7 @@ This is not JSON at all
triples, entity_contexts = agent_extractor.process_extraction_data(data, metadata)
# Should not create subject-of relationships when no metadata ID
subject_of_triples = [t for t in triples if t.p.value == SUBJECT_OF]
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
assert len(subject_of_triples) == 0
# Should still create entity contexts
@ -327,17 +327,17 @@ This is not JSON at all
async def test_emit_triples(self, agent_extractor, sample_metadata):
"""Test emitting triples to publisher"""
mock_publisher = AsyncMock()
test_triples = [
Triple(
s=Value(value="test:subject", is_uri=True),
p=Value(value="test:predicate", is_uri=True),
o=Value(value="test object", is_uri=False)
s=Term(type=IRI, iri="test:subject"),
p=Term(type=IRI, iri="test:predicate"),
o=Term(type=LITERAL, value="test object")
)
]
await agent_extractor.emit_triples(mock_publisher, sample_metadata, test_triples)
mock_publisher.send.assert_called_once()
sent_triples = mock_publisher.send.call_args[0][0]
assert isinstance(sent_triples, Triples)
@ -348,22 +348,22 @@ This is not JSON at all
# Note: metadata.metadata is now empty array in the new implementation
assert sent_triples.metadata.metadata == []
assert len(sent_triples.triples) == 1
assert sent_triples.triples[0].s.value == "test:subject"
assert sent_triples.triples[0].s.iri == "test:subject"
@pytest.mark.asyncio
async def test_emit_entity_contexts(self, agent_extractor, sample_metadata):
"""Test emitting entity contexts to publisher"""
mock_publisher = AsyncMock()
test_contexts = [
EntityContext(
entity=Value(value="test:entity", is_uri=True),
entity=Term(type=IRI, iri="test:entity"),
context="Test context"
)
]
await agent_extractor.emit_entity_contexts(mock_publisher, sample_metadata, test_contexts)
mock_publisher.send.assert_called_once()
sent_contexts = mock_publisher.send.call_args[0][0]
assert isinstance(sent_contexts, EntityContexts)
@ -374,7 +374,7 @@ This is not JSON at all
# Note: metadata.metadata is now empty array in the new implementation
assert sent_contexts.metadata.metadata == []
assert len(sent_contexts.entities) == 1
assert sent_contexts.entities[0].entity.value == "test:entity"
assert sent_contexts.entities[0].entity.iri == "test:entity"
def test_agent_extractor_initialization_params(self):
"""Test agent extractor parameter validation"""

View file

@ -11,7 +11,7 @@ import urllib.parse
from unittest.mock import AsyncMock, MagicMock
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Value
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
from trustgraph.schema import EntityContext, EntityContexts
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
@ -188,7 +188,7 @@ class TestAgentKgExtractionEdgeCases:
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
# Should not create subject-of triples when ID is empty string
subject_of_triples = [t for t in triples if t.p.value == SUBJECT_OF]
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
assert len(subject_of_triples) == 0
def test_process_extraction_data_special_entity_names(self, agent_extractor):
@ -221,7 +221,7 @@ class TestAgentKgExtractionEdgeCases:
# Verify URIs were properly encoded
for i, entity in enumerate(special_entities):
expected_uri = f"{TRUSTGRAPH_ENTITIES}{urllib.parse.quote(entity)}"
assert contexts[i].entity.value == expected_uri
assert contexts[i].entity.iri == expected_uri
def test_process_extraction_data_very_long_definitions(self, agent_extractor):
"""Test processing with very long entity definitions"""
@ -241,7 +241,7 @@ class TestAgentKgExtractionEdgeCases:
assert contexts[0].context == long_definition
# Find definition triple
def_triple = next((t for t in triples if t.p.value == DEFINITION), None)
def_triple = next((t for t in triples if t.p.iri == DEFINITION), None)
assert def_triple is not None
assert def_triple.o.value == long_definition
@ -262,7 +262,7 @@ class TestAgentKgExtractionEdgeCases:
assert len(contexts) == 4
# Check that both definitions for "Machine Learning" are present
ml_contexts = [ec for ec in contexts if "Machine%20Learning" in ec.entity.value]
ml_contexts = [ec for ec in contexts if "Machine%20Learning" in ec.entity.iri]
assert len(ml_contexts) == 2
assert ml_contexts[0].context == "First definition"
assert ml_contexts[1].context == "Second definition"
@ -286,7 +286,7 @@ class TestAgentKgExtractionEdgeCases:
assert len(contexts) == 3
# Empty entity should create empty URI after encoding
empty_entity_context = next((ec for ec in contexts if ec.entity.value == TRUSTGRAPH_ENTITIES), None)
empty_entity_context = next((ec for ec in contexts if ec.entity.iri == TRUSTGRAPH_ENTITIES), None)
assert empty_entity_context is not None
def test_process_extraction_data_nested_json_in_strings(self, agent_extractor):
@ -338,7 +338,7 @@ class TestAgentKgExtractionEdgeCases:
# Should process all relationships
# Note: The current implementation has some logic issues that these tests document
assert len([t for t in triples if t.p.value != RDF_LABEL and t.p.value != SUBJECT_OF]) >= 7
assert len([t for t in triples if t.p.iri != RDF_LABEL and t.p.iri != SUBJECT_OF]) >= 7
@pytest.mark.asyncio
async def test_emit_empty_collections(self, agent_extractor):

View file

@ -7,7 +7,7 @@ processing graph structures, and performing graph operations.
import pytest
from unittest.mock import Mock
from .conftest import Triple, Value, Metadata
from .conftest import Triple, Metadata
from collections import defaultdict, deque

View file

@ -2,13 +2,13 @@
Unit tests for triple construction logic
Tests the core business logic for constructing RDF triples from extracted
entities and relationships, including URI generation, Value object creation,
entities and relationships, including URI generation, Term object creation,
and triple validation.
"""
import pytest
from unittest.mock import Mock
from .conftest import Triple, Triples, Value, Metadata
from .conftest import Triple, Triples, Term, Metadata, IRI, LITERAL
import re
import hashlib
@ -48,80 +48,82 @@ class TestTripleConstructionLogic:
generated_uri = generate_uri(text, entity_type)
assert generated_uri == expected_uri, f"URI generation failed for '{text}'"
def test_value_object_creation(self):
"""Test creation of Value objects for subjects, predicates, and objects"""
def test_term_object_creation(self):
"""Test creation of Term objects for subjects, predicates, and objects"""
# Arrange
def create_value_object(text, is_uri, value_type=""):
return Value(
value=text,
is_uri=is_uri,
type=value_type
)
def create_term_object(text, is_uri, datatype=""):
if is_uri:
return Term(type=IRI, iri=text)
else:
return Term(type=LITERAL, value=text, datatype=datatype if datatype else None)
test_cases = [
("http://trustgraph.ai/kg/person/john-smith", True, ""),
("John Smith", False, "string"),
("42", False, "integer"),
("http://schema.org/worksFor", True, "")
]
# Act & Assert
for value_text, is_uri, value_type in test_cases:
value_obj = create_value_object(value_text, is_uri, value_type)
assert isinstance(value_obj, Value)
assert value_obj.value == value_text
assert value_obj.is_uri == is_uri
assert value_obj.type == value_type
for value_text, is_uri, datatype in test_cases:
term_obj = create_term_object(value_text, is_uri, datatype)
assert isinstance(term_obj, Term)
if is_uri:
assert term_obj.type == IRI
assert term_obj.iri == value_text
else:
assert term_obj.type == LITERAL
assert term_obj.value == value_text
def test_triple_construction_from_relationship(self):
"""Test constructing Triple objects from relationships"""
# Arrange
relationship = {
"subject": "John Smith",
"predicate": "works_for",
"predicate": "works_for",
"object": "OpenAI",
"subject_type": "PERSON",
"object_type": "ORG"
}
def construct_triple(relationship, uri_base="http://trustgraph.ai/kg"):
# Generate URIs
subject_uri = f"{uri_base}/person/{relationship['subject'].lower().replace(' ', '-')}"
object_uri = f"{uri_base}/org/{relationship['object'].lower().replace(' ', '-')}"
# Map predicate to schema.org URI
predicate_mappings = {
"works_for": "http://schema.org/worksFor",
"located_in": "http://schema.org/location",
"developed": "http://schema.org/creator"
}
predicate_uri = predicate_mappings.get(relationship["predicate"],
predicate_uri = predicate_mappings.get(relationship["predicate"],
f"{uri_base}/predicate/{relationship['predicate']}")
# Create Value objects
subject_value = Value(value=subject_uri, is_uri=True, type="")
predicate_value = Value(value=predicate_uri, is_uri=True, type="")
object_value = Value(value=object_uri, is_uri=True, type="")
# Create Term objects
subject_term = Term(type=IRI, iri=subject_uri)
predicate_term = Term(type=IRI, iri=predicate_uri)
object_term = Term(type=IRI, iri=object_uri)
# Create Triple
return Triple(
s=subject_value,
p=predicate_value,
o=object_value
s=subject_term,
p=predicate_term,
o=object_term
)
# Act
triple = construct_triple(relationship)
# Assert
assert isinstance(triple, Triple)
assert triple.s.value == "http://trustgraph.ai/kg/person/john-smith"
assert triple.s.is_uri is True
assert triple.p.value == "http://schema.org/worksFor"
assert triple.p.is_uri is True
assert triple.o.value == "http://trustgraph.ai/kg/org/openai"
assert triple.o.is_uri is True
assert triple.s.iri == "http://trustgraph.ai/kg/person/john-smith"
assert triple.s.type == IRI
assert triple.p.iri == "http://schema.org/worksFor"
assert triple.p.type == IRI
assert triple.o.iri == "http://trustgraph.ai/kg/org/openai"
assert triple.o.type == IRI
def test_literal_value_handling(self):
"""Test handling of literal values vs URI values"""
@ -132,10 +134,10 @@ class TestTripleConstructionLogic:
("John Smith", "email", "john@example.com", False), # Literal email
("John Smith", "worksFor", "http://trustgraph.ai/kg/org/openai", True) # URI reference
]
def create_triple_with_literal(subject_uri, predicate, object_value, object_is_uri):
subject_val = Value(value=subject_uri, is_uri=True, type="")
subject_term = Term(type=IRI, iri=subject_uri)
# Determine predicate URI
predicate_mappings = {
"name": "http://schema.org/name",
@ -144,32 +146,37 @@ class TestTripleConstructionLogic:
"worksFor": "http://schema.org/worksFor"
}
predicate_uri = predicate_mappings.get(predicate, f"http://trustgraph.ai/kg/predicate/{predicate}")
predicate_val = Value(value=predicate_uri, is_uri=True, type="")
# Create object value with appropriate type
object_type = ""
if not object_is_uri:
predicate_term = Term(type=IRI, iri=predicate_uri)
# Create object term with appropriate type
if object_is_uri:
object_term = Term(type=IRI, iri=object_value)
else:
datatype = None
if predicate == "age":
object_type = "integer"
datatype = "integer"
elif predicate in ["name", "email"]:
object_type = "string"
object_val = Value(value=object_value, is_uri=object_is_uri, type=object_type)
return Triple(s=subject_val, p=predicate_val, o=object_val)
datatype = "string"
object_term = Term(type=LITERAL, value=object_value, datatype=datatype)
return Triple(s=subject_term, p=predicate_term, o=object_term)
# Act & Assert
for subject_uri, predicate, object_value, object_is_uri in test_data:
subject_full_uri = "http://trustgraph.ai/kg/person/john-smith"
triple = create_triple_with_literal(subject_full_uri, predicate, object_value, object_is_uri)
assert triple.o.is_uri == object_is_uri
assert triple.o.value == object_value
if object_is_uri:
assert triple.o.type == IRI
assert triple.o.iri == object_value
else:
assert triple.o.type == LITERAL
assert triple.o.value == object_value
if predicate == "age":
assert triple.o.type == "integer"
assert triple.o.datatype == "integer"
elif predicate in ["name", "email"]:
assert triple.o.type == "string"
assert triple.o.datatype == "string"
def test_namespace_management(self):
"""Test namespace prefix management and expansion"""
@ -216,63 +223,74 @@ class TestTripleConstructionLogic:
def test_triple_validation(self):
"""Test triple validation rules"""
# Arrange
def get_term_value(term):
"""Extract value from a Term"""
if term.type == IRI:
return term.iri
else:
return term.value
def validate_triple(triple):
errors = []
# Check required components
if not triple.s or not triple.s.value:
s_val = get_term_value(triple.s) if triple.s else None
p_val = get_term_value(triple.p) if triple.p else None
o_val = get_term_value(triple.o) if triple.o else None
if not triple.s or not s_val:
errors.append("Missing or empty subject")
if not triple.p or not triple.p.value:
if not triple.p or not p_val:
errors.append("Missing or empty predicate")
if not triple.o or not triple.o.value:
if not triple.o or not o_val:
errors.append("Missing or empty object")
# Check URI validity for URI values
uri_pattern = r'^https?://[^\s/$.?#].[^\s]*$'
if triple.s.is_uri and not re.match(uri_pattern, triple.s.value):
if triple.s.type == IRI and not re.match(uri_pattern, triple.s.iri or ""):
errors.append("Invalid subject URI format")
if triple.p.is_uri and not re.match(uri_pattern, triple.p.value):
if triple.p.type == IRI and not re.match(uri_pattern, triple.p.iri or ""):
errors.append("Invalid predicate URI format")
if triple.o.is_uri and not re.match(uri_pattern, triple.o.value):
if triple.o.type == IRI and not re.match(uri_pattern, triple.o.iri or ""):
errors.append("Invalid object URI format")
# Predicates should typically be URIs
if not triple.p.is_uri:
if triple.p.type != IRI:
errors.append("Predicate should be a URI")
return len(errors) == 0, errors
# Test valid triple
valid_triple = Triple(
s=Value(value="http://trustgraph.ai/kg/person/john", is_uri=True, type=""),
p=Value(value="http://schema.org/name", is_uri=True, type=""),
o=Value(value="John Smith", is_uri=False, type="string")
s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
p=Term(type=IRI, iri="http://schema.org/name"),
o=Term(type=LITERAL, value="John Smith", datatype="string")
)
# Test invalid triples
invalid_triples = [
Triple(s=Value(value="", is_uri=True, type=""),
p=Value(value="http://schema.org/name", is_uri=True, type=""),
o=Value(value="John", is_uri=False, type="")), # Empty subject
Triple(s=Value(value="http://trustgraph.ai/kg/person/john", is_uri=True, type=""),
p=Value(value="name", is_uri=False, type=""), # Non-URI predicate
o=Value(value="John", is_uri=False, type="")),
Triple(s=Value(value="invalid-uri", is_uri=True, type=""),
p=Value(value="http://schema.org/name", is_uri=True, type=""),
o=Value(value="John", is_uri=False, type="")) # Invalid URI format
Triple(s=Term(type=IRI, iri=""),
p=Term(type=IRI, iri="http://schema.org/name"),
o=Term(type=LITERAL, value="John")), # Empty subject
Triple(s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
p=Term(type=LITERAL, value="name"), # Non-URI predicate
o=Term(type=LITERAL, value="John")),
Triple(s=Term(type=IRI, iri="invalid-uri"),
p=Term(type=IRI, iri="http://schema.org/name"),
o=Term(type=LITERAL, value="John")) # Invalid URI format
]
# Act & Assert
is_valid, errors = validate_triple(valid_triple)
assert is_valid, f"Valid triple failed validation: {errors}"
for invalid_triple in invalid_triples:
is_valid, errors = validate_triple(invalid_triple)
assert not is_valid, f"Invalid triple passed validation: {invalid_triple}"
@ -286,97 +304,97 @@ class TestTripleConstructionLogic:
{"text": "OpenAI", "type": "ORG"},
{"text": "San Francisco", "type": "PLACE"}
]
relationships = [
{"subject": "John Smith", "predicate": "works_for", "object": "OpenAI"},
{"subject": "OpenAI", "predicate": "located_in", "object": "San Francisco"}
]
def construct_triple_batch(entities, relationships, document_id="doc-1"):
triples = []
# Create type triples for entities
for entity in entities:
entity_uri = f"http://trustgraph.ai/kg/{entity['type'].lower()}/{entity['text'].lower().replace(' ', '-')}"
type_uri = f"http://trustgraph.ai/kg/type/{entity['type']}"
type_triple = Triple(
s=Value(value=entity_uri, is_uri=True, type=""),
p=Value(value="http://www.w3.org/1999/02/22-rdf-syntax-ns#type", is_uri=True, type=""),
o=Value(value=type_uri, is_uri=True, type="")
s=Term(type=IRI, iri=entity_uri),
p=Term(type=IRI, iri="http://www.w3.org/1999/02/22-rdf-syntax-ns#type"),
o=Term(type=IRI, iri=type_uri)
)
triples.append(type_triple)
# Create relationship triples
for rel in relationships:
subject_uri = f"http://trustgraph.ai/kg/entity/{rel['subject'].lower().replace(' ', '-')}"
object_uri = f"http://trustgraph.ai/kg/entity/{rel['object'].lower().replace(' ', '-')}"
predicate_uri = f"http://schema.org/{rel['predicate'].replace('_', '')}"
rel_triple = Triple(
s=Value(value=subject_uri, is_uri=True, type=""),
p=Value(value=predicate_uri, is_uri=True, type=""),
o=Value(value=object_uri, is_uri=True, type="")
s=Term(type=IRI, iri=subject_uri),
p=Term(type=IRI, iri=predicate_uri),
o=Term(type=IRI, iri=object_uri)
)
triples.append(rel_triple)
return triples
# Act
triples = construct_triple_batch(entities, relationships)
# Assert
assert len(triples) == len(entities) + len(relationships) # Type triples + relationship triples
# Check that all triples are valid Triple objects
for triple in triples:
assert isinstance(triple, Triple)
assert triple.s.value != ""
assert triple.p.value != ""
assert triple.o.value != ""
assert triple.s.iri != ""
assert triple.p.iri != ""
assert triple.o.iri != ""
def test_triples_batch_object_creation(self):
"""Test creating Triples batch objects with metadata"""
# Arrange
sample_triples = [
Triple(
s=Value(value="http://trustgraph.ai/kg/person/john", is_uri=True, type=""),
p=Value(value="http://schema.org/name", is_uri=True, type=""),
o=Value(value="John Smith", is_uri=False, type="string")
s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
p=Term(type=IRI, iri="http://schema.org/name"),
o=Term(type=LITERAL, value="John Smith", datatype="string")
),
Triple(
s=Value(value="http://trustgraph.ai/kg/person/john", is_uri=True, type=""),
p=Value(value="http://schema.org/worksFor", is_uri=True, type=""),
o=Value(value="http://trustgraph.ai/kg/org/openai", is_uri=True, type="")
s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
p=Term(type=IRI, iri="http://schema.org/worksFor"),
o=Term(type=IRI, iri="http://trustgraph.ai/kg/org/openai")
)
]
metadata = Metadata(
id="test-doc-123",
user="test_user",
user="test_user",
collection="test_collection",
metadata=[]
)
# Act
triples_batch = Triples(
metadata=metadata,
triples=sample_triples
)
# Assert
assert isinstance(triples_batch, Triples)
assert triples_batch.metadata.id == "test-doc-123"
assert triples_batch.metadata.user == "test_user"
assert triples_batch.metadata.collection == "test_collection"
assert len(triples_batch.triples) == 2
# Check that triples are properly embedded
for triple in triples_batch.triples:
assert isinstance(triple, Triple)
assert isinstance(triple.s, Value)
assert isinstance(triple.p, Value)
assert isinstance(triple.o, Value)
assert isinstance(triple.s, Term)
assert isinstance(triple.p, Term)
assert isinstance(triple.o, Term)
def test_uri_collision_handling(self):
"""Test handling of URI collisions and duplicate detection"""