mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Changed schema for Value -> Term, majorly breaking change (#622)
* Changed schema for Value -> Term, majorly breaking change * Following the schema change, Value -> Term into all processing * Updated Cassandra for g, p, s, o index patterns (7 indexes) * Reviewed and updated all tests * Neo4j, Memgraph and FalkorDB remain broken, will look at once settled down
This commit is contained in:
parent
e061f2c633
commit
cf0daedefa
86 changed files with 2458 additions and 1764 deletions
|
|
@ -6,11 +6,21 @@ import pytest
|
|||
from unittest.mock import Mock, AsyncMock
|
||||
|
||||
# Mock schema classes for testing
|
||||
class Value:
|
||||
def __init__(self, value, is_uri, type):
|
||||
self.value = value
|
||||
self.is_uri = is_uri
|
||||
# Term type constants
|
||||
IRI = "i"
|
||||
LITERAL = "l"
|
||||
BLANK = "b"
|
||||
TRIPLE = "t"
|
||||
|
||||
class Term:
|
||||
def __init__(self, type, iri=None, value=None, id=None, datatype=None, language=None, triple=None):
|
||||
self.type = type
|
||||
self.iri = iri
|
||||
self.value = value
|
||||
self.id = id
|
||||
self.datatype = datatype
|
||||
self.language = language
|
||||
self.triple = triple
|
||||
|
||||
class Triple:
|
||||
def __init__(self, s, p, o):
|
||||
|
|
@ -66,32 +76,30 @@ def sample_relationships():
|
|||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_value_uri():
|
||||
"""Sample URI Value object"""
|
||||
return Value(
|
||||
value="http://example.com/person/john-smith",
|
||||
is_uri=True,
|
||||
type=""
|
||||
def sample_term_uri():
|
||||
"""Sample URI Term object"""
|
||||
return Term(
|
||||
type=IRI,
|
||||
iri="http://example.com/person/john-smith"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_value_literal():
|
||||
"""Sample literal Value object"""
|
||||
return Value(
|
||||
value="John Smith",
|
||||
is_uri=False,
|
||||
type="string"
|
||||
def sample_term_literal():
|
||||
"""Sample literal Term object"""
|
||||
return Term(
|
||||
type=LITERAL,
|
||||
value="John Smith"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_triple(sample_value_uri, sample_value_literal):
|
||||
def sample_triple(sample_term_uri, sample_term_literal):
|
||||
"""Sample Triple object"""
|
||||
return Triple(
|
||||
s=sample_value_uri,
|
||||
p=Value(value="http://schema.org/name", is_uri=True, type=""),
|
||||
o=sample_value_literal
|
||||
s=sample_term_uri,
|
||||
p=Term(type=IRI, iri="http://schema.org/name"),
|
||||
o=sample_term_literal
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ import json
|
|||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
|
||||
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Value, Error
|
||||
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
|
||||
from trustgraph.schema import EntityContext, EntityContexts
|
||||
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
||||
from trustgraph.template.prompt_manager import PromptManager
|
||||
|
|
@ -53,9 +53,9 @@ class TestAgentKgExtractor:
|
|||
id="doc123",
|
||||
metadata=[
|
||||
Triple(
|
||||
s=Value(value="doc123", is_uri=True),
|
||||
p=Value(value="http://example.org/type", is_uri=True),
|
||||
o=Value(value="document", is_uri=False)
|
||||
s=Term(type=IRI, iri="doc123"),
|
||||
p=Term(type=IRI, iri="http://example.org/type"),
|
||||
o=Term(type=LITERAL, value="document")
|
||||
)
|
||||
]
|
||||
)
|
||||
|
|
@ -178,27 +178,27 @@ This is not JSON at all
|
|||
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
|
||||
|
||||
# Check entity label triple
|
||||
label_triple = next((t for t in triples if t.p.value == RDF_LABEL and t.o.value == "Machine Learning"), None)
|
||||
label_triple = next((t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "Machine Learning"), None)
|
||||
assert label_triple is not None
|
||||
assert label_triple.s.value == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
||||
assert label_triple.s.is_uri == True
|
||||
assert label_triple.o.is_uri == False
|
||||
|
||||
assert label_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
||||
assert label_triple.s.type == IRI
|
||||
assert label_triple.o.type == LITERAL
|
||||
|
||||
# Check definition triple
|
||||
def_triple = next((t for t in triples if t.p.value == DEFINITION), None)
|
||||
def_triple = next((t for t in triples if t.p.iri == DEFINITION), None)
|
||||
assert def_triple is not None
|
||||
assert def_triple.s.value == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
||||
assert def_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
||||
assert def_triple.o.value == "A subset of AI that enables learning from data."
|
||||
|
||||
|
||||
# Check subject-of triple
|
||||
subject_of_triple = next((t for t in triples if t.p.value == SUBJECT_OF), None)
|
||||
subject_of_triple = next((t for t in triples if t.p.iri == SUBJECT_OF), None)
|
||||
assert subject_of_triple is not None
|
||||
assert subject_of_triple.s.value == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
||||
assert subject_of_triple.o.value == "doc123"
|
||||
|
||||
assert subject_of_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
||||
assert subject_of_triple.o.iri == "doc123"
|
||||
|
||||
# Check entity context
|
||||
assert len(entity_contexts) == 1
|
||||
assert entity_contexts[0].entity.value == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
||||
assert entity_contexts[0].entity.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
||||
assert entity_contexts[0].context == "A subset of AI that enables learning from data."
|
||||
|
||||
def test_process_extraction_data_relationships(self, agent_extractor, sample_metadata):
|
||||
|
|
@ -218,25 +218,25 @@ This is not JSON at all
|
|||
# Check that subject, predicate, and object labels are created
|
||||
subject_uri = f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
||||
predicate_uri = f"{TRUSTGRAPH_ENTITIES}is_subset_of"
|
||||
|
||||
|
||||
# Find label triples
|
||||
subject_label = next((t for t in triples if t.s.value == subject_uri and t.p.value == RDF_LABEL), None)
|
||||
subject_label = next((t for t in triples if t.s.iri == subject_uri and t.p.iri == RDF_LABEL), None)
|
||||
assert subject_label is not None
|
||||
assert subject_label.o.value == "Machine Learning"
|
||||
|
||||
predicate_label = next((t for t in triples if t.s.value == predicate_uri and t.p.value == RDF_LABEL), None)
|
||||
|
||||
predicate_label = next((t for t in triples if t.s.iri == predicate_uri and t.p.iri == RDF_LABEL), None)
|
||||
assert predicate_label is not None
|
||||
assert predicate_label.o.value == "is_subset_of"
|
||||
|
||||
|
||||
# Check main relationship triple
|
||||
object_uri = f"{TRUSTGRAPH_ENTITIES}Artificial%20Intelligence"
|
||||
rel_triple = next((t for t in triples if t.s.value == subject_uri and t.p.value == predicate_uri), None)
|
||||
rel_triple = next((t for t in triples if t.s.iri == subject_uri and t.p.iri == predicate_uri), None)
|
||||
assert rel_triple is not None
|
||||
assert rel_triple.o.value == object_uri
|
||||
assert rel_triple.o.is_uri == True
|
||||
|
||||
assert rel_triple.o.iri == object_uri
|
||||
assert rel_triple.o.type == IRI
|
||||
|
||||
# Check subject-of relationships
|
||||
subject_of_triples = [t for t in triples if t.p.value == SUBJECT_OF and t.o.value == "doc123"]
|
||||
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF and t.o.iri == "doc123"]
|
||||
assert len(subject_of_triples) >= 2 # At least subject and predicate should have subject-of relations
|
||||
|
||||
def test_process_extraction_data_literal_object(self, agent_extractor, sample_metadata):
|
||||
|
|
@ -254,7 +254,7 @@ This is not JSON at all
|
|||
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
|
||||
|
||||
# Check that object labels are not created for literal objects
|
||||
object_labels = [t for t in triples if t.p.value == RDF_LABEL and t.o.value == "95%"]
|
||||
object_labels = [t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "95%"]
|
||||
# Based on the code logic, it should not create object labels for non-entity objects
|
||||
# But there might be a bug in the original implementation
|
||||
|
||||
|
|
@ -263,12 +263,12 @@ This is not JSON at all
|
|||
triples, entity_contexts = agent_extractor.process_extraction_data(sample_extraction_data, sample_metadata)
|
||||
|
||||
# Check that we have both definition and relationship triples
|
||||
definition_triples = [t for t in triples if t.p.value == DEFINITION]
|
||||
definition_triples = [t for t in triples if t.p.iri == DEFINITION]
|
||||
assert len(definition_triples) == 2 # Two definitions
|
||||
|
||||
|
||||
# Check entity contexts are created for definitions
|
||||
assert len(entity_contexts) == 2
|
||||
entity_uris = [ec.entity.value for ec in entity_contexts]
|
||||
entity_uris = [ec.entity.iri for ec in entity_contexts]
|
||||
assert f"{TRUSTGRAPH_ENTITIES}Machine%20Learning" in entity_uris
|
||||
assert f"{TRUSTGRAPH_ENTITIES}Neural%20Networks" in entity_uris
|
||||
|
||||
|
|
@ -282,7 +282,7 @@ This is not JSON at all
|
|||
triples, entity_contexts = agent_extractor.process_extraction_data(data, metadata)
|
||||
|
||||
# Should not create subject-of relationships when no metadata ID
|
||||
subject_of_triples = [t for t in triples if t.p.value == SUBJECT_OF]
|
||||
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
|
||||
assert len(subject_of_triples) == 0
|
||||
|
||||
# Should still create entity contexts
|
||||
|
|
@ -327,17 +327,17 @@ This is not JSON at all
|
|||
async def test_emit_triples(self, agent_extractor, sample_metadata):
|
||||
"""Test emitting triples to publisher"""
|
||||
mock_publisher = AsyncMock()
|
||||
|
||||
|
||||
test_triples = [
|
||||
Triple(
|
||||
s=Value(value="test:subject", is_uri=True),
|
||||
p=Value(value="test:predicate", is_uri=True),
|
||||
o=Value(value="test object", is_uri=False)
|
||||
s=Term(type=IRI, iri="test:subject"),
|
||||
p=Term(type=IRI, iri="test:predicate"),
|
||||
o=Term(type=LITERAL, value="test object")
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
await agent_extractor.emit_triples(mock_publisher, sample_metadata, test_triples)
|
||||
|
||||
|
||||
mock_publisher.send.assert_called_once()
|
||||
sent_triples = mock_publisher.send.call_args[0][0]
|
||||
assert isinstance(sent_triples, Triples)
|
||||
|
|
@ -348,22 +348,22 @@ This is not JSON at all
|
|||
# Note: metadata.metadata is now empty array in the new implementation
|
||||
assert sent_triples.metadata.metadata == []
|
||||
assert len(sent_triples.triples) == 1
|
||||
assert sent_triples.triples[0].s.value == "test:subject"
|
||||
assert sent_triples.triples[0].s.iri == "test:subject"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_emit_entity_contexts(self, agent_extractor, sample_metadata):
|
||||
"""Test emitting entity contexts to publisher"""
|
||||
mock_publisher = AsyncMock()
|
||||
|
||||
|
||||
test_contexts = [
|
||||
EntityContext(
|
||||
entity=Value(value="test:entity", is_uri=True),
|
||||
entity=Term(type=IRI, iri="test:entity"),
|
||||
context="Test context"
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
await agent_extractor.emit_entity_contexts(mock_publisher, sample_metadata, test_contexts)
|
||||
|
||||
|
||||
mock_publisher.send.assert_called_once()
|
||||
sent_contexts = mock_publisher.send.call_args[0][0]
|
||||
assert isinstance(sent_contexts, EntityContexts)
|
||||
|
|
@ -374,7 +374,7 @@ This is not JSON at all
|
|||
# Note: metadata.metadata is now empty array in the new implementation
|
||||
assert sent_contexts.metadata.metadata == []
|
||||
assert len(sent_contexts.entities) == 1
|
||||
assert sent_contexts.entities[0].entity.value == "test:entity"
|
||||
assert sent_contexts.entities[0].entity.iri == "test:entity"
|
||||
|
||||
def test_agent_extractor_initialization_params(self):
|
||||
"""Test agent extractor parameter validation"""
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ import urllib.parse
|
|||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
|
||||
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Value
|
||||
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, IRI, LITERAL
|
||||
from trustgraph.schema import EntityContext, EntityContexts
|
||||
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
||||
|
||||
|
|
@ -188,7 +188,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
|
||||
|
||||
# Should not create subject-of triples when ID is empty string
|
||||
subject_of_triples = [t for t in triples if t.p.value == SUBJECT_OF]
|
||||
subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
|
||||
assert len(subject_of_triples) == 0
|
||||
|
||||
def test_process_extraction_data_special_entity_names(self, agent_extractor):
|
||||
|
|
@ -221,7 +221,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
# Verify URIs were properly encoded
|
||||
for i, entity in enumerate(special_entities):
|
||||
expected_uri = f"{TRUSTGRAPH_ENTITIES}{urllib.parse.quote(entity)}"
|
||||
assert contexts[i].entity.value == expected_uri
|
||||
assert contexts[i].entity.iri == expected_uri
|
||||
|
||||
def test_process_extraction_data_very_long_definitions(self, agent_extractor):
|
||||
"""Test processing with very long entity definitions"""
|
||||
|
|
@ -241,7 +241,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
assert contexts[0].context == long_definition
|
||||
|
||||
# Find definition triple
|
||||
def_triple = next((t for t in triples if t.p.value == DEFINITION), None)
|
||||
def_triple = next((t for t in triples if t.p.iri == DEFINITION), None)
|
||||
assert def_triple is not None
|
||||
assert def_triple.o.value == long_definition
|
||||
|
||||
|
|
@ -262,7 +262,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
assert len(contexts) == 4
|
||||
|
||||
# Check that both definitions for "Machine Learning" are present
|
||||
ml_contexts = [ec for ec in contexts if "Machine%20Learning" in ec.entity.value]
|
||||
ml_contexts = [ec for ec in contexts if "Machine%20Learning" in ec.entity.iri]
|
||||
assert len(ml_contexts) == 2
|
||||
assert ml_contexts[0].context == "First definition"
|
||||
assert ml_contexts[1].context == "Second definition"
|
||||
|
|
@ -286,7 +286,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
assert len(contexts) == 3
|
||||
|
||||
# Empty entity should create empty URI after encoding
|
||||
empty_entity_context = next((ec for ec in contexts if ec.entity.value == TRUSTGRAPH_ENTITIES), None)
|
||||
empty_entity_context = next((ec for ec in contexts if ec.entity.iri == TRUSTGRAPH_ENTITIES), None)
|
||||
assert empty_entity_context is not None
|
||||
|
||||
def test_process_extraction_data_nested_json_in_strings(self, agent_extractor):
|
||||
|
|
@ -338,7 +338,7 @@ class TestAgentKgExtractionEdgeCases:
|
|||
|
||||
# Should process all relationships
|
||||
# Note: The current implementation has some logic issues that these tests document
|
||||
assert len([t for t in triples if t.p.value != RDF_LABEL and t.p.value != SUBJECT_OF]) >= 7
|
||||
assert len([t for t in triples if t.p.iri != RDF_LABEL and t.p.iri != SUBJECT_OF]) >= 7
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_emit_empty_collections(self, agent_extractor):
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ processing graph structures, and performing graph operations.
|
|||
|
||||
import pytest
|
||||
from unittest.mock import Mock
|
||||
from .conftest import Triple, Value, Metadata
|
||||
from .conftest import Triple, Metadata
|
||||
from collections import defaultdict, deque
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -2,13 +2,13 @@
|
|||
Unit tests for triple construction logic
|
||||
|
||||
Tests the core business logic for constructing RDF triples from extracted
|
||||
entities and relationships, including URI generation, Value object creation,
|
||||
entities and relationships, including URI generation, Term object creation,
|
||||
and triple validation.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import Mock
|
||||
from .conftest import Triple, Triples, Value, Metadata
|
||||
from .conftest import Triple, Triples, Term, Metadata, IRI, LITERAL
|
||||
import re
|
||||
import hashlib
|
||||
|
||||
|
|
@ -48,80 +48,82 @@ class TestTripleConstructionLogic:
|
|||
generated_uri = generate_uri(text, entity_type)
|
||||
assert generated_uri == expected_uri, f"URI generation failed for '{text}'"
|
||||
|
||||
def test_value_object_creation(self):
|
||||
"""Test creation of Value objects for subjects, predicates, and objects"""
|
||||
def test_term_object_creation(self):
|
||||
"""Test creation of Term objects for subjects, predicates, and objects"""
|
||||
# Arrange
|
||||
def create_value_object(text, is_uri, value_type=""):
|
||||
return Value(
|
||||
value=text,
|
||||
is_uri=is_uri,
|
||||
type=value_type
|
||||
)
|
||||
|
||||
def create_term_object(text, is_uri, datatype=""):
|
||||
if is_uri:
|
||||
return Term(type=IRI, iri=text)
|
||||
else:
|
||||
return Term(type=LITERAL, value=text, datatype=datatype if datatype else None)
|
||||
|
||||
test_cases = [
|
||||
("http://trustgraph.ai/kg/person/john-smith", True, ""),
|
||||
("John Smith", False, "string"),
|
||||
("42", False, "integer"),
|
||||
("http://schema.org/worksFor", True, "")
|
||||
]
|
||||
|
||||
|
||||
# Act & Assert
|
||||
for value_text, is_uri, value_type in test_cases:
|
||||
value_obj = create_value_object(value_text, is_uri, value_type)
|
||||
|
||||
assert isinstance(value_obj, Value)
|
||||
assert value_obj.value == value_text
|
||||
assert value_obj.is_uri == is_uri
|
||||
assert value_obj.type == value_type
|
||||
for value_text, is_uri, datatype in test_cases:
|
||||
term_obj = create_term_object(value_text, is_uri, datatype)
|
||||
|
||||
assert isinstance(term_obj, Term)
|
||||
if is_uri:
|
||||
assert term_obj.type == IRI
|
||||
assert term_obj.iri == value_text
|
||||
else:
|
||||
assert term_obj.type == LITERAL
|
||||
assert term_obj.value == value_text
|
||||
|
||||
def test_triple_construction_from_relationship(self):
|
||||
"""Test constructing Triple objects from relationships"""
|
||||
# Arrange
|
||||
relationship = {
|
||||
"subject": "John Smith",
|
||||
"predicate": "works_for",
|
||||
"predicate": "works_for",
|
||||
"object": "OpenAI",
|
||||
"subject_type": "PERSON",
|
||||
"object_type": "ORG"
|
||||
}
|
||||
|
||||
|
||||
def construct_triple(relationship, uri_base="http://trustgraph.ai/kg"):
|
||||
# Generate URIs
|
||||
subject_uri = f"{uri_base}/person/{relationship['subject'].lower().replace(' ', '-')}"
|
||||
object_uri = f"{uri_base}/org/{relationship['object'].lower().replace(' ', '-')}"
|
||||
|
||||
|
||||
# Map predicate to schema.org URI
|
||||
predicate_mappings = {
|
||||
"works_for": "http://schema.org/worksFor",
|
||||
"located_in": "http://schema.org/location",
|
||||
"developed": "http://schema.org/creator"
|
||||
}
|
||||
predicate_uri = predicate_mappings.get(relationship["predicate"],
|
||||
predicate_uri = predicate_mappings.get(relationship["predicate"],
|
||||
f"{uri_base}/predicate/{relationship['predicate']}")
|
||||
|
||||
# Create Value objects
|
||||
subject_value = Value(value=subject_uri, is_uri=True, type="")
|
||||
predicate_value = Value(value=predicate_uri, is_uri=True, type="")
|
||||
object_value = Value(value=object_uri, is_uri=True, type="")
|
||||
|
||||
|
||||
# Create Term objects
|
||||
subject_term = Term(type=IRI, iri=subject_uri)
|
||||
predicate_term = Term(type=IRI, iri=predicate_uri)
|
||||
object_term = Term(type=IRI, iri=object_uri)
|
||||
|
||||
# Create Triple
|
||||
return Triple(
|
||||
s=subject_value,
|
||||
p=predicate_value,
|
||||
o=object_value
|
||||
s=subject_term,
|
||||
p=predicate_term,
|
||||
o=object_term
|
||||
)
|
||||
|
||||
|
||||
# Act
|
||||
triple = construct_triple(relationship)
|
||||
|
||||
|
||||
# Assert
|
||||
assert isinstance(triple, Triple)
|
||||
assert triple.s.value == "http://trustgraph.ai/kg/person/john-smith"
|
||||
assert triple.s.is_uri is True
|
||||
assert triple.p.value == "http://schema.org/worksFor"
|
||||
assert triple.p.is_uri is True
|
||||
assert triple.o.value == "http://trustgraph.ai/kg/org/openai"
|
||||
assert triple.o.is_uri is True
|
||||
assert triple.s.iri == "http://trustgraph.ai/kg/person/john-smith"
|
||||
assert triple.s.type == IRI
|
||||
assert triple.p.iri == "http://schema.org/worksFor"
|
||||
assert triple.p.type == IRI
|
||||
assert triple.o.iri == "http://trustgraph.ai/kg/org/openai"
|
||||
assert triple.o.type == IRI
|
||||
|
||||
def test_literal_value_handling(self):
|
||||
"""Test handling of literal values vs URI values"""
|
||||
|
|
@ -132,10 +134,10 @@ class TestTripleConstructionLogic:
|
|||
("John Smith", "email", "john@example.com", False), # Literal email
|
||||
("John Smith", "worksFor", "http://trustgraph.ai/kg/org/openai", True) # URI reference
|
||||
]
|
||||
|
||||
|
||||
def create_triple_with_literal(subject_uri, predicate, object_value, object_is_uri):
|
||||
subject_val = Value(value=subject_uri, is_uri=True, type="")
|
||||
|
||||
subject_term = Term(type=IRI, iri=subject_uri)
|
||||
|
||||
# Determine predicate URI
|
||||
predicate_mappings = {
|
||||
"name": "http://schema.org/name",
|
||||
|
|
@ -144,32 +146,37 @@ class TestTripleConstructionLogic:
|
|||
"worksFor": "http://schema.org/worksFor"
|
||||
}
|
||||
predicate_uri = predicate_mappings.get(predicate, f"http://trustgraph.ai/kg/predicate/{predicate}")
|
||||
predicate_val = Value(value=predicate_uri, is_uri=True, type="")
|
||||
|
||||
# Create object value with appropriate type
|
||||
object_type = ""
|
||||
if not object_is_uri:
|
||||
predicate_term = Term(type=IRI, iri=predicate_uri)
|
||||
|
||||
# Create object term with appropriate type
|
||||
if object_is_uri:
|
||||
object_term = Term(type=IRI, iri=object_value)
|
||||
else:
|
||||
datatype = None
|
||||
if predicate == "age":
|
||||
object_type = "integer"
|
||||
datatype = "integer"
|
||||
elif predicate in ["name", "email"]:
|
||||
object_type = "string"
|
||||
|
||||
object_val = Value(value=object_value, is_uri=object_is_uri, type=object_type)
|
||||
|
||||
return Triple(s=subject_val, p=predicate_val, o=object_val)
|
||||
|
||||
datatype = "string"
|
||||
object_term = Term(type=LITERAL, value=object_value, datatype=datatype)
|
||||
|
||||
return Triple(s=subject_term, p=predicate_term, o=object_term)
|
||||
|
||||
# Act & Assert
|
||||
for subject_uri, predicate, object_value, object_is_uri in test_data:
|
||||
subject_full_uri = "http://trustgraph.ai/kg/person/john-smith"
|
||||
triple = create_triple_with_literal(subject_full_uri, predicate, object_value, object_is_uri)
|
||||
|
||||
assert triple.o.is_uri == object_is_uri
|
||||
assert triple.o.value == object_value
|
||||
|
||||
|
||||
if object_is_uri:
|
||||
assert triple.o.type == IRI
|
||||
assert triple.o.iri == object_value
|
||||
else:
|
||||
assert triple.o.type == LITERAL
|
||||
assert triple.o.value == object_value
|
||||
|
||||
if predicate == "age":
|
||||
assert triple.o.type == "integer"
|
||||
assert triple.o.datatype == "integer"
|
||||
elif predicate in ["name", "email"]:
|
||||
assert triple.o.type == "string"
|
||||
assert triple.o.datatype == "string"
|
||||
|
||||
def test_namespace_management(self):
|
||||
"""Test namespace prefix management and expansion"""
|
||||
|
|
@ -216,63 +223,74 @@ class TestTripleConstructionLogic:
|
|||
def test_triple_validation(self):
|
||||
"""Test triple validation rules"""
|
||||
# Arrange
|
||||
def get_term_value(term):
|
||||
"""Extract value from a Term"""
|
||||
if term.type == IRI:
|
||||
return term.iri
|
||||
else:
|
||||
return term.value
|
||||
|
||||
def validate_triple(triple):
|
||||
errors = []
|
||||
|
||||
|
||||
# Check required components
|
||||
if not triple.s or not triple.s.value:
|
||||
s_val = get_term_value(triple.s) if triple.s else None
|
||||
p_val = get_term_value(triple.p) if triple.p else None
|
||||
o_val = get_term_value(triple.o) if triple.o else None
|
||||
|
||||
if not triple.s or not s_val:
|
||||
errors.append("Missing or empty subject")
|
||||
|
||||
if not triple.p or not triple.p.value:
|
||||
|
||||
if not triple.p or not p_val:
|
||||
errors.append("Missing or empty predicate")
|
||||
|
||||
if not triple.o or not triple.o.value:
|
||||
|
||||
if not triple.o or not o_val:
|
||||
errors.append("Missing or empty object")
|
||||
|
||||
|
||||
# Check URI validity for URI values
|
||||
uri_pattern = r'^https?://[^\s/$.?#].[^\s]*$'
|
||||
|
||||
if triple.s.is_uri and not re.match(uri_pattern, triple.s.value):
|
||||
|
||||
if triple.s.type == IRI and not re.match(uri_pattern, triple.s.iri or ""):
|
||||
errors.append("Invalid subject URI format")
|
||||
|
||||
if triple.p.is_uri and not re.match(uri_pattern, triple.p.value):
|
||||
|
||||
if triple.p.type == IRI and not re.match(uri_pattern, triple.p.iri or ""):
|
||||
errors.append("Invalid predicate URI format")
|
||||
|
||||
if triple.o.is_uri and not re.match(uri_pattern, triple.o.value):
|
||||
|
||||
if triple.o.type == IRI and not re.match(uri_pattern, triple.o.iri or ""):
|
||||
errors.append("Invalid object URI format")
|
||||
|
||||
|
||||
# Predicates should typically be URIs
|
||||
if not triple.p.is_uri:
|
||||
if triple.p.type != IRI:
|
||||
errors.append("Predicate should be a URI")
|
||||
|
||||
|
||||
return len(errors) == 0, errors
|
||||
|
||||
|
||||
# Test valid triple
|
||||
valid_triple = Triple(
|
||||
s=Value(value="http://trustgraph.ai/kg/person/john", is_uri=True, type=""),
|
||||
p=Value(value="http://schema.org/name", is_uri=True, type=""),
|
||||
o=Value(value="John Smith", is_uri=False, type="string")
|
||||
s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
|
||||
p=Term(type=IRI, iri="http://schema.org/name"),
|
||||
o=Term(type=LITERAL, value="John Smith", datatype="string")
|
||||
)
|
||||
|
||||
|
||||
# Test invalid triples
|
||||
invalid_triples = [
|
||||
Triple(s=Value(value="", is_uri=True, type=""),
|
||||
p=Value(value="http://schema.org/name", is_uri=True, type=""),
|
||||
o=Value(value="John", is_uri=False, type="")), # Empty subject
|
||||
|
||||
Triple(s=Value(value="http://trustgraph.ai/kg/person/john", is_uri=True, type=""),
|
||||
p=Value(value="name", is_uri=False, type=""), # Non-URI predicate
|
||||
o=Value(value="John", is_uri=False, type="")),
|
||||
|
||||
Triple(s=Value(value="invalid-uri", is_uri=True, type=""),
|
||||
p=Value(value="http://schema.org/name", is_uri=True, type=""),
|
||||
o=Value(value="John", is_uri=False, type="")) # Invalid URI format
|
||||
Triple(s=Term(type=IRI, iri=""),
|
||||
p=Term(type=IRI, iri="http://schema.org/name"),
|
||||
o=Term(type=LITERAL, value="John")), # Empty subject
|
||||
|
||||
Triple(s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
|
||||
p=Term(type=LITERAL, value="name"), # Non-URI predicate
|
||||
o=Term(type=LITERAL, value="John")),
|
||||
|
||||
Triple(s=Term(type=IRI, iri="invalid-uri"),
|
||||
p=Term(type=IRI, iri="http://schema.org/name"),
|
||||
o=Term(type=LITERAL, value="John")) # Invalid URI format
|
||||
]
|
||||
|
||||
|
||||
# Act & Assert
|
||||
is_valid, errors = validate_triple(valid_triple)
|
||||
assert is_valid, f"Valid triple failed validation: {errors}"
|
||||
|
||||
|
||||
for invalid_triple in invalid_triples:
|
||||
is_valid, errors = validate_triple(invalid_triple)
|
||||
assert not is_valid, f"Invalid triple passed validation: {invalid_triple}"
|
||||
|
|
@ -286,97 +304,97 @@ class TestTripleConstructionLogic:
|
|||
{"text": "OpenAI", "type": "ORG"},
|
||||
{"text": "San Francisco", "type": "PLACE"}
|
||||
]
|
||||
|
||||
|
||||
relationships = [
|
||||
{"subject": "John Smith", "predicate": "works_for", "object": "OpenAI"},
|
||||
{"subject": "OpenAI", "predicate": "located_in", "object": "San Francisco"}
|
||||
]
|
||||
|
||||
|
||||
def construct_triple_batch(entities, relationships, document_id="doc-1"):
|
||||
triples = []
|
||||
|
||||
|
||||
# Create type triples for entities
|
||||
for entity in entities:
|
||||
entity_uri = f"http://trustgraph.ai/kg/{entity['type'].lower()}/{entity['text'].lower().replace(' ', '-')}"
|
||||
type_uri = f"http://trustgraph.ai/kg/type/{entity['type']}"
|
||||
|
||||
|
||||
type_triple = Triple(
|
||||
s=Value(value=entity_uri, is_uri=True, type=""),
|
||||
p=Value(value="http://www.w3.org/1999/02/22-rdf-syntax-ns#type", is_uri=True, type=""),
|
||||
o=Value(value=type_uri, is_uri=True, type="")
|
||||
s=Term(type=IRI, iri=entity_uri),
|
||||
p=Term(type=IRI, iri="http://www.w3.org/1999/02/22-rdf-syntax-ns#type"),
|
||||
o=Term(type=IRI, iri=type_uri)
|
||||
)
|
||||
triples.append(type_triple)
|
||||
|
||||
|
||||
# Create relationship triples
|
||||
for rel in relationships:
|
||||
subject_uri = f"http://trustgraph.ai/kg/entity/{rel['subject'].lower().replace(' ', '-')}"
|
||||
object_uri = f"http://trustgraph.ai/kg/entity/{rel['object'].lower().replace(' ', '-')}"
|
||||
predicate_uri = f"http://schema.org/{rel['predicate'].replace('_', '')}"
|
||||
|
||||
|
||||
rel_triple = Triple(
|
||||
s=Value(value=subject_uri, is_uri=True, type=""),
|
||||
p=Value(value=predicate_uri, is_uri=True, type=""),
|
||||
o=Value(value=object_uri, is_uri=True, type="")
|
||||
s=Term(type=IRI, iri=subject_uri),
|
||||
p=Term(type=IRI, iri=predicate_uri),
|
||||
o=Term(type=IRI, iri=object_uri)
|
||||
)
|
||||
triples.append(rel_triple)
|
||||
|
||||
|
||||
return triples
|
||||
|
||||
|
||||
# Act
|
||||
triples = construct_triple_batch(entities, relationships)
|
||||
|
||||
|
||||
# Assert
|
||||
assert len(triples) == len(entities) + len(relationships) # Type triples + relationship triples
|
||||
|
||||
|
||||
# Check that all triples are valid Triple objects
|
||||
for triple in triples:
|
||||
assert isinstance(triple, Triple)
|
||||
assert triple.s.value != ""
|
||||
assert triple.p.value != ""
|
||||
assert triple.o.value != ""
|
||||
assert triple.s.iri != ""
|
||||
assert triple.p.iri != ""
|
||||
assert triple.o.iri != ""
|
||||
|
||||
def test_triples_batch_object_creation(self):
|
||||
"""Test creating Triples batch objects with metadata"""
|
||||
# Arrange
|
||||
sample_triples = [
|
||||
Triple(
|
||||
s=Value(value="http://trustgraph.ai/kg/person/john", is_uri=True, type=""),
|
||||
p=Value(value="http://schema.org/name", is_uri=True, type=""),
|
||||
o=Value(value="John Smith", is_uri=False, type="string")
|
||||
s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
|
||||
p=Term(type=IRI, iri="http://schema.org/name"),
|
||||
o=Term(type=LITERAL, value="John Smith", datatype="string")
|
||||
),
|
||||
Triple(
|
||||
s=Value(value="http://trustgraph.ai/kg/person/john", is_uri=True, type=""),
|
||||
p=Value(value="http://schema.org/worksFor", is_uri=True, type=""),
|
||||
o=Value(value="http://trustgraph.ai/kg/org/openai", is_uri=True, type="")
|
||||
s=Term(type=IRI, iri="http://trustgraph.ai/kg/person/john"),
|
||||
p=Term(type=IRI, iri="http://schema.org/worksFor"),
|
||||
o=Term(type=IRI, iri="http://trustgraph.ai/kg/org/openai")
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
metadata = Metadata(
|
||||
id="test-doc-123",
|
||||
user="test_user",
|
||||
user="test_user",
|
||||
collection="test_collection",
|
||||
metadata=[]
|
||||
)
|
||||
|
||||
|
||||
# Act
|
||||
triples_batch = Triples(
|
||||
metadata=metadata,
|
||||
triples=sample_triples
|
||||
)
|
||||
|
||||
|
||||
# Assert
|
||||
assert isinstance(triples_batch, Triples)
|
||||
assert triples_batch.metadata.id == "test-doc-123"
|
||||
assert triples_batch.metadata.user == "test_user"
|
||||
assert triples_batch.metadata.collection == "test_collection"
|
||||
assert len(triples_batch.triples) == 2
|
||||
|
||||
|
||||
# Check that triples are properly embedded
|
||||
for triple in triples_batch.triples:
|
||||
assert isinstance(triple, Triple)
|
||||
assert isinstance(triple.s, Value)
|
||||
assert isinstance(triple.p, Value)
|
||||
assert isinstance(triple.o, Value)
|
||||
assert isinstance(triple.s, Term)
|
||||
assert isinstance(triple.p, Term)
|
||||
assert isinstance(triple.o, Term)
|
||||
|
||||
def test_uri_collision_handling(self):
|
||||
"""Test handling of URI collisions and duplicate detection"""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue