mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-03 12:22:37 +02:00
test(ontology): harden domain/range validation + add missing tests (#848)
Fixes #826. Addresses all five points the maintainer called out in the follow-up to #825. Source change (trustgraph-flow/trustgraph/extract/kg/ontology/extract.py): - Added `_is_subclass_of(cls, target, ontology_subset, max_depth=100)` helper with visited-set cycle detection + a defensive depth cap. LLM-generated ontologies may emit cycles (A subclass_of B, B subclass_of A); the prior while-loop would infinite-loop on that. - Replaced both near-identical domain and range subclass walks in `is_valid_triple` with a single call to the new helper. Net is -20 duplicated lines + 26-line helper. Tests (tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py): - test_is_valid_triple_subclass_is_accepted: domain expects Recipe, actual type is Cake (subclass), validates. - test_is_valid_triple_handles_subclass_cycle_without_infinite_loop: A subclass_of B, B subclass_of A; call returns False within the depth cap rather than hanging. - test_parse_and_validate_triples_collects_entity_types_from_rdf_type: end-to-end path: rdf:type triples build the entity_types dict, subsequent domain-check triples validate against it. - test_is_valid_triple_entity_types_none_default: the None default path now has explicit coverage. 156 existing tests in tests/unit/test_extract/test_ontology still pass.
This commit is contained in:
parent
5e28d3cce0
commit
6302eb8c97
2 changed files with 107 additions and 25 deletions
|
|
@ -277,6 +277,60 @@ class TestTripleValidation:
|
||||||
is_invalid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset, entity_types_invalid)
|
is_invalid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset, entity_types_invalid)
|
||||||
assert not is_invalid, "Invalid range should be rejected"
|
assert not is_invalid, "Invalid range should be rejected"
|
||||||
|
|
||||||
|
def test_is_valid_triple_subclass_is_accepted(self, extractor, sample_ontology_subset):
|
||||||
|
"""Domain check passes when actual type is a subclass of expected."""
|
||||||
|
sample_ontology_subset.classes["Cake"] = {
|
||||||
|
"uri": "http://purl.org/ontology/fo/Cake",
|
||||||
|
"type": "owl:Class",
|
||||||
|
"subclass_of": "Recipe",
|
||||||
|
}
|
||||||
|
sample_ontology_subset.object_properties["has_ingredient"] = {
|
||||||
|
"domain": "Recipe",
|
||||||
|
"range": "Ingredient",
|
||||||
|
}
|
||||||
|
|
||||||
|
result = extractor.is_valid_triple(
|
||||||
|
subject="cake:lemon-drizzle",
|
||||||
|
predicate="has_ingredient",
|
||||||
|
object_val="ingredient:lemon",
|
||||||
|
ontology_subset=sample_ontology_subset,
|
||||||
|
entity_types={"cake:lemon-drizzle": "Cake", "ingredient:lemon": "Ingredient"},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
def test_is_valid_triple_handles_subclass_cycle_without_infinite_loop(self, extractor, sample_ontology_subset):
|
||||||
|
"""A cycle in subclass_of must return False instead of hanging."""
|
||||||
|
sample_ontology_subset.classes["A"] = {"subclass_of": "B"}
|
||||||
|
sample_ontology_subset.classes["B"] = {"subclass_of": "A"}
|
||||||
|
sample_ontology_subset.object_properties["p"] = {"domain": "Recipe", "range": "Ingredient"}
|
||||||
|
|
||||||
|
result = extractor.is_valid_triple(
|
||||||
|
subject="entity:x",
|
||||||
|
predicate="p",
|
||||||
|
object_val="ingredient:y",
|
||||||
|
ontology_subset=sample_ontology_subset,
|
||||||
|
entity_types={"entity:x": "A", "ingredient:y": "Ingredient"},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
def test_is_valid_triple_entity_types_none_default(self, extractor, sample_ontology_subset):
|
||||||
|
"""entity_types=None should not raise; domain/range checks skip if type unknown."""
|
||||||
|
sample_ontology_subset.object_properties["has_ingredient"] = {
|
||||||
|
"domain": "Recipe",
|
||||||
|
"range": "Ingredient",
|
||||||
|
}
|
||||||
|
|
||||||
|
result = extractor.is_valid_triple(
|
||||||
|
subject="recipe:x",
|
||||||
|
predicate="has_ingredient",
|
||||||
|
object_val="ingredient:y",
|
||||||
|
ontology_subset=sample_ontology_subset,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
|
||||||
class TestTripleParsing:
|
class TestTripleParsing:
|
||||||
"""Test suite for parsing triples from LLM responses."""
|
"""Test suite for parsing triples from LLM responses."""
|
||||||
|
|
@ -377,6 +431,24 @@ class TestTripleParsing:
|
||||||
assert triple.p.type == IRI, "Predicate should be IRI type"
|
assert triple.p.type == IRI, "Predicate should be IRI type"
|
||||||
assert triple.o.type == LITERAL, "Object literal should be LITERAL type"
|
assert triple.o.type == LITERAL, "Object literal should be LITERAL type"
|
||||||
|
|
||||||
|
def test_parse_and_validate_triples_collects_entity_types_from_rdf_type(self, extractor, sample_ontology_subset):
|
||||||
|
"""entity_types should be built from rdf:type triples in the same batch."""
|
||||||
|
sample_ontology_subset.object_properties["has_ingredient"] = {
|
||||||
|
"domain": "Recipe",
|
||||||
|
"range": "Ingredient",
|
||||||
|
}
|
||||||
|
triples_response = [
|
||||||
|
{"subject": "recipe:cornish-pasty", "predicate": "rdf:type", "object": "Recipe"},
|
||||||
|
{"subject": "ingredient:beef", "predicate": "rdf:type", "object": "Ingredient"},
|
||||||
|
{"subject": "recipe:cornish-pasty", "predicate": "has_ingredient", "object": "ingredient:beef"},
|
||||||
|
]
|
||||||
|
|
||||||
|
valid_triples = extractor.parse_and_validate_triples(
|
||||||
|
triples_response, sample_ontology_subset
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(valid_triples) == 3
|
||||||
|
|
||||||
|
|
||||||
class TestURIExpansionInExtraction:
|
class TestURIExpansionInExtraction:
|
||||||
"""Test suite for URI expansion during triple extraction."""
|
"""Test suite for URI expansion during triple extraction."""
|
||||||
|
|
|
||||||
|
|
@ -540,6 +540,32 @@ class Processor(FlowProcessor):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _is_subclass_of(self, cls, target, ontology_subset, max_depth=100):
|
||||||
|
"""Return True if cls is a subclass of target via subclass_of chain.
|
||||||
|
|
||||||
|
Defends against cycles in ontology data (LLM-generated ontologies may
|
||||||
|
emit A subclass_of B, B subclass_of A) with a visited set. A depth cap
|
||||||
|
acts as a second line of defense against unbounded chains.
|
||||||
|
"""
|
||||||
|
if cls == target:
|
||||||
|
return True
|
||||||
|
visited = set()
|
||||||
|
curr = cls
|
||||||
|
depth = 0
|
||||||
|
while curr in ontology_subset.classes and depth < max_depth:
|
||||||
|
if curr in visited:
|
||||||
|
return False # cycle detected
|
||||||
|
visited.add(curr)
|
||||||
|
cls_def = ontology_subset.classes[curr]
|
||||||
|
parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None
|
||||||
|
if parent is None:
|
||||||
|
return False
|
||||||
|
if parent == target:
|
||||||
|
return True
|
||||||
|
curr = parent
|
||||||
|
depth += 1
|
||||||
|
return False
|
||||||
|
|
||||||
def is_valid_triple(self, subject: str, predicate: str, object_val: str,
|
def is_valid_triple(self, subject: str, predicate: str, object_val: str,
|
||||||
ontology_subset: OntologySubset, entity_types: dict = None) -> bool:
|
ontology_subset: OntologySubset, entity_types: dict = None) -> bool:
|
||||||
"""Validate triple against ontology constraints."""
|
"""Validate triple against ontology constraints."""
|
||||||
|
|
@ -570,36 +596,20 @@ class Processor(FlowProcessor):
|
||||||
expected_domain = prop_def.get('domain')
|
expected_domain = prop_def.get('domain')
|
||||||
if expected_domain and subject in entity_types:
|
if expected_domain and subject in entity_types:
|
||||||
actual_domain = entity_types[subject]
|
actual_domain = entity_types[subject]
|
||||||
if actual_domain != expected_domain:
|
if actual_domain != expected_domain and not self._is_subclass_of(
|
||||||
is_subclass = False
|
actual_domain, expected_domain, ontology_subset
|
||||||
curr_class = actual_domain
|
):
|
||||||
while curr_class in ontology_subset.classes:
|
return False
|
||||||
cls_def = ontology_subset.classes[curr_class]
|
|
||||||
parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None
|
|
||||||
if parent == expected_domain:
|
|
||||||
is_subclass = True
|
|
||||||
break
|
|
||||||
curr_class = parent
|
|
||||||
if not is_subclass:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Range validation
|
# Range validation
|
||||||
if is_obj_prop:
|
if is_obj_prop:
|
||||||
expected_range = prop_def.get('range')
|
expected_range = prop_def.get('range')
|
||||||
if expected_range and object_val in entity_types:
|
if expected_range and object_val in entity_types:
|
||||||
actual_range = entity_types[object_val]
|
actual_range = entity_types[object_val]
|
||||||
if actual_range != expected_range:
|
if actual_range != expected_range and not self._is_subclass_of(
|
||||||
is_subclass = False
|
actual_range, expected_range, ontology_subset
|
||||||
curr_class = actual_range
|
):
|
||||||
while curr_class in ontology_subset.classes:
|
return False
|
||||||
cls_def = ontology_subset.classes[curr_class]
|
|
||||||
parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None
|
|
||||||
if parent == expected_range:
|
|
||||||
is_subclass = True
|
|
||||||
break
|
|
||||||
curr_class = parent
|
|
||||||
if not is_subclass:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue