test(ontology): harden domain/range validation + add missing tests (#848)

Fixes #826. Addresses all five points the maintainer called out in
the follow-up to #825.

Source change (trustgraph-flow/trustgraph/extract/kg/ontology/extract.py):
- Added `_is_subclass_of(cls, target, ontology_subset, max_depth=100)`
  helper with visited-set cycle detection + a defensive depth cap.
  LLM-generated ontologies may emit cycles (A subclass_of B,
  B subclass_of A); the prior while-loop would infinite-loop on that.
- Replaced both near-identical domain and range subclass walks in
  `is_valid_triple` with a single call to the new helper. Net is
  -20 duplicated lines + 26-line helper.

Tests (tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py):
- test_is_valid_triple_subclass_is_accepted: domain expects Recipe,
  actual type is Cake (subclass), validates.
- test_is_valid_triple_handles_subclass_cycle_without_infinite_loop:
  A subclass_of B, B subclass_of A; call returns False within the
  depth cap rather than hanging.
- test_parse_and_validate_triples_collects_entity_types_from_rdf_type:
  end-to-end path: rdf:type triples build the entity_types dict,
  subsequent domain-check triples validate against it.
- test_is_valid_triple_entity_types_none_default: the None default
  path now has explicit coverage.

156 existing tests in tests/unit/test_extract/test_ontology still pass.
This commit is contained in:
Trevin Chow 2026-04-28 08:33:49 -07:00 committed by Cyber MacGeddon
parent 5e28d3cce0
commit 6302eb8c97
2 changed files with 107 additions and 25 deletions

View file

@ -540,6 +540,32 @@ class Processor(FlowProcessor):
return True
return False
def _is_subclass_of(self, cls, target, ontology_subset, max_depth=100):
"""Return True if cls is a subclass of target via subclass_of chain.
Defends against cycles in ontology data (LLM-generated ontologies may
emit A subclass_of B, B subclass_of A) with a visited set. A depth cap
acts as a second line of defense against unbounded chains.
"""
if cls == target:
return True
visited = set()
curr = cls
depth = 0
while curr in ontology_subset.classes and depth < max_depth:
if curr in visited:
return False # cycle detected
visited.add(curr)
cls_def = ontology_subset.classes[curr]
parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None
if parent is None:
return False
if parent == target:
return True
curr = parent
depth += 1
return False
def is_valid_triple(self, subject: str, predicate: str, object_val: str,
ontology_subset: OntologySubset, entity_types: dict = None) -> bool:
"""Validate triple against ontology constraints."""
@ -570,36 +596,20 @@ class Processor(FlowProcessor):
expected_domain = prop_def.get('domain')
if expected_domain and subject in entity_types:
actual_domain = entity_types[subject]
if actual_domain != expected_domain:
is_subclass = False
curr_class = actual_domain
while curr_class in ontology_subset.classes:
cls_def = ontology_subset.classes[curr_class]
parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None
if parent == expected_domain:
is_subclass = True
break
curr_class = parent
if not is_subclass:
return False
if actual_domain != expected_domain and not self._is_subclass_of(
actual_domain, expected_domain, ontology_subset
):
return False
# Range validation
if is_obj_prop:
expected_range = prop_def.get('range')
if expected_range and object_val in entity_types:
actual_range = entity_types[object_val]
if actual_range != expected_range:
is_subclass = False
curr_class = actual_range
while curr_class in ontology_subset.classes:
cls_def = ontology_subset.classes[curr_class]
parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None
if parent == expected_range:
is_subclass = True
break
curr_class = parent
if not is_subclass:
return False
if actual_range != expected_range and not self._is_subclass_of(
actual_range, expected_range, ontology_subset
):
return False
return True
@ -988,4 +998,4 @@ class Processor(FlowProcessor):
def run():
"""Launch the OntoRAG extraction service."""
Processor.launch(default_ident, __doc__)
Processor.launch(default_ident, __doc__)