diff --git a/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py b/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py index bae6bdbd..6a2048a5 100644 --- a/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py +++ b/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py @@ -277,6 +277,60 @@ class TestTripleValidation: is_invalid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset, entity_types_invalid) assert not is_invalid, "Invalid range should be rejected" + def test_is_valid_triple_subclass_is_accepted(self, extractor, sample_ontology_subset): + """Domain check passes when actual type is a subclass of expected.""" + sample_ontology_subset.classes["Cake"] = { + "uri": "http://purl.org/ontology/fo/Cake", + "type": "owl:Class", + "subclass_of": "Recipe", + } + sample_ontology_subset.object_properties["has_ingredient"] = { + "domain": "Recipe", + "range": "Ingredient", + } + + result = extractor.is_valid_triple( + subject="cake:lemon-drizzle", + predicate="has_ingredient", + object_val="ingredient:lemon", + ontology_subset=sample_ontology_subset, + entity_types={"cake:lemon-drizzle": "Cake", "ingredient:lemon": "Ingredient"}, + ) + + assert result is True + + def test_is_valid_triple_handles_subclass_cycle_without_infinite_loop(self, extractor, sample_ontology_subset): + """A cycle in subclass_of must return False instead of hanging.""" + sample_ontology_subset.classes["A"] = {"subclass_of": "B"} + sample_ontology_subset.classes["B"] = {"subclass_of": "A"} + sample_ontology_subset.object_properties["p"] = {"domain": "Recipe", "range": "Ingredient"} + + result = extractor.is_valid_triple( + subject="entity:x", + predicate="p", + object_val="ingredient:y", + ontology_subset=sample_ontology_subset, + entity_types={"entity:x": "A", "ingredient:y": "Ingredient"}, + ) + + assert result is False + + def test_is_valid_triple_entity_types_none_default(self, extractor, sample_ontology_subset): + """entity_types=None should not raise; domain/range checks skip if type unknown.""" + sample_ontology_subset.object_properties["has_ingredient"] = { + "domain": "Recipe", + "range": "Ingredient", + } + + result = extractor.is_valid_triple( + subject="recipe:x", + predicate="has_ingredient", + object_val="ingredient:y", + ontology_subset=sample_ontology_subset, + ) + + assert result is True + class TestTripleParsing: """Test suite for parsing triples from LLM responses.""" @@ -377,6 +431,24 @@ class TestTripleParsing: assert triple.p.type == IRI, "Predicate should be IRI type" assert triple.o.type == LITERAL, "Object literal should be LITERAL type" + def test_parse_and_validate_triples_collects_entity_types_from_rdf_type(self, extractor, sample_ontology_subset): + """entity_types should be built from rdf:type triples in the same batch.""" + sample_ontology_subset.object_properties["has_ingredient"] = { + "domain": "Recipe", + "range": "Ingredient", + } + triples_response = [ + {"subject": "recipe:cornish-pasty", "predicate": "rdf:type", "object": "Recipe"}, + {"subject": "ingredient:beef", "predicate": "rdf:type", "object": "Ingredient"}, + {"subject": "recipe:cornish-pasty", "predicate": "has_ingredient", "object": "ingredient:beef"}, + ] + + valid_triples = extractor.parse_and_validate_triples( + triples_response, sample_ontology_subset + ) + + assert len(valid_triples) == 3 + class TestURIExpansionInExtraction: """Test suite for URI expansion during triple extraction.""" diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py index ef9a7331..1d45d3f9 100644 --- a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py +++ b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py @@ -540,6 +540,32 @@ class Processor(FlowProcessor): return True return False + def _is_subclass_of(self, cls, target, ontology_subset, max_depth=100): + """Return True if cls is a subclass of target via subclass_of chain. + + Defends against cycles in ontology data (LLM-generated ontologies may + emit A subclass_of B, B subclass_of A) with a visited set. A depth cap + acts as a second line of defense against unbounded chains. + """ + if cls == target: + return True + visited = set() + curr = cls + depth = 0 + while curr in ontology_subset.classes and depth < max_depth: + if curr in visited: + return False # cycle detected + visited.add(curr) + cls_def = ontology_subset.classes[curr] + parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None + if parent is None: + return False + if parent == target: + return True + curr = parent + depth += 1 + return False + def is_valid_triple(self, subject: str, predicate: str, object_val: str, ontology_subset: OntologySubset, entity_types: dict = None) -> bool: """Validate triple against ontology constraints.""" @@ -570,36 +596,20 @@ class Processor(FlowProcessor): expected_domain = prop_def.get('domain') if expected_domain and subject in entity_types: actual_domain = entity_types[subject] - if actual_domain != expected_domain: - is_subclass = False - curr_class = actual_domain - while curr_class in ontology_subset.classes: - cls_def = ontology_subset.classes[curr_class] - parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None - if parent == expected_domain: - is_subclass = True - break - curr_class = parent - if not is_subclass: - return False + if actual_domain != expected_domain and not self._is_subclass_of( + actual_domain, expected_domain, ontology_subset + ): + return False # Range validation if is_obj_prop: expected_range = prop_def.get('range') if expected_range and object_val in entity_types: actual_range = entity_types[object_val] - if actual_range != expected_range: - is_subclass = False - curr_class = actual_range - while curr_class in ontology_subset.classes: - cls_def = ontology_subset.classes[curr_class] - parent = cls_def.get('subclass_of') if isinstance(cls_def, dict) else None - if parent == expected_range: - is_subclass = True - break - curr_class = parent - if not is_subclass: - return False + if actual_range != expected_range and not self._is_subclass_of( + actual_range, expected_range, ontology_subset + ): + return False return True @@ -988,4 +998,4 @@ class Processor(FlowProcessor): def run(): """Launch the OntoRAG extraction service.""" - Processor.launch(default_ident, __doc__) \ No newline at end of file + Processor.launch(default_ident, __doc__)