diff --git a/tests/unit/test_extract/test_ontology/README.md b/tests/unit/test_extract/test_ontology/README.md new file mode 100644 index 00000000..e3f0a164 --- /dev/null +++ b/tests/unit/test_extract/test_ontology/README.md @@ -0,0 +1,148 @@ +# Ontology Extractor Unit Tests + +Comprehensive unit tests for the OntoRAG ontology extraction system. + +## Test Coverage + +### 1. `test_ontology_selector.py` - Auto-Include Properties Feature + +Tests the critical dependency resolution that automatically includes all properties related to selected classes. + +**Key Tests:** +- `test_auto_include_properties_for_recipe_class` - Verifies Recipe class auto-includes `ingredients`, `method`, `produces`, `serves` +- `test_auto_include_properties_for_ingredient_class` - Verifies Ingredient class auto-includes `food` property +- `test_auto_include_properties_for_range_class` - Tests properties are included when class appears in range +- `test_auto_include_adds_domain_and_range_classes` - Ensures related classes are added too +- `test_multiple_classes_get_all_related_properties` - Tests combining multiple class selections +- `test_no_duplicate_properties_added` - Ensures properties aren't duplicated + +### 2. `test_uri_expansion.py` - URI Expansion + +Tests that URIs are properly expanded using ontology definitions instead of constructed fallback URIs. + +**Key Tests:** +- `test_expand_class_uri_from_ontology` - Class names expand to ontology URIs +- `test_expand_object_property_uri_from_ontology` - Object properties use ontology URIs +- `test_expand_datatype_property_uri_from_ontology` - Datatype properties use ontology URIs +- `test_expand_rdf_prefix` - Standard RDF prefixes expand correctly +- `test_expand_rdfs_prefix`, `test_expand_owl_prefix`, `test_expand_xsd_prefix` - Other standard prefixes +- `test_fallback_uri_for_instance` - Entity instances get constructed URIs +- `test_already_full_uri_unchanged` - Full URIs pass through +- `test_dict_access_not_object_attribute` - **Critical test** verifying dict access works (not object attributes) + +### 3. `test_ontology_triples.py` - Ontology Triple Generation + +Tests that ontology elements (classes and properties) are properly converted to RDF triples with labels, comments, domains, and ranges. + +**Key Tests:** +- `test_generates_class_type_triples` - Classes get `rdf:type owl:Class` triples +- `test_generates_class_labels` - Classes get `rdfs:label` triples +- `test_generates_class_comments` - Classes get `rdfs:comment` triples +- `test_generates_object_property_type_triples` - Object properties get proper type triples +- `test_generates_object_property_labels` - Object properties get labels +- `test_generates_object_property_domain` - Object properties get `rdfs:domain` triples +- `test_generates_object_property_range` - Object properties get `rdfs:range` triples +- `test_generates_datatype_property_type_triples` - Datatype properties get proper type triples +- `test_generates_datatype_property_range` - Datatype properties get XSD type ranges +- `test_uses_dict_field_names_not_rdf_names` - **Critical test** verifying dict field names work +- `test_total_triple_count_is_reasonable` - Validates expected number of triples + +### 4. `test_text_processing.py` - Text Processing and Segmentation + +Tests that text is properly split into sentences for ontology matching, including NLTK tokenization and TextSegment creation. + +**Key Tests:** +- `test_segment_single_sentence` - Single sentence produces one segment +- `test_segment_multiple_sentences` - Multiple sentences split correctly +- `test_segment_positions` - Segment start/end positions are correct +- `test_segment_complex_punctuation` - Handles abbreviations (Dr., U.S.A., Mr.) +- `test_segment_question_and_exclamation` - Different sentence terminators +- `test_segment_preserves_original_text` - Segments can reconstruct original +- `test_text_segment_non_overlapping` - Segments don't overlap +- `test_nltk_punkt_availability` - NLTK tokenizer is available +- `test_unicode_text` - Handles unicode characters +- `test_quoted_text` - Handles quoted text correctly + +### 5. `test_prompt_and_extraction.py` - LLM Prompt Construction and Triple Extraction + +Tests that the system correctly constructs prompts with ontology constraints and extracts/validates triples from LLM responses. + +**Key Tests:** +- `test_build_extraction_variables_includes_text` - Prompt includes input text +- `test_build_extraction_variables_includes_classes` - Prompt includes ontology classes +- `test_build_extraction_variables_includes_properties` - Prompt includes properties +- `test_validates_rdf_type_triple_with_valid_class` - Validates rdf:type against ontology +- `test_rejects_rdf_type_triple_with_invalid_class` - Rejects invalid classes +- `test_validates_object_property_triple` - Validates object properties +- `test_rejects_unknown_property` - Rejects properties not in ontology +- `test_parse_simple_triple_dict` - Parses triple from dict format +- `test_filters_invalid_triples` - Filters out invalid triples +- `test_expands_uris_in_parsed_triples` - Expands URIs using ontology +- `test_creates_proper_triple_objects` - Creates Triple objects with Value subjects/predicates/objects + +### 6. `test_embedding_and_similarity.py` - Ontology Embedding and Similarity Matching + +Tests that ontology elements are properly embedded and matched against input text using vector similarity. + +**Key Tests:** +- `test_create_text_from_class_with_id` - Text representation includes class ID +- `test_create_text_from_class_with_labels` - Includes labels in text +- `test_create_text_from_class_with_comment` - Includes comments in text +- `test_create_text_from_property_with_domain_range` - Includes domain/range in property text +- `test_normalizes_id_with_underscores` - Normalizes IDs (underscores to spaces) +- `test_includes_subclass_info_for_classes` - Includes subclass relationships +- `test_vector_store_api_structure` - Vector store has expected API +- `test_selector_handles_text_segments` - Selector processes text segments +- `test_merge_subsets_combines_elements` - Merging combines ontology elements +- `test_ontology_element_metadata_structure` - Metadata structure is correct + +## Running the Tests + +### Run all ontology extractor tests: +```bash +cd /home/mark/work/trustgraph.ai/trustgraph +pytest tests/unit/test_extract/test_ontology/ -v +``` + +### Run specific test file: +```bash +pytest tests/unit/test_extract/test_ontology/test_ontology_selector.py -v +pytest tests/unit/test_extract/test_ontology/test_uri_expansion.py -v +pytest tests/unit/test_extract/test_ontology/test_ontology_triples.py -v +pytest tests/unit/test_extract/test_ontology/test_text_processing.py -v +pytest tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py -v +pytest tests/unit/test_extract/test_ontology/test_embedding_and_similarity.py -v +``` + +### Run specific test: +```bash +pytest tests/unit/test_extract/test_ontology/test_ontology_selector.py::TestOntologySelector::test_auto_include_properties_for_recipe_class -v +``` + +### Run with coverage: +```bash +pytest tests/unit/test_extract/test_ontology/ --cov=trustgraph.extract.kg.ontology --cov-report=html +``` + +## Test Fixtures + +- `sample_ontology` - Complete Food Ontology with Recipe, Ingredient, Food, Method classes +- `ontology_loader_with_sample` - Mock OntologyLoader with the sample ontology +- `ontology_embedder` - Mock embedder for testing +- `mock_embedding_service` - Mock service for generating deterministic embeddings +- `vector_store` - InMemoryVectorStore for testing +- `extractor` - Processor instance for URI expansion tests +- `ontology_subset_with_uris` - OntologySubset with proper URIs defined +- `sample_ontology_subset` - OntologySubset for testing triple generation +- `text_processor` - TextProcessor instance for text segmentation tests +- `sample_ontology_class` - Sample OntologyClass for testing +- `sample_ontology_property` - Sample OntologyProperty for testing + +## Implementation Notes + +These tests verify the fixes made to address: +1. **Disconnected graph problem** - Auto-include properties feature ensures all relevant relationships are available +2. **Wrong URIs problem** - URI expansion using ontology definitions instead of constructed fallbacks +3. **Dict vs object attribute problem** - URI expansion works with dicts (from `cls.__dict__`) not object attributes +4. **Ontology visibility in KG** - Ontology elements themselves appear in the knowledge graph with proper metadata +5. **Text segmentation** - Proper sentence splitting for ontology matching using NLTK diff --git a/tests/unit/test_extract/test_ontology/__init__.py b/tests/unit/test_extract/test_ontology/__init__.py new file mode 100644 index 00000000..22e958af --- /dev/null +++ b/tests/unit/test_extract/test_ontology/__init__.py @@ -0,0 +1 @@ +"""Tests for ontology-based extraction.""" diff --git a/tests/unit/test_extract/test_ontology/test_embedding_and_similarity.py b/tests/unit/test_extract/test_ontology/test_embedding_and_similarity.py new file mode 100644 index 00000000..fe6d3c5f --- /dev/null +++ b/tests/unit/test_extract/test_ontology/test_embedding_and_similarity.py @@ -0,0 +1,423 @@ +""" +Unit tests for ontology embedding and similarity matching. + +Tests that ontology elements are properly embedded and matched against +input text using vector similarity. +""" + +import pytest +import numpy as np +from unittest.mock import AsyncMock, MagicMock +from trustgraph.extract.kg.ontology.ontology_embedder import ( + OntologyEmbedder, + OntologyElementMetadata +) +from trustgraph.extract.kg.ontology.ontology_loader import ( + Ontology, + OntologyClass, + OntologyProperty +) +from trustgraph.extract.kg.ontology.vector_store import InMemoryVectorStore, SearchResult +from trustgraph.extract.kg.ontology.text_processor import TextSegment +from trustgraph.extract.kg.ontology.ontology_selector import OntologySelector, OntologySubset + + +@pytest.fixture +def mock_embedding_service(): + """Create a mock embedding service.""" + service = AsyncMock() + # Return deterministic embeddings for testing + async def mock_embed(text): + # Simple hash-based embedding for deterministic tests + hash_val = hash(text) % 1000 + return np.array([hash_val / 1000.0, (1000 - hash_val) / 1000.0]) + service.embed = mock_embed + return service + + +@pytest.fixture +def vector_store(): + """Create an empty vector store.""" + return InMemoryVectorStore() + + +@pytest.fixture +def ontology_embedder(mock_embedding_service, vector_store): + """Create an ontology embedder with mock service.""" + return OntologyEmbedder( + embedding_service=mock_embedding_service, + vector_store=vector_store + ) + + +@pytest.fixture +def sample_ontology_class(): + """Create a sample ontology class.""" + return OntologyClass( + uri="http://purl.org/ontology/fo/Recipe", + type="owl:Class", + labels=[{"value": "Recipe", "lang": "en-gb"}], + comment="A Recipe is a combination of ingredients and a method.", + subclass_of=None + ) + + +@pytest.fixture +def sample_ontology_property(): + """Create a sample ontology property.""" + return OntologyProperty( + uri="http://purl.org/ontology/fo/ingredients", + type="owl:ObjectProperty", + labels=[{"value": "ingredients", "lang": "en-gb"}], + comment="The ingredients property relates a recipe to an ingredient list.", + domain="Recipe", + range="IngredientList" + ) + + +class TestTextRepresentation: + """Test suite for creating text representations of ontology elements.""" + + def test_create_text_from_class_with_id(self, ontology_embedder, sample_ontology_class): + """Test that class ID is included in text representation.""" + text = ontology_embedder._create_text_representation( + "Recipe", + sample_ontology_class, + "class" + ) + + assert "Recipe" in text, "Should include class ID" + + def test_create_text_from_class_with_labels(self, ontology_embedder, sample_ontology_class): + """Test that class labels are included in text representation.""" + text = ontology_embedder._create_text_representation( + "Recipe", + sample_ontology_class, + "class" + ) + + assert "Recipe" in text, "Should include label value" + + def test_create_text_from_class_with_comment(self, ontology_embedder, sample_ontology_class): + """Test that class comments are included in text representation.""" + text = ontology_embedder._create_text_representation( + "Recipe", + sample_ontology_class, + "class" + ) + + assert "combination of ingredients" in text, "Should include comment" + + def test_create_text_from_property_with_domain_range(self, ontology_embedder, sample_ontology_property): + """Test that property domain and range are included in text.""" + text = ontology_embedder._create_text_representation( + "ingredients", + sample_ontology_property, + "objectProperty" + ) + + assert "domain: Recipe" in text, "Should include domain" + assert "range: IngredientList" in text, "Should include range" + + def test_normalizes_id_with_underscores(self, ontology_embedder): + """Test that IDs with underscores are normalized.""" + mock_element = MagicMock() + mock_element.labels = [] + mock_element.comment = None + + text = ontology_embedder._create_text_representation( + "some_property_name", + mock_element, + "objectProperty" + ) + + assert "some property name" in text, "Should replace underscores with spaces" + + def test_normalizes_id_with_hyphens(self, ontology_embedder): + """Test that IDs with hyphens are normalized.""" + mock_element = MagicMock() + mock_element.labels = [] + mock_element.comment = None + + text = ontology_embedder._create_text_representation( + "some-property-name", + mock_element, + "objectProperty" + ) + + assert "some property name" in text, "Should replace hyphens with spaces" + + def test_handles_element_without_labels(self, ontology_embedder): + """Test handling of elements without labels.""" + mock_element = MagicMock() + mock_element.labels = None + mock_element.comment = "Test comment" + + text = ontology_embedder._create_text_representation( + "TestElement", + mock_element, + "class" + ) + + assert "TestElement" in text, "Should still include ID" + assert "Test comment" in text, "Should include comment" + + def test_includes_subclass_info_for_classes(self, ontology_embedder): + """Test that subclass information is included for classes.""" + mock_class = MagicMock() + mock_class.labels = [] + mock_class.comment = None + mock_class.subclass_of = "ParentClass" + + text = ontology_embedder._create_text_representation( + "ChildClass", + mock_class, + "class" + ) + + assert "subclass of ParentClass" in text, "Should include subclass relationship" + + +class TestVectorStoreOperations: + """Test suite for vector store operations.""" + + def test_vector_store_starts_empty(self, vector_store): + """Test that vector store initializes empty.""" + assert vector_store.size() == 0, "New vector store should be empty" + + def test_vector_store_api_structure(self, vector_store): + """Test that vector store has expected API methods.""" + assert hasattr(vector_store, 'add'), "Should have add method" + assert hasattr(vector_store, 'add_batch'), "Should have add_batch method" + assert hasattr(vector_store, 'search'), "Should have search method" + assert hasattr(vector_store, 'size'), "Should have size method" + + def test_search_result_class_structure(self): + """Test that SearchResult has expected structure.""" + # Create a sample SearchResult + result = SearchResult(id="test-1", score=0.95, metadata={"element": "Test"}) + + assert hasattr(result, 'id'), "Should have id attribute" + assert hasattr(result, 'score'), "Should have score attribute" + assert hasattr(result, 'metadata'), "Should have metadata attribute" + assert result.id == "test-1" + assert result.score == 0.95 + assert result.metadata["element"] == "Test" + + +class TestOntologySelectorIntegration: + """Test suite for ontology selector with embeddings.""" + + @pytest.fixture + def sample_ontology(self): + """Create a sample ontology for testing.""" + return Ontology( + id="food", + classes={ + "Recipe": OntologyClass( + uri="http://purl.org/ontology/fo/Recipe", + type="owl:Class", + labels=[{"value": "Recipe", "lang": "en-gb"}], + comment="A Recipe is a combination of ingredients and a method." + ), + "Ingredient": OntologyClass( + uri="http://purl.org/ontology/fo/Ingredient", + type="owl:Class", + labels=[{"value": "Ingredient", "lang": "en-gb"}], + comment="An Ingredient combines a quantity and a food." + ) + }, + object_properties={ + "ingredients": OntologyProperty( + uri="http://purl.org/ontology/fo/ingredients", + type="owl:ObjectProperty", + labels=[{"value": "ingredients", "lang": "en-gb"}], + comment="Relates a recipe to its ingredients.", + domain="Recipe", + range="IngredientList" + ) + }, + datatype_properties={}, + metadata={"name": "Food Ontology"} + ) + + @pytest.fixture + def ontology_loader_mock(self, sample_ontology): + """Create a mock ontology loader.""" + loader = MagicMock() + loader.get_ontology.return_value = sample_ontology + loader.get_all_ontology_ids.return_value = ["food"] + return loader + + async def test_selector_handles_text_segments( + self, ontology_embedder, ontology_loader_mock + ): + """Test that selector can process text segments.""" + # Create selector + selector = OntologySelector( + ontology_embedder=ontology_embedder, + ontology_loader=ontology_loader_mock, + top_k=5, + similarity_threshold=0.3 + ) + + # Create text segments + segments = [ + TextSegment(text="Recipe for cornish pasty", type="sentence", position=0), + TextSegment(text="ingredients needed", type="sentence", position=1) + ] + + # Select ontology subset (will be empty since we haven't embedded anything) + subsets = await selector.select_ontology_subset(segments) + + # Should return a list (even if empty) + assert isinstance(subsets, list), "Should return a list of subsets" + + async def test_selector_with_no_embedding_service(self, vector_store, ontology_loader_mock): + """Test that selector handles missing embedding service gracefully.""" + embedder = OntologyEmbedder(embedding_service=None, vector_store=vector_store) + + selector = OntologySelector( + ontology_embedder=embedder, + ontology_loader=ontology_loader_mock, + top_k=5, + similarity_threshold=0.7 + ) + + segments = [ + TextSegment(text="Test text", type="sentence", position=0) + ] + + # Should return empty results without crashing + subsets = await selector.select_ontology_subset(segments) + assert isinstance(subsets, list), "Should return a list even without embeddings" + + def test_merge_subsets_combines_elements(self, ontology_loader_mock, ontology_embedder): + """Test that merging subsets combines all elements.""" + selector = OntologySelector( + ontology_embedder=ontology_embedder, + ontology_loader=ontology_loader_mock, + top_k=5, + similarity_threshold=0.7 + ) + + # Create two subsets from same ontology + subset1 = OntologySubset( + ontology_id="food", + classes={"Recipe": {"uri": "http://example.com/Recipe"}}, + object_properties={}, + datatype_properties={}, + metadata={}, + relevance_score=0.8 + ) + + subset2 = OntologySubset( + ontology_id="food", + classes={"Ingredient": {"uri": "http://example.com/Ingredient"}}, + object_properties={"ingredients": {"uri": "http://example.com/ingredients"}}, + datatype_properties={}, + metadata={}, + relevance_score=0.9 + ) + + merged = selector.merge_subsets([subset1, subset2]) + + assert len(merged.classes) == 2, "Should combine classes" + # Keys may be prefixed with ontology id + assert any("Recipe" in key for key in merged.classes.keys()) + assert any("Ingredient" in key for key in merged.classes.keys()) + assert len(merged.object_properties) == 1, "Should include properties" + + +class TestEmbeddingEdgeCases: + """Test suite for edge cases in embedding.""" + + async def test_embed_element_with_no_labels(self, ontology_embedder): + """Test embedding element without labels.""" + mock_element = MagicMock() + mock_element.labels = None + mock_element.comment = "Test element" + + text = ontology_embedder._create_text_representation( + "TestElement", + mock_element, + "class" + ) + + # Should not crash and should include ID and comment + assert "TestElement" in text + assert "Test element" in text + + async def test_embed_element_with_empty_comment(self, ontology_embedder): + """Test embedding element with empty comment.""" + mock_element = MagicMock() + mock_element.labels = [{"value": "Label"}] + mock_element.comment = None + + text = ontology_embedder._create_text_representation( + "TestElement", + mock_element, + "class" + ) + + # Should not crash + assert "Label" in text + + def test_ontology_element_metadata_structure(self): + """Test OntologyElementMetadata structure.""" + metadata = OntologyElementMetadata( + type="class", + ontology="food", + element="Recipe", + definition={"uri": "http://example.com/Recipe"}, + text="Recipe A combination of ingredients" + ) + + assert metadata.type == "class" + assert metadata.ontology == "food" + assert metadata.element == "Recipe" + assert "uri" in metadata.definition + + def test_vector_store_search_on_empty_store(self): + """Test searching empty vector store.""" + # Need a non-empty store for faiss to work + # This test verifies the store can be created but searching requires dimension + store = InMemoryVectorStore() + assert store.size() == 0, "Empty store should have size 0" + + +class TestOntologySubsetStructure: + """Test suite for OntologySubset structure.""" + + def test_ontology_subset_creation(self): + """Test creating an OntologySubset.""" + subset = OntologySubset( + ontology_id="test", + classes={"Recipe": {}}, + object_properties={"produces": {}}, + datatype_properties={"serves": {}}, + metadata={"name": "Test"}, + relevance_score=0.85 + ) + + assert subset.ontology_id == "test" + assert len(subset.classes) == 1 + assert len(subset.object_properties) == 1 + assert len(subset.datatype_properties) == 1 + assert subset.relevance_score == 0.85 + + def test_ontology_subset_default_score(self): + """Test that OntologySubset has default score.""" + subset = OntologySubset( + ontology_id="test", + classes={}, + object_properties={}, + datatype_properties={}, + metadata={} + ) + + assert subset.relevance_score == 0.0, "Should have default score of 0.0" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/unit/test_extract/test_ontology/test_entity_contexts.py b/tests/unit/test_extract/test_ontology/test_entity_contexts.py new file mode 100644 index 00000000..c867b05a --- /dev/null +++ b/tests/unit/test_extract/test_ontology/test_entity_contexts.py @@ -0,0 +1,353 @@ +""" +Unit tests for entity context building. + +Tests that entity contexts are properly created from extracted triples, +collecting labels and definitions for entity embedding and retrieval. +""" + +import pytest +from trustgraph.extract.kg.ontology.extract import Processor +from trustgraph.schema.core.primitives import Triple, Value +from trustgraph.schema.knowledge.graph import EntityContext + + +@pytest.fixture +def processor(): + """Create a Processor instance for testing.""" + processor = object.__new__(Processor) + return processor + + +class TestEntityContextBuilding: + """Test suite for entity context building from triples.""" + + def test_builds_context_from_label(self, processor): + """Test that entity context is built from rdfs:label.""" + triples = [ + Triple( + s=Value(value="https://example.com/entity/cornish-pasty", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="Cornish Pasty", is_uri=False) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 1, "Should create one entity context" + assert isinstance(contexts[0], EntityContext) + assert contexts[0].entity.value == "https://example.com/entity/cornish-pasty" + assert "Label: Cornish Pasty" in contexts[0].context + + def test_builds_context_from_definition(self, processor): + """Test that entity context includes definitions.""" + triples = [ + Triple( + s=Value(value="https://example.com/entity/pasty", is_uri=True), + p=Value(value="http://www.w3.org/2004/02/skos/core#definition", is_uri=True), + o=Value(value="A baked pastry filled with savory ingredients", is_uri=False) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 1 + assert "A baked pastry filled with savory ingredients" in contexts[0].context + + def test_combines_label_and_definition(self, processor): + """Test that label and definition are combined in context.""" + triples = [ + Triple( + s=Value(value="https://example.com/entity/recipe1", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="Pasty Recipe", is_uri=False) + ), + Triple( + s=Value(value="https://example.com/entity/recipe1", is_uri=True), + p=Value(value="http://www.w3.org/2004/02/skos/core#definition", is_uri=True), + o=Value(value="Traditional Cornish pastry recipe", is_uri=False) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 1 + context_text = contexts[0].context + assert "Label: Pasty Recipe" in context_text + assert "Traditional Cornish pastry recipe" in context_text + assert ". " in context_text, "Should join parts with period and space" + + def test_uses_first_label_only(self, processor): + """Test that only the first label is used in context.""" + triples = [ + Triple( + s=Value(value="https://example.com/entity/food1", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="First Label", is_uri=False) + ), + Triple( + s=Value(value="https://example.com/entity/food1", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="Second Label", is_uri=False) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 1 + assert "Label: First Label" in contexts[0].context + assert "Second Label" not in contexts[0].context + + def test_includes_all_definitions(self, processor): + """Test that all definitions are included in context.""" + triples = [ + Triple( + s=Value(value="https://example.com/entity/food1", is_uri=True), + p=Value(value="http://www.w3.org/2004/02/skos/core#definition", is_uri=True), + o=Value(value="First definition", is_uri=False) + ), + Triple( + s=Value(value="https://example.com/entity/food1", is_uri=True), + p=Value(value="http://www.w3.org/2004/02/skos/core#definition", is_uri=True), + o=Value(value="Second definition", is_uri=False) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 1 + context_text = contexts[0].context + assert "First definition" in context_text + assert "Second definition" in context_text + + def test_supports_schema_org_description(self, processor): + """Test that schema.org description is treated as definition.""" + triples = [ + Triple( + s=Value(value="https://example.com/entity/food1", is_uri=True), + p=Value(value="https://schema.org/description", is_uri=True), + o=Value(value="A delicious food item", is_uri=False) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 1 + assert "A delicious food item" in contexts[0].context + + def test_handles_multiple_entities(self, processor): + """Test that contexts are created for multiple entities.""" + triples = [ + Triple( + s=Value(value="https://example.com/entity/entity1", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="Entity One", is_uri=False) + ), + Triple( + s=Value(value="https://example.com/entity/entity2", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="Entity Two", is_uri=False) + ), + Triple( + s=Value(value="https://example.com/entity/entity3", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="Entity Three", is_uri=False) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 3, "Should create context for each entity" + entity_uris = [ctx.entity.value for ctx in contexts] + assert "https://example.com/entity/entity1" in entity_uris + assert "https://example.com/entity/entity2" in entity_uris + assert "https://example.com/entity/entity3" in entity_uris + + def test_ignores_uri_literals(self, processor): + """Test that URI objects are ignored (only literal labels/definitions).""" + triples = [ + Triple( + s=Value(value="https://example.com/entity/food1", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="https://example.com/some/uri", is_uri=True) # URI, not literal + ) + ] + + contexts = processor.build_entity_contexts(triples) + + # Should not create context since label is URI + assert len(contexts) == 0, "Should not create context for URI labels" + + def test_ignores_non_label_non_definition_triples(self, processor): + """Test that other predicates are ignored.""" + triples = [ + Triple( + s=Value(value="https://example.com/entity/food1", is_uri=True), + p=Value(value="http://www.w3.org/1999/02/22-rdf-syntax-ns#type", is_uri=True), + o=Value(value="http://example.com/Food", is_uri=True) + ), + Triple( + s=Value(value="https://example.com/entity/food1", is_uri=True), + p=Value(value="http://example.com/produces", is_uri=True), + o=Value(value="https://example.com/entity/food2", is_uri=True) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + # Should not create context since no labels or definitions + assert len(contexts) == 0, "Should not create context without labels/definitions" + + def test_handles_empty_triple_list(self, processor): + """Test handling of empty triple list.""" + triples = [] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 0, "Empty triple list should return empty contexts" + + def test_entity_context_has_value_object(self, processor): + """Test that EntityContext.entity is a Value object.""" + triples = [ + Triple( + s=Value(value="https://example.com/entity/test", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="Test Entity", is_uri=False) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 1 + assert isinstance(contexts[0].entity, Value), "Entity should be Value object" + assert contexts[0].entity.is_uri, "Entity should be marked as URI" + + def test_entity_context_text_is_string(self, processor): + """Test that EntityContext.context is a string.""" + triples = [ + Triple( + s=Value(value="https://example.com/entity/test", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="Test Entity", is_uri=False) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 1 + assert isinstance(contexts[0].context, str), "Context should be string" + + def test_only_creates_contexts_with_meaningful_info(self, processor): + """Test that contexts are only created when there's meaningful information.""" + triples = [ + # Entity with label - should create context + Triple( + s=Value(value="https://example.com/entity/entity1", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="Entity One", is_uri=False) + ), + # Entity with only rdf:type - should NOT create context + Triple( + s=Value(value="https://example.com/entity/entity2", is_uri=True), + p=Value(value="http://www.w3.org/1999/02/22-rdf-syntax-ns#type", is_uri=True), + o=Value(value="http://example.com/Food", is_uri=True) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 1, "Should only create context for entity with label/definition" + assert contexts[0].entity.value == "https://example.com/entity/entity1" + + +class TestEntityContextEdgeCases: + """Test suite for edge cases in entity context building.""" + + def test_handles_unicode_in_labels(self, processor): + """Test handling of unicode characters in labels.""" + triples = [ + Triple( + s=Value(value="https://example.com/entity/café", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="Café Spécial", is_uri=False) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 1 + assert "Café Spécial" in contexts[0].context + + def test_handles_long_definitions(self, processor): + """Test handling of very long definitions.""" + long_def = "This is a very long definition " * 50 + triples = [ + Triple( + s=Value(value="https://example.com/entity/test", is_uri=True), + p=Value(value="http://www.w3.org/2004/02/skos/core#definition", is_uri=True), + o=Value(value=long_def, is_uri=False) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 1 + assert long_def in contexts[0].context + + def test_handles_special_characters_in_context(self, processor): + """Test handling of special characters in context text.""" + triples = [ + Triple( + s=Value(value="https://example.com/entity/test", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="Test & Entity \"quotes\"", is_uri=False) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 1 + assert "Test & Entity \"quotes\"" in contexts[0].context + + def test_mixed_relevant_and_irrelevant_triples(self, processor): + """Test extracting contexts from mixed triple types.""" + triples = [ + # Label - relevant + Triple( + s=Value(value="https://example.com/entity/recipe1", is_uri=True), + p=Value(value="http://www.w3.org/2000/01/rdf-schema#label", is_uri=True), + o=Value(value="Cornish Pasty Recipe", is_uri=False) + ), + # Type - irrelevant + Triple( + s=Value(value="https://example.com/entity/recipe1", is_uri=True), + p=Value(value="http://www.w3.org/1999/02/22-rdf-syntax-ns#type", is_uri=True), + o=Value(value="http://example.com/Recipe", is_uri=True) + ), + # Property - irrelevant + Triple( + s=Value(value="https://example.com/entity/recipe1", is_uri=True), + p=Value(value="http://example.com/produces", is_uri=True), + o=Value(value="https://example.com/entity/pasty", is_uri=True) + ), + # Definition - relevant + Triple( + s=Value(value="https://example.com/entity/recipe1", is_uri=True), + p=Value(value="http://www.w3.org/2004/02/skos/core#definition", is_uri=True), + o=Value(value="Traditional British pastry recipe", is_uri=False) + ) + ] + + contexts = processor.build_entity_contexts(triples) + + assert len(contexts) == 1 + context_text = contexts[0].context + # Should include label and definition + assert "Label: Cornish Pasty Recipe" in context_text + assert "Traditional British pastry recipe" in context_text + # Should not include type or property info + assert "Recipe" not in context_text or "Cornish Pasty Recipe" in context_text # Only in label + assert "produces" not in context_text + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/unit/test_extract/test_ontology/test_ontology_loading.py b/tests/unit/test_extract/test_ontology/test_ontology_loading.py new file mode 100644 index 00000000..27e34e1f --- /dev/null +++ b/tests/unit/test_extract/test_ontology/test_ontology_loading.py @@ -0,0 +1,518 @@ +""" +Unit tests for ontology loading and configuration. + +Tests that ontologies are properly loaded from configuration, +parsed, validated, and managed by the OntologyLoader. +""" + +import pytest +from trustgraph.extract.kg.ontology.ontology_loader import ( + OntologyLoader, + Ontology, + OntologyClass, + OntologyProperty +) + + +@pytest.fixture +def ontology_loader(): + """Create an OntologyLoader instance.""" + return OntologyLoader() + + +@pytest.fixture +def sample_ontology_config(): + """Create a sample ontology configuration.""" + return { + "food": { + "metadata": { + "name": "Food Ontology", + "namespace": "http://purl.org/ontology/fo/" + }, + "classes": { + "Recipe": { + "uri": "http://purl.org/ontology/fo/Recipe", + "type": "owl:Class", + "rdfs:label": [{"value": "Recipe", "lang": "en-gb"}], + "rdfs:comment": "A Recipe is a combination of ingredients and a method." + }, + "Ingredient": { + "uri": "http://purl.org/ontology/fo/Ingredient", + "type": "owl:Class", + "rdfs:label": [{"value": "Ingredient", "lang": "en-gb"}], + "rdfs:comment": "An Ingredient combines a quantity and a food." + }, + "Food": { + "uri": "http://purl.org/ontology/fo/Food", + "type": "owl:Class", + "rdfs:label": [{"value": "Food", "lang": "en-gb"}], + "rdfs:comment": "A Food is something that can be eaten.", + "rdfs:subClassOf": "EdibleThing" + } + }, + "objectProperties": { + "ingredients": { + "uri": "http://purl.org/ontology/fo/ingredients", + "type": "owl:ObjectProperty", + "rdfs:label": [{"value": "ingredients", "lang": "en-gb"}], + "rdfs:domain": "Recipe", + "rdfs:range": "IngredientList" + }, + "produces": { + "uri": "http://purl.org/ontology/fo/produces", + "type": "owl:ObjectProperty", + "rdfs:label": [{"value": "produces", "lang": "en-gb"}], + "rdfs:domain": "Recipe", + "rdfs:range": "Food" + } + }, + "datatypeProperties": { + "serves": { + "uri": "http://purl.org/ontology/fo/serves", + "type": "owl:DatatypeProperty", + "rdfs:label": [{"value": "serves", "lang": "en-gb"}], + "rdfs:domain": "Recipe", + "rdfs:range": "xsd:string" + } + } + } + } + + +class TestOntologyLoaderInitialization: + """Test suite for OntologyLoader initialization.""" + + def test_loader_starts_empty(self, ontology_loader): + """Test that loader initializes with no ontologies.""" + assert len(ontology_loader.get_all_ontologies()) == 0 + + def test_loader_get_nonexistent_ontology(self, ontology_loader): + """Test getting non-existent ontology returns None.""" + result = ontology_loader.get_ontology("nonexistent") + assert result is None + + +class TestOntologyLoading: + """Test suite for loading ontologies from configuration.""" + + def test_loads_single_ontology(self, ontology_loader, sample_ontology_config): + """Test loading a single ontology.""" + ontology_loader.update_ontologies(sample_ontology_config) + + ontologies = ontology_loader.get_all_ontologies() + assert len(ontologies) == 1 + assert "food" in ontologies + + def test_loaded_ontology_has_correct_id(self, ontology_loader, sample_ontology_config): + """Test that loaded ontology has correct ID.""" + ontology_loader.update_ontologies(sample_ontology_config) + + ontology = ontology_loader.get_ontology("food") + assert ontology is not None + assert ontology.id == "food" + + def test_loaded_ontology_has_metadata(self, ontology_loader, sample_ontology_config): + """Test that loaded ontology includes metadata.""" + ontology_loader.update_ontologies(sample_ontology_config) + + ontology = ontology_loader.get_ontology("food") + assert ontology.metadata["name"] == "Food Ontology" + assert ontology.metadata["namespace"] == "http://purl.org/ontology/fo/" + + def test_loaded_ontology_has_classes(self, ontology_loader, sample_ontology_config): + """Test that loaded ontology includes classes.""" + ontology_loader.update_ontologies(sample_ontology_config) + + ontology = ontology_loader.get_ontology("food") + assert len(ontology.classes) == 3 + assert "Recipe" in ontology.classes + assert "Ingredient" in ontology.classes + assert "Food" in ontology.classes + + def test_loaded_classes_have_correct_properties(self, ontology_loader, sample_ontology_config): + """Test that loaded classes have correct properties.""" + ontology_loader.update_ontologies(sample_ontology_config) + + ontology = ontology_loader.get_ontology("food") + recipe = ontology.get_class("Recipe") + + assert isinstance(recipe, OntologyClass) + assert recipe.uri == "http://purl.org/ontology/fo/Recipe" + assert recipe.type == "owl:Class" + assert len(recipe.labels) == 1 + assert recipe.labels[0]["value"] == "Recipe" + assert "combination of ingredients" in recipe.comment + + def test_loaded_ontology_has_object_properties(self, ontology_loader, sample_ontology_config): + """Test that loaded ontology includes object properties.""" + ontology_loader.update_ontologies(sample_ontology_config) + + ontology = ontology_loader.get_ontology("food") + assert len(ontology.object_properties) == 2 + assert "ingredients" in ontology.object_properties + assert "produces" in ontology.object_properties + + def test_loaded_properties_have_domain_and_range(self, ontology_loader, sample_ontology_config): + """Test that loaded properties have domain and range.""" + ontology_loader.update_ontologies(sample_ontology_config) + + ontology = ontology_loader.get_ontology("food") + produces = ontology.get_property("produces") + + assert isinstance(produces, OntologyProperty) + assert produces.domain == "Recipe" + assert produces.range == "Food" + + def test_loaded_ontology_has_datatype_properties(self, ontology_loader, sample_ontology_config): + """Test that loaded ontology includes datatype properties.""" + ontology_loader.update_ontologies(sample_ontology_config) + + ontology = ontology_loader.get_ontology("food") + assert len(ontology.datatype_properties) == 1 + assert "serves" in ontology.datatype_properties + + def test_loads_multiple_ontologies(self, ontology_loader): + """Test loading multiple ontologies.""" + config = { + "food": { + "metadata": {"name": "Food Ontology"}, + "classes": {"Recipe": {"uri": "http://example.com/Recipe"}}, + "objectProperties": {}, + "datatypeProperties": {} + }, + "music": { + "metadata": {"name": "Music Ontology"}, + "classes": {"Song": {"uri": "http://example.com/Song"}}, + "objectProperties": {}, + "datatypeProperties": {} + } + } + + ontology_loader.update_ontologies(config) + + ontologies = ontology_loader.get_all_ontologies() + assert len(ontologies) == 2 + assert "food" in ontologies + assert "music" in ontologies + + def test_update_replaces_existing_ontologies(self, ontology_loader, sample_ontology_config): + """Test that update replaces existing ontologies.""" + # Load initial ontologies + ontology_loader.update_ontologies(sample_ontology_config) + assert len(ontology_loader.get_all_ontologies()) == 1 + + # Update with different config + new_config = { + "music": { + "metadata": {"name": "Music Ontology"}, + "classes": {}, + "objectProperties": {}, + "datatypeProperties": {} + } + } + ontology_loader.update_ontologies(new_config) + + # Old ontologies should be replaced + ontologies = ontology_loader.get_all_ontologies() + assert len(ontologies) == 1 + assert "music" in ontologies + assert "food" not in ontologies + + +class TestOntologyRetrieval: + """Test suite for retrieving ontologies.""" + + def test_get_ontology_by_id(self, ontology_loader, sample_ontology_config): + """Test retrieving ontology by ID.""" + ontology_loader.update_ontologies(sample_ontology_config) + + ontology = ontology_loader.get_ontology("food") + assert ontology is not None + assert isinstance(ontology, Ontology) + + def test_get_all_ontologies(self, ontology_loader): + """Test retrieving all ontologies.""" + config = { + "food": { + "metadata": {}, + "classes": {}, + "objectProperties": {}, + "datatypeProperties": {} + }, + "music": { + "metadata": {}, + "classes": {}, + "objectProperties": {}, + "datatypeProperties": {} + } + } + ontology_loader.update_ontologies(config) + + ontologies = ontology_loader.get_all_ontologies() + assert isinstance(ontologies, dict) + assert len(ontologies) == 2 + + def test_get_all_ontology_ids(self, ontology_loader): + """Test retrieving all ontology IDs.""" + config = { + "food": { + "metadata": {}, + "classes": {}, + "objectProperties": {}, + "datatypeProperties": {} + }, + "music": { + "metadata": {}, + "classes": {}, + "objectProperties": {}, + "datatypeProperties": {} + } + } + ontology_loader.update_ontologies(config) + + ontologies = ontology_loader.get_all_ontologies() + ids = list(ontologies.keys()) + assert len(ids) == 2 + assert "food" in ids + assert "music" in ids + + +class TestOntologyClassMethods: + """Test suite for Ontology helper methods.""" + + def test_get_class(self, ontology_loader, sample_ontology_config): + """Test getting a class from ontology.""" + ontology_loader.update_ontologies(sample_ontology_config) + ontology = ontology_loader.get_ontology("food") + + recipe = ontology.get_class("Recipe") + assert recipe is not None + assert recipe.uri == "http://purl.org/ontology/fo/Recipe" + + def test_get_nonexistent_class(self, ontology_loader, sample_ontology_config): + """Test getting non-existent class returns None.""" + ontology_loader.update_ontologies(sample_ontology_config) + ontology = ontology_loader.get_ontology("food") + + result = ontology.get_class("NonExistent") + assert result is None + + def test_get_property(self, ontology_loader, sample_ontology_config): + """Test getting a property from ontology.""" + ontology_loader.update_ontologies(sample_ontology_config) + ontology = ontology_loader.get_ontology("food") + + produces = ontology.get_property("produces") + assert produces is not None + assert produces.domain == "Recipe" + + def test_get_property_checks_both_types(self, ontology_loader, sample_ontology_config): + """Test that get_property checks both object and datatype properties.""" + ontology_loader.update_ontologies(sample_ontology_config) + ontology = ontology_loader.get_ontology("food") + + # Object property + produces = ontology.get_property("produces") + assert produces is not None + + # Datatype property + serves = ontology.get_property("serves") + assert serves is not None + + def test_get_parent_classes(self, ontology_loader, sample_ontology_config): + """Test getting parent classes following subClassOf.""" + ontology_loader.update_ontologies(sample_ontology_config) + ontology = ontology_loader.get_ontology("food") + + parents = ontology.get_parent_classes("Food") + assert "EdibleThing" in parents + + def test_get_parent_classes_empty_for_root(self, ontology_loader, sample_ontology_config): + """Test that root classes have no parents.""" + ontology_loader.update_ontologies(sample_ontology_config) + ontology = ontology_loader.get_ontology("food") + + parents = ontology.get_parent_classes("Recipe") + assert len(parents) == 0 + + +class TestOntologyValidation: + """Test suite for ontology validation.""" + + def test_validates_property_domain_exists(self, ontology_loader): + """Test validation of property domain.""" + config = { + "test": { + "metadata": {}, + "classes": { + "Recipe": {"uri": "http://example.com/Recipe"} + }, + "objectProperties": { + "produces": { + "uri": "http://example.com/produces", + "type": "owl:ObjectProperty", + "rdfs:domain": "NonExistentClass", # Invalid + "rdfs:range": "Food" + } + }, + "datatypeProperties": {} + } + } + + ontology_loader.update_ontologies(config) + ontology = ontology_loader.get_ontology("test") + + issues = ontology.validate_structure() + assert len(issues) > 0 + assert any("unknown domain" in issue.lower() for issue in issues) + + def test_validates_object_property_range_exists(self, ontology_loader): + """Test validation of object property range.""" + config = { + "test": { + "metadata": {}, + "classes": { + "Recipe": {"uri": "http://example.com/Recipe"} + }, + "objectProperties": { + "produces": { + "uri": "http://example.com/produces", + "type": "owl:ObjectProperty", + "rdfs:domain": "Recipe", + "rdfs:range": "NonExistentClass" # Invalid + } + }, + "datatypeProperties": {} + } + } + + ontology_loader.update_ontologies(config) + ontology = ontology_loader.get_ontology("test") + + issues = ontology.validate_structure() + assert len(issues) > 0 + assert any("unknown range" in issue.lower() for issue in issues) + + def test_detects_circular_inheritance(self, ontology_loader): + """Test detection of circular inheritance.""" + config = { + "test": { + "metadata": {}, + "classes": { + "A": { + "uri": "http://example.com/A", + "rdfs:subClassOf": "B" + }, + "B": { + "uri": "http://example.com/B", + "rdfs:subClassOf": "C" + }, + "C": { + "uri": "http://example.com/C", + "rdfs:subClassOf": "A" # Circular! + } + }, + "objectProperties": {}, + "datatypeProperties": {} + } + } + + ontology_loader.update_ontologies(config) + ontology = ontology_loader.get_ontology("test") + + issues = ontology.validate_structure() + assert len(issues) > 0 + assert any("circular" in issue.lower() for issue in issues) + + def test_valid_ontology_has_no_issues(self, ontology_loader, sample_ontology_config): + """Test that valid ontology passes validation.""" + # Modify config to have valid references + config = sample_ontology_config.copy() + config["food"]["classes"]["EdibleThing"] = { + "uri": "http://purl.org/ontology/fo/EdibleThing" + } + config["food"]["classes"]["IngredientList"] = { + "uri": "http://purl.org/ontology/fo/IngredientList" + } + + ontology_loader.update_ontologies(config) + ontology = ontology_loader.get_ontology("food") + + issues = ontology.validate_structure() + # Should have minimal or no issues for valid ontology + assert isinstance(issues, list) + + +class TestEdgeCases: + """Test suite for edge cases in ontology loading.""" + + def test_handles_empty_config(self, ontology_loader): + """Test handling of empty configuration.""" + ontology_loader.update_ontologies({}) + + ontologies = ontology_loader.get_all_ontologies() + assert len(ontologies) == 0 + + def test_handles_ontology_without_classes(self, ontology_loader): + """Test handling of ontology with no classes.""" + config = { + "minimal": { + "metadata": {"name": "Minimal"}, + "classes": {}, + "objectProperties": {}, + "datatypeProperties": {} + } + } + + ontology_loader.update_ontologies(config) + ontology = ontology_loader.get_ontology("minimal") + + assert ontology is not None + assert len(ontology.classes) == 0 + + def test_handles_ontology_without_properties(self, ontology_loader): + """Test handling of ontology with no properties.""" + config = { + "test": { + "metadata": {}, + "classes": { + "Recipe": {"uri": "http://example.com/Recipe"} + }, + "objectProperties": {}, + "datatypeProperties": {} + } + } + + ontology_loader.update_ontologies(config) + ontology = ontology_loader.get_ontology("test") + + assert ontology is not None + assert len(ontology.object_properties) == 0 + assert len(ontology.datatype_properties) == 0 + + def test_handles_missing_optional_fields(self, ontology_loader): + """Test handling of missing optional fields.""" + config = { + "test": { + "metadata": {}, + "classes": { + "Simple": { + "uri": "http://example.com/Simple" + # No labels, comments, subclass, etc. + } + }, + "objectProperties": {}, + "datatypeProperties": {} + } + } + + ontology_loader.update_ontologies(config) + ontology = ontology_loader.get_ontology("test") + + simple = ontology.get_class("Simple") + assert simple is not None + assert simple.uri == "http://example.com/Simple" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/unit/test_extract/test_ontology/test_ontology_selector.py b/tests/unit/test_extract/test_ontology/test_ontology_selector.py new file mode 100644 index 00000000..37526d74 --- /dev/null +++ b/tests/unit/test_extract/test_ontology/test_ontology_selector.py @@ -0,0 +1,336 @@ +""" +Unit tests for OntologySelector component. + +Tests the critical auto-include properties feature that automatically pulls in +all properties related to selected classes. +""" + +import pytest +from unittest.mock import Mock, AsyncMock +from trustgraph.extract.kg.ontology.ontology_selector import ( + OntologySelector, + OntologySubset +) +from trustgraph.extract.kg.ontology.ontology_loader import ( + Ontology, + OntologyClass, + OntologyProperty +) +from trustgraph.extract.kg.ontology.text_processor import TextSegment + + +@pytest.fixture +def sample_ontology(): + """Create a sample food ontology for testing.""" + # Create classes + recipe_class = OntologyClass( + uri="http://purl.org/ontology/fo/Recipe", + type="owl:Class", + labels=[{"value": "Recipe", "lang": "en-gb"}], + comment="A Recipe is a combination of ingredients and a method." + ) + + ingredient_class = OntologyClass( + uri="http://purl.org/ontology/fo/Ingredient", + type="owl:Class", + labels=[{"value": "Ingredient", "lang": "en-gb"}], + comment="An Ingredient is a combination of a quantity and a food." + ) + + food_class = OntologyClass( + uri="http://purl.org/ontology/fo/Food", + type="owl:Class", + labels=[{"value": "Food", "lang": "en-gb"}], + comment="A Food is something that can be eaten." + ) + + method_class = OntologyClass( + uri="http://purl.org/ontology/fo/Method", + type="owl:Class", + labels=[{"value": "Method", "lang": "en-gb"}], + comment="A Method is the way in which ingredients are combined." + ) + + # Create object properties + ingredients_prop = OntologyProperty( + uri="http://purl.org/ontology/fo/ingredients", + type="owl:ObjectProperty", + labels=[{"value": "ingredients", "lang": "en-gb"}], + comment="The ingredients property relates a recipe to an ingredient list.", + domain="Recipe", + range="IngredientList" + ) + + food_prop = OntologyProperty( + uri="http://purl.org/ontology/fo/food", + type="owl:ObjectProperty", + labels=[{"value": "food", "lang": "en-gb"}], + comment="The food property relates an ingredient to the food that is required.", + domain="Ingredient", + range="Food" + ) + + method_prop = OntologyProperty( + uri="http://purl.org/ontology/fo/method", + type="owl:ObjectProperty", + labels=[{"value": "method", "lang": "en-gb"}], + comment="The method property relates a recipe to the method used.", + domain="Recipe", + range="Method" + ) + + produces_prop = OntologyProperty( + uri="http://purl.org/ontology/fo/produces", + type="owl:ObjectProperty", + labels=[{"value": "produces", "lang": "en-gb"}], + comment="The produces property relates a recipe to the food it produces.", + domain="Recipe", + range="Food" + ) + + # Create datatype properties + serves_prop = OntologyProperty( + uri="http://purl.org/ontology/fo/serves", + type="owl:DatatypeProperty", + labels=[{"value": "serves", "lang": "en-gb"}], + comment="The serves property indicates what the recipe is intended to serve.", + domain="Recipe", + range="xsd:string" + ) + + # Build ontology + ontology = Ontology( + id="food", + metadata={ + "name": "Food Ontology", + "namespace": "http://purl.org/ontology/fo/" + }, + classes={ + "Recipe": recipe_class, + "Ingredient": ingredient_class, + "Food": food_class, + "Method": method_class + }, + object_properties={ + "ingredients": ingredients_prop, + "food": food_prop, + "method": method_prop, + "produces": produces_prop + }, + datatype_properties={ + "serves": serves_prop + } + ) + + return ontology + + +@pytest.fixture +def ontology_loader_with_sample(sample_ontology): + """Create an OntologyLoader with the sample ontology.""" + loader = Mock() + loader.get_ontology = Mock(return_value=sample_ontology) + loader.ontologies = {"food": sample_ontology} + return loader + + +@pytest.fixture +def ontology_embedder(): + """Create a mock OntologyEmbedder.""" + embedder = Mock() + embedder.embed_text = AsyncMock(return_value=[0.1, 0.2, 0.3]) # Mock embedding + + # Mock vector store with search results + vector_store = Mock() + embedder.get_vector_store = Mock(return_value=vector_store) + + return embedder + + +class TestOntologySelector: + """Test suite for OntologySelector.""" + + def test_auto_include_properties_for_recipe_class( + self, ontology_loader_with_sample, ontology_embedder, sample_ontology + ): + """Test that selecting Recipe class automatically includes all related properties.""" + selector = OntologySelector( + ontology_embedder=ontology_embedder, + ontology_loader=ontology_loader_with_sample, + top_k=10, + similarity_threshold=0.3 + ) + + # Create a subset with only Recipe class initially selected + subset = OntologySubset( + ontology_id="food", + classes={"Recipe": sample_ontology.classes["Recipe"].__dict__}, + object_properties={}, + datatype_properties={}, + metadata=sample_ontology.metadata, + relevance_score=0.8 + ) + + # Resolve dependencies (this is where auto-include happens) + selector._resolve_dependencies(subset) + + # Assert that properties with Recipe in domain are included + assert "ingredients" in subset.object_properties, \ + "ingredients property should be auto-included (Recipe in domain)" + assert "method" in subset.object_properties, \ + "method property should be auto-included (Recipe in domain)" + assert "produces" in subset.object_properties, \ + "produces property should be auto-included (Recipe in domain)" + assert "serves" in subset.datatype_properties, \ + "serves property should be auto-included (Recipe in domain)" + + # Assert that unrelated property is NOT included + assert "food" not in subset.object_properties, \ + "food property should NOT be included (Recipe not in domain/range)" + + def test_auto_include_properties_for_ingredient_class( + self, ontology_loader_with_sample, ontology_embedder, sample_ontology + ): + """Test that selecting Ingredient class includes properties with Ingredient in domain.""" + selector = OntologySelector( + ontology_embedder=ontology_embedder, + ontology_loader=ontology_loader_with_sample + ) + + subset = OntologySubset( + ontology_id="food", + classes={"Ingredient": sample_ontology.classes["Ingredient"].__dict__}, + object_properties={}, + datatype_properties={}, + metadata=sample_ontology.metadata + ) + + selector._resolve_dependencies(subset) + + # Ingredient has 'food' property in domain + assert "food" in subset.object_properties, \ + "food property should be auto-included (Ingredient in domain)" + + # Recipe-related properties should NOT be included + assert "ingredients" not in subset.object_properties + assert "method" not in subset.object_properties + + def test_auto_include_properties_for_range_class( + self, ontology_loader_with_sample, ontology_embedder, sample_ontology + ): + """Test that selecting a class includes properties with that class in range.""" + selector = OntologySelector( + ontology_embedder=ontology_embedder, + ontology_loader=ontology_loader_with_sample + ) + + subset = OntologySubset( + ontology_id="food", + classes={"Food": sample_ontology.classes["Food"].__dict__}, + object_properties={}, + datatype_properties={}, + metadata=sample_ontology.metadata + ) + + selector._resolve_dependencies(subset) + + # Food appears in range of 'food' and 'produces' properties + assert "food" in subset.object_properties, \ + "food property should be auto-included (Food in range)" + assert "produces" in subset.object_properties, \ + "produces property should be auto-included (Food in range)" + + def test_auto_include_adds_domain_and_range_classes( + self, ontology_loader_with_sample, ontology_embedder, sample_ontology + ): + """Test that auto-included properties also add their domain/range classes.""" + selector = OntologySelector( + ontology_embedder=ontology_embedder, + ontology_loader=ontology_loader_with_sample + ) + + # Start with only Recipe class + subset = OntologySubset( + ontology_id="food", + classes={"Recipe": sample_ontology.classes["Recipe"].__dict__}, + object_properties={}, + datatype_properties={}, + metadata=sample_ontology.metadata + ) + + selector._resolve_dependencies(subset) + + # Should auto-include 'produces' property (Recipe → Food) + assert "produces" in subset.object_properties + + # Should also add Food class (range of produces) + assert "Food" in subset.classes, \ + "Food class should be added (range of auto-included produces property)" + + # Should also add Method class (range of method property) + assert "Method" in subset.classes, \ + "Method class should be added (range of auto-included method property)" + + def test_multiple_classes_get_all_related_properties( + self, ontology_loader_with_sample, ontology_embedder, sample_ontology + ): + """Test that selecting multiple classes includes all their related properties.""" + selector = OntologySelector( + ontology_embedder=ontology_embedder, + ontology_loader=ontology_loader_with_sample + ) + + # Select both Recipe and Ingredient classes + subset = OntologySubset( + ontology_id="food", + classes={ + "Recipe": sample_ontology.classes["Recipe"].__dict__, + "Ingredient": sample_ontology.classes["Ingredient"].__dict__ + }, + object_properties={}, + datatype_properties={}, + metadata=sample_ontology.metadata + ) + + selector._resolve_dependencies(subset) + + # Should include Recipe-related properties + assert "ingredients" in subset.object_properties + assert "method" in subset.object_properties + assert "produces" in subset.object_properties + assert "serves" in subset.datatype_properties + + # Should also include Ingredient-related properties + assert "food" in subset.object_properties + + def test_no_duplicate_properties_added( + self, ontology_loader_with_sample, ontology_embedder, sample_ontology + ): + """Test that properties aren't added multiple times.""" + selector = OntologySelector( + ontology_embedder=ontology_embedder, + ontology_loader=ontology_loader_with_sample + ) + + # Start with Recipe and Food (both related to 'produces') + subset = OntologySubset( + ontology_id="food", + classes={ + "Recipe": sample_ontology.classes["Recipe"].__dict__, + "Food": sample_ontology.classes["Food"].__dict__ + }, + object_properties={}, + datatype_properties={}, + metadata=sample_ontology.metadata + ) + + selector._resolve_dependencies(subset) + + # 'produces' should be included once (not duplicated) + assert "produces" in subset.object_properties + # Count would be 1 - dict keys are unique, so this is guaranteed + # but worth documenting the expected behavior + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/unit/test_extract/test_ontology/test_ontology_triples.py b/tests/unit/test_extract/test_ontology/test_ontology_triples.py new file mode 100644 index 00000000..70ade79d --- /dev/null +++ b/tests/unit/test_extract/test_ontology/test_ontology_triples.py @@ -0,0 +1,300 @@ +""" +Unit tests for ontology triple generation. + +Tests that ontology elements (classes and properties) are properly converted +to RDF triples with labels, comments, domains, and ranges so they appear in +the knowledge graph. +""" + +import pytest +from trustgraph.extract.kg.ontology.extract import Processor +from trustgraph.extract.kg.ontology.ontology_selector import OntologySubset +from trustgraph.schema.core.primitives import Triple, Value + + +@pytest.fixture +def extractor(): + """Create a Processor instance for testing.""" + extractor = object.__new__(Processor) + return extractor + + +@pytest.fixture +def sample_ontology_subset(): + """Create a sample ontology subset with classes and properties.""" + return OntologySubset( + ontology_id="food", + classes={ + "Recipe": { + "uri": "http://purl.org/ontology/fo/Recipe", + "type": "owl:Class", + "labels": [{"value": "Recipe", "lang": "en-gb"}], + "comment": "A Recipe is a combination of ingredients and a method.", + "subclass_of": None + }, + "Ingredient": { + "uri": "http://purl.org/ontology/fo/Ingredient", + "type": "owl:Class", + "labels": [{"value": "Ingredient", "lang": "en-gb"}], + "comment": "An Ingredient combines a quantity and a food.", + "subclass_of": None + }, + "Food": { + "uri": "http://purl.org/ontology/fo/Food", + "type": "owl:Class", + "labels": [{"value": "Food", "lang": "en-gb"}], + "comment": "A Food is something that can be eaten.", + "subclass_of": None + } + }, + object_properties={ + "ingredients": { + "uri": "http://purl.org/ontology/fo/ingredients", + "type": "owl:ObjectProperty", + "labels": [{"value": "ingredients", "lang": "en-gb"}], + "comment": "The ingredients property relates a recipe to an ingredient list.", + "domain": "Recipe", + "range": "IngredientList" + }, + "produces": { + "uri": "http://purl.org/ontology/fo/produces", + "type": "owl:ObjectProperty", + "labels": [{"value": "produces", "lang": "en-gb"}], + "comment": "The produces property relates a recipe to the food it produces.", + "domain": "Recipe", + "range": "Food" + } + }, + datatype_properties={ + "serves": { + "uri": "http://purl.org/ontology/fo/serves", + "type": "owl:DatatypeProperty", + "labels": [{"value": "serves", "lang": "en-gb"}], + "comment": "The serves property indicates serving size.", + "domain": "Recipe", + "rdfs:range": "xsd:string" + } + }, + metadata={ + "name": "Food Ontology", + "namespace": "http://purl.org/ontology/fo/" + } + ) + + +class TestOntologyTripleGeneration: + """Test suite for ontology triple generation.""" + + def test_generates_class_type_triples(self, extractor, sample_ontology_subset): + """Test that classes get rdf:type owl:Class triples.""" + triples = extractor.build_ontology_triples(sample_ontology_subset) + + # Find type triples for Recipe class + recipe_type_triples = [ + t for t in triples + if t.s.value == "http://purl.org/ontology/fo/Recipe" + and t.p.value == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" + ] + + assert len(recipe_type_triples) == 1, "Should generate exactly one type triple per class" + assert recipe_type_triples[0].o.value == "http://www.w3.org/2002/07/owl#Class", \ + "Class type should be owl:Class" + + def test_generates_class_labels(self, extractor, sample_ontology_subset): + """Test that classes get rdfs:label triples.""" + triples = extractor.build_ontology_triples(sample_ontology_subset) + + # Find label triples for Recipe class + recipe_label_triples = [ + t for t in triples + if t.s.value == "http://purl.org/ontology/fo/Recipe" + and t.p.value == "http://www.w3.org/2000/01/rdf-schema#label" + ] + + assert len(recipe_label_triples) == 1, "Should generate label triple for class" + assert recipe_label_triples[0].o.value == "Recipe", \ + "Label should match class label from ontology" + assert not recipe_label_triples[0].o.is_uri, \ + "Label should be a literal, not URI" + + def test_generates_class_comments(self, extractor, sample_ontology_subset): + """Test that classes get rdfs:comment triples.""" + triples = extractor.build_ontology_triples(sample_ontology_subset) + + # Find comment triples for Recipe class + recipe_comment_triples = [ + t for t in triples + if t.s.value == "http://purl.org/ontology/fo/Recipe" + and t.p.value == "http://www.w3.org/2000/01/rdf-schema#comment" + ] + + assert len(recipe_comment_triples) == 1, "Should generate comment triple for class" + assert "combination of ingredients and a method" in recipe_comment_triples[0].o.value, \ + "Comment should match class description from ontology" + + def test_generates_object_property_type_triples(self, extractor, sample_ontology_subset): + """Test that object properties get rdf:type owl:ObjectProperty triples.""" + triples = extractor.build_ontology_triples(sample_ontology_subset) + + # Find type triples for ingredients property + ingredients_type_triples = [ + t for t in triples + if t.s.value == "http://purl.org/ontology/fo/ingredients" + and t.p.value == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" + ] + + assert len(ingredients_type_triples) == 1, \ + "Should generate exactly one type triple per object property" + assert ingredients_type_triples[0].o.value == "http://www.w3.org/2002/07/owl#ObjectProperty", \ + "Object property type should be owl:ObjectProperty" + + def test_generates_object_property_labels(self, extractor, sample_ontology_subset): + """Test that object properties get rdfs:label triples.""" + triples = extractor.build_ontology_triples(sample_ontology_subset) + + # Find label triples for ingredients property + ingredients_label_triples = [ + t for t in triples + if t.s.value == "http://purl.org/ontology/fo/ingredients" + and t.p.value == "http://www.w3.org/2000/01/rdf-schema#label" + ] + + assert len(ingredients_label_triples) == 1, \ + "Should generate label triple for object property" + assert ingredients_label_triples[0].o.value == "ingredients", \ + "Label should match property label from ontology" + + def test_generates_object_property_domain(self, extractor, sample_ontology_subset): + """Test that object properties get rdfs:domain triples.""" + triples = extractor.build_ontology_triples(sample_ontology_subset) + + # Find domain triples for ingredients property + ingredients_domain_triples = [ + t for t in triples + if t.s.value == "http://purl.org/ontology/fo/ingredients" + and t.p.value == "http://www.w3.org/2000/01/rdf-schema#domain" + ] + + assert len(ingredients_domain_triples) == 1, \ + "Should generate domain triple for object property" + assert ingredients_domain_triples[0].o.value == "http://purl.org/ontology/fo/Recipe", \ + "Domain should be Recipe class URI" + assert ingredients_domain_triples[0].o.is_uri, \ + "Domain should be a URI reference" + + def test_generates_object_property_range(self, extractor, sample_ontology_subset): + """Test that object properties get rdfs:range triples.""" + triples = extractor.build_ontology_triples(sample_ontology_subset) + + # Find range triples for produces property + produces_range_triples = [ + t for t in triples + if t.s.value == "http://purl.org/ontology/fo/produces" + and t.p.value == "http://www.w3.org/2000/01/rdf-schema#range" + ] + + assert len(produces_range_triples) == 1, \ + "Should generate range triple for object property" + assert produces_range_triples[0].o.value == "http://purl.org/ontology/fo/Food", \ + "Range should be Food class URI" + + def test_generates_datatype_property_type_triples(self, extractor, sample_ontology_subset): + """Test that datatype properties get rdf:type owl:DatatypeProperty triples.""" + triples = extractor.build_ontology_triples(sample_ontology_subset) + + # Find type triples for serves property + serves_type_triples = [ + t for t in triples + if t.s.value == "http://purl.org/ontology/fo/serves" + and t.p.value == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" + ] + + assert len(serves_type_triples) == 1, \ + "Should generate exactly one type triple per datatype property" + assert serves_type_triples[0].o.value == "http://www.w3.org/2002/07/owl#DatatypeProperty", \ + "Datatype property type should be owl:DatatypeProperty" + + def test_generates_datatype_property_range(self, extractor, sample_ontology_subset): + """Test that datatype properties get rdfs:range triples with XSD types.""" + triples = extractor.build_ontology_triples(sample_ontology_subset) + + # Find range triples for serves property + serves_range_triples = [ + t for t in triples + if t.s.value == "http://purl.org/ontology/fo/serves" + and t.p.value == "http://www.w3.org/2000/01/rdf-schema#range" + ] + + assert len(serves_range_triples) == 1, \ + "Should generate range triple for datatype property" + assert serves_range_triples[0].o.value == "http://www.w3.org/2001/XMLSchema#string", \ + "Range should be XSD type URI (xsd:string expanded)" + + def test_generates_triples_for_all_classes(self, extractor, sample_ontology_subset): + """Test that triples are generated for all classes in the subset.""" + triples = extractor.build_ontology_triples(sample_ontology_subset) + + # Count unique class subjects + class_subjects = set( + t.s.value for t in triples + if t.p.value == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" + and t.o.value == "http://www.w3.org/2002/07/owl#Class" + ) + + assert len(class_subjects) == 3, \ + "Should generate triples for all 3 classes (Recipe, Ingredient, Food)" + + def test_generates_triples_for_all_properties(self, extractor, sample_ontology_subset): + """Test that triples are generated for all properties in the subset.""" + triples = extractor.build_ontology_triples(sample_ontology_subset) + + # Count unique property subjects (object + datatype properties) + property_subjects = set( + t.s.value for t in triples + if t.p.value == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" + and ("ObjectProperty" in t.o.value or "DatatypeProperty" in t.o.value) + ) + + assert len(property_subjects) == 3, \ + "Should generate triples for all 3 properties (ingredients, produces, serves)" + + def test_uses_dict_field_names_not_rdf_names(self, extractor, sample_ontology_subset): + """Test that triple generation works with dict field names (labels, comment, domain, range). + + This is critical - the ontology subset has dicts with Python field names, + not RDF property names. + """ + # Verify the subset uses dict field names + recipe_def = sample_ontology_subset.classes["Recipe"] + assert isinstance(recipe_def, dict), "Class definitions should be dicts" + assert "labels" in recipe_def, "Should use 'labels' not 'rdfs:label'" + assert "comment" in recipe_def, "Should use 'comment' not 'rdfs:comment'" + + # Now verify triple generation works + triples = extractor.build_ontology_triples(sample_ontology_subset) + + # Should still generate proper RDF triples despite dict field names + label_triples = [ + t for t in triples + if t.p.value == "http://www.w3.org/2000/01/rdf-schema#label" + ] + assert len(label_triples) > 0, \ + "Should generate rdfs:label triples from dict 'labels' field" + + def test_total_triple_count_is_reasonable(self, extractor, sample_ontology_subset): + """Test that we generate a reasonable number of triples.""" + triples = extractor.build_ontology_triples(sample_ontology_subset) + + # Each class gets: type, label, comment (3 triples) + # Each object property gets: type, label, comment, domain, range (5 triples) + # Each datatype property gets: type, label, comment, domain, range (5 triples) + # Expected: 3 classes * 3 + 2 object props * 5 + 1 datatype prop * 5 = 9 + 10 + 5 = 24 + + assert len(triples) >= 20, \ + "Should generate substantial number of triples for ontology elements" + assert len(triples) < 50, \ + "Should not generate excessive duplicate triples" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py b/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py new file mode 100644 index 00000000..e6d5bf36 --- /dev/null +++ b/tests/unit/test_extract/test_ontology/test_prompt_and_extraction.py @@ -0,0 +1,414 @@ +""" +Unit tests for LLM prompt construction and triple extraction. + +Tests that the system correctly constructs prompts with ontology constraints +and extracts/validates triples from LLM responses. +""" + +import pytest +from trustgraph.extract.kg.ontology.extract import Processor +from trustgraph.extract.kg.ontology.ontology_selector import OntologySubset +from trustgraph.schema.core.primitives import Triple, Value + + +@pytest.fixture +def extractor(): + """Create a Processor instance for testing.""" + extractor = object.__new__(Processor) + extractor.URI_PREFIXES = { + "rdf:": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs:": "http://www.w3.org/2000/01/rdf-schema#", + "owl:": "http://www.w3.org/2002/07/owl#", + "xsd:": "http://www.w3.org/2001/XMLSchema#", + } + return extractor + + +@pytest.fixture +def sample_ontology_subset(): + """Create a sample ontology subset for extraction testing.""" + return OntologySubset( + ontology_id="food", + classes={ + "Recipe": { + "uri": "http://purl.org/ontology/fo/Recipe", + "type": "owl:Class", + "labels": [{"value": "Recipe", "lang": "en-gb"}], + "comment": "A Recipe is a combination of ingredients and a method." + }, + "Ingredient": { + "uri": "http://purl.org/ontology/fo/Ingredient", + "type": "owl:Class", + "labels": [{"value": "Ingredient", "lang": "en-gb"}], + "comment": "An Ingredient combines a quantity and a food." + }, + "Food": { + "uri": "http://purl.org/ontology/fo/Food", + "type": "owl:Class", + "labels": [{"value": "Food", "lang": "en-gb"}], + "comment": "A Food is something that can be eaten." + } + }, + object_properties={ + "ingredients": { + "uri": "http://purl.org/ontology/fo/ingredients", + "type": "owl:ObjectProperty", + "labels": [{"value": "ingredients", "lang": "en-gb"}], + "comment": "The ingredients property relates a recipe to an ingredient list.", + "domain": "Recipe", + "range": "IngredientList" + }, + "food": { + "uri": "http://purl.org/ontology/fo/food", + "type": "owl:ObjectProperty", + "labels": [{"value": "food", "lang": "en-gb"}], + "comment": "The food property relates an ingredient to food.", + "domain": "Ingredient", + "range": "Food" + }, + "produces": { + "uri": "http://purl.org/ontology/fo/produces", + "type": "owl:ObjectProperty", + "labels": [{"value": "produces", "lang": "en-gb"}], + "comment": "The produces property relates a recipe to the food it produces.", + "domain": "Recipe", + "range": "Food" + } + }, + datatype_properties={ + "serves": { + "uri": "http://purl.org/ontology/fo/serves", + "type": "owl:DatatypeProperty", + "labels": [{"value": "serves", "lang": "en-gb"}], + "comment": "The serves property indicates serving size.", + "domain": "Recipe", + "rdfs:range": "xsd:string" + } + }, + metadata={ + "name": "Food Ontology", + "namespace": "http://purl.org/ontology/fo/" + } + ) + + +class TestPromptConstruction: + """Test suite for LLM prompt construction.""" + + def test_build_extraction_variables_includes_text(self, extractor, sample_ontology_subset): + """Test that extraction variables include the input text.""" + chunk = "Cornish pasty is a traditional British pastry filled with meat and vegetables." + + variables = extractor.build_extraction_variables(chunk, sample_ontology_subset) + + assert "text" in variables, "Should include text key" + assert variables["text"] == chunk, "Text should match input chunk" + + def test_build_extraction_variables_includes_classes(self, extractor, sample_ontology_subset): + """Test that extraction variables include ontology classes.""" + chunk = "Test text" + + variables = extractor.build_extraction_variables(chunk, sample_ontology_subset) + + assert "classes" in variables, "Should include classes key" + assert len(variables["classes"]) == 3, "Should include all classes from subset" + assert "Recipe" in variables["classes"] + assert "Ingredient" in variables["classes"] + assert "Food" in variables["classes"] + + def test_build_extraction_variables_includes_properties(self, extractor, sample_ontology_subset): + """Test that extraction variables include ontology properties.""" + chunk = "Test text" + + variables = extractor.build_extraction_variables(chunk, sample_ontology_subset) + + assert "object_properties" in variables, "Should include object_properties key" + assert "datatype_properties" in variables, "Should include datatype_properties key" + assert len(variables["object_properties"]) == 3 + assert len(variables["datatype_properties"]) == 1 + + def test_build_extraction_variables_structure(self, extractor, sample_ontology_subset): + """Test the overall structure of extraction variables.""" + chunk = "Test text" + + variables = extractor.build_extraction_variables(chunk, sample_ontology_subset) + + # Should have exactly 4 keys + assert set(variables.keys()) == {"text", "classes", "object_properties", "datatype_properties"} + + def test_build_extraction_variables_with_empty_subset(self, extractor): + """Test building variables with minimal ontology subset.""" + minimal_subset = OntologySubset( + ontology_id="minimal", + classes={}, + object_properties={}, + datatype_properties={}, + metadata={} + ) + chunk = "Test text" + + variables = extractor.build_extraction_variables(chunk, minimal_subset) + + assert variables["text"] == chunk + assert len(variables["classes"]) == 0 + assert len(variables["object_properties"]) == 0 + assert len(variables["datatype_properties"]) == 0 + + +class TestTripleValidation: + """Test suite for triple validation against ontology.""" + + def test_validates_rdf_type_triple_with_valid_class(self, extractor, sample_ontology_subset): + """Test that rdf:type triples are validated against ontology classes.""" + subject = "cornish-pasty" + predicate = "rdf:type" + object_val = "Recipe" + + is_valid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset) + + assert is_valid, "rdf:type with valid class should be valid" + + def test_rejects_rdf_type_triple_with_invalid_class(self, extractor, sample_ontology_subset): + """Test that rdf:type triples with non-existent classes are rejected.""" + subject = "cornish-pasty" + predicate = "rdf:type" + object_val = "NonExistentClass" + + is_valid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset) + + assert not is_valid, "rdf:type with invalid class should be rejected" + + def test_validates_rdfs_label_triple(self, extractor, sample_ontology_subset): + """Test that rdfs:label triples are always valid.""" + subject = "cornish-pasty" + predicate = "rdfs:label" + object_val = "Cornish Pasty" + + is_valid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset) + + assert is_valid, "rdfs:label should always be valid" + + def test_validates_object_property_triple(self, extractor, sample_ontology_subset): + """Test that object property triples are validated.""" + subject = "cornish-pasty-recipe" + predicate = "produces" + object_val = "cornish-pasty" + + is_valid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset) + + assert is_valid, "Valid object property should be accepted" + + def test_validates_datatype_property_triple(self, extractor, sample_ontology_subset): + """Test that datatype property triples are validated.""" + subject = "cornish-pasty-recipe" + predicate = "serves" + object_val = "4-6 people" + + is_valid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset) + + assert is_valid, "Valid datatype property should be accepted" + + def test_rejects_unknown_property(self, extractor, sample_ontology_subset): + """Test that unknown properties are rejected.""" + subject = "cornish-pasty" + predicate = "unknownProperty" + object_val = "some value" + + is_valid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset) + + assert not is_valid, "Unknown property should be rejected" + + def test_validates_multiple_valid_properties(self, extractor, sample_ontology_subset): + """Test validation of different property types.""" + test_cases = [ + ("recipe1", "produces", "food1", True), + ("ingredient1", "food", "food1", True), + ("recipe1", "serves", "4", True), + ("recipe1", "invalidProp", "value", False), + ] + + for subject, predicate, object_val, expected in test_cases: + is_valid = extractor.is_valid_triple(subject, predicate, object_val, sample_ontology_subset) + assert is_valid == expected, f"Validation of {predicate} should be {expected}" + + +class TestTripleParsing: + """Test suite for parsing triples from LLM responses.""" + + def test_parse_simple_triple_dict(self, extractor, sample_ontology_subset): + """Test parsing a simple triple from dict format.""" + triples_response = [ + { + "subject": "cornish-pasty", + "predicate": "rdf:type", + "object": "Recipe" + } + ] + + validated = extractor.parse_and_validate_triples(triples_response, sample_ontology_subset) + + assert len(validated) == 1, "Should parse one valid triple" + assert validated[0].s.value == "https://trustgraph.ai/food/cornish-pasty" + assert validated[0].p.value == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" + assert validated[0].o.value == "http://purl.org/ontology/fo/Recipe" + + def test_parse_multiple_triples(self, extractor, sample_ontology_subset): + """Test parsing multiple triples.""" + triples_response = [ + {"subject": "cornish-pasty", "predicate": "rdf:type", "object": "Recipe"}, + {"subject": "cornish-pasty", "predicate": "rdfs:label", "object": "Cornish Pasty"}, + {"subject": "cornish-pasty", "predicate": "serves", "object": "1-2 people"} + ] + + validated = extractor.parse_and_validate_triples(triples_response, sample_ontology_subset) + + assert len(validated) == 3, "Should parse all valid triples" + + def test_filters_invalid_triples(self, extractor, sample_ontology_subset): + """Test that invalid triples are filtered out.""" + triples_response = [ + {"subject": "cornish-pasty", "predicate": "rdf:type", "object": "Recipe"}, # Valid + {"subject": "cornish-pasty", "predicate": "invalidProp", "object": "value"}, # Invalid + {"subject": "cornish-pasty", "predicate": "produces", "object": "food1"} # Valid + ] + + validated = extractor.parse_and_validate_triples(triples_response, sample_ontology_subset) + + assert len(validated) == 2, "Should filter out invalid triple" + + def test_handles_missing_fields(self, extractor, sample_ontology_subset): + """Test that triples with missing fields are skipped.""" + triples_response = [ + {"subject": "cornish-pasty", "predicate": "rdf:type"}, # Missing object + {"subject": "cornish-pasty", "object": "Recipe"}, # Missing predicate + {"predicate": "rdf:type", "object": "Recipe"}, # Missing subject + {"subject": "cornish-pasty", "predicate": "rdf:type", "object": "Recipe"} # Valid + ] + + validated = extractor.parse_and_validate_triples(triples_response, sample_ontology_subset) + + assert len(validated) == 1, "Should skip triples with missing fields" + + def test_handles_empty_response(self, extractor, sample_ontology_subset): + """Test handling of empty LLM response.""" + triples_response = [] + + validated = extractor.parse_and_validate_triples(triples_response, sample_ontology_subset) + + assert len(validated) == 0, "Empty response should return no triples" + + def test_expands_uris_in_parsed_triples(self, extractor, sample_ontology_subset): + """Test that URIs are properly expanded in parsed triples.""" + triples_response = [ + {"subject": "recipe1", "predicate": "produces", "object": "Food"} + ] + + validated = extractor.parse_and_validate_triples(triples_response, sample_ontology_subset) + + assert len(validated) == 1 + # Subject should be expanded to entity URI + assert validated[0].s.value.startswith("https://trustgraph.ai/food/") + # Predicate should be expanded to ontology URI + assert validated[0].p.value == "http://purl.org/ontology/fo/produces" + # Object should be expanded to class URI + assert validated[0].o.value == "http://purl.org/ontology/fo/Food" + + def test_creates_proper_triple_objects(self, extractor, sample_ontology_subset): + """Test that Triple objects are properly created.""" + triples_response = [ + {"subject": "cornish-pasty", "predicate": "rdfs:label", "object": "Cornish Pasty"} + ] + + validated = extractor.parse_and_validate_triples(triples_response, sample_ontology_subset) + + assert len(validated) == 1 + triple = validated[0] + assert isinstance(triple, Triple), "Should create Triple objects" + assert isinstance(triple.s, Value), "Subject should be Value object" + assert isinstance(triple.p, Value), "Predicate should be Value object" + assert isinstance(triple.o, Value), "Object should be Value object" + assert triple.s.is_uri, "Subject should be marked as URI" + assert triple.p.is_uri, "Predicate should be marked as URI" + assert not triple.o.is_uri, "Object literal should not be marked as URI" + + +class TestURIExpansionInExtraction: + """Test suite for URI expansion during triple extraction.""" + + def test_expands_class_names_in_objects(self, extractor, sample_ontology_subset): + """Test that class names in object position are expanded.""" + triples_response = [ + {"subject": "entity1", "predicate": "rdf:type", "object": "Recipe"} + ] + + validated = extractor.parse_and_validate_triples(triples_response, sample_ontology_subset) + + assert validated[0].o.value == "http://purl.org/ontology/fo/Recipe" + assert validated[0].o.is_uri, "Class reference should be URI" + + def test_expands_property_names(self, extractor, sample_ontology_subset): + """Test that property names are expanded to full URIs.""" + triples_response = [ + {"subject": "recipe1", "predicate": "produces", "object": "food1"} + ] + + validated = extractor.parse_and_validate_triples(triples_response, sample_ontology_subset) + + assert validated[0].p.value == "http://purl.org/ontology/fo/produces" + + def test_expands_entity_instances(self, extractor, sample_ontology_subset): + """Test that entity instances get constructed URIs.""" + triples_response = [ + {"subject": "my-special-recipe", "predicate": "rdf:type", "object": "Recipe"} + ] + + validated = extractor.parse_and_validate_triples(triples_response, sample_ontology_subset) + + assert validated[0].s.value.startswith("https://trustgraph.ai/food/") + assert "my-special-recipe" in validated[0].s.value + + +class TestEdgeCases: + """Test suite for edge cases in extraction.""" + + def test_handles_non_dict_response_items(self, extractor, sample_ontology_subset): + """Test that non-dict items in response are skipped.""" + triples_response = [ + {"subject": "entity1", "predicate": "rdf:type", "object": "Recipe"}, # Valid + "invalid string item", # Invalid + None, # Invalid + {"subject": "entity2", "predicate": "rdf:type", "object": "Food"} # Valid + ] + + validated = extractor.parse_and_validate_triples(triples_response, sample_ontology_subset) + + # Should skip non-dict items gracefully + assert len(validated) >= 0, "Should handle non-dict items without crashing" + + def test_handles_empty_string_values(self, extractor, sample_ontology_subset): + """Test that empty string values are skipped.""" + triples_response = [ + {"subject": "", "predicate": "rdf:type", "object": "Recipe"}, + {"subject": "entity1", "predicate": "", "object": "Recipe"}, + {"subject": "entity1", "predicate": "rdf:type", "object": ""}, + {"subject": "entity1", "predicate": "rdf:type", "object": "Recipe"} # Valid + ] + + validated = extractor.parse_and_validate_triples(triples_response, sample_ontology_subset) + + assert len(validated) == 1, "Should skip triples with empty strings" + + def test_handles_unicode_in_literals(self, extractor, sample_ontology_subset): + """Test handling of unicode characters in literal values.""" + triples_response = [ + {"subject": "café-recipe", "predicate": "rdfs:label", "object": "Café Spécial"} + ] + + validated = extractor.parse_and_validate_triples(triples_response, sample_ontology_subset) + + assert len(validated) == 1 + assert "Café Spécial" in validated[0].o.value + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/unit/test_extract/test_ontology/test_text_processing.py b/tests/unit/test_extract/test_ontology/test_text_processing.py new file mode 100644 index 00000000..67686297 --- /dev/null +++ b/tests/unit/test_extract/test_ontology/test_text_processing.py @@ -0,0 +1,290 @@ +""" +Unit tests for text processing and segmentation. + +Tests that text is properly split into sentences for ontology matching, +including NLTK tokenization and TextSegment creation. +""" + +import pytest +from trustgraph.extract.kg.ontology.text_processor import TextProcessor, TextSegment + + +@pytest.fixture +def text_processor(): + """Create a TextProcessor instance for testing.""" + return TextProcessor() + + +class TestTextSegmentation: + """Test suite for text segmentation functionality.""" + + def test_segment_single_sentence(self, text_processor): + """Test segmentation of a single sentence.""" + text = "This is a simple sentence." + + segments = text_processor.process_chunk(text, extract_phrases=False) + + # Filter to only sentences + sentences = [s for s in segments if s.type == 'sentence'] + assert len(sentences) == 1, "Single sentence should produce one sentence segment" + assert text in sentences[0].text, "Segment text should contain input" + + def test_segment_multiple_sentences(self, text_processor): + """Test segmentation of multiple sentences.""" + text = "First sentence. Second sentence. Third sentence." + + segments = text_processor.process_chunk(text, extract_phrases=False) + + # Filter to only sentences + sentences = [s for s in segments if s.type == 'sentence'] + assert len(sentences) == 3, "Should create three sentence segments for three sentences" + assert "First sentence" in sentences[0].text + assert "Second sentence" in sentences[1].text + assert "Third sentence" in sentences[2].text + + def test_segment_positions(self, text_processor): + """Test that segment positions are tracked.""" + text = "First sentence. Second sentence." + + segments = text_processor.process_chunk(text, extract_phrases=False) + + # Filter to only sentences + sentences = [s for s in segments if s.type == 'sentence'] + assert len(sentences) == 2 + assert sentences[0].position == 0 + assert sentences[1].position > 0 + + def test_segment_empty_text(self, text_processor): + """Test handling of empty text.""" + text = "" + + segments = text_processor.process_chunk(text, extract_phrases=False) + + assert len(segments) == 0, "Empty text should produce no segments" + + def test_segment_whitespace_only(self, text_processor): + """Test handling of whitespace-only text.""" + text = " \n\t " + + segments = text_processor.process_chunk(text, extract_phrases=False) + + # May produce empty segments or no segments depending on implementation + assert len(segments) <= 1, "Whitespace-only text should produce minimal segments" + + def test_segment_with_newlines(self, text_processor): + """Test segmentation of text with newlines.""" + text = "First sentence.\nSecond sentence." + + segments = text_processor.process_chunk(text, extract_phrases=False) + + # Filter to only sentences + sentences = [s for s in segments if s.type == 'sentence'] + assert len(sentences) == 2 + assert "First sentence" in sentences[0].text + assert "Second sentence" in sentences[1].text + + def test_segment_complex_punctuation(self, text_processor): + """Test segmentation with complex punctuation.""" + text = "Dr. Smith went to the U.S.A. yesterday. He met Mr. Jones." + + segments = text_processor.process_chunk(text, extract_phrases=False) + + # Filter to only sentences + sentences = [s for s in segments if s.type == 'sentence'] + # NLTK should handle abbreviations correctly + assert len(sentences) == 2, "Should recognize abbreviations and not split on them" + assert "Dr. Smith" in sentences[0].text + assert "Mr. Jones" in sentences[1].text + + def test_segment_question_and_exclamation(self, text_processor): + """Test segmentation with different sentence terminators.""" + text = "Is this working? Yes, it is! Great news." + + segments = text_processor.process_chunk(text, extract_phrases=False) + + # Filter to only sentences + sentences = [s for s in segments if s.type == 'sentence'] + assert len(sentences) == 3 + assert "Is this working?" in sentences[0].text + assert "Yes, it is!" in sentences[1].text + assert "Great news" in sentences[2].text + + def test_segment_long_paragraph(self, text_processor): + """Test segmentation of a longer paragraph.""" + text = ( + "The recipe requires several ingredients. " + "First, gather flour and sugar. " + "Then, add eggs and milk. " + "Finally, mix everything together." + ) + + segments = text_processor.process_chunk(text, extract_phrases=False) + + # Filter to only sentences + sentences = [s for s in segments if s.type == 'sentence'] + assert len(sentences) == 4, "Should split paragraph into individual sentences" + assert all(isinstance(seg, TextSegment) for seg in sentences) + + def test_extract_phrases_option(self, text_processor): + """Test that phrase extraction can be enabled.""" + text = "The recipe requires several ingredients." + + # With phrases + segments_with_phrases = text_processor.process_chunk(text, extract_phrases=True) + # Without phrases + segments_without_phrases = text_processor.process_chunk(text, extract_phrases=False) + + # With phrases should have more segments (sentences + phrases) + assert len(segments_with_phrases) >= len(segments_without_phrases) + + +class TestTextSegmentCreation: + """Test suite for TextSegment object creation.""" + + def test_text_segment_attributes(self, text_processor): + """Test that TextSegment objects have correct attributes.""" + text = "This is a test sentence." + + segments = text_processor.process_chunk(text, extract_phrases=False) + + assert len(segments) >= 1 + segment = segments[0] + + assert hasattr(segment, 'text'), "Segment should have text attribute" + assert hasattr(segment, 'type'), "Segment should have type attribute" + assert hasattr(segment, 'position'), "Segment should have position attribute" + assert segment.type in ['sentence', 'phrase', 'noun_phrase', 'verb_phrase'] + + def test_text_segment_types(self, text_processor): + """Test that different segment types are created correctly.""" + text = "The recipe requires several ingredients." + + # Without phrases + segments = text_processor.process_chunk(text, extract_phrases=False) + types = set(s.type for s in segments) + assert 'sentence' in types, "Should create sentence segments" + + # With phrases + segments = text_processor.process_chunk(text, extract_phrases=True) + types = set(s.type for s in segments) + assert 'sentence' in types, "Should create sentence segments" + # May also have phrase types + + def test_text_segment_sentence_tracking(self, text_processor): + """Test that segments track their parent sentence.""" + text = "This is a test sentence." + + segments = text_processor.process_chunk(text, extract_phrases=True) + + # Phrases should reference their parent sentence + phrases = [s for s in segments if s.type != 'sentence'] + if phrases: + for phrase in phrases: + # parent_sentence may be set for phrases + assert hasattr(phrase, 'parent_sentence') + + +class TestNLTKCompatibility: + """Test suite for NLTK version compatibility.""" + + def test_nltk_punkt_availability(self, text_processor): + """Test that NLTK punkt tokenizer is available.""" + # This test verifies the text_processor can use NLTK + # If punkt/punkt_tab is not available, this will fail during setup + import nltk + + # Try to use sentence tokenizer + text = "Test sentence. Another sentence." + + try: + from nltk.tokenize import sent_tokenize + result = sent_tokenize(text) + assert len(result) > 0, "NLTK sentence tokenizer should work" + except LookupError: + pytest.fail("NLTK punkt tokenizer not available") + + def test_text_processor_uses_nltk(self, text_processor): + """Test that TextProcessor successfully uses NLTK for segmentation.""" + # This verifies the integration works + text = "First sentence. Second sentence." + + segments = text_processor.process_chunk(text, extract_phrases=False) + + # Should successfully segment using NLTK + sentences = [s for s in segments if s.type == 'sentence'] + assert len(sentences) >= 1, "Should successfully segment text using NLTK" + + +class TestEdgeCases: + """Test suite for edge cases in text processing.""" + + def test_sentence_with_only_punctuation(self, text_processor): + """Test handling of unusual punctuation patterns.""" + text = "...!?!" + + segments = text_processor.process_chunk(text, extract_phrases=False) + + # Should handle gracefully (NLTK may split this oddly, that's ok) + assert len(segments) <= 3, "Should handle punctuation-only text gracefully" + + def test_very_long_sentence(self, text_processor): + """Test handling of very long sentences.""" + # Create a long sentence with many clauses + text = ( + "This is a very long sentence with many clauses, " + "including subordinate clauses, coordinate clauses, " + "and various other grammatical structures that make it " + "quite lengthy but still technically a single sentence." + ) + + segments = text_processor.process_chunk(text, extract_phrases=False) + + sentences = [s for s in segments if s.type == 'sentence'] + assert len(sentences) == 1, "Long sentence should still be one sentence segment" + assert len(sentences[0].text) > 100 + + def test_unicode_text(self, text_processor): + """Test handling of unicode characters.""" + text = "Café serves crêpes. The recipe is français." + + segments = text_processor.process_chunk(text, extract_phrases=False) + + sentences = [s for s in segments if s.type == 'sentence'] + assert len(sentences) == 2 + assert "Café" in sentences[0].text + assert "français" in sentences[1].text + + def test_numbers_and_dates(self, text_processor): + """Test handling of numbers and dates in text.""" + text = "The recipe was created on Jan. 1, 2024. It serves 4-6 people." + + segments = text_processor.process_chunk(text, extract_phrases=False) + + sentences = [s for s in segments if s.type == 'sentence'] + assert len(sentences) == 2 + assert "2024" in sentences[0].text + assert "4-6" in sentences[1].text + + def test_ellipsis_handling(self, text_processor): + """Test handling of ellipsis in text.""" + text = "First sentence... Second sentence." + + segments = text_processor.process_chunk(text, extract_phrases=False) + + # NLTK may handle ellipsis differently + assert len(segments) >= 1, "Should produce at least one segment" + # The exact behavior depends on NLTK version + + def test_quoted_text(self, text_processor): + """Test handling of quoted text.""" + text = 'He said "Hello world." Then he left.' + + segments = text_processor.process_chunk(text, extract_phrases=False) + + sentences = [s for s in segments if s.type == 'sentence'] + assert len(sentences) == 2 + assert '"Hello world."' in sentences[0].text or "Hello world" in sentences[0].text + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/unit/test_extract/test_ontology/test_uri_expansion.py b/tests/unit/test_extract/test_ontology/test_uri_expansion.py new file mode 100644 index 00000000..dec33ff1 --- /dev/null +++ b/tests/unit/test_extract/test_ontology/test_uri_expansion.py @@ -0,0 +1,258 @@ +""" +Unit tests for URI expansion functionality. + +Tests that URIs are properly expanded using ontology definitions instead of +constructed fallback URIs. +""" + +import pytest +from trustgraph.extract.kg.ontology.extract import Processor +from trustgraph.extract.kg.ontology.ontology_selector import OntologySubset + + +class MockParams: + """Mock parameters for Processor.""" + def get(self, key, default=None): + return default + + +@pytest.fixture +def extractor(): + """Create a Processor instance for testing.""" + params = MockParams() + # We only need the expand_uri method, so minimal initialization + extractor = object.__new__(Processor) + extractor.URI_PREFIXES = { + "rdf:": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs:": "http://www.w3.org/2000/01/rdf-schema#", + "owl:": "http://www.w3.org/2002/07/owl#", + "xsd:": "http://www.w3.org/2001/XMLSchema#", + } + return extractor + + +@pytest.fixture +def ontology_subset_with_uris(): + """Create an ontology subset with proper URIs defined.""" + return OntologySubset( + ontology_id="food", + classes={ + "Recipe": { + "uri": "http://purl.org/ontology/fo/Recipe", + "type": "owl:Class", + "labels": [{"value": "Recipe", "lang": "en-gb"}], + "comment": "A Recipe is a combination of ingredients and a method." + }, + "Ingredient": { + "uri": "http://purl.org/ontology/fo/Ingredient", + "type": "owl:Class", + "labels": [{"value": "Ingredient", "lang": "en-gb"}], + "comment": "An Ingredient combines a quantity and a food." + }, + "Food": { + "uri": "http://purl.org/ontology/fo/Food", + "type": "owl:Class", + "labels": [{"value": "Food", "lang": "en-gb"}], + "comment": "A Food is something that can be eaten." + } + }, + object_properties={ + "ingredients": { + "uri": "http://purl.org/ontology/fo/ingredients", + "type": "owl:ObjectProperty", + "labels": [{"value": "ingredients", "lang": "en-gb"}], + "domain": "Recipe", + "range": "IngredientList" + }, + "food": { + "uri": "http://purl.org/ontology/fo/food", + "type": "owl:ObjectProperty", + "labels": [{"value": "food", "lang": "en-gb"}], + "domain": "Ingredient", + "range": "Food" + }, + "produces": { + "uri": "http://purl.org/ontology/fo/produces", + "type": "owl:ObjectProperty", + "labels": [{"value": "produces", "lang": "en-gb"}], + "domain": "Recipe", + "range": "Food" + } + }, + datatype_properties={ + "serves": { + "uri": "http://purl.org/ontology/fo/serves", + "type": "owl:DatatypeProperty", + "labels": [{"value": "serves", "lang": "en-gb"}], + "domain": "Recipe", + "range": "xsd:string" + } + }, + metadata={ + "name": "Food Ontology", + "namespace": "http://purl.org/ontology/fo/" + } + ) + + +class TestURIExpansion: + """Test suite for URI expansion functionality.""" + + def test_expand_class_uri_from_ontology(self, extractor, ontology_subset_with_uris): + """Test that class names are expanded to their ontology URIs.""" + result = extractor.expand_uri("Recipe", ontology_subset_with_uris, "food") + + assert result == "http://purl.org/ontology/fo/Recipe", \ + "Recipe should expand to its ontology URI" + + def test_expand_object_property_uri_from_ontology(self, extractor, ontology_subset_with_uris): + """Test that object properties are expanded to their ontology URIs.""" + result = extractor.expand_uri("ingredients", ontology_subset_with_uris, "food") + + assert result == "http://purl.org/ontology/fo/ingredients", \ + "ingredients property should expand to its ontology URI" + + def test_expand_datatype_property_uri_from_ontology(self, extractor, ontology_subset_with_uris): + """Test that datatype properties are expanded to their ontology URIs.""" + result = extractor.expand_uri("serves", ontology_subset_with_uris, "food") + + assert result == "http://purl.org/ontology/fo/serves", \ + "serves property should expand to its ontology URI" + + def test_expand_multiple_classes(self, extractor, ontology_subset_with_uris): + """Test expansion of multiple different classes.""" + recipe_uri = extractor.expand_uri("Recipe", ontology_subset_with_uris, "food") + ingredient_uri = extractor.expand_uri("Ingredient", ontology_subset_with_uris, "food") + food_uri = extractor.expand_uri("Food", ontology_subset_with_uris, "food") + + assert recipe_uri == "http://purl.org/ontology/fo/Recipe" + assert ingredient_uri == "http://purl.org/ontology/fo/Ingredient" + assert food_uri == "http://purl.org/ontology/fo/Food" + + def test_expand_rdf_prefix(self, extractor, ontology_subset_with_uris): + """Test that standard RDF prefixes are expanded.""" + result = extractor.expand_uri("rdf:type", ontology_subset_with_uris, "food") + + assert result == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", \ + "rdf:type should expand to full RDF namespace URI" + + def test_expand_rdfs_prefix(self, extractor, ontology_subset_with_uris): + """Test that RDFS prefixes are expanded.""" + result = extractor.expand_uri("rdfs:label", ontology_subset_with_uris, "food") + + assert result == "http://www.w3.org/2000/01/rdf-schema#label", \ + "rdfs:label should expand to full RDFS namespace URI" + + def test_expand_owl_prefix(self, extractor, ontology_subset_with_uris): + """Test that OWL prefixes are expanded.""" + result = extractor.expand_uri("owl:Class", ontology_subset_with_uris, "food") + + assert result == "http://www.w3.org/2002/07/owl#Class", \ + "owl:Class should expand to full OWL namespace URI" + + def test_expand_xsd_prefix(self, extractor, ontology_subset_with_uris): + """Test that XSD prefixes are expanded.""" + result = extractor.expand_uri("xsd:string", ontology_subset_with_uris, "food") + + assert result == "http://www.w3.org/2001/XMLSchema#string", \ + "xsd:string should expand to full XSD namespace URI" + + def test_fallback_uri_for_instance(self, extractor, ontology_subset_with_uris): + """Test that entity instances get constructed URIs when not in ontology.""" + result = extractor.expand_uri("recipe:cornish-pasty", ontology_subset_with_uris, "food") + + # Should construct a URI for the instance + assert result.startswith("https://trustgraph.ai/food/"), \ + "Entity instance should get constructed URI under trustgraph.ai domain" + assert "cornish-pasty" in result.lower(), \ + "Instance URI should include normalized entity name" + + def test_already_full_uri_unchanged(self, extractor, ontology_subset_with_uris): + """Test that full URIs are returned unchanged.""" + full_uri = "http://example.com/custom/entity" + result = extractor.expand_uri(full_uri, ontology_subset_with_uris, "food") + + assert result == full_uri, \ + "Full URIs should be returned unchanged" + + def test_https_uri_unchanged(self, extractor, ontology_subset_with_uris): + """Test that HTTPS URIs are returned unchanged.""" + full_uri = "https://example.com/custom/entity" + result = extractor.expand_uri(full_uri, ontology_subset_with_uris, "food") + + assert result == full_uri, \ + "HTTPS URIs should be returned unchanged" + + def test_class_without_uri_gets_fallback(self, extractor): + """Test that classes without URI definitions get constructed fallback URIs.""" + # Create subset with class that has no URI + subset_no_uri = OntologySubset( + ontology_id="test", + classes={ + "SomeClass": { + "type": "owl:Class", + "labels": [{"value": "Some Class"}], + # No 'uri' field + } + }, + object_properties={}, + datatype_properties={}, + metadata={} + ) + + result = extractor.expand_uri("SomeClass", subset_no_uri, "test") + + assert result == "https://trustgraph.ai/ontology/test#SomeClass", \ + "Class without URI should get fallback constructed URI" + + def test_property_without_uri_gets_fallback(self, extractor): + """Test that properties without URI definitions get constructed fallback URIs.""" + subset_no_uri = OntologySubset( + ontology_id="test", + classes={}, + object_properties={ + "someProperty": { + "type": "owl:ObjectProperty", + # No 'uri' field + } + }, + datatype_properties={}, + metadata={} + ) + + result = extractor.expand_uri("someProperty", subset_no_uri, "test") + + assert result == "https://trustgraph.ai/ontology/test#someProperty", \ + "Property without URI should get fallback constructed URI" + + def test_entity_normalization_in_constructed_uri(self, extractor, ontology_subset_with_uris): + """Test that entity names are normalized when constructing URIs.""" + # Entity with spaces and mixed case + result = extractor.expand_uri("Cornish Pasty Recipe", ontology_subset_with_uris, "food") + + # Should be normalized: lowercase, spaces to hyphens + assert result == "https://trustgraph.ai/food/cornish-pasty-recipe", \ + "Entity names should be normalized (lowercase, spaces to hyphens)" + + def test_dict_access_not_object_attribute(self, extractor, ontology_subset_with_uris): + """Test that URI expansion works with dict access (not object attributes). + + This is the key fix - ontology_selector stores cls.__dict__ which means + we get dicts, not objects, so we must use dict key access. + """ + # The ontology_subset_with_uris uses dicts (with 'uri' key) + # This test verifies we can access it correctly + class_def = ontology_subset_with_uris.classes["Recipe"] + + # Verify it's a dict + assert isinstance(class_def, dict), "Class definitions should be dicts" + assert "uri" in class_def, "Dict should have 'uri' key" + + # Now test expansion works + result = extractor.expand_uri("Recipe", ontology_subset_with_uris, "food") + assert result == class_def["uri"], \ + "URI expansion must work with dict access (not object attributes)" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])