""" Integration tests for Knowledge Graph Extract → Store Pipeline These tests verify the end-to-end functionality of the knowledge graph extraction and storage pipeline, testing text-to-graph transformation, entity extraction, relationship extraction, and graph database storage. Following the TEST_STRATEGY.md approach for integration testing. """ import pytest import json import urllib.parse from unittest.mock import AsyncMock, MagicMock, patch from trustgraph.extract.kg.definitions.extract import Processor as DefinitionsProcessor from trustgraph.extract.kg.relationships.extract import Processor as RelationshipsProcessor from trustgraph.storage.knowledge.store import Processor as KnowledgeStoreProcessor from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL from trustgraph.schema import EntityContext, EntityContexts, GraphEmbeddings, EntityEmbeddings from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL from trustgraph.base import PromptResult @pytest.mark.integration class TestKnowledgeGraphPipelineIntegration: """Integration tests for Knowledge Graph Extract → Store Pipeline""" @pytest.fixture def mock_flow_context(self): """Mock flow context for service coordination""" context = MagicMock() # Mock prompt client for definitions extraction prompt_client = AsyncMock() prompt_client.extract_definitions.return_value = PromptResult( response_type="jsonl", objects=[ { "entity": "Machine Learning", "definition": "A subset of artificial intelligence that enables computers to learn from data without explicit programming." }, { "entity": "Neural Networks", "definition": "Computing systems inspired by biological neural networks that process information." } ] ) # Mock prompt client for relationships extraction prompt_client.extract_relationships.return_value = PromptResult( response_type="jsonl", objects=[ { "subject": "Machine Learning", "predicate": "is_subset_of", "object": "Artificial Intelligence", "object-entity": True }, { "subject": "Neural Networks", "predicate": "is_used_in", "object": "Machine Learning", "object-entity": True } ] ) # Mock producers for output streams triples_producer = AsyncMock() entity_contexts_producer = AsyncMock() # Configure context routing def context_router(service_name): if service_name == "prompt-request": return prompt_client elif service_name == "triples": return triples_producer elif service_name == "entity-contexts": return entity_contexts_producer else: return AsyncMock() context.side_effect = context_router return context @pytest.fixture def mock_cassandra_store(self): """Mock Cassandra knowledge table store""" store = AsyncMock() store.add_triples.return_value = None store.add_graph_embeddings.return_value = None return store @pytest.fixture def sample_chunk(self): """Sample text chunk for processing""" return Chunk( metadata=Metadata( id="doc-123", collection="test_collection", ), chunk=b"Machine Learning is a subset of Artificial Intelligence. Neural Networks are used in Machine Learning to process complex patterns." ) @pytest.fixture def sample_definitions_response(self): """Sample definitions extraction response""" return [ { "entity": "Machine Learning", "definition": "A subset of artificial intelligence that enables computers to learn from data." }, { "entity": "Artificial Intelligence", "definition": "The simulation of human intelligence in machines." }, { "entity": "Neural Networks", "definition": "Computing systems inspired by biological neural networks." } ] @pytest.fixture def sample_relationships_response(self): """Sample relationships extraction response""" return [ { "subject": "Machine Learning", "predicate": "is_subset_of", "object": "Artificial Intelligence", "object-entity": True }, { "subject": "Neural Networks", "predicate": "is_used_in", "object": "Machine Learning", "object-entity": True }, { "subject": "Machine Learning", "predicate": "processes", "object": "data patterns", "object-entity": False } ] @pytest.fixture def definitions_processor(self): """Create definitions processor with minimal configuration""" processor = MagicMock() processor.to_uri = DefinitionsProcessor.to_uri.__get__(processor, DefinitionsProcessor) processor.emit_triples = DefinitionsProcessor.emit_triples.__get__(processor, DefinitionsProcessor) processor.emit_ecs = DefinitionsProcessor.emit_ecs.__get__(processor, DefinitionsProcessor) processor.on_message = DefinitionsProcessor.on_message.__get__(processor, DefinitionsProcessor) processor.triples_batch_size = 50 processor.entity_batch_size = 5 return processor @pytest.fixture def relationships_processor(self): """Create relationships processor with minimal configuration""" processor = MagicMock() processor.to_uri = RelationshipsProcessor.to_uri.__get__(processor, RelationshipsProcessor) processor.emit_triples = RelationshipsProcessor.emit_triples.__get__(processor, RelationshipsProcessor) processor.on_message = RelationshipsProcessor.on_message.__get__(processor, RelationshipsProcessor) processor.triples_batch_size = 50 return processor @pytest.mark.asyncio async def test_definitions_extraction_pipeline(self, definitions_processor, mock_flow_context, sample_chunk): """Test definitions extraction from text chunk to graph triples""" # Arrange mock_msg = MagicMock() mock_msg.value.return_value = sample_chunk mock_consumer = MagicMock() # Act await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context) # Assert # Verify prompt client was called for definitions extraction prompt_client = mock_flow_context("prompt-request") prompt_client.extract_definitions.assert_called_once() call_args = prompt_client.extract_definitions.call_args assert "Machine Learning" in call_args.kwargs['text'] assert "Neural Networks" in call_args.kwargs['text'] # Verify triples producer was called triples_producer = mock_flow_context("triples") triples_producer.send.assert_called_once() # Verify entity contexts producer was called entity_contexts_producer = mock_flow_context("entity-contexts") entity_contexts_producer.send.assert_called_once() @pytest.mark.asyncio async def test_relationships_extraction_pipeline(self, relationships_processor, mock_flow_context, sample_chunk): """Test relationships extraction from text chunk to graph triples""" # Arrange mock_msg = MagicMock() mock_msg.value.return_value = sample_chunk mock_consumer = MagicMock() # Act await relationships_processor.on_message(mock_msg, mock_consumer, mock_flow_context) # Assert # Verify prompt client was called for relationships extraction prompt_client = mock_flow_context("prompt-request") prompt_client.extract_relationships.assert_called_once() call_args = prompt_client.extract_relationships.call_args assert "Machine Learning" in call_args.kwargs['text'] # Verify triples producer was called triples_producer = mock_flow_context("triples") triples_producer.send.assert_called_once() @pytest.mark.asyncio async def test_uri_generation_consistency(self, definitions_processor, relationships_processor): """Test URI generation consistency between processors""" # Arrange test_entities = [ "Machine Learning", "Artificial Intelligence", "Neural Networks", "Deep Learning", "Natural Language Processing" ] # Act & Assert for entity in test_entities: def_uri = definitions_processor.to_uri(entity) rel_uri = relationships_processor.to_uri(entity) # URIs should be identical between processors assert def_uri == rel_uri # URI should be properly encoded assert def_uri.startswith(TRUSTGRAPH_ENTITIES) assert " " not in def_uri assert def_uri.endswith(urllib.parse.quote(entity.replace(" ", "-").lower().encode("utf-8"))) @pytest.mark.asyncio async def test_definitions_triple_generation(self, definitions_processor, sample_definitions_response): """Test triple generation from definitions extraction""" # Arrange metadata = Metadata( id="test-doc", collection="test_collection", ) # Act triples = [] entities = [] for defn in sample_definitions_response: s = defn["entity"] o = defn["definition"] if s and o: s_uri = definitions_processor.to_uri(s) s_term = Term(type=IRI, iri=str(s_uri)) o_term = Term(type=LITERAL, value=str(o)) # Generate triples as the processor would triples.append(Triple( s=s_term, p=Term(type=IRI, iri=RDF_LABEL), o=Term(type=LITERAL, value=s) )) triples.append(Triple( s=s_term, p=Term(type=IRI, iri=DEFINITION), o=o_term )) entities.append(EntityContext( entity=s_term, context=defn["definition"] )) # Assert assert len(triples) == 6 # 2 triples per entity * 3 entities assert len(entities) == 3 # 1 entity context per entity # Verify triple structure label_triples = [t for t in triples if t.p.iri == RDF_LABEL] definition_triples = [t for t in triples if t.p.iri == DEFINITION] assert len(label_triples) == 3 assert len(definition_triples) == 3 # Verify entity contexts for entity in entities: assert entity.entity.type == IRI assert entity.entity.iri.startswith(TRUSTGRAPH_ENTITIES) assert len(entity.context) > 0 @pytest.mark.asyncio async def test_relationships_triple_generation(self, relationships_processor, sample_relationships_response): """Test triple generation from relationships extraction""" # Arrange metadata = Metadata( id="test-doc", collection="test_collection", ) # Act triples = [] for rel in sample_relationships_response: s = rel["subject"] p = rel["predicate"] o = rel["object"] if s and p and o: s_uri = relationships_processor.to_uri(s) s_term = Term(type=IRI, iri=str(s_uri)) p_uri = relationships_processor.to_uri(p) p_term = Term(type=IRI, iri=str(p_uri)) if rel["object-entity"]: o_uri = relationships_processor.to_uri(o) o_term = Term(type=IRI, iri=str(o_uri)) else: o_term = Term(type=LITERAL, value=str(o)) # Main relationship triple triples.append(Triple(s=s_term, p=p_term, o=o_term)) # Label triples triples.append(Triple( s=s_term, p=Term(type=IRI, iri=RDF_LABEL), o=Term(type=LITERAL, value=str(s)) )) triples.append(Triple( s=p_term, p=Term(type=IRI, iri=RDF_LABEL), o=Term(type=LITERAL, value=str(p)) )) if rel["object-entity"]: triples.append(Triple( s=o_term, p=Term(type=IRI, iri=RDF_LABEL), o=Term(type=LITERAL, value=str(o)) )) # Assert assert len(triples) > 0 # Verify relationship triples exist relationship_triples = [t for t in triples if t.p.iri.endswith("is_subset_of") or t.p.iri.endswith("is_used_in")] assert len(relationship_triples) >= 2 # Verify label triples label_triples = [t for t in triples if t.p.iri == RDF_LABEL] assert len(label_triples) > 0 @pytest.mark.asyncio async def test_knowledge_store_triples_storage(self, mock_cassandra_store): """Test knowledge store triples storage integration""" # Arrange processor = MagicMock() processor.table_store = mock_cassandra_store processor.on_triples = KnowledgeStoreProcessor.on_triples.__get__(processor, KnowledgeStoreProcessor) sample_triples = Triples( metadata=Metadata( id="test-doc", collection="test_collection", ), triples=[ Triple( s=Term(type=IRI, iri="http://trustgraph.ai/e/machine-learning"), p=Term(type=IRI, iri=DEFINITION), o=Term(type=LITERAL, value="A subset of AI") ) ] ) mock_msg = MagicMock() mock_msg.value.return_value = sample_triples mock_flow = MagicMock() mock_flow.workspace = "test_workspace" # Act await processor.on_triples(mock_msg, None, mock_flow) # Assert mock_cassandra_store.add_triples.assert_called_once_with("test_workspace", sample_triples) @pytest.mark.asyncio async def test_knowledge_store_graph_embeddings_storage(self, mock_cassandra_store): """Test knowledge store graph embeddings storage integration""" # Arrange processor = MagicMock() processor.table_store = mock_cassandra_store processor.on_graph_embeddings = KnowledgeStoreProcessor.on_graph_embeddings.__get__(processor, KnowledgeStoreProcessor) sample_embeddings = GraphEmbeddings( metadata=Metadata( id="test-doc", collection="test_collection", ), entities=[ EntityEmbeddings( entity=Term(type=IRI, iri="http://example.org/entity"), vector=[0.1, 0.2, 0.3] ) ] ) mock_msg = MagicMock() mock_msg.value.return_value = sample_embeddings mock_flow = MagicMock() mock_flow.workspace = "test_workspace" # Act await processor.on_graph_embeddings(mock_msg, None, mock_flow) # Assert mock_cassandra_store.add_graph_embeddings.assert_called_once_with("test_workspace", sample_embeddings) @pytest.mark.asyncio async def test_end_to_end_pipeline_coordination(self, definitions_processor, relationships_processor, mock_flow_context, sample_chunk): """Test end-to-end pipeline coordination from chunk to storage""" # Arrange mock_msg = MagicMock() mock_msg.value.return_value = sample_chunk mock_consumer = MagicMock() # Act - Process through definitions extractor await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context) # Act - Process through relationships extractor await relationships_processor.on_message(mock_msg, mock_consumer, mock_flow_context) # Assert # Verify both extractors called prompt service prompt_client = mock_flow_context("prompt-request") prompt_client.extract_definitions.assert_called_once() prompt_client.extract_relationships.assert_called_once() # Verify triples were produced from both extractors triples_producer = mock_flow_context("triples") assert triples_producer.send.call_count == 2 # One from each extractor # Verify entity contexts were produced from definitions extractor entity_contexts_producer = mock_flow_context("entity-contexts") entity_contexts_producer.send.assert_called_once() @pytest.mark.asyncio async def test_error_handling_in_definitions_extraction(self, definitions_processor, mock_flow_context, sample_chunk): """Test error handling in definitions extraction""" # Arrange mock_flow_context("prompt-request").extract_definitions.side_effect = Exception("Prompt service unavailable") mock_msg = MagicMock() mock_msg.value.return_value = sample_chunk mock_consumer = MagicMock() # Act & Assert # Should not raise exception, but should handle it gracefully await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context) # Verify prompt was attempted prompt_client = mock_flow_context("prompt-request") prompt_client.extract_definitions.assert_called_once() @pytest.mark.asyncio async def test_error_handling_in_relationships_extraction(self, relationships_processor, mock_flow_context, sample_chunk): """Test error handling in relationships extraction""" # Arrange mock_flow_context("prompt-request").extract_relationships.side_effect = Exception("Prompt service unavailable") mock_msg = MagicMock() mock_msg.value.return_value = sample_chunk mock_consumer = MagicMock() # Act & Assert # Should not raise exception, but should handle it gracefully await relationships_processor.on_message(mock_msg, mock_consumer, mock_flow_context) # Verify prompt was attempted prompt_client = mock_flow_context("prompt-request") prompt_client.extract_relationships.assert_called_once() @pytest.mark.asyncio async def test_empty_extraction_results_handling(self, definitions_processor, mock_flow_context, sample_chunk): """Test handling of empty extraction results""" # Arrange mock_flow_context("prompt-request").extract_definitions.return_value = PromptResult( response_type="jsonl", objects=[] ) mock_msg = MagicMock() mock_msg.value.return_value = sample_chunk mock_consumer = MagicMock() # Act await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context) # Assert # Should NOT call producers with empty results (avoids Cassandra NULL issues) triples_producer = mock_flow_context("triples") entity_contexts_producer = mock_flow_context("entity-contexts") triples_producer.send.assert_not_called() entity_contexts_producer.send.assert_not_called() @pytest.mark.asyncio async def test_invalid_extraction_format_handling(self, definitions_processor, mock_flow_context, sample_chunk): """Test handling of invalid extraction response format""" # Arrange mock_flow_context("prompt-request").extract_definitions.return_value = PromptResult( response_type="text", text="invalid format" ) # Should be jsonl with objects list mock_msg = MagicMock() mock_msg.value.return_value = sample_chunk mock_consumer = MagicMock() # Act & Assert # Should handle invalid format gracefully await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context) # Verify prompt was attempted prompt_client = mock_flow_context("prompt-request") prompt_client.extract_definitions.assert_called_once() @pytest.mark.asyncio async def test_entity_filtering_and_validation(self, definitions_processor, mock_flow_context): """Test entity filtering and validation in extraction""" # Arrange mock_flow_context("prompt-request").extract_definitions.return_value = PromptResult( response_type="jsonl", objects=[ {"entity": "Valid Entity", "definition": "Valid definition"}, {"entity": "", "definition": "Empty entity"}, # Should be filtered {"entity": "Valid Entity 2", "definition": ""}, # Should be filtered {"entity": None, "definition": "None entity"}, # Should be filtered {"entity": "Valid Entity 3", "definition": None}, # Should be filtered ] ) sample_chunk = Chunk( metadata=Metadata(id="test", collection="collection"), chunk=b"Test chunk" ) mock_msg = MagicMock() mock_msg.value.return_value = sample_chunk mock_consumer = MagicMock() # Act await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context) # Assert # Should only process valid entities triples_producer = mock_flow_context("triples") entity_contexts_producer = mock_flow_context("entity-contexts") triples_producer.send.assert_called_once() entity_contexts_producer.send.assert_called_once() @pytest.mark.asyncio @pytest.mark.slow async def test_large_batch_processing_performance(self, definitions_processor, relationships_processor, mock_flow_context): """Test performance with large batch of chunks""" # Arrange large_chunk_batch = [ Chunk( metadata=Metadata(id=f"doc-{i}", collection="collection"), chunk=f"Document {i} contains machine learning and AI content.".encode("utf-8") ) for i in range(100) # Large batch ] mock_consumer = MagicMock() # Act import time start_time = time.time() for chunk in large_chunk_batch: mock_msg = MagicMock() mock_msg.value.return_value = chunk # Process through both extractors await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context) await relationships_processor.on_message(mock_msg, mock_consumer, mock_flow_context) end_time = time.time() execution_time = end_time - start_time # Assert assert execution_time < 30.0 # Should complete within reasonable time # Verify all chunks were processed prompt_client = mock_flow_context("prompt-request") assert prompt_client.extract_definitions.call_count == 100 assert prompt_client.extract_relationships.call_count == 100 @pytest.mark.asyncio async def test_metadata_propagation_through_pipeline(self, definitions_processor, mock_flow_context): """Test metadata propagation through the pipeline""" # Arrange original_metadata = Metadata( id="test-doc-123", collection="test_collection", ) sample_chunk = Chunk( metadata=original_metadata, chunk=b"Test content for metadata propagation" ) mock_msg = MagicMock() mock_msg.value.return_value = sample_chunk mock_consumer = MagicMock() # Act await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context) # Assert # Verify metadata was propagated to output triples_producer = mock_flow_context("triples") entity_contexts_producer = mock_flow_context("entity-contexts") triples_producer.send.assert_called_once() entity_contexts_producer.send.assert_called_once() # Check that metadata was included in the calls triples_call = triples_producer.send.call_args[0][0] entity_contexts_call = entity_contexts_producer.send.call_args[0][0] assert triples_call.metadata.id == "test-doc-123" assert triples_call.metadata.collection == "test_collection" assert entity_contexts_call.metadata.id == "test-doc-123" assert entity_contexts_call.metadata.collection == "test_collection"