mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
* Bump setup.py versions for 1.1 * PoC MCP server (#419) * Very initial MCP server PoC for TrustGraph * Put service on port 8000 * Add MCP container and packages to buildout * Update docs for API/CLI changes in 1.0 (#421) * Update some API basics for the 0.23/1.0 API change * Add MCP container push (#425) * Add command args to the MCP server (#426) * Host and port parameters * Added websocket arg * More docs * MCP client support (#427) - MCP client service - Tool request/response schema - API gateway support for mcp-tool - Message translation for tool request & response - Make mcp-tool using configuration service for information about where the MCP services are. * Feature/react call mcp (#428) Key Features - MCP Tool Integration: Added core MCP tool support with ToolClientSpec and ToolClient classes - API Enhancement: New mcp_tool method for flow-specific tool invocation - CLI Tooling: New tg-invoke-mcp-tool command for testing MCP integration - React Agent Enhancement: Fixed and improved multi-tool invocation capabilities - Tool Management: Enhanced CLI for tool configuration and management Changes - Added MCP tool invocation to API with flow-specific integration - Implemented ToolClientSpec and ToolClient for tool call handling - Updated agent-manager-react to invoke MCP tools with configurable types - Enhanced CLI with new commands and improved help text - Added comprehensive documentation for new CLI commands - Improved tool configuration management Testing - Added tg-invoke-mcp-tool CLI command for isolated MCP integration testing - Enhanced agent capability to invoke multiple tools simultaneously * Test suite executed from CI pipeline (#433) * Test strategy & test cases * Unit tests * Integration tests * Extending test coverage (#434) * Contract tests * Testing embeedings * Agent unit tests * Knowledge pipeline tests * Turn on contract tests * Increase storage test coverage (#435) * Fixing storage and adding tests * PR pipeline only runs quick tests * Empty configuration is returned as empty list, previously was not in response (#436) * Update config util to take files as well as command-line text (#437) * Updated CLI invocation and config model for tools and mcp (#438) * Updated CLI invocation and config model for tools and mcp * CLI anomalies * Tweaked the MCP tool implementation for new model * Update agent implementation to match the new model * Fix agent tools, now all tested * Fixed integration tests * Fix MCP delete tool params * Update Python deps to 1.2 * Update to enable knowledge extraction using the agent framework (#439) * Implement KG extraction agent (kg-extract-agent) * Using ReAct framework (agent-manager-react) * ReAct manager had an issue when emitting JSON, which conflicts which ReAct manager's own JSON messages, so refactored ReAct manager to use traditional ReAct messages, non-JSON structure. * Minor refactor to take the prompt template client out of prompt-template so it can be more readily used by other modules. kg-extract-agent uses this framework. * Migrate from setup.py to pyproject.toml (#440) * Converted setup.py to pyproject.toml * Modern package infrastructure as recommended by py docs * Install missing build deps (#441) * Install missing build deps (#442) * Implement logging strategy (#444) * Logging strategy and convert all prints() to logging invocations * Fix/startup failure (#445) * Fix loggin startup problems * Fix logging startup problems (#446) * Fix logging startup problems (#447) * Fixed Mistral OCR to use current API (#448) * Fixed Mistral OCR to use current API * Added PDF decoder tests * Fix Mistral OCR ident to be standard pdf-decoder (#450) * Fix Mistral OCR ident to be standard pdf-decoder * Correct test * Schema structure refactor (#451) * Write schema refactor spec * Implemented schema refactor spec * Structure data mvp (#452) * Structured data tech spec * Architecture principles * New schemas * Updated schemas and specs * Object extractor * Add .coveragerc * New tests * Cassandra object storage * Trying to object extraction working, issues exist * Validate librarian collection (#453) * Fix token chunker, broken API invocation (#454) * Fix token chunker, broken API invocation (#455) * Knowledge load utility CLI (#456) * Knowledge loader * More tests
642 lines
No EOL
25 KiB
Python
642 lines
No EOL
25 KiB
Python
"""
|
|
Integration tests for Knowledge Graph Extract → Store Pipeline
|
|
|
|
These tests verify the end-to-end functionality of the knowledge graph extraction
|
|
and storage pipeline, testing text-to-graph transformation, entity extraction,
|
|
relationship extraction, and graph database storage.
|
|
Following the TEST_STRATEGY.md approach for integration testing.
|
|
"""
|
|
|
|
import pytest
|
|
import json
|
|
import urllib.parse
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
from trustgraph.extract.kg.definitions.extract import Processor as DefinitionsProcessor
|
|
from trustgraph.extract.kg.relationships.extract import Processor as RelationshipsProcessor
|
|
from trustgraph.storage.knowledge.store import Processor as KnowledgeStoreProcessor
|
|
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Value, Error
|
|
from trustgraph.schema import EntityContext, EntityContexts, GraphEmbeddings
|
|
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
|
|
|
|
|
@pytest.mark.integration
|
|
class TestKnowledgeGraphPipelineIntegration:
|
|
"""Integration tests for Knowledge Graph Extract → Store Pipeline"""
|
|
|
|
@pytest.fixture
|
|
def mock_flow_context(self):
|
|
"""Mock flow context for service coordination"""
|
|
context = MagicMock()
|
|
|
|
# Mock prompt client for definitions extraction
|
|
prompt_client = AsyncMock()
|
|
prompt_client.extract_definitions.return_value = [
|
|
{
|
|
"entity": "Machine Learning",
|
|
"definition": "A subset of artificial intelligence that enables computers to learn from data without explicit programming."
|
|
},
|
|
{
|
|
"entity": "Neural Networks",
|
|
"definition": "Computing systems inspired by biological neural networks that process information."
|
|
}
|
|
]
|
|
|
|
# Mock prompt client for relationships extraction
|
|
prompt_client.extract_relationships.return_value = [
|
|
{
|
|
"subject": "Machine Learning",
|
|
"predicate": "is_subset_of",
|
|
"object": "Artificial Intelligence",
|
|
"object-entity": True
|
|
},
|
|
{
|
|
"subject": "Neural Networks",
|
|
"predicate": "is_used_in",
|
|
"object": "Machine Learning",
|
|
"object-entity": True
|
|
}
|
|
]
|
|
|
|
# Mock producers for output streams
|
|
triples_producer = AsyncMock()
|
|
entity_contexts_producer = AsyncMock()
|
|
|
|
# Configure context routing
|
|
def context_router(service_name):
|
|
if service_name == "prompt-request":
|
|
return prompt_client
|
|
elif service_name == "triples":
|
|
return triples_producer
|
|
elif service_name == "entity-contexts":
|
|
return entity_contexts_producer
|
|
else:
|
|
return AsyncMock()
|
|
|
|
context.side_effect = context_router
|
|
return context
|
|
|
|
@pytest.fixture
|
|
def mock_cassandra_store(self):
|
|
"""Mock Cassandra knowledge table store"""
|
|
store = AsyncMock()
|
|
store.add_triples.return_value = None
|
|
store.add_graph_embeddings.return_value = None
|
|
return store
|
|
|
|
@pytest.fixture
|
|
def sample_chunk(self):
|
|
"""Sample text chunk for processing"""
|
|
return Chunk(
|
|
metadata=Metadata(
|
|
id="doc-123",
|
|
user="test_user",
|
|
collection="test_collection",
|
|
metadata=[]
|
|
),
|
|
chunk=b"Machine Learning is a subset of Artificial Intelligence. Neural Networks are used in Machine Learning to process complex patterns."
|
|
)
|
|
|
|
@pytest.fixture
|
|
def sample_definitions_response(self):
|
|
"""Sample definitions extraction response"""
|
|
return [
|
|
{
|
|
"entity": "Machine Learning",
|
|
"definition": "A subset of artificial intelligence that enables computers to learn from data."
|
|
},
|
|
{
|
|
"entity": "Artificial Intelligence",
|
|
"definition": "The simulation of human intelligence in machines."
|
|
},
|
|
{
|
|
"entity": "Neural Networks",
|
|
"definition": "Computing systems inspired by biological neural networks."
|
|
}
|
|
]
|
|
|
|
@pytest.fixture
|
|
def sample_relationships_response(self):
|
|
"""Sample relationships extraction response"""
|
|
return [
|
|
{
|
|
"subject": "Machine Learning",
|
|
"predicate": "is_subset_of",
|
|
"object": "Artificial Intelligence",
|
|
"object-entity": True
|
|
},
|
|
{
|
|
"subject": "Neural Networks",
|
|
"predicate": "is_used_in",
|
|
"object": "Machine Learning",
|
|
"object-entity": True
|
|
},
|
|
{
|
|
"subject": "Machine Learning",
|
|
"predicate": "processes",
|
|
"object": "data patterns",
|
|
"object-entity": False
|
|
}
|
|
]
|
|
|
|
@pytest.fixture
|
|
def definitions_processor(self):
|
|
"""Create definitions processor with minimal configuration"""
|
|
processor = MagicMock()
|
|
processor.to_uri = DefinitionsProcessor.to_uri.__get__(processor, DefinitionsProcessor)
|
|
processor.emit_triples = DefinitionsProcessor.emit_triples.__get__(processor, DefinitionsProcessor)
|
|
processor.emit_ecs = DefinitionsProcessor.emit_ecs.__get__(processor, DefinitionsProcessor)
|
|
processor.on_message = DefinitionsProcessor.on_message.__get__(processor, DefinitionsProcessor)
|
|
return processor
|
|
|
|
@pytest.fixture
|
|
def relationships_processor(self):
|
|
"""Create relationships processor with minimal configuration"""
|
|
processor = MagicMock()
|
|
processor.to_uri = RelationshipsProcessor.to_uri.__get__(processor, RelationshipsProcessor)
|
|
processor.emit_triples = RelationshipsProcessor.emit_triples.__get__(processor, RelationshipsProcessor)
|
|
processor.on_message = RelationshipsProcessor.on_message.__get__(processor, RelationshipsProcessor)
|
|
return processor
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_definitions_extraction_pipeline(self, definitions_processor, mock_flow_context, sample_chunk):
|
|
"""Test definitions extraction from text chunk to graph triples"""
|
|
# Arrange
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = sample_chunk
|
|
mock_consumer = MagicMock()
|
|
|
|
# Act
|
|
await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
|
|
|
|
# Assert
|
|
# Verify prompt client was called for definitions extraction
|
|
prompt_client = mock_flow_context("prompt-request")
|
|
prompt_client.extract_definitions.assert_called_once()
|
|
call_args = prompt_client.extract_definitions.call_args
|
|
assert "Machine Learning" in call_args.kwargs['text']
|
|
assert "Neural Networks" in call_args.kwargs['text']
|
|
|
|
# Verify triples producer was called
|
|
triples_producer = mock_flow_context("triples")
|
|
triples_producer.send.assert_called_once()
|
|
|
|
# Verify entity contexts producer was called
|
|
entity_contexts_producer = mock_flow_context("entity-contexts")
|
|
entity_contexts_producer.send.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_relationships_extraction_pipeline(self, relationships_processor, mock_flow_context, sample_chunk):
|
|
"""Test relationships extraction from text chunk to graph triples"""
|
|
# Arrange
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = sample_chunk
|
|
mock_consumer = MagicMock()
|
|
|
|
# Act
|
|
await relationships_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
|
|
|
|
# Assert
|
|
# Verify prompt client was called for relationships extraction
|
|
prompt_client = mock_flow_context("prompt-request")
|
|
prompt_client.extract_relationships.assert_called_once()
|
|
call_args = prompt_client.extract_relationships.call_args
|
|
assert "Machine Learning" in call_args.kwargs['text']
|
|
|
|
# Verify triples producer was called
|
|
triples_producer = mock_flow_context("triples")
|
|
triples_producer.send.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_uri_generation_consistency(self, definitions_processor, relationships_processor):
|
|
"""Test URI generation consistency between processors"""
|
|
# Arrange
|
|
test_entities = [
|
|
"Machine Learning",
|
|
"Artificial Intelligence",
|
|
"Neural Networks",
|
|
"Deep Learning",
|
|
"Natural Language Processing"
|
|
]
|
|
|
|
# Act & Assert
|
|
for entity in test_entities:
|
|
def_uri = definitions_processor.to_uri(entity)
|
|
rel_uri = relationships_processor.to_uri(entity)
|
|
|
|
# URIs should be identical between processors
|
|
assert def_uri == rel_uri
|
|
|
|
# URI should be properly encoded
|
|
assert def_uri.startswith(TRUSTGRAPH_ENTITIES)
|
|
assert " " not in def_uri
|
|
assert def_uri.endswith(urllib.parse.quote(entity.replace(" ", "-").lower().encode("utf-8")))
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_definitions_triple_generation(self, definitions_processor, sample_definitions_response):
|
|
"""Test triple generation from definitions extraction"""
|
|
# Arrange
|
|
metadata = Metadata(
|
|
id="test-doc",
|
|
user="test_user",
|
|
collection="test_collection",
|
|
metadata=[]
|
|
)
|
|
|
|
# Act
|
|
triples = []
|
|
entities = []
|
|
|
|
for defn in sample_definitions_response:
|
|
s = defn["entity"]
|
|
o = defn["definition"]
|
|
|
|
if s and o:
|
|
s_uri = definitions_processor.to_uri(s)
|
|
s_value = Value(value=str(s_uri), is_uri=True)
|
|
o_value = Value(value=str(o), is_uri=False)
|
|
|
|
# Generate triples as the processor would
|
|
triples.append(Triple(
|
|
s=s_value,
|
|
p=Value(value=RDF_LABEL, is_uri=True),
|
|
o=Value(value=s, is_uri=False)
|
|
))
|
|
|
|
triples.append(Triple(
|
|
s=s_value,
|
|
p=Value(value=DEFINITION, is_uri=True),
|
|
o=o_value
|
|
))
|
|
|
|
entities.append(EntityContext(
|
|
entity=s_value,
|
|
context=defn["definition"]
|
|
))
|
|
|
|
# Assert
|
|
assert len(triples) == 6 # 2 triples per entity * 3 entities
|
|
assert len(entities) == 3 # 1 entity context per entity
|
|
|
|
# Verify triple structure
|
|
label_triples = [t for t in triples if t.p.value == RDF_LABEL]
|
|
definition_triples = [t for t in triples if t.p.value == DEFINITION]
|
|
|
|
assert len(label_triples) == 3
|
|
assert len(definition_triples) == 3
|
|
|
|
# Verify entity contexts
|
|
for entity in entities:
|
|
assert entity.entity.is_uri is True
|
|
assert entity.entity.value.startswith(TRUSTGRAPH_ENTITIES)
|
|
assert len(entity.context) > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_relationships_triple_generation(self, relationships_processor, sample_relationships_response):
|
|
"""Test triple generation from relationships extraction"""
|
|
# Arrange
|
|
metadata = Metadata(
|
|
id="test-doc",
|
|
user="test_user",
|
|
collection="test_collection",
|
|
metadata=[]
|
|
)
|
|
|
|
# Act
|
|
triples = []
|
|
|
|
for rel in sample_relationships_response:
|
|
s = rel["subject"]
|
|
p = rel["predicate"]
|
|
o = rel["object"]
|
|
|
|
if s and p and o:
|
|
s_uri = relationships_processor.to_uri(s)
|
|
s_value = Value(value=str(s_uri), is_uri=True)
|
|
|
|
p_uri = relationships_processor.to_uri(p)
|
|
p_value = Value(value=str(p_uri), is_uri=True)
|
|
|
|
if rel["object-entity"]:
|
|
o_uri = relationships_processor.to_uri(o)
|
|
o_value = Value(value=str(o_uri), is_uri=True)
|
|
else:
|
|
o_value = Value(value=str(o), is_uri=False)
|
|
|
|
# Main relationship triple
|
|
triples.append(Triple(s=s_value, p=p_value, o=o_value))
|
|
|
|
# Label triples
|
|
triples.append(Triple(
|
|
s=s_value,
|
|
p=Value(value=RDF_LABEL, is_uri=True),
|
|
o=Value(value=str(s), is_uri=False)
|
|
))
|
|
|
|
triples.append(Triple(
|
|
s=p_value,
|
|
p=Value(value=RDF_LABEL, is_uri=True),
|
|
o=Value(value=str(p), is_uri=False)
|
|
))
|
|
|
|
if rel["object-entity"]:
|
|
triples.append(Triple(
|
|
s=o_value,
|
|
p=Value(value=RDF_LABEL, is_uri=True),
|
|
o=Value(value=str(o), is_uri=False)
|
|
))
|
|
|
|
# Assert
|
|
assert len(triples) > 0
|
|
|
|
# Verify relationship triples exist
|
|
relationship_triples = [t for t in triples if t.p.value.endswith("is_subset_of") or t.p.value.endswith("is_used_in")]
|
|
assert len(relationship_triples) >= 2
|
|
|
|
# Verify label triples
|
|
label_triples = [t for t in triples if t.p.value == RDF_LABEL]
|
|
assert len(label_triples) > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_knowledge_store_triples_storage(self, mock_cassandra_store):
|
|
"""Test knowledge store triples storage integration"""
|
|
# Arrange
|
|
processor = MagicMock()
|
|
processor.table_store = mock_cassandra_store
|
|
processor.on_triples = KnowledgeStoreProcessor.on_triples.__get__(processor, KnowledgeStoreProcessor)
|
|
|
|
sample_triples = Triples(
|
|
metadata=Metadata(
|
|
id="test-doc",
|
|
user="test_user",
|
|
collection="test_collection",
|
|
metadata=[]
|
|
),
|
|
triples=[
|
|
Triple(
|
|
s=Value(value="http://trustgraph.ai/e/machine-learning", is_uri=True),
|
|
p=Value(value=DEFINITION, is_uri=True),
|
|
o=Value(value="A subset of AI", is_uri=False)
|
|
)
|
|
]
|
|
)
|
|
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = sample_triples
|
|
|
|
# Act
|
|
await processor.on_triples(mock_msg, None, None)
|
|
|
|
# Assert
|
|
mock_cassandra_store.add_triples.assert_called_once_with(sample_triples)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_knowledge_store_graph_embeddings_storage(self, mock_cassandra_store):
|
|
"""Test knowledge store graph embeddings storage integration"""
|
|
# Arrange
|
|
processor = MagicMock()
|
|
processor.table_store = mock_cassandra_store
|
|
processor.on_graph_embeddings = KnowledgeStoreProcessor.on_graph_embeddings.__get__(processor, KnowledgeStoreProcessor)
|
|
|
|
sample_embeddings = GraphEmbeddings(
|
|
metadata=Metadata(
|
|
id="test-doc",
|
|
user="test_user",
|
|
collection="test_collection",
|
|
metadata=[]
|
|
),
|
|
entities=[]
|
|
)
|
|
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = sample_embeddings
|
|
|
|
# Act
|
|
await processor.on_graph_embeddings(mock_msg, None, None)
|
|
|
|
# Assert
|
|
mock_cassandra_store.add_graph_embeddings.assert_called_once_with(sample_embeddings)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_end_to_end_pipeline_coordination(self, definitions_processor, relationships_processor,
|
|
mock_flow_context, sample_chunk):
|
|
"""Test end-to-end pipeline coordination from chunk to storage"""
|
|
# Arrange
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = sample_chunk
|
|
mock_consumer = MagicMock()
|
|
|
|
# Act - Process through definitions extractor
|
|
await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
|
|
|
|
# Act - Process through relationships extractor
|
|
await relationships_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
|
|
|
|
# Assert
|
|
# Verify both extractors called prompt service
|
|
prompt_client = mock_flow_context("prompt-request")
|
|
prompt_client.extract_definitions.assert_called_once()
|
|
prompt_client.extract_relationships.assert_called_once()
|
|
|
|
# Verify triples were produced from both extractors
|
|
triples_producer = mock_flow_context("triples")
|
|
assert triples_producer.send.call_count == 2 # One from each extractor
|
|
|
|
# Verify entity contexts were produced from definitions extractor
|
|
entity_contexts_producer = mock_flow_context("entity-contexts")
|
|
entity_contexts_producer.send.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_error_handling_in_definitions_extraction(self, definitions_processor, mock_flow_context, sample_chunk):
|
|
"""Test error handling in definitions extraction"""
|
|
# Arrange
|
|
mock_flow_context("prompt-request").extract_definitions.side_effect = Exception("Prompt service unavailable")
|
|
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = sample_chunk
|
|
mock_consumer = MagicMock()
|
|
|
|
# Act & Assert
|
|
# Should not raise exception, but should handle it gracefully
|
|
await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
|
|
|
|
# Verify prompt was attempted
|
|
prompt_client = mock_flow_context("prompt-request")
|
|
prompt_client.extract_definitions.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_error_handling_in_relationships_extraction(self, relationships_processor, mock_flow_context, sample_chunk):
|
|
"""Test error handling in relationships extraction"""
|
|
# Arrange
|
|
mock_flow_context("prompt-request").extract_relationships.side_effect = Exception("Prompt service unavailable")
|
|
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = sample_chunk
|
|
mock_consumer = MagicMock()
|
|
|
|
# Act & Assert
|
|
# Should not raise exception, but should handle it gracefully
|
|
await relationships_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
|
|
|
|
# Verify prompt was attempted
|
|
prompt_client = mock_flow_context("prompt-request")
|
|
prompt_client.extract_relationships.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_empty_extraction_results_handling(self, definitions_processor, mock_flow_context, sample_chunk):
|
|
"""Test handling of empty extraction results"""
|
|
# Arrange
|
|
mock_flow_context("prompt-request").extract_definitions.return_value = []
|
|
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = sample_chunk
|
|
mock_consumer = MagicMock()
|
|
|
|
# Act
|
|
await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
|
|
|
|
# Assert
|
|
# Should still call producers but with empty results
|
|
triples_producer = mock_flow_context("triples")
|
|
entity_contexts_producer = mock_flow_context("entity-contexts")
|
|
|
|
triples_producer.send.assert_called_once()
|
|
entity_contexts_producer.send.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_invalid_extraction_format_handling(self, definitions_processor, mock_flow_context, sample_chunk):
|
|
"""Test handling of invalid extraction response format"""
|
|
# Arrange
|
|
mock_flow_context("prompt-request").extract_definitions.return_value = "invalid format" # Should be list
|
|
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = sample_chunk
|
|
mock_consumer = MagicMock()
|
|
|
|
# Act & Assert
|
|
# Should handle invalid format gracefully
|
|
await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
|
|
|
|
# Verify prompt was attempted
|
|
prompt_client = mock_flow_context("prompt-request")
|
|
prompt_client.extract_definitions.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_entity_filtering_and_validation(self, definitions_processor, mock_flow_context):
|
|
"""Test entity filtering and validation in extraction"""
|
|
# Arrange
|
|
mock_flow_context("prompt-request").extract_definitions.return_value = [
|
|
{"entity": "Valid Entity", "definition": "Valid definition"},
|
|
{"entity": "", "definition": "Empty entity"}, # Should be filtered
|
|
{"entity": "Valid Entity 2", "definition": ""}, # Should be filtered
|
|
{"entity": None, "definition": "None entity"}, # Should be filtered
|
|
{"entity": "Valid Entity 3", "definition": None}, # Should be filtered
|
|
]
|
|
|
|
sample_chunk = Chunk(
|
|
metadata=Metadata(id="test", user="user", collection="collection", metadata=[]),
|
|
chunk=b"Test chunk"
|
|
)
|
|
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = sample_chunk
|
|
mock_consumer = MagicMock()
|
|
|
|
# Act
|
|
await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
|
|
|
|
# Assert
|
|
# Should only process valid entities
|
|
triples_producer = mock_flow_context("triples")
|
|
entity_contexts_producer = mock_flow_context("entity-contexts")
|
|
|
|
triples_producer.send.assert_called_once()
|
|
entity_contexts_producer.send.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.slow
|
|
async def test_large_batch_processing_performance(self, definitions_processor, relationships_processor,
|
|
mock_flow_context):
|
|
"""Test performance with large batch of chunks"""
|
|
# Arrange
|
|
large_chunk_batch = [
|
|
Chunk(
|
|
metadata=Metadata(id=f"doc-{i}", user="user", collection="collection", metadata=[]),
|
|
chunk=f"Document {i} contains machine learning and AI content.".encode("utf-8")
|
|
)
|
|
for i in range(100) # Large batch
|
|
]
|
|
|
|
mock_consumer = MagicMock()
|
|
|
|
# Act
|
|
import time
|
|
start_time = time.time()
|
|
|
|
for chunk in large_chunk_batch:
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = chunk
|
|
|
|
# Process through both extractors
|
|
await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
|
|
await relationships_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
|
|
|
|
end_time = time.time()
|
|
execution_time = end_time - start_time
|
|
|
|
# Assert
|
|
assert execution_time < 30.0 # Should complete within reasonable time
|
|
|
|
# Verify all chunks were processed
|
|
prompt_client = mock_flow_context("prompt-request")
|
|
assert prompt_client.extract_definitions.call_count == 100
|
|
assert prompt_client.extract_relationships.call_count == 100
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_metadata_propagation_through_pipeline(self, definitions_processor, mock_flow_context):
|
|
"""Test metadata propagation through the pipeline"""
|
|
# Arrange
|
|
original_metadata = Metadata(
|
|
id="test-doc-123",
|
|
user="test_user",
|
|
collection="test_collection",
|
|
metadata=[
|
|
Triple(
|
|
s=Value(value="doc:test", is_uri=True),
|
|
p=Value(value="dc:title", is_uri=True),
|
|
o=Value(value="Test Document", is_uri=False)
|
|
)
|
|
]
|
|
)
|
|
|
|
sample_chunk = Chunk(
|
|
metadata=original_metadata,
|
|
chunk=b"Test content for metadata propagation"
|
|
)
|
|
|
|
mock_msg = MagicMock()
|
|
mock_msg.value.return_value = sample_chunk
|
|
mock_consumer = MagicMock()
|
|
|
|
# Act
|
|
await definitions_processor.on_message(mock_msg, mock_consumer, mock_flow_context)
|
|
|
|
# Assert
|
|
# Verify metadata was propagated to output
|
|
triples_producer = mock_flow_context("triples")
|
|
entity_contexts_producer = mock_flow_context("entity-contexts")
|
|
|
|
triples_producer.send.assert_called_once()
|
|
entity_contexts_producer.send.assert_called_once()
|
|
|
|
# Check that metadata was included in the calls
|
|
triples_call = triples_producer.send.call_args[0][0]
|
|
entity_contexts_call = entity_contexts_producer.send.call_args[0][0]
|
|
|
|
assert triples_call.metadata.id == "test-doc-123"
|
|
assert triples_call.metadata.user == "test_user"
|
|
assert triples_call.metadata.collection == "test_collection"
|
|
|
|
assert entity_contexts_call.metadata.id == "test-doc-123"
|
|
assert entity_contexts_call.metadata.user == "test_user"
|
|
assert entity_contexts_call.metadata.collection == "test_collection" |