mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 08:26:21 +02:00
The subjectOf triples were redundant with the subgraph provenance model
introduced in e8407b34. Entity-to-source lineage can be traced via
tg:contains -> subgraph -> prov:wasDerivedFrom -> chunk, making the
direct subjectOf edges unnecessary metadata polluting the knowledge graph.
Removed from all three extractors (agent, definitions, relationships),
cleaned up the SUBJECT_OF constant and vocabulary label, and updated
tests accordingly.
394 lines
No EOL
16 KiB
Python
394 lines
No EOL
16 KiB
Python
"""
|
|
Unit tests for Agent-based Knowledge Graph Extraction
|
|
|
|
These tests verify the core functionality of the agent-driven KG extractor,
|
|
including JSON response parsing, triple generation, entity context creation,
|
|
and RDF URI handling.
|
|
"""
|
|
|
|
import pytest
|
|
import json
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
|
|
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
|
|
from trustgraph.schema import EntityContext, EntityContexts
|
|
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
|
|
from trustgraph.template.prompt_manager import PromptManager
|
|
|
|
|
|
@pytest.mark.unit
|
|
class TestAgentKgExtractor:
|
|
"""Unit tests for Agent-based Knowledge Graph Extractor"""
|
|
|
|
@pytest.fixture
|
|
def agent_extractor(self):
|
|
"""Create a mock agent extractor for testing core functionality"""
|
|
# Create a mock that has the methods we want to test
|
|
extractor = MagicMock()
|
|
|
|
# Add real implementations of the methods we want to test
|
|
from trustgraph.extract.kg.agent.extract import Processor
|
|
real_extractor = Processor.__new__(Processor) # Create without calling __init__
|
|
|
|
# Set up the methods we want to test
|
|
extractor.to_uri = real_extractor.to_uri
|
|
extractor.parse_jsonl = real_extractor.parse_jsonl
|
|
extractor.process_extraction_data = real_extractor.process_extraction_data
|
|
extractor.emit_triples = real_extractor.emit_triples
|
|
extractor.emit_entity_contexts = real_extractor.emit_entity_contexts
|
|
|
|
# Mock the prompt manager
|
|
extractor.manager = PromptManager()
|
|
extractor.template_id = "agent-kg-extract"
|
|
extractor.config_key = "prompt"
|
|
extractor.concurrency = 1
|
|
|
|
return extractor
|
|
|
|
@pytest.fixture
|
|
def sample_metadata(self):
|
|
"""Sample metadata for testing"""
|
|
return Metadata(
|
|
id="doc123",
|
|
)
|
|
|
|
@pytest.fixture
|
|
def sample_extraction_data(self):
|
|
"""Sample extraction data in JSONL format (list with type discriminators)"""
|
|
return [
|
|
{
|
|
"type": "definition",
|
|
"entity": "Machine Learning",
|
|
"definition": "A subset of artificial intelligence that enables computers to learn from data without explicit programming."
|
|
},
|
|
{
|
|
"type": "definition",
|
|
"entity": "Neural Networks",
|
|
"definition": "Computing systems inspired by biological neural networks that process information."
|
|
},
|
|
{
|
|
"type": "relationship",
|
|
"subject": "Machine Learning",
|
|
"predicate": "is_subset_of",
|
|
"object": "Artificial Intelligence",
|
|
"object-entity": True
|
|
},
|
|
{
|
|
"type": "relationship",
|
|
"subject": "Neural Networks",
|
|
"predicate": "used_in",
|
|
"object": "Machine Learning",
|
|
"object-entity": True
|
|
},
|
|
{
|
|
"type": "relationship",
|
|
"subject": "Deep Learning",
|
|
"predicate": "accuracy",
|
|
"object": "95%",
|
|
"object-entity": False
|
|
}
|
|
]
|
|
|
|
def test_to_uri_conversion(self, agent_extractor):
|
|
"""Test URI conversion for entities"""
|
|
# Test simple entity name
|
|
uri = agent_extractor.to_uri("Machine Learning")
|
|
expected = f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
|
assert uri == expected
|
|
|
|
# Test entity with special characters
|
|
uri = agent_extractor.to_uri("Entity with & special chars!")
|
|
expected = f"{TRUSTGRAPH_ENTITIES}Entity%20with%20%26%20special%20chars%21"
|
|
assert uri == expected
|
|
|
|
# Test empty string
|
|
uri = agent_extractor.to_uri("")
|
|
expected = f"{TRUSTGRAPH_ENTITIES}"
|
|
assert uri == expected
|
|
|
|
def test_parse_jsonl_with_code_blocks(self, agent_extractor):
|
|
"""Test JSONL parsing from code blocks"""
|
|
# Test JSONL in code blocks - note: JSON uses lowercase true/false
|
|
response = '''```json
|
|
{"type": "definition", "entity": "AI", "definition": "Artificial Intelligence"}
|
|
{"type": "relationship", "subject": "AI", "predicate": "is", "object": "technology", "object-entity": false}
|
|
```'''
|
|
|
|
result = agent_extractor.parse_jsonl(response)
|
|
|
|
assert len(result) == 2
|
|
assert result[0]["entity"] == "AI"
|
|
assert result[0]["definition"] == "Artificial Intelligence"
|
|
assert result[1]["type"] == "relationship"
|
|
|
|
def test_parse_jsonl_without_code_blocks(self, agent_extractor):
|
|
"""Test JSONL parsing without code blocks"""
|
|
response = '''{"type": "definition", "entity": "ML", "definition": "Machine Learning"}
|
|
{"type": "definition", "entity": "AI", "definition": "Artificial Intelligence"}'''
|
|
|
|
result = agent_extractor.parse_jsonl(response)
|
|
|
|
assert len(result) == 2
|
|
assert result[0]["entity"] == "ML"
|
|
assert result[1]["entity"] == "AI"
|
|
|
|
def test_parse_jsonl_invalid_lines_skipped(self, agent_extractor):
|
|
"""Test JSONL parsing skips invalid lines gracefully"""
|
|
response = '''{"type": "definition", "entity": "Valid", "definition": "Valid def"}
|
|
This is not JSON at all
|
|
{"type": "definition", "entity": "Also Valid", "definition": "Another def"}'''
|
|
|
|
result = agent_extractor.parse_jsonl(response)
|
|
|
|
# Should get 2 valid objects, skipping the invalid line
|
|
assert len(result) == 2
|
|
assert result[0]["entity"] == "Valid"
|
|
assert result[1]["entity"] == "Also Valid"
|
|
|
|
def test_parse_jsonl_truncation_resilience(self, agent_extractor):
|
|
"""Test JSONL parsing handles truncated responses"""
|
|
# Simulates output cut off mid-line
|
|
response = '''{"type": "definition", "entity": "Complete", "definition": "Full def"}
|
|
{"type": "definition", "entity": "Trunca'''
|
|
|
|
result = agent_extractor.parse_jsonl(response)
|
|
|
|
# Should get 1 valid object, the truncated line is skipped
|
|
assert len(result) == 1
|
|
assert result[0]["entity"] == "Complete"
|
|
|
|
def test_process_extraction_data_definitions(self, agent_extractor, sample_metadata):
|
|
"""Test processing of definition data"""
|
|
data = [
|
|
{
|
|
"type": "definition",
|
|
"entity": "Machine Learning",
|
|
"definition": "A subset of AI that enables learning from data."
|
|
}
|
|
]
|
|
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
|
|
|
# Check entity label triple
|
|
label_triple = next((t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "Machine Learning"), None)
|
|
assert label_triple is not None
|
|
assert label_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
|
assert label_triple.s.type == IRI
|
|
assert label_triple.o.type == LITERAL
|
|
|
|
# Check definition triple
|
|
def_triple = next((t for t in triples if t.p.iri == DEFINITION), None)
|
|
assert def_triple is not None
|
|
assert def_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
|
assert def_triple.o.value == "A subset of AI that enables learning from data."
|
|
|
|
# Check entity context
|
|
assert len(entity_contexts) == 1
|
|
assert entity_contexts[0].entity.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
|
assert entity_contexts[0].context == "A subset of AI that enables learning from data."
|
|
|
|
def test_process_extraction_data_relationships(self, agent_extractor, sample_metadata):
|
|
"""Test processing of relationship data"""
|
|
data = [
|
|
{
|
|
"type": "relationship",
|
|
"subject": "Machine Learning",
|
|
"predicate": "is_subset_of",
|
|
"object": "Artificial Intelligence",
|
|
"object-entity": True
|
|
}
|
|
]
|
|
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
|
|
|
# Check that subject, predicate, and object labels are created
|
|
subject_uri = f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
|
predicate_uri = f"{TRUSTGRAPH_ENTITIES}is_subset_of"
|
|
|
|
# Find label triples
|
|
subject_label = next((t for t in triples if t.s.iri == subject_uri and t.p.iri == RDF_LABEL), None)
|
|
assert subject_label is not None
|
|
assert subject_label.o.value == "Machine Learning"
|
|
|
|
predicate_label = next((t for t in triples if t.s.iri == predicate_uri and t.p.iri == RDF_LABEL), None)
|
|
assert predicate_label is not None
|
|
assert predicate_label.o.value == "is_subset_of"
|
|
|
|
# Check main relationship triple
|
|
object_uri = f"{TRUSTGRAPH_ENTITIES}Artificial%20Intelligence"
|
|
rel_triple = next((t for t in triples if t.s.iri == subject_uri and t.p.iri == predicate_uri), None)
|
|
assert rel_triple is not None
|
|
assert rel_triple.o.iri == object_uri
|
|
assert rel_triple.o.type == IRI
|
|
|
|
def test_process_extraction_data_literal_object(self, agent_extractor, sample_metadata):
|
|
"""Test processing of relationships with literal objects"""
|
|
data = [
|
|
{
|
|
"type": "relationship",
|
|
"subject": "Deep Learning",
|
|
"predicate": "accuracy",
|
|
"object": "95%",
|
|
"object-entity": False
|
|
}
|
|
]
|
|
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
|
|
|
# Check that object labels are not created for literal objects
|
|
object_labels = [t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "95%"]
|
|
# Based on the code logic, it should not create object labels for non-entity objects
|
|
# But there might be a bug in the original implementation
|
|
|
|
def test_process_extraction_data_combined(self, agent_extractor, sample_metadata, sample_extraction_data):
|
|
"""Test processing of combined definitions and relationships"""
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(sample_extraction_data, sample_metadata)
|
|
|
|
# Check that we have both definition and relationship triples
|
|
definition_triples = [t for t in triples if t.p.iri == DEFINITION]
|
|
assert len(definition_triples) == 2 # Two definitions
|
|
|
|
# Check entity contexts are created for definitions
|
|
assert len(entity_contexts) == 2
|
|
entity_uris = [ec.entity.iri for ec in entity_contexts]
|
|
assert f"{TRUSTGRAPH_ENTITIES}Machine%20Learning" in entity_uris
|
|
assert f"{TRUSTGRAPH_ENTITIES}Neural%20Networks" in entity_uris
|
|
|
|
def test_process_extraction_data_no_metadata_id(self, agent_extractor):
|
|
"""Test processing when metadata has no ID"""
|
|
metadata = Metadata(id=None)
|
|
data = [
|
|
{"type": "definition", "entity": "Test Entity", "definition": "Test definition"}
|
|
]
|
|
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, metadata)
|
|
|
|
# Should still create entity contexts
|
|
assert len(entity_contexts) == 1
|
|
|
|
def test_process_extraction_data_empty_data(self, agent_extractor, sample_metadata):
|
|
"""Test processing of empty extraction data"""
|
|
data = []
|
|
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
|
|
|
# Should have no entity contexts
|
|
assert len(entity_contexts) == 0
|
|
# Triples should be empty
|
|
assert len(triples) == 0
|
|
|
|
def test_process_extraction_data_unknown_types_ignored(self, agent_extractor, sample_metadata):
|
|
"""Test processing data with unknown type values"""
|
|
data = [
|
|
{"type": "definition", "entity": "Valid", "definition": "Valid def"},
|
|
{"type": "unknown_type", "foo": "bar"}, # Unknown type - should be ignored
|
|
{"type": "relationship", "subject": "A", "predicate": "rel", "object": "B", "object-entity": True}
|
|
]
|
|
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
|
|
|
# Should process valid items and ignore unknown types
|
|
assert len(entity_contexts) == 1 # Only the definition creates entity context
|
|
|
|
def test_process_extraction_data_malformed_entries(self, agent_extractor, sample_metadata):
|
|
"""Test processing data with malformed entries"""
|
|
# Test items missing required fields - should raise KeyError
|
|
data = [
|
|
{"type": "definition", "entity": "Test"}, # Missing definition
|
|
]
|
|
|
|
# Should handle gracefully or raise appropriate errors
|
|
with pytest.raises(KeyError):
|
|
agent_extractor.process_extraction_data(data, sample_metadata)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_emit_triples(self, agent_extractor, sample_metadata):
|
|
"""Test emitting triples to publisher"""
|
|
mock_publisher = AsyncMock()
|
|
|
|
test_triples = [
|
|
Triple(
|
|
s=Term(type=IRI, iri="test:subject"),
|
|
p=Term(type=IRI, iri="test:predicate"),
|
|
o=Term(type=LITERAL, value="test object")
|
|
)
|
|
]
|
|
|
|
await agent_extractor.emit_triples(mock_publisher, sample_metadata, test_triples)
|
|
|
|
mock_publisher.send.assert_called_once()
|
|
sent_triples = mock_publisher.send.call_args[0][0]
|
|
assert isinstance(sent_triples, Triples)
|
|
# Check metadata fields individually since implementation creates new Metadata object
|
|
assert sent_triples.metadata.id == sample_metadata.id
|
|
assert sent_triples.metadata.user == sample_metadata.user
|
|
assert sent_triples.metadata.collection == sample_metadata.collection
|
|
assert len(sent_triples.triples) == 1
|
|
assert sent_triples.triples[0].s.iri == "test:subject"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_emit_entity_contexts(self, agent_extractor, sample_metadata):
|
|
"""Test emitting entity contexts to publisher"""
|
|
mock_publisher = AsyncMock()
|
|
|
|
test_contexts = [
|
|
EntityContext(
|
|
entity=Term(type=IRI, iri="test:entity"),
|
|
context="Test context"
|
|
)
|
|
]
|
|
|
|
await agent_extractor.emit_entity_contexts(mock_publisher, sample_metadata, test_contexts)
|
|
|
|
mock_publisher.send.assert_called_once()
|
|
sent_contexts = mock_publisher.send.call_args[0][0]
|
|
assert isinstance(sent_contexts, EntityContexts)
|
|
# Check metadata fields individually since implementation creates new Metadata object
|
|
assert sent_contexts.metadata.id == sample_metadata.id
|
|
assert sent_contexts.metadata.user == sample_metadata.user
|
|
assert sent_contexts.metadata.collection == sample_metadata.collection
|
|
assert len(sent_contexts.entities) == 1
|
|
assert sent_contexts.entities[0].entity.iri == "test:entity"
|
|
|
|
def test_agent_extractor_initialization_params(self):
|
|
"""Test agent extractor parameter validation"""
|
|
# Test default parameters (we'll mock the initialization)
|
|
def mock_init(self, **kwargs):
|
|
self.template_id = kwargs.get('template-id', 'agent-kg-extract')
|
|
self.config_key = kwargs.get('config-type', 'prompt')
|
|
self.concurrency = kwargs.get('concurrency', 1)
|
|
|
|
with patch.object(AgentKgExtractor, '__init__', mock_init):
|
|
extractor = AgentKgExtractor()
|
|
|
|
# This tests the default parameter logic
|
|
assert extractor.template_id == 'agent-kg-extract'
|
|
assert extractor.config_key == 'prompt'
|
|
assert extractor.concurrency == 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_prompt_config_loading_logic(self, agent_extractor):
|
|
"""Test prompt configuration loading logic"""
|
|
# Test the core logic without requiring full FlowProcessor initialization
|
|
config = {
|
|
"prompt": {
|
|
"system": json.dumps("Test system"),
|
|
"template-index": json.dumps(["agent-kg-extract"]),
|
|
"template.agent-kg-extract": json.dumps({
|
|
"prompt": "Extract knowledge from: {{ text }}",
|
|
"response-type": "json"
|
|
})
|
|
}
|
|
}
|
|
|
|
# Test the manager loading directly
|
|
if "prompt" in config:
|
|
agent_extractor.manager.load_config(config["prompt"])
|
|
|
|
# Should not raise an exception
|
|
assert agent_extractor.manager is not None
|
|
|
|
# Test with empty config
|
|
empty_config = {}
|
|
# Should handle gracefully - no config to load |