mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 08:26:21 +02:00
Introduces `workspace` as the isolation boundary for config, flows,
library, and knowledge data. Removes `user` as a schema-level field
throughout the code, API specs, and tests; workspace provides the
same separation more cleanly at the trusted flow.workspace layer
rather than through client-supplied message fields.
Design
------
- IAM tech spec (docs/tech-specs/iam.md) documents current state,
proposed auth/access model, and migration direction.
- Data ownership model (docs/tech-specs/data-ownership-model.md)
captures the workspace/collection/flow hierarchy.
Schema + messaging
------------------
- Drop `user` field from AgentRequest/Step, GraphRagQuery,
DocumentRagQuery, Triples/Graph/Document/Row EmbeddingsRequest,
Sparql/Rows/Structured QueryRequest, ToolServiceRequest.
- Keep collection/workspace routing via flow.workspace at the
service layer.
- Translators updated to not serialise/deserialise user.
API specs
---------
- OpenAPI schemas and path examples cleaned of user fields.
- Websocket async-api messages updated.
- Removed the unused parameters/User.yaml.
Services + base
---------------
- Librarian, collection manager, knowledge, config: all operations
scoped by workspace. Config client API takes workspace as first
positional arg.
- `flow.workspace` set at flow start time by the infrastructure;
no longer pass-through from clients.
- Tool service drops user-personalisation passthrough.
CLI + SDK
---------
- tg-init-workspace and workspace-aware import/export.
- All tg-* commands drop user args; accept --workspace.
- Python API/SDK (flow, socket_client, async_*, explainability,
library) drop user kwargs from every method signature.
MCP server
----------
- All tool endpoints drop user parameters; socket_manager no longer
keyed per user.
Flow service
------------
- Closure-based topic cleanup on flow stop: only delete topics
whose blueprint template was parameterised AND no remaining
live flow (across all workspaces) still resolves to that topic.
Three scopes fall out naturally from template analysis:
* {id} -> per-flow, deleted on stop
* {blueprint} -> per-blueprint, kept while any flow of the
same blueprint exists
* {workspace} -> per-workspace, kept while any flow in the
workspace exists
* literal -> global, never deleted (e.g. tg.request.librarian)
Fixes a bug where stopping a flow silently destroyed the global
librarian exchange, wedging all library operations until manual
restart.
RabbitMQ backend
----------------
- heartbeat=60, blocked_connection_timeout=300. Catches silently
dead connections (broker restart, orphaned channels, network
partitions) within ~2 heartbeat windows, so the consumer
reconnects and re-binds its queue rather than sitting forever
on a zombie connection.
Tests
-----
- Full test refresh: unit, integration, contract, provenance.
- Dropped user-field assertions and constructor kwargs across
~100 test files.
- Renamed user-collection isolation tests to workspace-collection.
392 lines
No EOL
16 KiB
Python
392 lines
No EOL
16 KiB
Python
"""
|
|
Unit tests for Agent-based Knowledge Graph Extraction
|
|
|
|
These tests verify the core functionality of the agent-driven KG extractor,
|
|
including JSON response parsing, triple generation, entity context creation,
|
|
and RDF URI handling.
|
|
"""
|
|
|
|
import pytest
|
|
import json
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
|
|
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
|
|
from trustgraph.schema import EntityContext, EntityContexts
|
|
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL
|
|
from trustgraph.template.prompt_manager import PromptManager
|
|
|
|
|
|
@pytest.mark.unit
|
|
class TestAgentKgExtractor:
|
|
"""Unit tests for Agent-based Knowledge Graph Extractor"""
|
|
|
|
@pytest.fixture
|
|
def agent_extractor(self):
|
|
"""Create a mock agent extractor for testing core functionality"""
|
|
# Create a mock that has the methods we want to test
|
|
extractor = MagicMock()
|
|
|
|
# Add real implementations of the methods we want to test
|
|
from trustgraph.extract.kg.agent.extract import Processor
|
|
real_extractor = Processor.__new__(Processor) # Create without calling __init__
|
|
|
|
# Set up the methods we want to test
|
|
extractor.to_uri = real_extractor.to_uri
|
|
extractor.parse_jsonl = real_extractor.parse_jsonl
|
|
extractor.process_extraction_data = real_extractor.process_extraction_data
|
|
extractor.emit_triples = real_extractor.emit_triples
|
|
extractor.emit_entity_contexts = real_extractor.emit_entity_contexts
|
|
|
|
# Mock the prompt manager
|
|
extractor.manager = PromptManager()
|
|
extractor.template_id = "agent-kg-extract"
|
|
extractor.config_key = "prompt"
|
|
extractor.concurrency = 1
|
|
|
|
return extractor
|
|
|
|
@pytest.fixture
|
|
def sample_metadata(self):
|
|
"""Sample metadata for testing"""
|
|
return Metadata(
|
|
id="doc123",
|
|
)
|
|
|
|
@pytest.fixture
|
|
def sample_extraction_data(self):
|
|
"""Sample extraction data in JSONL format (list with type discriminators)"""
|
|
return [
|
|
{
|
|
"type": "definition",
|
|
"entity": "Machine Learning",
|
|
"definition": "A subset of artificial intelligence that enables computers to learn from data without explicit programming."
|
|
},
|
|
{
|
|
"type": "definition",
|
|
"entity": "Neural Networks",
|
|
"definition": "Computing systems inspired by biological neural networks that process information."
|
|
},
|
|
{
|
|
"type": "relationship",
|
|
"subject": "Machine Learning",
|
|
"predicate": "is_subset_of",
|
|
"object": "Artificial Intelligence",
|
|
"object-entity": True
|
|
},
|
|
{
|
|
"type": "relationship",
|
|
"subject": "Neural Networks",
|
|
"predicate": "used_in",
|
|
"object": "Machine Learning",
|
|
"object-entity": True
|
|
},
|
|
{
|
|
"type": "relationship",
|
|
"subject": "Deep Learning",
|
|
"predicate": "accuracy",
|
|
"object": "95%",
|
|
"object-entity": False
|
|
}
|
|
]
|
|
|
|
def test_to_uri_conversion(self, agent_extractor):
|
|
"""Test URI conversion for entities"""
|
|
# Test simple entity name
|
|
uri = agent_extractor.to_uri("Machine Learning")
|
|
expected = f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
|
assert uri == expected
|
|
|
|
# Test entity with special characters
|
|
uri = agent_extractor.to_uri("Entity with & special chars!")
|
|
expected = f"{TRUSTGRAPH_ENTITIES}Entity%20with%20%26%20special%20chars%21"
|
|
assert uri == expected
|
|
|
|
# Test empty string
|
|
uri = agent_extractor.to_uri("")
|
|
expected = f"{TRUSTGRAPH_ENTITIES}"
|
|
assert uri == expected
|
|
|
|
def test_parse_jsonl_with_code_blocks(self, agent_extractor):
|
|
"""Test JSONL parsing from code blocks"""
|
|
# Test JSONL in code blocks - note: JSON uses lowercase true/false
|
|
response = '''```json
|
|
{"type": "definition", "entity": "AI", "definition": "Artificial Intelligence"}
|
|
{"type": "relationship", "subject": "AI", "predicate": "is", "object": "technology", "object-entity": false}
|
|
```'''
|
|
|
|
result = agent_extractor.parse_jsonl(response)
|
|
|
|
assert len(result) == 2
|
|
assert result[0]["entity"] == "AI"
|
|
assert result[0]["definition"] == "Artificial Intelligence"
|
|
assert result[1]["type"] == "relationship"
|
|
|
|
def test_parse_jsonl_without_code_blocks(self, agent_extractor):
|
|
"""Test JSONL parsing without code blocks"""
|
|
response = '''{"type": "definition", "entity": "ML", "definition": "Machine Learning"}
|
|
{"type": "definition", "entity": "AI", "definition": "Artificial Intelligence"}'''
|
|
|
|
result = agent_extractor.parse_jsonl(response)
|
|
|
|
assert len(result) == 2
|
|
assert result[0]["entity"] == "ML"
|
|
assert result[1]["entity"] == "AI"
|
|
|
|
def test_parse_jsonl_invalid_lines_skipped(self, agent_extractor):
|
|
"""Test JSONL parsing skips invalid lines gracefully"""
|
|
response = '''{"type": "definition", "entity": "Valid", "definition": "Valid def"}
|
|
This is not JSON at all
|
|
{"type": "definition", "entity": "Also Valid", "definition": "Another def"}'''
|
|
|
|
result = agent_extractor.parse_jsonl(response)
|
|
|
|
# Should get 2 valid objects, skipping the invalid line
|
|
assert len(result) == 2
|
|
assert result[0]["entity"] == "Valid"
|
|
assert result[1]["entity"] == "Also Valid"
|
|
|
|
def test_parse_jsonl_truncation_resilience(self, agent_extractor):
|
|
"""Test JSONL parsing handles truncated responses"""
|
|
# Simulates output cut off mid-line
|
|
response = '''{"type": "definition", "entity": "Complete", "definition": "Full def"}
|
|
{"type": "definition", "entity": "Trunca'''
|
|
|
|
result = agent_extractor.parse_jsonl(response)
|
|
|
|
# Should get 1 valid object, the truncated line is skipped
|
|
assert len(result) == 1
|
|
assert result[0]["entity"] == "Complete"
|
|
|
|
def test_process_extraction_data_definitions(self, agent_extractor, sample_metadata):
|
|
"""Test processing of definition data"""
|
|
data = [
|
|
{
|
|
"type": "definition",
|
|
"entity": "Machine Learning",
|
|
"definition": "A subset of AI that enables learning from data."
|
|
}
|
|
]
|
|
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
|
|
|
# Check entity label triple
|
|
label_triple = next((t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "Machine Learning"), None)
|
|
assert label_triple is not None
|
|
assert label_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
|
assert label_triple.s.type == IRI
|
|
assert label_triple.o.type == LITERAL
|
|
|
|
# Check definition triple
|
|
def_triple = next((t for t in triples if t.p.iri == DEFINITION), None)
|
|
assert def_triple is not None
|
|
assert def_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
|
assert def_triple.o.value == "A subset of AI that enables learning from data."
|
|
|
|
# Check entity context
|
|
assert len(entity_contexts) == 1
|
|
assert entity_contexts[0].entity.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
|
assert entity_contexts[0].context == "A subset of AI that enables learning from data."
|
|
|
|
def test_process_extraction_data_relationships(self, agent_extractor, sample_metadata):
|
|
"""Test processing of relationship data"""
|
|
data = [
|
|
{
|
|
"type": "relationship",
|
|
"subject": "Machine Learning",
|
|
"predicate": "is_subset_of",
|
|
"object": "Artificial Intelligence",
|
|
"object-entity": True
|
|
}
|
|
]
|
|
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
|
|
|
# Check that subject, predicate, and object labels are created
|
|
subject_uri = f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
|
|
predicate_uri = f"{TRUSTGRAPH_ENTITIES}is_subset_of"
|
|
|
|
# Find label triples
|
|
subject_label = next((t for t in triples if t.s.iri == subject_uri and t.p.iri == RDF_LABEL), None)
|
|
assert subject_label is not None
|
|
assert subject_label.o.value == "Machine Learning"
|
|
|
|
predicate_label = next((t for t in triples if t.s.iri == predicate_uri and t.p.iri == RDF_LABEL), None)
|
|
assert predicate_label is not None
|
|
assert predicate_label.o.value == "is_subset_of"
|
|
|
|
# Check main relationship triple
|
|
object_uri = f"{TRUSTGRAPH_ENTITIES}Artificial%20Intelligence"
|
|
rel_triple = next((t for t in triples if t.s.iri == subject_uri and t.p.iri == predicate_uri), None)
|
|
assert rel_triple is not None
|
|
assert rel_triple.o.iri == object_uri
|
|
assert rel_triple.o.type == IRI
|
|
|
|
def test_process_extraction_data_literal_object(self, agent_extractor, sample_metadata):
|
|
"""Test processing of relationships with literal objects"""
|
|
data = [
|
|
{
|
|
"type": "relationship",
|
|
"subject": "Deep Learning",
|
|
"predicate": "accuracy",
|
|
"object": "95%",
|
|
"object-entity": False
|
|
}
|
|
]
|
|
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
|
|
|
# Check that object labels are not created for literal objects
|
|
object_labels = [t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "95%"]
|
|
# Based on the code logic, it should not create object labels for non-entity objects
|
|
# But there might be a bug in the original implementation
|
|
|
|
def test_process_extraction_data_combined(self, agent_extractor, sample_metadata, sample_extraction_data):
|
|
"""Test processing of combined definitions and relationships"""
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(sample_extraction_data, sample_metadata)
|
|
|
|
# Check that we have both definition and relationship triples
|
|
definition_triples = [t for t in triples if t.p.iri == DEFINITION]
|
|
assert len(definition_triples) == 2 # Two definitions
|
|
|
|
# Check entity contexts are created for definitions
|
|
assert len(entity_contexts) == 2
|
|
entity_uris = [ec.entity.iri for ec in entity_contexts]
|
|
assert f"{TRUSTGRAPH_ENTITIES}Machine%20Learning" in entity_uris
|
|
assert f"{TRUSTGRAPH_ENTITIES}Neural%20Networks" in entity_uris
|
|
|
|
def test_process_extraction_data_no_metadata_id(self, agent_extractor):
|
|
"""Test processing when metadata has no ID"""
|
|
metadata = Metadata(id=None)
|
|
data = [
|
|
{"type": "definition", "entity": "Test Entity", "definition": "Test definition"}
|
|
]
|
|
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, metadata)
|
|
|
|
# Should still create entity contexts
|
|
assert len(entity_contexts) == 1
|
|
|
|
def test_process_extraction_data_empty_data(self, agent_extractor, sample_metadata):
|
|
"""Test processing of empty extraction data"""
|
|
data = []
|
|
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
|
|
|
# Should have no entity contexts
|
|
assert len(entity_contexts) == 0
|
|
# Triples should be empty
|
|
assert len(triples) == 0
|
|
|
|
def test_process_extraction_data_unknown_types_ignored(self, agent_extractor, sample_metadata):
|
|
"""Test processing data with unknown type values"""
|
|
data = [
|
|
{"type": "definition", "entity": "Valid", "definition": "Valid def"},
|
|
{"type": "unknown_type", "foo": "bar"}, # Unknown type - should be ignored
|
|
{"type": "relationship", "subject": "A", "predicate": "rel", "object": "B", "object-entity": True}
|
|
]
|
|
|
|
triples, entity_contexts, _ = agent_extractor.process_extraction_data(data, sample_metadata)
|
|
|
|
# Should process valid items and ignore unknown types
|
|
assert len(entity_contexts) == 1 # Only the definition creates entity context
|
|
|
|
def test_process_extraction_data_malformed_entries(self, agent_extractor, sample_metadata):
|
|
"""Test processing data with malformed entries"""
|
|
# Test items missing required fields - should raise KeyError
|
|
data = [
|
|
{"type": "definition", "entity": "Test"}, # Missing definition
|
|
]
|
|
|
|
# Should handle gracefully or raise appropriate errors
|
|
with pytest.raises(KeyError):
|
|
agent_extractor.process_extraction_data(data, sample_metadata)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_emit_triples(self, agent_extractor, sample_metadata):
|
|
"""Test emitting triples to publisher"""
|
|
mock_publisher = AsyncMock()
|
|
|
|
test_triples = [
|
|
Triple(
|
|
s=Term(type=IRI, iri="test:subject"),
|
|
p=Term(type=IRI, iri="test:predicate"),
|
|
o=Term(type=LITERAL, value="test object")
|
|
)
|
|
]
|
|
|
|
await agent_extractor.emit_triples(mock_publisher, sample_metadata, test_triples)
|
|
|
|
mock_publisher.send.assert_called_once()
|
|
sent_triples = mock_publisher.send.call_args[0][0]
|
|
assert isinstance(sent_triples, Triples)
|
|
# Check metadata fields individually since implementation creates new Metadata object
|
|
assert sent_triples.metadata.id == sample_metadata.id
|
|
assert sent_triples.metadata.collection == sample_metadata.collection
|
|
assert len(sent_triples.triples) == 1
|
|
assert sent_triples.triples[0].s.iri == "test:subject"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_emit_entity_contexts(self, agent_extractor, sample_metadata):
|
|
"""Test emitting entity contexts to publisher"""
|
|
mock_publisher = AsyncMock()
|
|
|
|
test_contexts = [
|
|
EntityContext(
|
|
entity=Term(type=IRI, iri="test:entity"),
|
|
context="Test context"
|
|
)
|
|
]
|
|
|
|
await agent_extractor.emit_entity_contexts(mock_publisher, sample_metadata, test_contexts)
|
|
|
|
mock_publisher.send.assert_called_once()
|
|
sent_contexts = mock_publisher.send.call_args[0][0]
|
|
assert isinstance(sent_contexts, EntityContexts)
|
|
# Check metadata fields individually since implementation creates new Metadata object
|
|
assert sent_contexts.metadata.id == sample_metadata.id
|
|
assert sent_contexts.metadata.collection == sample_metadata.collection
|
|
assert len(sent_contexts.entities) == 1
|
|
assert sent_contexts.entities[0].entity.iri == "test:entity"
|
|
|
|
def test_agent_extractor_initialization_params(self):
|
|
"""Test agent extractor parameter validation"""
|
|
# Test default parameters (we'll mock the initialization)
|
|
def mock_init(self, **kwargs):
|
|
self.template_id = kwargs.get('template-id', 'agent-kg-extract')
|
|
self.config_key = kwargs.get('config-type', 'prompt')
|
|
self.concurrency = kwargs.get('concurrency', 1)
|
|
|
|
with patch.object(AgentKgExtractor, '__init__', mock_init):
|
|
extractor = AgentKgExtractor()
|
|
|
|
# This tests the default parameter logic
|
|
assert extractor.template_id == 'agent-kg-extract'
|
|
assert extractor.config_key == 'prompt'
|
|
assert extractor.concurrency == 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_prompt_config_loading_logic(self, agent_extractor):
|
|
"""Test prompt configuration loading logic"""
|
|
# Test the core logic without requiring full FlowProcessor initialization
|
|
config = {
|
|
"prompt": {
|
|
"system": json.dumps("Test system"),
|
|
"template-index": json.dumps(["agent-kg-extract"]),
|
|
"template.agent-kg-extract": json.dumps({
|
|
"prompt": "Extract knowledge from: {{ text }}",
|
|
"response-type": "json"
|
|
})
|
|
}
|
|
}
|
|
|
|
# Test the manager loading directly
|
|
if "prompt" in config:
|
|
agent_extractor.manager.load_config(config["prompt"])
|
|
|
|
# Should not raise an exception
|
|
assert agent_extractor.manager is not None
|
|
|
|
# Test with empty config
|
|
empty_config = {}
|
|
# Should handle gracefully - no config to load |