Merge 2.0 to master (#651)

This commit is contained in:
cybermaggedon 2026-02-28 11:03:14 +00:00 committed by GitHub
parent 3666ece2c5
commit b9d7bf9a8b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
212 changed files with 13940 additions and 6180 deletions

View file

@ -12,7 +12,7 @@ import json
from unittest.mock import AsyncMock, MagicMock, patch
from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Value, Error
from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
from trustgraph.schema import EntityContext, EntityContexts, AgentRequest, AgentResponse
from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
from trustgraph.template.prompt_manager import PromptManager
@ -30,38 +30,16 @@ class TestAgentKgExtractionIntegration:
# Mock agent client
agent_client = AsyncMock()
# Mock successful agent response
# Mock successful agent response in JSONL format
def mock_agent_response(recipient, question):
# Simulate agent processing and return structured response
# Simulate agent processing and return structured JSONL response
mock_response = MagicMock()
mock_response.error = None
mock_response.answer = '''```json
{
"definitions": [
{
"entity": "Machine Learning",
"definition": "A subset of artificial intelligence that enables computers to learn from data without explicit programming."
},
{
"entity": "Neural Networks",
"definition": "Computing systems inspired by biological neural networks that process information."
}
],
"relationships": [
{
"subject": "Machine Learning",
"predicate": "is_subset_of",
"object": "Artificial Intelligence",
"object-entity": true
},
{
"subject": "Neural Networks",
"predicate": "used_in",
"object": "Machine Learning",
"object-entity": true
}
]
}
{"type": "definition", "entity": "Machine Learning", "definition": "A subset of artificial intelligence that enables computers to learn from data without explicit programming."}
{"type": "definition", "entity": "Neural Networks", "definition": "Computing systems inspired by biological neural networks that process information."}
{"type": "relationship", "subject": "Machine Learning", "predicate": "is_subset_of", "object": "Artificial Intelligence", "object-entity": true}
{"type": "relationship", "subject": "Neural Networks", "predicate": "used_in", "object": "Machine Learning", "object-entity": true}
```'''
return mock_response.answer
@ -100,9 +78,9 @@ class TestAgentKgExtractionIntegration:
id="doc123",
metadata=[
Triple(
s=Value(value="doc123", is_uri=True),
p=Value(value="http://example.org/type", is_uri=True),
o=Value(value="document", is_uri=False)
s=Term(type=IRI, iri="doc123"),
p=Term(type=IRI, iri="http://example.org/type"),
o=Term(type=LITERAL, value="document")
)
]
)
@ -120,7 +98,7 @@ class TestAgentKgExtractionIntegration:
# Copy the methods we want to test
extractor.to_uri = real_extractor.to_uri
extractor.parse_json = real_extractor.parse_json
extractor.parse_jsonl = real_extractor.parse_jsonl
extractor.process_extraction_data = real_extractor.process_extraction_data
extractor.emit_triples = real_extractor.emit_triples
extractor.emit_entity_contexts = real_extractor.emit_entity_contexts
@ -156,7 +134,7 @@ class TestAgentKgExtractionIntegration:
agent_response = agent_client.invoke(recipient=lambda x: True, question=prompt)
# Parse and process
extraction_data = extractor.parse_json(agent_response)
extraction_data = extractor.parse_jsonl(agent_response)
triples, entity_contexts = extractor.process_extraction_data(extraction_data, v.metadata)
# Add metadata triples
@ -200,15 +178,15 @@ class TestAgentKgExtractionIntegration:
assert len(sent_triples.triples) > 0
# Check that we have definition triples
definition_triples = [t for t in sent_triples.triples if t.p.value == DEFINITION]
definition_triples = [t for t in sent_triples.triples if t.p.iri == DEFINITION]
assert len(definition_triples) >= 2 # Should have definitions for ML and Neural Networks
# Check that we have label triples
label_triples = [t for t in sent_triples.triples if t.p.value == RDF_LABEL]
label_triples = [t for t in sent_triples.triples if t.p.iri == RDF_LABEL]
assert len(label_triples) >= 2 # Should have labels for entities
# Check subject-of relationships
subject_of_triples = [t for t in sent_triples.triples if t.p.value == SUBJECT_OF]
subject_of_triples = [t for t in sent_triples.triples if t.p.iri == SUBJECT_OF]
assert len(subject_of_triples) >= 2 # Entities should be linked to document
# Verify entity contexts were emitted
@ -220,7 +198,7 @@ class TestAgentKgExtractionIntegration:
assert len(sent_contexts.entities) >= 2 # Should have contexts for both entities
# Verify entity URIs are properly formed
entity_uris = [ec.entity.value for ec in sent_contexts.entities]
entity_uris = [ec.entity.iri for ec in sent_contexts.entities]
assert f"{TRUSTGRAPH_ENTITIES}Machine%20Learning" in entity_uris
assert f"{TRUSTGRAPH_ENTITIES}Neural%20Networks" in entity_uris
@ -248,22 +226,28 @@ class TestAgentKgExtractionIntegration:
@pytest.mark.asyncio
async def test_invalid_json_response_handling(self, configured_agent_extractor, sample_chunk, mock_flow_context):
"""Test handling of invalid JSON responses from agent"""
"""Test handling of invalid JSON responses from agent - JSONL is lenient and skips invalid lines"""
# Arrange - mock invalid JSON response
agent_client = mock_flow_context("agent-request")
def mock_invalid_json_response(recipient, question):
return "This is not valid JSON at all"
agent_client.invoke = mock_invalid_json_response
mock_message = MagicMock()
mock_message.value.return_value = sample_chunk
mock_consumer = MagicMock()
# Act & Assert
with pytest.raises((ValueError, json.JSONDecodeError)):
await configured_agent_extractor.on_message(mock_message, mock_consumer, mock_flow_context)
# Act - JSONL parsing is lenient, invalid lines are skipped
await configured_agent_extractor.on_message(mock_message, mock_consumer, mock_flow_context)
# Assert - should emit triples (with just metadata) but no entity contexts
triples_publisher = mock_flow_context("triples")
triples_publisher.send.assert_called_once()
entity_contexts_publisher = mock_flow_context("entity-contexts")
entity_contexts_publisher.send.assert_not_called()
@pytest.mark.asyncio
async def test_empty_extraction_results(self, configured_agent_extractor, sample_chunk, mock_flow_context):
@ -272,7 +256,8 @@ class TestAgentKgExtractionIntegration:
agent_client = mock_flow_context("agent-request")
def mock_empty_response(recipient, question):
return '{"definitions": [], "relationships": []}'
# Return empty JSONL (just empty/whitespace)
return ''
agent_client.invoke = mock_empty_response
@ -303,7 +288,8 @@ class TestAgentKgExtractionIntegration:
agent_client = mock_flow_context("agent-request")
def mock_malformed_response(recipient, question):
return '''{"definitions": [{"entity": "Missing Definition"}], "relationships": [{"subject": "Missing Object"}]}'''
# JSONL with definition missing required field
return '{"type": "definition", "entity": "Missing Definition"}'
agent_client.invoke = mock_malformed_response
@ -330,7 +316,7 @@ class TestAgentKgExtractionIntegration:
def capture_prompt(recipient, question):
# Verify the prompt contains the test text
assert test_text in question
return '{"definitions": [], "relationships": []}'
return '' # Empty JSONL response
agent_client.invoke = capture_prompt
@ -361,7 +347,7 @@ class TestAgentKgExtractionIntegration:
responses = []
def mock_response(recipient, question):
response = f'{{"definitions": [{{"entity": "Entity {len(responses)}", "definition": "Definition {len(responses)}"}}], "relationships": []}}'
response = f'{{"type": "definition", "entity": "Entity {len(responses)}", "definition": "Definition {len(responses)}"}}'
responses.append(response)
return response
@ -398,7 +384,7 @@ class TestAgentKgExtractionIntegration:
# Verify unicode text was properly decoded and included
assert "学习机器" in question
assert "人工知能" in question
return '''{"definitions": [{"entity": "機械学習", "definition": "人工知能の一分野"}], "relationships": []}'''
return '{"type": "definition", "entity": "機械学習", "definition": "人工知能の一分野"}'
agent_client.invoke = mock_unicode_response
@ -415,7 +401,7 @@ class TestAgentKgExtractionIntegration:
sent_triples = triples_publisher.send.call_args[0][0]
# Check that unicode entity was properly processed
entity_labels = [t for t in sent_triples.triples if t.p.value == RDF_LABEL and t.o.value == "機械学習"]
entity_labels = [t for t in sent_triples.triples if t.p.iri == RDF_LABEL and t.o.value == "機械学習"]
assert len(entity_labels) > 0
@pytest.mark.asyncio
@ -433,7 +419,7 @@ class TestAgentKgExtractionIntegration:
def mock_large_text_response(recipient, question):
# Verify large text was included
assert len(question) > 10000
return '''{"definitions": [{"entity": "Machine Learning", "definition": "Important AI technique"}], "relationships": []}'''
return '{"type": "definition", "entity": "Machine Learning", "definition": "Important AI technique"}'
agent_client.invoke = mock_large_text_response