Merge 2.0 to master (#651)

2026-04-29 10:26:21 +02:00 · 2026-02-28 11:03:14 +00:00 · 2026-02-28 11:03:14 +00:00 · b9d7bf9a8b
commit b9d7bf9a8b
parent 3666ece2c5
212 changed files with 13940 additions and 6180 deletions
--- a/tests/integration/test_agent_kg_extraction_integration.py
+++ b/tests/integration/test_agent_kg_extraction_integration.py
@ -12,7 +12,7 @@ import json
 from unittest.mock import AsyncMock, MagicMock, patch

 from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
-from trustgraph.schema import Chunk, Triple, Triples, Metadata, Value, Error
+from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
 from trustgraph.schema import EntityContext, EntityContexts, AgentRequest, AgentResponse
 from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
 from trustgraph.template.prompt_manager import PromptManager
@ -30,38 +30,16 @@ class TestAgentKgExtractionIntegration:
        # Mock agent client
        agent_client = AsyncMock()
        
-        # Mock successful agent response
+        # Mock successful agent response in JSONL format
        def mock_agent_response(recipient, question):
-            # Simulate agent processing and return structured response
+            # Simulate agent processing and return structured JSONL response
            mock_response = MagicMock()
            mock_response.error = None
            mock_response.answer = '''```json
-{
-    "definitions": [
-        {
-            "entity": "Machine Learning",
-            "definition": "A subset of artificial intelligence that enables computers to learn from data without explicit programming."
-        },
-        {
-            "entity": "Neural Networks", 
-            "definition": "Computing systems inspired by biological neural networks that process information."
-        }
-    ],
-    "relationships": [
-        {
-            "subject": "Machine Learning",
-            "predicate": "is_subset_of", 
-            "object": "Artificial Intelligence",
-            "object-entity": true
-        },
-        {
-            "subject": "Neural Networks",
-            "predicate": "used_in",
-            "object": "Machine Learning", 
-            "object-entity": true
-        }
-    ]
-}
+{"type": "definition", "entity": "Machine Learning", "definition": "A subset of artificial intelligence that enables computers to learn from data without explicit programming."}
+{"type": "definition", "entity": "Neural Networks", "definition": "Computing systems inspired by biological neural networks that process information."}
+{"type": "relationship", "subject": "Machine Learning", "predicate": "is_subset_of", "object": "Artificial Intelligence", "object-entity": true}
+{"type": "relationship", "subject": "Neural Networks", "predicate": "used_in", "object": "Machine Learning", "object-entity": true}
 ```'''
            return mock_response.answer
        
@ -100,9 +78,9 @@ class TestAgentKgExtractionIntegration:
                id="doc123",
                metadata=[
                    Triple(
-                        s=Value(value="doc123", is_uri=True),
-                        p=Value(value="http://example.org/type", is_uri=True),
-                        o=Value(value="document", is_uri=False)
+                        s=Term(type=IRI, iri="doc123"),
+                        p=Term(type=IRI, iri="http://example.org/type"),
+                        o=Term(type=LITERAL, value="document")
                    )
                ]
            )
@ -120,7 +98,7 @@ class TestAgentKgExtractionIntegration:
        
        # Copy the methods we want to test
        extractor.to_uri = real_extractor.to_uri
-        extractor.parse_json = real_extractor.parse_json
+        extractor.parse_jsonl = real_extractor.parse_jsonl
        extractor.process_extraction_data = real_extractor.process_extraction_data
        extractor.emit_triples = real_extractor.emit_triples
        extractor.emit_entity_contexts = real_extractor.emit_entity_contexts
@ -156,7 +134,7 @@ class TestAgentKgExtractionIntegration:
            agent_response = agent_client.invoke(recipient=lambda x: True, question=prompt)
            
            # Parse and process
-            extraction_data = extractor.parse_json(agent_response)
+            extraction_data = extractor.parse_jsonl(agent_response)
            triples, entity_contexts = extractor.process_extraction_data(extraction_data, v.metadata)
            
            # Add metadata triples
@ -200,15 +178,15 @@ class TestAgentKgExtractionIntegration:
        assert len(sent_triples.triples) > 0
        
        # Check that we have definition triples
-        definition_triples = [t for t in sent_triples.triples if t.p.value == DEFINITION]
+        definition_triples = [t for t in sent_triples.triples if t.p.iri == DEFINITION]
        assert len(definition_triples) >= 2  # Should have definitions for ML and Neural Networks
-        
+
        # Check that we have label triples
-        label_triples = [t for t in sent_triples.triples if t.p.value == RDF_LABEL]
+        label_triples = [t for t in sent_triples.triples if t.p.iri == RDF_LABEL]
        assert len(label_triples) >= 2  # Should have labels for entities
-        
+
        # Check subject-of relationships
-        subject_of_triples = [t for t in sent_triples.triples if t.p.value == SUBJECT_OF]
+        subject_of_triples = [t for t in sent_triples.triples if t.p.iri == SUBJECT_OF]
        assert len(subject_of_triples) >= 2  # Entities should be linked to document

        # Verify entity contexts were emitted
@ -220,7 +198,7 @@ class TestAgentKgExtractionIntegration:
        assert len(sent_contexts.entities) >= 2  # Should have contexts for both entities
        
        # Verify entity URIs are properly formed
-        entity_uris = [ec.entity.value for ec in sent_contexts.entities]
+        entity_uris = [ec.entity.iri for ec in sent_contexts.entities]
        assert f"{TRUSTGRAPH_ENTITIES}Machine%20Learning" in entity_uris
        assert f"{TRUSTGRAPH_ENTITIES}Neural%20Networks" in entity_uris

@ -248,22 +226,28 @@ class TestAgentKgExtractionIntegration:

    @pytest.mark.asyncio
    async def test_invalid_json_response_handling(self, configured_agent_extractor, sample_chunk, mock_flow_context):
-        """Test handling of invalid JSON responses from agent"""
+        """Test handling of invalid JSON responses from agent - JSONL is lenient and skips invalid lines"""
        # Arrange - mock invalid JSON response
        agent_client = mock_flow_context("agent-request")
-        
+
        def mock_invalid_json_response(recipient, question):
            return "This is not valid JSON at all"
-        
+
        agent_client.invoke = mock_invalid_json_response
-        
+
        mock_message = MagicMock()
        mock_message.value.return_value = sample_chunk
        mock_consumer = MagicMock()

-        # Act & Assert
-        with pytest.raises((ValueError, json.JSONDecodeError)):
-            await configured_agent_extractor.on_message(mock_message, mock_consumer, mock_flow_context)
+        # Act - JSONL parsing is lenient, invalid lines are skipped
+        await configured_agent_extractor.on_message(mock_message, mock_consumer, mock_flow_context)
+
+        # Assert - should emit triples (with just metadata) but no entity contexts
+        triples_publisher = mock_flow_context("triples")
+        triples_publisher.send.assert_called_once()
+
+        entity_contexts_publisher = mock_flow_context("entity-contexts")
+        entity_contexts_publisher.send.assert_not_called()

    @pytest.mark.asyncio
    async def test_empty_extraction_results(self, configured_agent_extractor, sample_chunk, mock_flow_context):
@ -272,7 +256,8 @@ class TestAgentKgExtractionIntegration:
        agent_client = mock_flow_context("agent-request")
        
        def mock_empty_response(recipient, question):
-            return '{"definitions": [], "relationships": []}'
+            # Return empty JSONL (just empty/whitespace)
+            return ''
        
        agent_client.invoke = mock_empty_response
        
@ -303,7 +288,8 @@ class TestAgentKgExtractionIntegration:
        agent_client = mock_flow_context("agent-request")
        
        def mock_malformed_response(recipient, question):
-            return '''{"definitions": [{"entity": "Missing Definition"}], "relationships": [{"subject": "Missing Object"}]}'''
+            # JSONL with definition missing required field
+            return '{"type": "definition", "entity": "Missing Definition"}'
        
        agent_client.invoke = mock_malformed_response
        
@ -330,7 +316,7 @@ class TestAgentKgExtractionIntegration:
        def capture_prompt(recipient, question):
            # Verify the prompt contains the test text
            assert test_text in question
-            return '{"definitions": [], "relationships": []}'
+            return ''  # Empty JSONL response
        
        agent_client.invoke = capture_prompt
        
@ -361,7 +347,7 @@ class TestAgentKgExtractionIntegration:
        responses = []
        
        def mock_response(recipient, question):
-            response = f'{{"definitions": [{{"entity": "Entity {len(responses)}", "definition": "Definition {len(responses)}"}}], "relationships": []}}'
+            response = f'{{"type": "definition", "entity": "Entity {len(responses)}", "definition": "Definition {len(responses)}"}}'
            responses.append(response)
            return response
        
@ -398,7 +384,7 @@ class TestAgentKgExtractionIntegration:
            # Verify unicode text was properly decoded and included
            assert "学习机器" in question
            assert "人工知能" in question
-            return '''{"definitions": [{"entity": "機械学習", "definition": "人工知能の一分野"}], "relationships": []}'''
+            return '{"type": "definition", "entity": "機械学習", "definition": "人工知能の一分野"}'
        
        agent_client.invoke = mock_unicode_response
        
@ -415,7 +401,7 @@ class TestAgentKgExtractionIntegration:
        
        sent_triples = triples_publisher.send.call_args[0][0]
        # Check that unicode entity was properly processed
-        entity_labels = [t for t in sent_triples.triples if t.p.value == RDF_LABEL and t.o.value == "機械学習"]
+        entity_labels = [t for t in sent_triples.triples if t.p.iri == RDF_LABEL and t.o.value == "機械学習"]
        assert len(entity_labels) > 0

    @pytest.mark.asyncio
@ -433,7 +419,7 @@ class TestAgentKgExtractionIntegration:
        def mock_large_text_response(recipient, question):
            # Verify large text was included
            assert len(question) > 10000
-            return '''{"definitions": [{"entity": "Machine Learning", "definition": "Important AI technique"}], "relationships": []}'''
+            return '{"type": "definition", "entity": "Machine Learning", "definition": "Important AI technique"}'
        
        agent_client.invoke = mock_large_text_response