Merge 2.0 to master (#651)

2026-04-25 08:26:21 +02:00 · 2026-02-28 11:03:14 +00:00 · 2026-02-28 11:03:14 +00:00 · b9d7bf9a8b
commit b9d7bf9a8b
parent 3666ece2c5
212 changed files with 13940 additions and 6180 deletions
--- a/tests/unit/test_knowledge_graph/test_agent_extraction.py
+++ b/tests/unit/test_knowledge_graph/test_agent_extraction.py
@ -11,7 +11,7 @@ import json
 from unittest.mock import AsyncMock, MagicMock, patch

 from trustgraph.extract.kg.agent.extract import Processor as AgentKgExtractor
-from trustgraph.schema import Chunk, Triple, Triples, Metadata, Value, Error
+from trustgraph.schema import Chunk, Triple, Triples, Metadata, Term, Error, IRI, LITERAL
 from trustgraph.schema import EntityContext, EntityContexts
 from trustgraph.rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
 from trustgraph.template.prompt_manager import PromptManager
@ -33,7 +33,7 @@ class TestAgentKgExtractor:
        
        # Set up the methods we want to test
        extractor.to_uri = real_extractor.to_uri
-        extractor.parse_json = real_extractor.parse_json
+        extractor.parse_jsonl = real_extractor.parse_jsonl
        extractor.process_extraction_data = real_extractor.process_extraction_data
        extractor.emit_triples = real_extractor.emit_triples
        extractor.emit_entity_contexts = real_extractor.emit_entity_contexts
@ -53,48 +53,49 @@ class TestAgentKgExtractor:
            id="doc123",
            metadata=[
                Triple(
-                    s=Value(value="doc123", is_uri=True),
-                    p=Value(value="http://example.org/type", is_uri=True),
-                    o=Value(value="document", is_uri=False)
+                    s=Term(type=IRI, iri="doc123"),
+                    p=Term(type=IRI, iri="http://example.org/type"),
+                    o=Term(type=LITERAL, value="document")
                )
            ]
        )

    @pytest.fixture
    def sample_extraction_data(self):
-        """Sample extraction data in expected format"""
-        return {
-            "definitions": [
-                {
-                    "entity": "Machine Learning",
-                    "definition": "A subset of artificial intelligence that enables computers to learn from data without explicit programming."
-                },
-                {
-                    "entity": "Neural Networks",
-                    "definition": "Computing systems inspired by biological neural networks that process information."
-                }
-            ],
-            "relationships": [
-                {
-                    "subject": "Machine Learning",
-                    "predicate": "is_subset_of",
-                    "object": "Artificial Intelligence",
-                    "object-entity": True
-                },
-                {
-                    "subject": "Neural Networks",
-                    "predicate": "used_in",
-                    "object": "Machine Learning",
-                    "object-entity": True
-                },
-                {
-                    "subject": "Deep Learning",
-                    "predicate": "accuracy",
-                    "object": "95%",
-                    "object-entity": False
-                }
-            ]
-        }
+        """Sample extraction data in JSONL format (list with type discriminators)"""
+        return [
+            {
+                "type": "definition",
+                "entity": "Machine Learning",
+                "definition": "A subset of artificial intelligence that enables computers to learn from data without explicit programming."
+            },
+            {
+                "type": "definition",
+                "entity": "Neural Networks",
+                "definition": "Computing systems inspired by biological neural networks that process information."
+            },
+            {
+                "type": "relationship",
+                "subject": "Machine Learning",
+                "predicate": "is_subset_of",
+                "object": "Artificial Intelligence",
+                "object-entity": True
+            },
+            {
+                "type": "relationship",
+                "subject": "Neural Networks",
+                "predicate": "used_in",
+                "object": "Machine Learning",
+                "object-entity": True
+            },
+            {
+                "type": "relationship",
+                "subject": "Deep Learning",
+                "predicate": "accuracy",
+                "object": "95%",
+                "object-entity": False
+            }
+        ]

    def test_to_uri_conversion(self, agent_extractor):
        """Test URI conversion for entities"""
@ -113,148 +114,147 @@ class TestAgentKgExtractor:
        expected = f"{TRUSTGRAPH_ENTITIES}"
        assert uri == expected

-    def test_parse_json_with_code_blocks(self, agent_extractor):
-        """Test JSON parsing from code blocks"""
-        # Test JSON in code blocks
+    def test_parse_jsonl_with_code_blocks(self, agent_extractor):
+        """Test JSONL parsing from code blocks"""
+        # Test JSONL in code blocks - note: JSON uses lowercase true/false
        response = '''```json
-        {
-            "definitions": [{"entity": "AI", "definition": "Artificial Intelligence"}],
-            "relationships": []
-        }
-        ```'''
-        
-        result = agent_extractor.parse_json(response)
-        
-        assert result["definitions"][0]["entity"] == "AI"
-        assert result["definitions"][0]["definition"] == "Artificial Intelligence"
-        assert result["relationships"] == []
+{"type": "definition", "entity": "AI", "definition": "Artificial Intelligence"}
+{"type": "relationship", "subject": "AI", "predicate": "is", "object": "technology", "object-entity": false}
+```'''

-    def test_parse_json_without_code_blocks(self, agent_extractor):
-        """Test JSON parsing without code blocks"""
-        response = '''{"definitions": [{"entity": "ML", "definition": "Machine Learning"}], "relationships": []}'''
-        
-        result = agent_extractor.parse_json(response)
-        
-        assert result["definitions"][0]["entity"] == "ML"
-        assert result["definitions"][0]["definition"] == "Machine Learning"
+        result = agent_extractor.parse_jsonl(response)

-    def test_parse_json_invalid_format(self, agent_extractor):
-        """Test JSON parsing with invalid format"""
-        invalid_response = "This is not JSON at all"
-        
-        with pytest.raises(json.JSONDecodeError):
-            agent_extractor.parse_json(invalid_response)
+        assert len(result) == 2
+        assert result[0]["entity"] == "AI"
+        assert result[0]["definition"] == "Artificial Intelligence"
+        assert result[1]["type"] == "relationship"

-    def test_parse_json_malformed_code_blocks(self, agent_extractor):
-        """Test JSON parsing with malformed code blocks"""
-        # Missing closing backticks
-        response = '''```json
-        {"definitions": [], "relationships": []}
-        '''
-        
-        # Should still parse the JSON content
-        with pytest.raises(json.JSONDecodeError):
-            agent_extractor.parse_json(response)
+    def test_parse_jsonl_without_code_blocks(self, agent_extractor):
+        """Test JSONL parsing without code blocks"""
+        response = '''{"type": "definition", "entity": "ML", "definition": "Machine Learning"}
+{"type": "definition", "entity": "AI", "definition": "Artificial Intelligence"}'''
+
+        result = agent_extractor.parse_jsonl(response)
+
+        assert len(result) == 2
+        assert result[0]["entity"] == "ML"
+        assert result[1]["entity"] == "AI"
+
+    def test_parse_jsonl_invalid_lines_skipped(self, agent_extractor):
+        """Test JSONL parsing skips invalid lines gracefully"""
+        response = '''{"type": "definition", "entity": "Valid", "definition": "Valid def"}
+This is not JSON at all
+{"type": "definition", "entity": "Also Valid", "definition": "Another def"}'''
+
+        result = agent_extractor.parse_jsonl(response)
+
+        # Should get 2 valid objects, skipping the invalid line
+        assert len(result) == 2
+        assert result[0]["entity"] == "Valid"
+        assert result[1]["entity"] == "Also Valid"
+
+    def test_parse_jsonl_truncation_resilience(self, agent_extractor):
+        """Test JSONL parsing handles truncated responses"""
+        # Simulates output cut off mid-line
+        response = '''{"type": "definition", "entity": "Complete", "definition": "Full def"}
+{"type": "definition", "entity": "Trunca'''
+
+        result = agent_extractor.parse_jsonl(response)
+
+        # Should get 1 valid object, the truncated line is skipped
+        assert len(result) == 1
+        assert result[0]["entity"] == "Complete"

    def test_process_extraction_data_definitions(self, agent_extractor, sample_metadata):
        """Test processing of definition data"""
-        data = {
-            "definitions": [
-                {
-                    "entity": "Machine Learning",
-                    "definition": "A subset of AI that enables learning from data."
-                }
-            ],
-            "relationships": []
-        }
-        
+        data = [
+            {
+                "type": "definition",
+                "entity": "Machine Learning",
+                "definition": "A subset of AI that enables learning from data."
+            }
+        ]
+
        triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
        
        # Check entity label triple
-        label_triple = next((t for t in triples if t.p.value == RDF_LABEL and t.o.value == "Machine Learning"), None)
+        label_triple = next((t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "Machine Learning"), None)
        assert label_triple is not None
-        assert label_triple.s.value == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
-        assert label_triple.s.is_uri == True
-        assert label_triple.o.is_uri == False
-        
+        assert label_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
+        assert label_triple.s.type == IRI
+        assert label_triple.o.type == LITERAL
+
        # Check definition triple
-        def_triple = next((t for t in triples if t.p.value == DEFINITION), None)
+        def_triple = next((t for t in triples if t.p.iri == DEFINITION), None)
        assert def_triple is not None
-        assert def_triple.s.value == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
+        assert def_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
        assert def_triple.o.value == "A subset of AI that enables learning from data."
-        
+
        # Check subject-of triple
-        subject_of_triple = next((t for t in triples if t.p.value == SUBJECT_OF), None)
+        subject_of_triple = next((t for t in triples if t.p.iri == SUBJECT_OF), None)
        assert subject_of_triple is not None
-        assert subject_of_triple.s.value == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
-        assert subject_of_triple.o.value == "doc123"
-        
+        assert subject_of_triple.s.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
+        assert subject_of_triple.o.iri == "doc123"
+
        # Check entity context
        assert len(entity_contexts) == 1
-        assert entity_contexts[0].entity.value == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
+        assert entity_contexts[0].entity.iri == f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
        assert entity_contexts[0].context == "A subset of AI that enables learning from data."

    def test_process_extraction_data_relationships(self, agent_extractor, sample_metadata):
        """Test processing of relationship data"""
-        data = {
-            "definitions": [],
-            "relationships": [
-                {
-                    "subject": "Machine Learning",
-                    "predicate": "is_subset_of",
-                    "object": "Artificial Intelligence",
-                    "object-entity": True
-                }
-            ]
-        }
-        
+        data = [
+            {
+                "type": "relationship",
+                "subject": "Machine Learning",
+                "predicate": "is_subset_of",
+                "object": "Artificial Intelligence",
+                "object-entity": True
+            }
+        ]
+
        triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
        
        # Check that subject, predicate, and object labels are created
        subject_uri = f"{TRUSTGRAPH_ENTITIES}Machine%20Learning"
        predicate_uri = f"{TRUSTGRAPH_ENTITIES}is_subset_of"
-        
+
        # Find label triples
-        subject_label = next((t for t in triples if t.s.value == subject_uri and t.p.value == RDF_LABEL), None)
+        subject_label = next((t for t in triples if t.s.iri == subject_uri and t.p.iri == RDF_LABEL), None)
        assert subject_label is not None
        assert subject_label.o.value == "Machine Learning"
-        
-        predicate_label = next((t for t in triples if t.s.value == predicate_uri and t.p.value == RDF_LABEL), None)
+
+        predicate_label = next((t for t in triples if t.s.iri == predicate_uri and t.p.iri == RDF_LABEL), None)
        assert predicate_label is not None
        assert predicate_label.o.value == "is_subset_of"
-        
-        # Check main relationship triple 
-        # NOTE: Current implementation has bugs:
-        # 1. Uses data.get("object-entity") instead of rel.get("object-entity")
-        # 2. Sets object_value to predicate_uri instead of actual object URI
-        # This test documents the current buggy behavior
-        rel_triple = next((t for t in triples if t.s.value == subject_uri and t.p.value == predicate_uri), None)
+
+        # Check main relationship triple
+        object_uri = f"{TRUSTGRAPH_ENTITIES}Artificial%20Intelligence"
+        rel_triple = next((t for t in triples if t.s.iri == subject_uri and t.p.iri == predicate_uri), None)
        assert rel_triple is not None
-        # Due to bug, object value is set to predicate_uri
-        assert rel_triple.o.value == predicate_uri
-        
+        assert rel_triple.o.iri == object_uri
+        assert rel_triple.o.type == IRI
+
        # Check subject-of relationships
-        subject_of_triples = [t for t in triples if t.p.value == SUBJECT_OF and t.o.value == "doc123"]
+        subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF and t.o.iri == "doc123"]
        assert len(subject_of_triples) >= 2  # At least subject and predicate should have subject-of relations

    def test_process_extraction_data_literal_object(self, agent_extractor, sample_metadata):
        """Test processing of relationships with literal objects"""
-        data = {
-            "definitions": [],
-            "relationships": [
-                {
-                    "subject": "Deep Learning",
-                    "predicate": "accuracy",
-                    "object": "95%",
-                    "object-entity": False
-                }
-            ]
-        }
-        
+        data = [
+            {
+                "type": "relationship",
+                "subject": "Deep Learning",
+                "predicate": "accuracy",
+                "object": "95%",
+                "object-entity": False
+            }
+        ]
+
        triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
-        
+
        # Check that object labels are not created for literal objects
-        object_labels = [t for t in triples if t.p.value == RDF_LABEL and t.o.value == "95%"]
+        object_labels = [t for t in triples if t.p.iri == RDF_LABEL and t.o.value == "95%"]
        # Based on the code logic, it should not create object labels for non-entity objects
        # But there might be a bug in the original implementation

@ -263,75 +263,62 @@ class TestAgentKgExtractor:
        triples, entity_contexts = agent_extractor.process_extraction_data(sample_extraction_data, sample_metadata)
        
        # Check that we have both definition and relationship triples
-        definition_triples = [t for t in triples if t.p.value == DEFINITION]
+        definition_triples = [t for t in triples if t.p.iri == DEFINITION]
        assert len(definition_triples) == 2  # Two definitions
-        
+
        # Check entity contexts are created for definitions
        assert len(entity_contexts) == 2
-        entity_uris = [ec.entity.value for ec in entity_contexts]
+        entity_uris = [ec.entity.iri for ec in entity_contexts]
        assert f"{TRUSTGRAPH_ENTITIES}Machine%20Learning" in entity_uris
        assert f"{TRUSTGRAPH_ENTITIES}Neural%20Networks" in entity_uris

    def test_process_extraction_data_no_metadata_id(self, agent_extractor):
        """Test processing when metadata has no ID"""
        metadata = Metadata(id=None, metadata=[])
-        data = {
-            "definitions": [
-                {"entity": "Test Entity", "definition": "Test definition"}
-            ],
-            "relationships": []
-        }
-        
+        data = [
+            {"type": "definition", "entity": "Test Entity", "definition": "Test definition"}
+        ]
+
        triples, entity_contexts = agent_extractor.process_extraction_data(data, metadata)
-        
+
        # Should not create subject-of relationships when no metadata ID
-        subject_of_triples = [t for t in triples if t.p.value == SUBJECT_OF]
+        subject_of_triples = [t for t in triples if t.p.iri == SUBJECT_OF]
        assert len(subject_of_triples) == 0
-        
+
        # Should still create entity contexts
        assert len(entity_contexts) == 1

    def test_process_extraction_data_empty_data(self, agent_extractor, sample_metadata):
        """Test processing of empty extraction data"""
-        data = {"definitions": [], "relationships": []}
-        
-        triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
-        
-        # Should only have metadata triples
-        assert len(entity_contexts) == 0
-        # Triples should only contain metadata triples if any
+        data = []

-    def test_process_extraction_data_missing_keys(self, agent_extractor, sample_metadata):
-        """Test processing data with missing keys"""
-        # Test missing definitions key
-        data = {"relationships": []}
        triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
+
+        # Should have no entity contexts
        assert len(entity_contexts) == 0
-        
-        # Test missing relationships key
-        data = {"definitions": []}
+        # Triples should be empty
+        assert len(triples) == 0
+
+    def test_process_extraction_data_unknown_types_ignored(self, agent_extractor, sample_metadata):
+        """Test processing data with unknown type values"""
+        data = [
+            {"type": "definition", "entity": "Valid", "definition": "Valid def"},
+            {"type": "unknown_type", "foo": "bar"},  # Unknown type - should be ignored
+            {"type": "relationship", "subject": "A", "predicate": "rel", "object": "B", "object-entity": True}
+        ]
+
        triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
-        assert len(entity_contexts) == 0
-        
-        # Test completely missing keys
-        data = {}
-        triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
-        assert len(entity_contexts) == 0
+
+        # Should process valid items and ignore unknown types
+        assert len(entity_contexts) == 1  # Only the definition creates entity context

    def test_process_extraction_data_malformed_entries(self, agent_extractor, sample_metadata):
        """Test processing data with malformed entries"""
-        # Test definition missing required fields
-        data = {
-            "definitions": [
-                {"entity": "Test"},  # Missing definition
-                {"definition": "Test def"}  # Missing entity
-            ],
-            "relationships": [
-                {"subject": "A", "predicate": "rel"},  # Missing object
-                {"subject": "B", "object": "C"}  # Missing predicate
-            ]
-        }
-        
+        # Test items missing required fields - should raise KeyError
+        data = [
+            {"type": "definition", "entity": "Test"},  # Missing definition
+        ]
+
        # Should handle gracefully or raise appropriate errors
        with pytest.raises(KeyError):
            agent_extractor.process_extraction_data(data, sample_metadata)
@ -340,17 +327,17 @@ class TestAgentKgExtractor:
    async def test_emit_triples(self, agent_extractor, sample_metadata):
        """Test emitting triples to publisher"""
        mock_publisher = AsyncMock()
-        
+
        test_triples = [
            Triple(
-                s=Value(value="test:subject", is_uri=True),
-                p=Value(value="test:predicate", is_uri=True),
-                o=Value(value="test object", is_uri=False)
+                s=Term(type=IRI, iri="test:subject"),
+                p=Term(type=IRI, iri="test:predicate"),
+                o=Term(type=LITERAL, value="test object")
            )
        ]
-        
+
        await agent_extractor.emit_triples(mock_publisher, sample_metadata, test_triples)
-        
+
        mock_publisher.send.assert_called_once()
        sent_triples = mock_publisher.send.call_args[0][0]
        assert isinstance(sent_triples, Triples)
@ -361,22 +348,22 @@ class TestAgentKgExtractor:
        # Note: metadata.metadata is now empty array in the new implementation
        assert sent_triples.metadata.metadata == []
        assert len(sent_triples.triples) == 1
-        assert sent_triples.triples[0].s.value == "test:subject"
+        assert sent_triples.triples[0].s.iri == "test:subject"

    @pytest.mark.asyncio
    async def test_emit_entity_contexts(self, agent_extractor, sample_metadata):
        """Test emitting entity contexts to publisher"""
        mock_publisher = AsyncMock()
-        
+
        test_contexts = [
            EntityContext(
-                entity=Value(value="test:entity", is_uri=True),
+                entity=Term(type=IRI, iri="test:entity"),
                context="Test context"
            )
        ]
-        
+
        await agent_extractor.emit_entity_contexts(mock_publisher, sample_metadata, test_contexts)
-        
+
        mock_publisher.send.assert_called_once()
        sent_contexts = mock_publisher.send.call_args[0][0]
        assert isinstance(sent_contexts, EntityContexts)
@ -387,7 +374,7 @@ class TestAgentKgExtractor:
        # Note: metadata.metadata is now empty array in the new implementation
        assert sent_contexts.metadata.metadata == []
        assert len(sent_contexts.entities) == 1
-        assert sent_contexts.entities[0].entity.value == "test:entity"
+        assert sent_contexts.entities[0].entity.iri == "test:entity"

    def test_agent_extractor_initialization_params(self):
        """Test agent extractor parameter validation"""