fix: ontology extractor reads .objects, not .object, from PromptResult (#842)

The extract-with-ontologies prompt is a JSONL prompt, which means the prompt service returns a PromptResult with response_type="jsonl" and the parsed items in `.objects` (plural). The ontology extractor was reading `.object` (singular) — the field used for response_type="json" — which is always None for JSONL prompts. Effect: the parser received None on every chunk, hit its "Unexpected response type: <class 'NoneType'>" branch, returned no ExtractionResult, and extract_with_simplified_format returned []. Every extraction silently produced zero triples. Graphs populated only with the seed ontology schema (TBox) and document/chunk provenance — no instance triples at all. The e2e test threshold of >=100 edges per collection was met by schema + provenance alone, so the failure mode was invisible until RAG queries couldn't find any content. Regression introduced in v2.3 with the token-usage work (commit 56d700f3 / 14e49d83) when PromptClient.prompt() began returning a PromptResult wrapper instead of the raw text/dict/list. All other call sites of .prompt() across retrieval/, agent/, orchestrator/ were already reading the correct field for their prompt's response_type; ontology extraction was the sole stranded caller. Also adds tests/unit/test_extract/test_ontology/test_extract_with_simplified_format.py covering: - happy path: populated .objects produces non-empty triples - production failure shape: .objects=None returns [] cleanly - empty .objects returns [] without raising - defensive: do not silently fall back to .object for a JSONL prompt
2026-06-10 23:35:14 +02:00 · 2026-04-22 12:05:47 +01:00 · 2026-04-22 12:05:47 +01:00 · f04f7fa154
commit f04f7fa154
parent 6027ba6bb5
2 changed files with 207 additions and 1 deletions
--- a/tests/unit/test_extract/test_ontology/test_extract_with_simplified_format.py
+++ b/tests/unit/test_extract/test_ontology/test_extract_with_simplified_format.py
@ -0,0 +1,200 @@
+"""
+Unit tests for extract_with_simplified_format.
+
+Regression guard for the bug where the extractor read
+``result.object`` (singular, used for response_type="json") instead of
+``result.objects`` (plural, used for response_type="jsonl"). The
+extract-with-ontologies prompt is JSONL, so reading the wrong field
+silently dropped every extraction and left the knowledge graph
+populated only by ontology schema + document provenance.
+"""
+
+import pytest
+from unittest.mock import AsyncMock, MagicMock
+
+from trustgraph.extract.kg.ontology.extract import Processor
+from trustgraph.extract.kg.ontology.ontology_selector import OntologySubset
+from trustgraph.base import PromptResult
+
+
+@pytest.fixture
+def extractor():
+    """Create a Processor instance without running its heavy __init__.
+
+    Matches the pattern used in test_prompt_and_extraction.py: only
+    the attributes the code under test touches need to be set.
+    """
+    ex = object.__new__(Processor)
+    ex.URI_PREFIXES = {
+        "rdf:":  "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+        "rdfs:": "http://www.w3.org/2000/01/rdf-schema#",
+        "owl:":  "http://www.w3.org/2002/07/owl#",
+        "xsd:":  "http://www.w3.org/2001/XMLSchema#",
+    }
+    return ex
+
+
+@pytest.fixture
+def food_subset():
+    """A minimal food ontology subset the extracted entities reference."""
+    return OntologySubset(
+        ontology_id="food",
+        classes={
+            "Recipe": {
+                "uri": "http://purl.org/ontology/fo/Recipe",
+                "type": "owl:Class",
+                "labels": [{"value": "Recipe", "lang": "en-gb"}],
+                "comment": "A Recipe.",
+            },
+            "Food": {
+                "uri": "http://purl.org/ontology/fo/Food",
+                "type": "owl:Class",
+                "labels": [{"value": "Food", "lang": "en-gb"}],
+                "comment": "A Food.",
+            },
+        },
+        object_properties={
+            "ingredients": {
+                "uri": "http://purl.org/ontology/fo/ingredients",
+                "type": "owl:ObjectProperty",
+                "labels": [{"value": "ingredients", "lang": "en-gb"}],
+                "comment": "Relates a recipe to its ingredients.",
+                "domain": "Recipe",
+                "range": "Food",
+            },
+        },
+        datatype_properties={},
+        metadata={
+            "name": "Food Ontology",
+            "namespace": "http://purl.org/ontology/fo/",
+        },
+    )
+
+
+def _flow_with_prompt_result(prompt_result):
+    """Build the ``flow(name)`` callable the extractor invokes.
+
+    ``extract_with_simplified_format`` calls
+    ``flow("prompt-request").prompt(...)`` — so we need ``flow`` to be
+    callable, return an object whose ``.prompt`` is an AsyncMock that
+    resolves to ``prompt_result``.
+    """
+    prompt_service = MagicMock()
+    prompt_service.prompt = AsyncMock(return_value=prompt_result)
+
+    def flow(name):
+        assert name == "prompt-request", (
+            f"extractor should only invoke flow('prompt-request'), "
+            f"got {name!r}"
+        )
+        return prompt_service
+
+    return flow, prompt_service.prompt
+
+
+class TestReadsObjectsForJsonlPrompt:
+    """extract-with-ontologies is a JSONL prompt; the extractor must
+    read ``result.objects``, not ``result.object``."""
+
+    async def test_populated_objects_produces_triples(
+            self, extractor, food_subset,
+    ):
+        """Happy path: PromptResult with populated .objects -> non-empty
+        triples list."""
+
+        prompt_result = PromptResult(
+            response_type="jsonl",
+            objects=[
+                {"type": "entity", "entity": "Cornish Pasty",
+                 "entity_type": "Recipe"},
+                {"type": "entity", "entity": "beef",
+                 "entity_type": "Food"},
+                {"type": "relationship",
+                 "subject": "Cornish Pasty", "subject_type": "Recipe",
+                 "relation": "ingredients",
+                 "object": "beef", "object_type": "Food"},
+            ],
+        )
+
+        flow, prompt_mock = _flow_with_prompt_result(prompt_result)
+
+        triples = await extractor.extract_with_simplified_format(
+            flow, "some chunk", food_subset, {"text": "some chunk"},
+        )
+
+        prompt_mock.assert_awaited_once()
+        assert triples, (
+            "extract_with_simplified_format returned no triples; if "
+            "this fails, the extractor is probably reading .object "
+            "instead of .objects again"
+        )
+
+    async def test_none_objects_returns_empty_without_crashing(
+            self, extractor, food_subset,
+    ):
+        """The exact shape that hit production on v2.3: the extractor
+        was reading ``.object`` for a JSONL prompt, which returned
+        ``None`` and tripped the parser's 'Unexpected response type'
+        path.  With the fix we read ``.objects``; if that's also
+        ``None`` we must still return ``[]`` cleanly, not crash."""
+
+        prompt_result = PromptResult(
+            response_type="jsonl",
+            objects=None,
+        )
+
+        flow, _ = _flow_with_prompt_result(prompt_result)
+
+        triples = await extractor.extract_with_simplified_format(
+            flow, "chunk", food_subset, {"text": "chunk"},
+        )
+
+        assert triples == []
+
+    async def test_empty_objects_returns_empty(
+            self, extractor, food_subset,
+    ):
+        """Valid JSONL response with zero entries should yield zero
+        triples, not raise."""
+
+        prompt_result = PromptResult(
+            response_type="jsonl",
+            objects=[],
+        )
+
+        flow, _ = _flow_with_prompt_result(prompt_result)
+
+        triples = await extractor.extract_with_simplified_format(
+            flow, "chunk", food_subset, {"text": "chunk"},
+        )
+
+        assert triples == []
+
+    async def test_ignores_object_field_for_jsonl_prompt(
+            self, extractor, food_subset,
+    ):
+        """If ``.object`` is somehow set but ``.objects`` is None, the
+        extractor must not silently fall back to ``.object``.  This
+        guards against a well-meaning regression that "helpfully"
+        re-adds fallback fields.
+
+        The extractor should read only ``.objects`` for this prompt;
+        when that is None we expect the empty-result path.
+        """
+
+        prompt_result = PromptResult(
+            response_type="json",
+            object={"not": "the field we should be reading"},
+            objects=None,
+        )
+
+        flow, _ = _flow_with_prompt_result(prompt_result)
+
+        triples = await extractor.extract_with_simplified_format(
+            flow, "chunk", food_subset, {"text": "chunk"},
+        )
+
+        assert triples == [], (
+            "Extractor fell back to .object for a JSONL prompt — "
+            "this is the regression shape we are trying to prevent"
+        )
--- a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py
@ -380,7 +380,13 @@ class Processor(FlowProcessor):
                id="extract-with-ontologies",
                variables=prompt_variables
            )
-            extraction_response = result.object
+
+            # extract-with-ontologies is a JSONL prompt, so PromptResult
+            # always populates .objects (a list of dicts).  Reading .object
+            # (singular) silently gives None for JSONL responses and drops
+            # every extraction.
+            extraction_response = result.objects
+
            logger.debug(f"Simplified extraction response: {extraction_response}")

            # Parse response into structured format