From bcd7a1694aeb5a0e44b5fee250786aa9bc0bbde7 Mon Sep 17 00:00:00 2001 From: Cyber MacGeddon Date: Wed, 22 Apr 2026 11:47:20 +0100 Subject: [PATCH] fix: ontology extractor reads .objects, not .object, from PromptResult MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The extract-with-ontologies prompt is a JSONL prompt, which means the prompt service returns a PromptResult with response_type="jsonl" and the parsed items in `.objects` (plural). The ontology extractor was reading `.object` (singular) — the field used for response_type="json" — which is always None for JSONL prompts. Effect: the parser received None on every chunk, hit its "Unexpected response type: " branch, returned no ExtractionResult, and extract_with_simplified_format returned []. Every extraction silently produced zero triples. Graphs populated only with the seed ontology schema (TBox) and document/chunk provenance — no instance triples at all. The e2e test threshold of >=100 edges per collection was met by schema + provenance alone, so the failure mode was invisible until RAG queries couldn't find any content. Regression introduced in v2.3 with the token-usage work (commit 56d700f3 / 14e49d83) when PromptClient.prompt() began returning a PromptResult wrapper instead of the raw text/dict/list. All other call sites of .prompt() across retrieval/, agent/, orchestrator/ were already reading the correct field for their prompt's response_type; ontology extraction was the sole stranded caller. Also adds tests/unit/test_extract/test_ontology/test_extract_with_simplified_format.py covering: - happy path: populated .objects produces non-empty triples - production failure shape: .objects=None returns [] cleanly - empty .objects returns [] without raising - defensive: do not silently fall back to .object for a JSONL prompt --- .../test_extract_with_simplified_format.py | 200 ++++++++++++++++++ .../trustgraph/extract/kg/ontology/extract.py | 8 +- 2 files changed, 207 insertions(+), 1 deletion(-) create mode 100644 tests/unit/test_extract/test_ontology/test_extract_with_simplified_format.py diff --git a/tests/unit/test_extract/test_ontology/test_extract_with_simplified_format.py b/tests/unit/test_extract/test_ontology/test_extract_with_simplified_format.py new file mode 100644 index 00000000..7130bd73 --- /dev/null +++ b/tests/unit/test_extract/test_ontology/test_extract_with_simplified_format.py @@ -0,0 +1,200 @@ +""" +Unit tests for extract_with_simplified_format. + +Regression guard for the bug where the extractor read +``result.object`` (singular, used for response_type="json") instead of +``result.objects`` (plural, used for response_type="jsonl"). The +extract-with-ontologies prompt is JSONL, so reading the wrong field +silently dropped every extraction and left the knowledge graph +populated only by ontology schema + document provenance. +""" + +import pytest +from unittest.mock import AsyncMock, MagicMock + +from trustgraph.extract.kg.ontology.extract import Processor +from trustgraph.extract.kg.ontology.ontology_selector import OntologySubset +from trustgraph.base import PromptResult + + +@pytest.fixture +def extractor(): + """Create a Processor instance without running its heavy __init__. + + Matches the pattern used in test_prompt_and_extraction.py: only + the attributes the code under test touches need to be set. + """ + ex = object.__new__(Processor) + ex.URI_PREFIXES = { + "rdf:": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs:": "http://www.w3.org/2000/01/rdf-schema#", + "owl:": "http://www.w3.org/2002/07/owl#", + "xsd:": "http://www.w3.org/2001/XMLSchema#", + } + return ex + + +@pytest.fixture +def food_subset(): + """A minimal food ontology subset the extracted entities reference.""" + return OntologySubset( + ontology_id="food", + classes={ + "Recipe": { + "uri": "http://purl.org/ontology/fo/Recipe", + "type": "owl:Class", + "labels": [{"value": "Recipe", "lang": "en-gb"}], + "comment": "A Recipe.", + }, + "Food": { + "uri": "http://purl.org/ontology/fo/Food", + "type": "owl:Class", + "labels": [{"value": "Food", "lang": "en-gb"}], + "comment": "A Food.", + }, + }, + object_properties={ + "ingredients": { + "uri": "http://purl.org/ontology/fo/ingredients", + "type": "owl:ObjectProperty", + "labels": [{"value": "ingredients", "lang": "en-gb"}], + "comment": "Relates a recipe to its ingredients.", + "domain": "Recipe", + "range": "Food", + }, + }, + datatype_properties={}, + metadata={ + "name": "Food Ontology", + "namespace": "http://purl.org/ontology/fo/", + }, + ) + + +def _flow_with_prompt_result(prompt_result): + """Build the ``flow(name)`` callable the extractor invokes. + + ``extract_with_simplified_format`` calls + ``flow("prompt-request").prompt(...)`` — so we need ``flow`` to be + callable, return an object whose ``.prompt`` is an AsyncMock that + resolves to ``prompt_result``. + """ + prompt_service = MagicMock() + prompt_service.prompt = AsyncMock(return_value=prompt_result) + + def flow(name): + assert name == "prompt-request", ( + f"extractor should only invoke flow('prompt-request'), " + f"got {name!r}" + ) + return prompt_service + + return flow, prompt_service.prompt + + +class TestReadsObjectsForJsonlPrompt: + """extract-with-ontologies is a JSONL prompt; the extractor must + read ``result.objects``, not ``result.object``.""" + + async def test_populated_objects_produces_triples( + self, extractor, food_subset, + ): + """Happy path: PromptResult with populated .objects -> non-empty + triples list.""" + + prompt_result = PromptResult( + response_type="jsonl", + objects=[ + {"type": "entity", "entity": "Cornish Pasty", + "entity_type": "Recipe"}, + {"type": "entity", "entity": "beef", + "entity_type": "Food"}, + {"type": "relationship", + "subject": "Cornish Pasty", "subject_type": "Recipe", + "relation": "ingredients", + "object": "beef", "object_type": "Food"}, + ], + ) + + flow, prompt_mock = _flow_with_prompt_result(prompt_result) + + triples = await extractor.extract_with_simplified_format( + flow, "some chunk", food_subset, {"text": "some chunk"}, + ) + + prompt_mock.assert_awaited_once() + assert triples, ( + "extract_with_simplified_format returned no triples; if " + "this fails, the extractor is probably reading .object " + "instead of .objects again" + ) + + async def test_none_objects_returns_empty_without_crashing( + self, extractor, food_subset, + ): + """The exact shape that hit production on v2.3: the extractor + was reading ``.object`` for a JSONL prompt, which returned + ``None`` and tripped the parser's 'Unexpected response type' + path. With the fix we read ``.objects``; if that's also + ``None`` we must still return ``[]`` cleanly, not crash.""" + + prompt_result = PromptResult( + response_type="jsonl", + objects=None, + ) + + flow, _ = _flow_with_prompt_result(prompt_result) + + triples = await extractor.extract_with_simplified_format( + flow, "chunk", food_subset, {"text": "chunk"}, + ) + + assert triples == [] + + async def test_empty_objects_returns_empty( + self, extractor, food_subset, + ): + """Valid JSONL response with zero entries should yield zero + triples, not raise.""" + + prompt_result = PromptResult( + response_type="jsonl", + objects=[], + ) + + flow, _ = _flow_with_prompt_result(prompt_result) + + triples = await extractor.extract_with_simplified_format( + flow, "chunk", food_subset, {"text": "chunk"}, + ) + + assert triples == [] + + async def test_ignores_object_field_for_jsonl_prompt( + self, extractor, food_subset, + ): + """If ``.object`` is somehow set but ``.objects`` is None, the + extractor must not silently fall back to ``.object``. This + guards against a well-meaning regression that "helpfully" + re-adds fallback fields. + + The extractor should read only ``.objects`` for this prompt; + when that is None we expect the empty-result path. + """ + + prompt_result = PromptResult( + response_type="json", + object={"not": "the field we should be reading"}, + objects=None, + ) + + flow, _ = _flow_with_prompt_result(prompt_result) + + triples = await extractor.extract_with_simplified_format( + flow, "chunk", food_subset, {"text": "chunk"}, + ) + + assert triples == [], ( + "Extractor fell back to .object for a JSONL prompt — " + "this is the regression shape we are trying to prevent" + ) diff --git a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py index e024ad40..cb090589 100644 --- a/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py +++ b/trustgraph-flow/trustgraph/extract/kg/ontology/extract.py @@ -380,7 +380,13 @@ class Processor(FlowProcessor): id="extract-with-ontologies", variables=prompt_variables ) - extraction_response = result.object + + # extract-with-ontologies is a JSONL prompt, so PromptResult + # always populates .objects (a list of dicts). Reading .object + # (singular) silently gives None for JSONL responses and drops + # every extraction. + extraction_response = result.objects + logger.debug(f"Simplified extraction response: {extraction_response}") # Parse response into structured format