mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
fix: ontology extractor reads .objects, not .object, from PromptResult (#842)
The extract-with-ontologies prompt is a JSONL prompt, which means the prompt service returns a PromptResult with response_type="jsonl" and the parsed items in `.objects` (plural). The ontology extractor was reading `.object` (singular) — the field used for response_type="json" — which is always None for JSONL prompts. Effect: the parser received None on every chunk, hit its "Unexpected response type: <class 'NoneType'>" branch, returned no ExtractionResult, and extract_with_simplified_format returned []. Every extraction silently produced zero triples. Graphs populated only with the seed ontology schema (TBox) and document/chunk provenance — no instance triples at all. The e2e test threshold of >=100 edges per collection was met by schema + provenance alone, so the failure mode was invisible until RAG queries couldn't find any content. Regression introduced in v2.3 with the token-usage work (commit56d700f3/14e49d83) when PromptClient.prompt() began returning a PromptResult wrapper instead of the raw text/dict/list. All other call sites of .prompt() across retrieval/, agent/, orchestrator/ were already reading the correct field for their prompt's response_type; ontology extraction was the sole stranded caller. Also adds tests/unit/test_extract/test_ontology/test_extract_with_simplified_format.py covering: - happy path: populated .objects produces non-empty triples - production failure shape: .objects=None returns [] cleanly - empty .objects returns [] without raising - defensive: do not silently fall back to .object for a JSONL prompt
This commit is contained in:
parent
8be128aa59
commit
6cbaf88fc6
2 changed files with 207 additions and 1 deletions
|
|
@ -0,0 +1,200 @@
|
|||
"""
|
||||
Unit tests for extract_with_simplified_format.
|
||||
|
||||
Regression guard for the bug where the extractor read
|
||||
``result.object`` (singular, used for response_type="json") instead of
|
||||
``result.objects`` (plural, used for response_type="jsonl"). The
|
||||
extract-with-ontologies prompt is JSONL, so reading the wrong field
|
||||
silently dropped every extraction and left the knowledge graph
|
||||
populated only by ontology schema + document provenance.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
from trustgraph.extract.kg.ontology.extract import Processor
|
||||
from trustgraph.extract.kg.ontology.ontology_selector import OntologySubset
|
||||
from trustgraph.base import PromptResult
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def extractor():
|
||||
"""Create a Processor instance without running its heavy __init__.
|
||||
|
||||
Matches the pattern used in test_prompt_and_extraction.py: only
|
||||
the attributes the code under test touches need to be set.
|
||||
"""
|
||||
ex = object.__new__(Processor)
|
||||
ex.URI_PREFIXES = {
|
||||
"rdf:": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
||||
"rdfs:": "http://www.w3.org/2000/01/rdf-schema#",
|
||||
"owl:": "http://www.w3.org/2002/07/owl#",
|
||||
"xsd:": "http://www.w3.org/2001/XMLSchema#",
|
||||
}
|
||||
return ex
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def food_subset():
|
||||
"""A minimal food ontology subset the extracted entities reference."""
|
||||
return OntologySubset(
|
||||
ontology_id="food",
|
||||
classes={
|
||||
"Recipe": {
|
||||
"uri": "http://purl.org/ontology/fo/Recipe",
|
||||
"type": "owl:Class",
|
||||
"labels": [{"value": "Recipe", "lang": "en-gb"}],
|
||||
"comment": "A Recipe.",
|
||||
},
|
||||
"Food": {
|
||||
"uri": "http://purl.org/ontology/fo/Food",
|
||||
"type": "owl:Class",
|
||||
"labels": [{"value": "Food", "lang": "en-gb"}],
|
||||
"comment": "A Food.",
|
||||
},
|
||||
},
|
||||
object_properties={
|
||||
"ingredients": {
|
||||
"uri": "http://purl.org/ontology/fo/ingredients",
|
||||
"type": "owl:ObjectProperty",
|
||||
"labels": [{"value": "ingredients", "lang": "en-gb"}],
|
||||
"comment": "Relates a recipe to its ingredients.",
|
||||
"domain": "Recipe",
|
||||
"range": "Food",
|
||||
},
|
||||
},
|
||||
datatype_properties={},
|
||||
metadata={
|
||||
"name": "Food Ontology",
|
||||
"namespace": "http://purl.org/ontology/fo/",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _flow_with_prompt_result(prompt_result):
|
||||
"""Build the ``flow(name)`` callable the extractor invokes.
|
||||
|
||||
``extract_with_simplified_format`` calls
|
||||
``flow("prompt-request").prompt(...)`` — so we need ``flow`` to be
|
||||
callable, return an object whose ``.prompt`` is an AsyncMock that
|
||||
resolves to ``prompt_result``.
|
||||
"""
|
||||
prompt_service = MagicMock()
|
||||
prompt_service.prompt = AsyncMock(return_value=prompt_result)
|
||||
|
||||
def flow(name):
|
||||
assert name == "prompt-request", (
|
||||
f"extractor should only invoke flow('prompt-request'), "
|
||||
f"got {name!r}"
|
||||
)
|
||||
return prompt_service
|
||||
|
||||
return flow, prompt_service.prompt
|
||||
|
||||
|
||||
class TestReadsObjectsForJsonlPrompt:
|
||||
"""extract-with-ontologies is a JSONL prompt; the extractor must
|
||||
read ``result.objects``, not ``result.object``."""
|
||||
|
||||
async def test_populated_objects_produces_triples(
|
||||
self, extractor, food_subset,
|
||||
):
|
||||
"""Happy path: PromptResult with populated .objects -> non-empty
|
||||
triples list."""
|
||||
|
||||
prompt_result = PromptResult(
|
||||
response_type="jsonl",
|
||||
objects=[
|
||||
{"type": "entity", "entity": "Cornish Pasty",
|
||||
"entity_type": "Recipe"},
|
||||
{"type": "entity", "entity": "beef",
|
||||
"entity_type": "Food"},
|
||||
{"type": "relationship",
|
||||
"subject": "Cornish Pasty", "subject_type": "Recipe",
|
||||
"relation": "ingredients",
|
||||
"object": "beef", "object_type": "Food"},
|
||||
],
|
||||
)
|
||||
|
||||
flow, prompt_mock = _flow_with_prompt_result(prompt_result)
|
||||
|
||||
triples = await extractor.extract_with_simplified_format(
|
||||
flow, "some chunk", food_subset, {"text": "some chunk"},
|
||||
)
|
||||
|
||||
prompt_mock.assert_awaited_once()
|
||||
assert triples, (
|
||||
"extract_with_simplified_format returned no triples; if "
|
||||
"this fails, the extractor is probably reading .object "
|
||||
"instead of .objects again"
|
||||
)
|
||||
|
||||
async def test_none_objects_returns_empty_without_crashing(
|
||||
self, extractor, food_subset,
|
||||
):
|
||||
"""The exact shape that hit production on v2.3: the extractor
|
||||
was reading ``.object`` for a JSONL prompt, which returned
|
||||
``None`` and tripped the parser's 'Unexpected response type'
|
||||
path. With the fix we read ``.objects``; if that's also
|
||||
``None`` we must still return ``[]`` cleanly, not crash."""
|
||||
|
||||
prompt_result = PromptResult(
|
||||
response_type="jsonl",
|
||||
objects=None,
|
||||
)
|
||||
|
||||
flow, _ = _flow_with_prompt_result(prompt_result)
|
||||
|
||||
triples = await extractor.extract_with_simplified_format(
|
||||
flow, "chunk", food_subset, {"text": "chunk"},
|
||||
)
|
||||
|
||||
assert triples == []
|
||||
|
||||
async def test_empty_objects_returns_empty(
|
||||
self, extractor, food_subset,
|
||||
):
|
||||
"""Valid JSONL response with zero entries should yield zero
|
||||
triples, not raise."""
|
||||
|
||||
prompt_result = PromptResult(
|
||||
response_type="jsonl",
|
||||
objects=[],
|
||||
)
|
||||
|
||||
flow, _ = _flow_with_prompt_result(prompt_result)
|
||||
|
||||
triples = await extractor.extract_with_simplified_format(
|
||||
flow, "chunk", food_subset, {"text": "chunk"},
|
||||
)
|
||||
|
||||
assert triples == []
|
||||
|
||||
async def test_ignores_object_field_for_jsonl_prompt(
|
||||
self, extractor, food_subset,
|
||||
):
|
||||
"""If ``.object`` is somehow set but ``.objects`` is None, the
|
||||
extractor must not silently fall back to ``.object``. This
|
||||
guards against a well-meaning regression that "helpfully"
|
||||
re-adds fallback fields.
|
||||
|
||||
The extractor should read only ``.objects`` for this prompt;
|
||||
when that is None we expect the empty-result path.
|
||||
"""
|
||||
|
||||
prompt_result = PromptResult(
|
||||
response_type="json",
|
||||
object={"not": "the field we should be reading"},
|
||||
objects=None,
|
||||
)
|
||||
|
||||
flow, _ = _flow_with_prompt_result(prompt_result)
|
||||
|
||||
triples = await extractor.extract_with_simplified_format(
|
||||
flow, "chunk", food_subset, {"text": "chunk"},
|
||||
)
|
||||
|
||||
assert triples == [], (
|
||||
"Extractor fell back to .object for a JSONL prompt — "
|
||||
"this is the regression shape we are trying to prevent"
|
||||
)
|
||||
|
|
@ -412,7 +412,13 @@ class Processor(FlowProcessor):
|
|||
id="extract-with-ontologies",
|
||||
variables=prompt_variables
|
||||
)
|
||||
extraction_response = result.object
|
||||
|
||||
# extract-with-ontologies is a JSONL prompt, so PromptResult
|
||||
# always populates .objects (a list of dicts). Reading .object
|
||||
# (singular) silently gives None for JSONL responses and drops
|
||||
# every extraction.
|
||||
extraction_response = result.objects
|
||||
|
||||
logger.debug(f"Simplified extraction response: {extraction_response}")
|
||||
|
||||
# Parse response into structured format
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue