fix(qa): tolerate non-dict JSON from QA LLM instead of crashing (#408)

* fix(qa): tolerate non-dict JSON from QA LLM instead of crashing parse_llm_json is explicitly designed to return a list when the model emits a top-level JSON array (it has a dedicated test for that). The QA analyzers then call parsed.get("tags", ...) directly on the result. When parsed is a list, that raises AttributeError, which is NOT caught by the surrounding except (json.JSONDecodeError, ValueError) — so a single stray array response from the QA model crashed the entire QA analysis run instead of degrading to empty results. The live variable-extraction path already guards this exact case with an isinstance(..., dict) check; mirror it in both QA analysis call sites (_run_qa_analysis per-node and _run_whole_call_qa_analysis fallback) so a non-dict parse result coerces to {} and the run produces empty defaults. Adds a regression test that drives the whole-call analyzer with an array response and asserts empty results rather than a crash. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> * fix(qa): log non-object QA JSON responses --------- Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com> Co-authored-by: Abhishek Kumar <abhishek@a6k.me>
2026-06-22 08:38:13 +02:00 · 2026-06-19 13:40:53 +05:00 · 2026-06-19 13:40:53 +05:00 · 6e194f4b59
commit 6e194f4b59
parent ed06de73a3
2 changed files with 92 additions and 0 deletions
--- a/api/services/workflow/qa/analysis.py
+++ b/api/services/workflow/qa/analysis.py
@ -206,6 +206,16 @@ async def run_per_node_qa_analysis(
        }
        try:
            parsed = parse_llm_json(raw_response)
+            # parse_llm_json can return a list (e.g. when the model emits a
+            # top-level JSON array); coerce non-dict results so the .get()
+            # lookups below don't raise AttributeError.
+            if not isinstance(parsed, dict):
+                logger.warning(
+                    f"QA LLM returned non-object JSON for node '{node_name}' "
+                    f"on run {workflow_run_id}; got {type(parsed).__name__}, "
+                    "using empty QA result"
+                )
+                parsed = {}
            node_result["tags"] = parsed.get("tags", [])
            node_result["summary"] = parsed.get("summary", "")
            node_result["score"] = parsed.get("call_quality_score")
@ -296,6 +306,16 @@ async def _run_whole_call_qa_analysis(
    }
    try:
        parsed = parse_llm_json(raw_response)
+        # parse_llm_json can return a list (e.g. when the model emits a
+        # top-level JSON array); coerce non-dict results so the .get()
+        # lookups below don't raise AttributeError.
+        if not isinstance(parsed, dict):
+            logger.warning(
+                f"QA LLM returned non-object JSON for whole-call QA on run "
+                f"{workflow_run_id}; got {type(parsed).__name__}, using empty "
+                "QA result"
+            )
+            parsed = {}
        node_result["tags"] = parsed.get("tags", [])
        node_result["summary"] = parsed.get("summary", "")
        node_result["score"] = parsed.get("call_quality_score")
--- a/api/tests/test_qa_analysis_non_dict_response.py
+++ b/api/tests/test_qa_analysis_non_dict_response.py
@ -0,0 +1,72 @@
+"""Regression test for QA analysis when the LLM returns a non-dict JSON value.
+
+``parse_llm_json`` is explicitly designed to return a list when the model emits
+a top-level JSON array (see ``test_json_parser.py``). The QA analyzers then call
+``parsed.get(...)`` on the result. For a list that raises ``AttributeError``,
+which is NOT caught by the surrounding ``except (json.JSONDecodeError, ValueError)``
+— so a stray array response crashed the whole QA run instead of degrading to
+empty results.
+"""
+
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, Mock, patch
+
+import pytest
+
+from api.services.workflow.qa import analysis as qa_analysis
+
+
+@pytest.mark.asyncio
+async def test_whole_call_qa_tolerates_array_llm_response():
+    """A top-level JSON array from the QA LLM degrades to empty results."""
+    qa_data = SimpleNamespace(qa_system_prompt="Summarize: {transcript}")
+    workflow_run = SimpleNamespace(
+        logs={
+            "realtime_feedback_events": [
+                {"role": "user", "content": "hello"},
+                {"role": "assistant", "content": "hi there"},
+            ]
+        },
+        usage_info={"call_duration_seconds": 12},
+    )
+    warning_mock = Mock()
+
+    with (
+        patch.object(
+            qa_analysis, "build_conversation_structure", return_value=[{"x": 1}]
+        ),
+        patch.object(qa_analysis, "format_transcript", return_value="user: hello"),
+        patch.object(qa_analysis, "compute_call_metrics", return_value={}),
+        patch.object(
+            qa_analysis,
+            "resolve_llm_config",
+            new=AsyncMock(return_value=("openai", "gpt-4o", "sk-test", {})),
+        ),
+        patch.object(qa_analysis, "create_llm_service_from_provider", return_value=object()),
+        patch.object(
+            qa_analysis,
+            "_run_llm_inference",
+            new=AsyncMock(return_value='["tag1", "tag2"]'),
+        ),
+        patch.object(
+            qa_analysis, "setup_langfuse_parent_context", return_value=None
+        ),
+        patch.object(qa_analysis, "add_qa_span_to_trace", return_value=None),
+        patch.object(qa_analysis.logger, "warning", warning_mock),
+    ):
+        # Before the fix this raised AttributeError: 'list' object has no
+        # attribute 'get'.
+        result = await qa_analysis._run_whole_call_qa_analysis(
+            qa_data, workflow_run, workflow_run_id=99
+        )
+
+    node_result = result["node_results"]["whole_call"]
+    assert node_result["tags"] == []
+    assert node_result["summary"] == ""
+    assert node_result["score"] is None
+    warning_mock.assert_called_once()
+    warning_message = warning_mock.call_args.args[0]
+    assert "non-object JSON" in warning_message
+    assert "run 99" in warning_message
+    assert "list" in warning_message
+    assert "tag1" not in warning_message