Added XML, JSON, CSV detection (#519)

* Improved XML detect, added schema selection * Add schema select + tests * API additions * More tests * Fixed tests
2026-07-13 23:32:11 +02:00 · 2025-09-16 23:53:43 +01:00 · 2025-09-16 23:53:43 +01:00 · 48016d8fb2
commit 48016d8fb2
parent 3d783f4bd4
10 changed files with 1240 additions and 54 deletions
--- a/tests/unit/test_retrieval/test_structured_diag/init.py
+++ b/tests/unit/test_retrieval/test_structured_diag/init.py
@ -0,0 +1,3 @@
+"""
+Unit and contract tests for structured-diag service
+"""
--- a/tests/unit/test_retrieval/test_structured_diag/test_message_translation.py
+++ b/tests/unit/test_retrieval/test_structured_diag/test_message_translation.py
@ -0,0 +1,172 @@
+"""
+Unit tests for message translation in structured-diag service
+"""
+
+import pytest
+from trustgraph.messaging.translators.diagnosis import (
+    StructuredDataDiagnosisRequestTranslator,
+    StructuredDataDiagnosisResponseTranslator
+)
+from trustgraph.schema.services.diagnosis import (
+    StructuredDataDiagnosisRequest,
+    StructuredDataDiagnosisResponse
+)
+
+
+class TestRequestTranslation:
+    """Test request message translation"""
+
+    def test_translate_schema_selection_request(self):
+        """Test translating schema-selection request from API to Pulsar"""
+        translator = StructuredDataDiagnosisRequestTranslator()
+
+        # API format (with hyphens)
+        api_data = {
+            "operation": "schema-selection",
+            "sample": "test data sample",
+            "options": {"filter": "catalog"}
+        }
+
+        # Translate to Pulsar
+        pulsar_msg = translator.to_pulsar(api_data)
+
+        assert pulsar_msg.operation == "schema-selection"
+        assert pulsar_msg.sample == "test data sample"
+        assert pulsar_msg.options == {"filter": "catalog"}
+
+    def test_translate_request_with_all_fields(self):
+        """Test translating request with all fields"""
+        translator = StructuredDataDiagnosisRequestTranslator()
+
+        api_data = {
+            "operation": "generate-descriptor",
+            "sample": "csv data",
+            "type": "csv",
+            "schema-name": "products",
+            "options": {"delimiter": ","}
+        }
+
+        pulsar_msg = translator.to_pulsar(api_data)
+
+        assert pulsar_msg.operation == "generate-descriptor"
+        assert pulsar_msg.sample == "csv data"
+        assert pulsar_msg.type == "csv"
+        assert pulsar_msg.schema_name == "products"
+        assert pulsar_msg.options == {"delimiter": ","}
+
+
+class TestResponseTranslation:
+    """Test response message translation"""
+
+    def test_translate_schema_selection_response(self):
+        """Test translating schema-selection response from Pulsar to API"""
+        translator = StructuredDataDiagnosisResponseTranslator()
+
+        # Create Pulsar response with schema_matches
+        pulsar_response = StructuredDataDiagnosisResponse(
+            operation="schema-selection",
+            schema_matches=["products", "inventory", "catalog"],
+            error=None
+        )
+
+        # Translate to API format
+        api_data = translator.from_pulsar(pulsar_response)
+
+        assert api_data["operation"] == "schema-selection"
+        assert api_data["schema-matches"] == ["products", "inventory", "catalog"]
+        assert "error" not in api_data  # None errors shouldn't be included
+
+    def test_translate_empty_schema_matches(self):
+        """Test translating response with empty schema_matches"""
+        translator = StructuredDataDiagnosisResponseTranslator()
+
+        pulsar_response = StructuredDataDiagnosisResponse(
+            operation="schema-selection",
+            schema_matches=[],
+            error=None
+        )
+
+        api_data = translator.from_pulsar(pulsar_response)
+
+        assert api_data["operation"] == "schema-selection"
+        assert api_data["schema-matches"] == []
+
+    def test_translate_response_without_schema_matches(self):
+        """Test translating response without schema_matches field"""
+        translator = StructuredDataDiagnosisResponseTranslator()
+
+        # Old-style response without schema_matches
+        pulsar_response = StructuredDataDiagnosisResponse(
+            operation="detect-type",
+            detected_type="xml",
+            confidence=0.9,
+            error=None
+        )
+
+        api_data = translator.from_pulsar(pulsar_response)
+
+        assert api_data["operation"] == "detect-type"
+        assert api_data["detected-type"] == "xml"
+        assert api_data["confidence"] == 0.9
+        assert "schema-matches" not in api_data  # None values shouldn't be included
+
+    def test_translate_response_with_error(self):
+        """Test translating response with error"""
+        translator = StructuredDataDiagnosisResponseTranslator()
+        from trustgraph.schema.core.primitives import Error
+
+        pulsar_response = StructuredDataDiagnosisResponse(
+            operation="schema-selection",
+            error=Error(
+                type="PromptServiceError",
+                message="Service unavailable"
+            )
+        )
+
+        api_data = translator.from_pulsar(pulsar_response)
+
+        assert api_data["operation"] == "schema-selection"
+        # Error objects are typically handled separately by the gateway
+        # but the translator shouldn't break on them
+
+    def test_translate_all_response_fields(self):
+        """Test translating response with all possible fields"""
+        translator = StructuredDataDiagnosisResponseTranslator()
+        import json
+
+        descriptor_data = {"mapping": {"field1": "column1"}}
+
+        pulsar_response = StructuredDataDiagnosisResponse(
+            operation="diagnose",
+            detected_type="csv",
+            confidence=0.95,
+            descriptor=json.dumps(descriptor_data),
+            metadata={"field_count": "5"},
+            schema_matches=["schema1", "schema2"],
+            error=None
+        )
+
+        api_data = translator.from_pulsar(pulsar_response)
+
+        assert api_data["operation"] == "diagnose"
+        assert api_data["detected-type"] == "csv"
+        assert api_data["confidence"] == 0.95
+        assert api_data["descriptor"] == descriptor_data  # Should be parsed from JSON
+        assert api_data["metadata"] == {"field_count": "5"}
+        assert api_data["schema-matches"] == ["schema1", "schema2"]
+
+    def test_response_completion_flag(self):
+        """Test that response includes completion flag"""
+        translator = StructuredDataDiagnosisResponseTranslator()
+
+        pulsar_response = StructuredDataDiagnosisResponse(
+            operation="schema-selection",
+            schema_matches=["products"],
+            error=None
+        )
+
+        api_data, is_final = translator.from_response_with_completion(pulsar_response)
+
+        assert is_final is True  # Structured-diag responses are always final
+        assert api_data["operation"] == "schema-selection"
+        assert api_data["schema-matches"] == ["products"]
--- a/tests/unit/test_retrieval/test_structured_diag/test_schema_contracts.py
+++ b/tests/unit/test_retrieval/test_structured_diag/test_schema_contracts.py
@ -0,0 +1,258 @@
+"""
+Contract tests for structured-diag service schemas
+"""
+
+import pytest
+import json
+from pulsar.schema import JsonSchema
+from trustgraph.schema.services.diagnosis import (
+    StructuredDataDiagnosisRequest,
+    StructuredDataDiagnosisResponse
+)
+
+
+class TestStructuredDiagnosisSchemaContract:
+    """Contract tests for structured diagnosis message schemas"""
+
+    def test_request_schema_basic_fields(self):
+        """Test basic request schema fields"""
+        request = StructuredDataDiagnosisRequest(
+            operation="detect-type",
+            sample="test data"
+        )
+
+        assert request.operation == "detect-type"
+        assert request.sample == "test data"
+        assert request.type is None  # Optional, defaults to None
+        assert request.schema_name is None  # Optional, defaults to None
+        assert request.options is None  # Optional, defaults to None
+
+    def test_request_schema_all_operations(self):
+        """Test request schema supports all operations"""
+        operations = ["detect-type", "generate-descriptor", "diagnose", "schema-selection"]
+
+        for op in operations:
+            request = StructuredDataDiagnosisRequest(
+                operation=op,
+                sample="test data"
+            )
+            assert request.operation == op
+
+    def test_request_schema_with_options(self):
+        """Test request schema with options"""
+        options = {"delimiter": ",", "has_header": "true"}
+        request = StructuredDataDiagnosisRequest(
+            operation="generate-descriptor",
+            sample="test data",
+            type="csv",
+            schema_name="products",
+            options=options
+        )
+
+        assert request.options == options
+        assert request.type == "csv"
+        assert request.schema_name == "products"
+
+    def test_response_schema_basic_fields(self):
+        """Test basic response schema fields"""
+        response = StructuredDataDiagnosisResponse(
+            operation="detect-type",
+            detected_type="xml",
+            confidence=0.9,
+            error=None  # Explicitly set to None
+        )
+
+        assert response.operation == "detect-type"
+        assert response.detected_type == "xml"
+        assert response.confidence == 0.9
+        assert response.error is None
+        assert response.descriptor is None
+        assert response.metadata is None
+        assert response.schema_matches is None  # New field, defaults to None
+
+    def test_response_schema_with_error(self):
+        """Test response schema with error"""
+        from trustgraph.schema.core.primitives import Error
+
+        error = Error(
+            type="ServiceError",
+            message="Service unavailable"
+        )
+        response = StructuredDataDiagnosisResponse(
+            operation="schema-selection",
+            error=error
+        )
+
+        assert response.error == error
+        assert response.error.type == "ServiceError"
+        assert response.error.message == "Service unavailable"
+
+    def test_response_schema_with_schema_matches(self):
+        """Test response schema with schema_matches array"""
+        matches = ["products", "inventory", "catalog"]
+        response = StructuredDataDiagnosisResponse(
+            operation="schema-selection",
+            schema_matches=matches
+        )
+
+        assert response.operation == "schema-selection"
+        assert response.schema_matches == matches
+        assert len(response.schema_matches) == 3
+
+    def test_response_schema_empty_schema_matches(self):
+        """Test response schema with empty schema_matches array"""
+        response = StructuredDataDiagnosisResponse(
+            operation="schema-selection",
+            schema_matches=[]
+        )
+
+        assert response.schema_matches == []
+        assert isinstance(response.schema_matches, list)
+
+    def test_response_schema_with_descriptor(self):
+        """Test response schema with descriptor"""
+        descriptor = {
+            "mapping": {
+                "field1": "column1",
+                "field2": "column2"
+            }
+        }
+        response = StructuredDataDiagnosisResponse(
+            operation="generate-descriptor",
+            descriptor=json.dumps(descriptor)
+        )
+
+        assert response.descriptor == json.dumps(descriptor)
+        parsed = json.loads(response.descriptor)
+        assert parsed["mapping"]["field1"] == "column1"
+
+    def test_response_schema_with_metadata(self):
+        """Test response schema with metadata"""
+        metadata = {
+            "csv_options": json.dumps({"delimiter": ","}),
+            "field_count": "5"
+        }
+        response = StructuredDataDiagnosisResponse(
+            operation="diagnose",
+            metadata=metadata
+        )
+
+        assert response.metadata == metadata
+        assert response.metadata["field_count"] == "5"
+
+    def test_schema_serialization(self):
+        """Test that schemas can be serialized and deserialized correctly"""
+        # Test request serialization
+        request = StructuredDataDiagnosisRequest(
+            operation="schema-selection",
+            sample="test data",
+            options={"key": "value"}
+        )
+
+        # Simulate Pulsar JsonSchema serialization
+        schema = JsonSchema(StructuredDataDiagnosisRequest)
+        serialized = schema.encode(request)
+        deserialized = schema.decode(serialized)
+
+        assert deserialized.operation == request.operation
+        assert deserialized.sample == request.sample
+        assert deserialized.options == request.options
+
+    def test_response_serialization_with_schema_matches(self):
+        """Test response serialization with schema_matches array"""
+        response = StructuredDataDiagnosisResponse(
+            operation="schema-selection",
+            schema_matches=["schema1", "schema2"],
+            confidence=0.85
+        )
+
+        # Simulate Pulsar JsonSchema serialization
+        schema = JsonSchema(StructuredDataDiagnosisResponse)
+        serialized = schema.encode(response)
+        deserialized = schema.decode(serialized)
+
+        assert deserialized.operation == response.operation
+        assert deserialized.schema_matches == response.schema_matches
+        assert deserialized.confidence == response.confidence
+
+    def test_backwards_compatibility(self):
+        """Test that old clients can still use the service without schema_matches"""
+        # Old response without schema_matches should still work
+        response = StructuredDataDiagnosisResponse(
+            operation="detect-type",
+            detected_type="json",
+            confidence=0.95
+        )
+
+        # Verify default value for new field
+        assert response.schema_matches is None  # Defaults to None when not set
+
+        # Verify old fields still work
+        assert response.detected_type == "json"
+        assert response.confidence == 0.95
+
+    def test_schema_selection_operation_contract(self):
+        """Test complete contract for schema-selection operation"""
+        # Request
+        request = StructuredDataDiagnosisRequest(
+            operation="schema-selection",
+            sample="product_id,name,price\n1,Widget,9.99"
+        )
+
+        assert request.operation == "schema-selection"
+        assert request.sample != ""
+
+        # Response with matches
+        response = StructuredDataDiagnosisResponse(
+            operation="schema-selection",
+            schema_matches=["products", "inventory"]
+        )
+
+        assert response.operation == "schema-selection"
+        assert isinstance(response.schema_matches, list)
+        assert len(response.schema_matches) == 2
+        assert all(isinstance(s, str) for s in response.schema_matches)
+
+        # Response with error
+        from trustgraph.schema.core.primitives import Error
+        error_response = StructuredDataDiagnosisResponse(
+            operation="schema-selection",
+            error=Error(type="PromptServiceError", message="Service unavailable")
+        )
+
+        assert error_response.error is not None
+        assert error_response.schema_matches is None  # Default None when not set
+
+    def test_all_operations_supported(self):
+        """Verify all operations are properly supported in the contract"""
+        supported_operations = {
+            "detect-type": {
+                "required_request": ["sample"],
+                "expected_response": ["detected_type", "confidence"]
+            },
+            "generate-descriptor": {
+                "required_request": ["sample", "type", "schema_name"],
+                "expected_response": ["descriptor"]
+            },
+            "diagnose": {
+                "required_request": ["sample"],
+                "expected_response": ["detected_type", "confidence", "descriptor"]
+            },
+            "schema-selection": {
+                "required_request": ["sample"],
+                "expected_response": ["schema_matches"]
+            }
+        }
+
+        for operation, contract in supported_operations.items():
+            # Test request creation
+            request_data = {"operation": operation}
+            for field in contract["required_request"]:
+                request_data[field] = "test_value"
+
+            request = StructuredDataDiagnosisRequest(**request_data)
+            assert request.operation == operation
+
+            # Test response creation
+            response = StructuredDataDiagnosisResponse(operation=operation)
+            assert response.operation == operation
--- a/tests/unit/test_retrieval/test_structured_diag/test_schema_selection.py
+++ b/tests/unit/test_retrieval/test_structured_diag/test_schema_selection.py
@ -0,0 +1,361 @@
+"""
+Unit tests for structured-diag service schema-selection operation
+"""
+
+import pytest
+import json
+from unittest.mock import AsyncMock, MagicMock, patch
+from trustgraph.retrieval.structured_diag.service import Processor
+from trustgraph.schema.services.diagnosis import StructuredDataDiagnosisRequest, StructuredDataDiagnosisResponse
+from trustgraph.schema import RowSchema, Field as SchemaField, Error
+
+
+@pytest.fixture
+def mock_schemas():
+    """Create mock schemas for testing"""
+    schemas = {
+        "products": RowSchema(
+            name="products",
+            description="Product catalog schema",
+            fields=[
+                SchemaField(
+                    name="product_id",
+                    type="string",
+                    description="Product identifier",
+                    required=True,
+                    primary=True,
+                    indexed=True
+                ),
+                SchemaField(
+                    name="name",
+                    type="string",
+                    description="Product name",
+                    required=True
+                ),
+                SchemaField(
+                    name="price",
+                    type="number",
+                    description="Product price",
+                    required=True
+                )
+            ]
+        ),
+        "customers": RowSchema(
+            name="customers",
+            description="Customer database schema",
+            fields=[
+                SchemaField(
+                    name="customer_id",
+                    type="string",
+                    description="Customer identifier",
+                    required=True,
+                    primary=True
+                ),
+                SchemaField(
+                    name="name",
+                    type="string",
+                    description="Customer name",
+                    required=True
+                ),
+                SchemaField(
+                    name="email",
+                    type="string",
+                    description="Customer email",
+                    required=True
+                )
+            ]
+        ),
+        "orders": RowSchema(
+            name="orders",
+            description="Order management schema",
+            fields=[
+                SchemaField(
+                    name="order_id",
+                    type="string",
+                    description="Order identifier",
+                    required=True,
+                    primary=True
+                ),
+                SchemaField(
+                    name="customer_id",
+                    type="string",
+                    description="Customer identifier",
+                    required=True
+                ),
+                SchemaField(
+                    name="total",
+                    type="number",
+                    description="Order total",
+                    required=True
+                )
+            ]
+        )
+    }
+    return schemas
+
+
+@pytest.fixture
+def service(mock_schemas):
+    """Create service instance with mock configuration"""
+    service = Processor(
+        taskgroup=MagicMock(),
+        id="test-processor"
+    )
+    service.schemas = mock_schemas
+    return service
+
+
+@pytest.fixture
+def mock_flow():
+    """Create mock flow with prompt service"""
+    flow = MagicMock()
+    prompt_request_flow = AsyncMock()
+    flow.return_value.request = prompt_request_flow
+    return flow, prompt_request_flow
+
+
+@pytest.mark.asyncio
+async def test_schema_selection_success(service, mock_flow):
+    """Test successful schema selection"""
+    flow, prompt_request_flow = mock_flow
+
+    # Mock prompt service response with matching schemas
+    mock_response = MagicMock()
+    mock_response.error = None
+    mock_response.text = '["products", "orders"]'
+    mock_response.object = None  # Explicitly set to None
+    prompt_request_flow.return_value = mock_response
+
+    # Create request
+    request = StructuredDataDiagnosisRequest(
+        operation="schema-selection",
+        sample="product_id,name,price,quantity\nPROD001,Widget,19.99,5"
+    )
+
+    # Execute operation
+    response = await service.schema_selection_operation(request, flow)
+
+    # Verify response
+    assert response.error is None
+    assert response.operation == "schema-selection"
+    assert response.schema_matches == ["products", "orders"]
+
+    # Verify prompt service was called correctly
+    prompt_request_flow.assert_called_once()
+    call_args = prompt_request_flow.call_args[0][0]
+    assert call_args.id == "schema-selection"
+
+    # Check that all schemas were passed to prompt
+    terms = call_args.terms
+    schemas_data = json.loads(terms["schemas"])
+    assert len(schemas_data) == 3  # All 3 schemas
+    assert any(s["name"] == "products" for s in schemas_data)
+    assert any(s["name"] == "customers" for s in schemas_data)
+    assert any(s["name"] == "orders" for s in schemas_data)
+
+
+@pytest.mark.asyncio
+async def test_schema_selection_empty_response(service, mock_flow):
+    """Test handling of empty prompt service response"""
+    flow, prompt_request_flow = mock_flow
+
+    # Mock empty response from prompt service
+    mock_response = MagicMock()
+    mock_response.error = None
+    mock_response.text = ""
+    mock_response.object = ""  # Both fields empty
+    prompt_request_flow.return_value = mock_response
+
+    # Create request
+    request = StructuredDataDiagnosisRequest(
+        operation="schema-selection",
+        sample="test data"
+    )
+
+    # Execute operation
+    response = await service.schema_selection_operation(request, flow)
+
+    # Verify error response
+    assert response.error is not None
+    assert response.error.type == "PromptServiceError"
+    assert "Empty response" in response.error.message
+    assert response.operation == "schema-selection"
+
+
+@pytest.mark.asyncio
+async def test_schema_selection_prompt_error(service, mock_flow):
+    """Test handling of prompt service error"""
+    flow, prompt_request_flow = mock_flow
+
+    # Mock error response from prompt service
+    mock_response = MagicMock()
+    mock_response.error = Error(
+        type="ServiceError",
+        message="Prompt service unavailable"
+    )
+    mock_response.text = None
+    prompt_request_flow.return_value = mock_response
+
+    # Create request
+    request = StructuredDataDiagnosisRequest(
+        operation="schema-selection",
+        sample="test data"
+    )
+
+    # Execute operation
+    response = await service.schema_selection_operation(request, flow)
+
+    # Verify error response
+    assert response.error is not None
+    assert response.error.type == "PromptServiceError"
+    assert "Failed to select schemas" in response.error.message
+    assert response.operation == "schema-selection"
+
+
+@pytest.mark.asyncio
+async def test_schema_selection_invalid_json(service, mock_flow):
+    """Test handling of invalid JSON response from prompt service"""
+    flow, prompt_request_flow = mock_flow
+
+    # Mock invalid JSON response
+    mock_response = MagicMock()
+    mock_response.error = None
+    mock_response.text = "not valid json"
+    mock_response.object = None
+    prompt_request_flow.return_value = mock_response
+
+    # Create request
+    request = StructuredDataDiagnosisRequest(
+        operation="schema-selection",
+        sample="test data"
+    )
+
+    # Execute operation
+    response = await service.schema_selection_operation(request, flow)
+
+    # Verify error response
+    assert response.error is not None
+    assert response.error.type == "ParseError"
+    assert "Failed to parse schema selection response" in response.error.message
+    assert response.operation == "schema-selection"
+
+
+@pytest.mark.asyncio
+async def test_schema_selection_non_array_response(service, mock_flow):
+    """Test handling of non-array JSON response from prompt service"""
+    flow, prompt_request_flow = mock_flow
+
+    # Mock non-array JSON response
+    mock_response = MagicMock()
+    mock_response.error = None
+    mock_response.text = '{"schema": "products"}'  # Object instead of array
+    mock_response.object = None
+    prompt_request_flow.return_value = mock_response
+
+    # Create request
+    request = StructuredDataDiagnosisRequest(
+        operation="schema-selection",
+        sample="test data"
+    )
+
+    # Execute operation
+    response = await service.schema_selection_operation(request, flow)
+
+    # Verify error response
+    assert response.error is not None
+    assert response.error.type == "ParseError"
+    assert "Failed to parse schema selection response" in response.error.message
+    assert response.operation == "schema-selection"
+
+
+@pytest.mark.asyncio
+async def test_schema_selection_with_options(service, mock_flow):
+    """Test schema selection with additional options"""
+    flow, prompt_request_flow = mock_flow
+
+    # Mock successful response
+    mock_response = MagicMock()
+    mock_response.error = None
+    mock_response.text = '["products"]'
+    mock_response.object = None
+    prompt_request_flow.return_value = mock_response
+
+    # Create request with options
+    request = StructuredDataDiagnosisRequest(
+        operation="schema-selection",
+        sample="test data",
+        options={"filter": "catalog", "confidence": "high"}
+    )
+
+    # Execute operation
+    response = await service.schema_selection_operation(request, flow)
+
+    # Verify response
+    assert response.error is None
+    assert response.schema_matches == ["products"]
+
+    # Verify options were passed to prompt
+    call_args = prompt_request_flow.call_args[0][0]
+    terms = call_args.terms
+    options = json.loads(terms["options"])
+    assert options["filter"] == "catalog"
+    assert options["confidence"] == "high"
+
+
+@pytest.mark.asyncio
+async def test_schema_selection_exception_handling(service, mock_flow):
+    """Test handling of unexpected exceptions"""
+    flow, prompt_request_flow = mock_flow
+
+    # Mock exception during prompt service call
+    prompt_request_flow.side_effect = Exception("Unexpected error")
+
+    # Create request
+    request = StructuredDataDiagnosisRequest(
+        operation="schema-selection",
+        sample="test data"
+    )
+
+    # Execute operation
+    response = await service.schema_selection_operation(request, flow)
+
+    # Verify error response
+    assert response.error is not None
+    assert response.error.type == "PromptServiceError"
+    assert "Failed to select schemas" in response.error.message
+    assert response.operation == "schema-selection"
+
+
+@pytest.mark.asyncio
+async def test_schema_selection_empty_schemas(service, mock_flow):
+    """Test schema selection with no schemas configured"""
+    flow, prompt_request_flow = mock_flow
+
+    # Clear schemas
+    service.schemas = {}
+
+    # Mock response (shouldn't be reached)
+    mock_response = MagicMock()
+    mock_response.error = None
+    mock_response.text = '[]'
+    mock_response.object = None
+    prompt_request_flow.return_value = mock_response
+
+    # Create request
+    request = StructuredDataDiagnosisRequest(
+        operation="schema-selection",
+        sample="test data"
+    )
+
+    # Execute operation
+    response = await service.schema_selection_operation(request, flow)
+
+    # Should still succeed but with empty schemas array passed to prompt
+    assert response.error is None
+    assert response.schema_matches == []
+
+    # Verify empty schemas array was passed
+    call_args = prompt_request_flow.call_args[0][0]
+    terms = call_args.terms
+    schemas_data = json.loads(terms["schemas"])
+    assert len(schemas_data) == 0
--- a/tests/unit/test_retrieval/test_structured_diag/test_type_detection.py
+++ b/tests/unit/test_retrieval/test_structured_diag/test_type_detection.py
@ -0,0 +1,179 @@
+"""
+Unit tests for simplified type detection in structured-diag service
+"""
+
+import pytest
+from trustgraph.retrieval.structured_diag.type_detector import detect_data_type
+
+
+class TestSimplifiedTypeDetection:
+    """Test the simplified type detection logic"""
+
+    def test_xml_detection_with_declaration(self):
+        """Test XML detection with XML declaration"""
+        sample = '<?xml version="1.0"?><root><item>data</item></root>'
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "xml"
+        assert confidence == 0.9
+
+    def test_xml_detection_without_declaration(self):
+        """Test XML detection without declaration but with closing tags"""
+        sample = '<root><item>data</item></root>'
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "xml"
+        assert confidence == 0.9
+
+    def test_xml_detection_truncated(self):
+        """Test XML detection with truncated XML (common with 500-byte samples)"""
+        sample = '''<?xml version="1.0" encoding="UTF-8"?>
+<pieDataset>
+  <pies>
+    <pie id="1">
+      <pieType>Steak &amp; Kidney</pieType>
+      <region>Yorkshire</region>
+      <diameterCm>12.5</diameterCm>
+      <heightCm>4.2'''  # Truncated mid-element
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "xml"
+        assert confidence == 0.9
+
+    def test_json_object_detection(self):
+        """Test JSON object detection"""
+        sample = '{"name": "John", "age": 30, "city": "New York"}'
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "json"
+        assert confidence == 0.9
+
+    def test_json_array_detection(self):
+        """Test JSON array detection"""
+        sample = '[{"id": 1}, {"id": 2}, {"id": 3}]'
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "json"
+        assert confidence == 0.9
+
+    def test_json_truncated(self):
+        """Test JSON detection with truncated JSON"""
+        sample = '{"products": [{"id": 1, "name": "Widget", "price": 19.99}, {"id": 2, "na'
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "json"
+        assert confidence == 0.9
+
+    def test_csv_detection(self):
+        """Test CSV detection as fallback"""
+        sample = '''name,age,city
+John,30,New York
+Jane,25,Boston
+Bob,35,Chicago'''
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "csv"
+        assert confidence == 0.8
+
+    def test_csv_detection_single_line(self):
+        """Test CSV detection with single line defaults to CSV"""
+        sample = 'column1,column2,column3'
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "csv"
+        assert confidence == 0.8
+
+    def test_empty_input(self):
+        """Test empty input handling"""
+        data_type, confidence = detect_data_type("")
+        assert data_type is None
+        assert confidence == 0.0
+
+    def test_whitespace_only(self):
+        """Test whitespace-only input"""
+        data_type, confidence = detect_data_type("   \n  \t  ")
+        assert data_type is None
+        assert confidence == 0.0
+
+    def test_html_not_xml(self):
+        """Test HTML is detected as XML (has closing tags)"""
+        sample = '<html><body><h1>Title</h1></body></html>'
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "xml"  # HTML is detected as XML
+        assert confidence == 0.9
+
+    def test_malformed_xml_still_detected(self):
+        """Test malformed XML is still detected as XML"""
+        sample = '<root><item>data</item><unclosed>'
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "xml"
+        assert confidence == 0.9
+
+    def test_json_with_whitespace(self):
+        """Test JSON detection with leading whitespace"""
+        sample = '   \n  {"key": "value"}'
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "json"
+        assert confidence == 0.9
+
+    def test_priority_xml_over_csv(self):
+        """Test XML takes priority over CSV when both patterns present"""
+        sample = '<?xml version="1.0"?>\n<data>a,b,c</data>'
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "xml"
+        assert confidence == 0.9
+
+    def test_priority_json_over_csv(self):
+        """Test JSON takes priority over CSV when both patterns present"""
+        sample = '{"data": "a,b,c"}'
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "json"
+        assert confidence == 0.9
+
+    def test_text_defaults_to_csv(self):
+        """Test plain text defaults to CSV"""
+        sample = 'This is just plain text without any structure'
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "csv"
+        assert confidence == 0.8
+
+
+class TestRealWorldSamples:
+    """Test with real-world data samples"""
+
+    def test_uk_pies_xml_sample(self):
+        """Test with actual UK pies XML sample (first 500 bytes)"""
+        sample = '''<?xml version="1.0" encoding="UTF-8"?>
+<pieDataset>
+  <pies>
+    <pie id="1">
+      <pieType>Steak &amp; Kidney</pieType>
+      <region>Yorkshire</region>
+      <diameterCm>12.5</diameterCm>
+      <heightCm>4.2</heightCm>
+      <weightGrams>285</weightGrams>
+      <crustType>Shortcrust</crustType>
+      <fillingCategory>Meat</fillingCategory>
+      <price>3.50</price>
+      <currency>GBP</currency>
+      <bakeryType>Traditional</bakeryType>
+    </pie>
+    <pie id="2">
+      <pieType>Chicken &amp; Mushroom</pieType>
+      <region>Lancashire</regio'''  # Cut at 500 chars
+        data_type, confidence = detect_data_type(sample[:500])
+        assert data_type == "xml"
+        assert confidence == 0.9
+
+    def test_product_json_sample(self):
+        """Test with product catalog JSON sample"""
+        sample = '''{"products": [
+  {"id": "PROD001", "name": "Widget", "price": 19.99, "category": "Tools"},
+  {"id": "PROD002", "name": "Gadget", "price": 29.99, "category": "Electronics"},
+  {"id": "PROD003", "name": "Doohickey", "price": 9.99, "category": "Accessories"}
+]}'''
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "json"
+        assert confidence == 0.9
+
+    def test_customer_csv_sample(self):
+        """Test with customer CSV sample"""
+        sample = '''customer_id,name,email,signup_date,total_orders
+CUST001,John Smith,john@example.com,2023-01-15,5
+CUST002,Jane Doe,jane@example.com,2023-02-20,3
+CUST003,Bob Johnson,bob@example.com,2023-03-10,7'''
+        data_type, confidence = detect_data_type(sample)
+        assert data_type == "csv"
+        assert confidence == 0.8