Added XML, JSON, CSV detection (#519)

* Improved XML detect, added schema selection * Add schema select + tests * API additions * More tests * Fixed tests
2026-04-27 17:36:23 +02:00 · 2025-09-16 23:53:43 +01:00 · 2025-09-16 23:53:43 +01:00 · 48016d8fb2
commit 48016d8fb2
parent 3d783f4bd4
10 changed files with 1240 additions and 54 deletions
--- a/trustgraph-flow/trustgraph/retrieval/structured_diag/service.py
+++ b/trustgraph-flow/trustgraph/retrieval/structured_diag/service.py
@ -21,6 +21,7 @@ default_ident = "structured-diag"
 default_csv_prompt = "diagnose-csv"
 default_json_prompt = "diagnose-json"
 default_xml_prompt = "diagnose-xml"
+default_schema_selection_prompt = "schema-selection"


 class Processor(FlowProcessor):
@ -36,6 +37,7 @@ class Processor(FlowProcessor):
        self.csv_prompt = params.get("csv_prompt", default_csv_prompt)
        self.json_prompt = params.get("json_prompt", default_json_prompt)
        self.xml_prompt = params.get("xml_prompt", default_xml_prompt)
+        self.schema_selection_prompt = params.get("schema_selection_prompt", default_schema_selection_prompt)

        super(Processor, self).__init__(
            **params | {
@ -143,10 +145,12 @@ class Processor(FlowProcessor):
                response = await self.generate_descriptor_operation(request, flow)
            elif request.operation == "diagnose":
                response = await self.diagnose_operation(request, flow)
+            elif request.operation == "schema-selection":
+                response = await self.schema_selection_operation(request, flow)
            else:
                error = Error(
                    type="InvalidOperation",
-                    message=f"Unknown operation: {request.operation}. Supported: detect-type, generate-descriptor, diagnose"
+                    message=f"Unknown operation: {request.operation}. Supported: detect-type, generate-descriptor, diagnose, schema-selection"
                )
                response = StructuredDataDiagnosisResponse(
                    error=error,
@ -155,7 +159,7 @@ class Processor(FlowProcessor):

            # Send response
            await flow("response").send(
-                id, response, properties={"id": id}
+                response, properties={"id": id}
            )

        except Exception as e:
@ -172,7 +176,7 @@ class Processor(FlowProcessor):
            )

            await flow("response").send(
-                id, response, properties={"id": id}
+                response, properties={"id": id}
            )

    async def detect_type_operation(self, request: StructuredDataDiagnosisRequest, flow) -> StructuredDataDiagnosisResponse:
@ -307,6 +311,102 @@ class Processor(FlowProcessor):
            metadata=metadata
        )

+    async def schema_selection_operation(self, request: StructuredDataDiagnosisRequest, flow) -> StructuredDataDiagnosisResponse:
+        """Handle schema-selection operation"""
+        logger.info("Processing schema-selection operation")
+
+        # Prepare all schemas for the prompt - match the original config format
+        all_schemas = []
+        for schema_name, row_schema in self.schemas.items():
+            schema_info = {
+                "name": row_schema.name,
+                "description": row_schema.description,
+                "fields": [
+                    {
+                        "name": f.name,
+                        "type": f.type,
+                        "description": f.description,
+                        "required": f.required,
+                        "primary_key": f.primary,
+                        "indexed": f.indexed,
+                        "enum": f.enum_values if f.enum_values else [],
+                        "size": f.size if hasattr(f, 'size') else 0
+                    }
+                    for f in row_schema.fields
+                ]
+            }
+            all_schemas.append(schema_info)
+
+        # Create prompt variables - schemas array contains ALL schemas
+        # Note: The prompt expects 'question' not 'sample'
+        variables = {
+            "question": request.sample,  # The prompt template expects 'question'
+            "schemas": all_schemas,
+            "options": request.options or {}
+        }
+
+        # Call prompt service with configurable template
+        terms = {k: json.dumps(v) for k, v in variables.items()}
+        prompt_request = PromptRequest(
+            id=self.schema_selection_prompt,
+            terms=terms
+        )
+
+        try:
+            logger.info(f"Calling prompt service for schema selection with template: {self.schema_selection_prompt}")
+            response = await flow("prompt-request").request(prompt_request)
+
+            if response.error:
+                logger.error(f"Prompt service error: {response.error.message}")
+                error = Error(
+                    type="PromptServiceError",
+                    message="Failed to select schemas using prompt service"
+                )
+                return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
+
+            # Check both text and object fields for response
+            response_data = None
+            if response.object and response.object.strip():
+                response_data = response.object.strip()
+                logger.debug(f"Using response from 'object' field: {response_data}")
+            elif response.text and response.text.strip():
+                response_data = response.text.strip()
+                logger.debug(f"Using response from 'text' field: {response_data}")
+            else:
+                logger.error("Empty response from prompt service (checked both text and object fields)")
+                error = Error(
+                    type="PromptServiceError",
+                    message="Empty response from prompt service"
+                )
+                return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
+
+            # Parse the response as JSON array of schema IDs
+            try:
+                schema_matches = json.loads(response_data)
+                if not isinstance(schema_matches, list):
+                    raise ValueError("Response must be an array")
+            except (json.JSONDecodeError, ValueError) as e:
+                logger.error(f"Failed to parse schema matches response: {e}")
+                error = Error(
+                    type="ParseError",
+                    message="Failed to parse schema selection response as JSON array"
+                )
+                return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
+
+            return StructuredDataDiagnosisResponse(
+                error=None,
+                operation=request.operation,
+                schema_matches=schema_matches
+            )
+
+        except Exception as e:
+            logger.error(f"Error calling prompt service: {e}", exc_info=True)
+            error = Error(
+                type="PromptServiceError",
+                message="Failed to select schemas using prompt service"
+            )
+            return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
+
    async def generate_descriptor_with_prompt(
        self, sample: str, data_type: str, target_schema: RowSchema,
        options: Dict[str, str], flow
--- a/trustgraph-flow/trustgraph/retrieval/structured_diag/type_detector.py
+++ b/trustgraph-flow/trustgraph/retrieval/structured_diag/type_detector.py
@ -31,28 +31,13 @@ def detect_data_type(sample: str) -> Tuple[Optional[str], float]:

    sample = sample.strip()

-    # Try each format and calculate confidence scores
-    json_confidence = _check_json_format(sample)
-    xml_confidence = _check_xml_format(sample)
-    csv_confidence = _check_csv_format(sample)
-
-    logger.debug(f"Format confidence scores - JSON: {json_confidence}, XML: {xml_confidence}, CSV: {csv_confidence}")
-
-    # Find the format with highest confidence
-    scores = {
-        "json": json_confidence,
-        "xml": xml_confidence,
-        "csv": csv_confidence
-    }
-
-    best_format = max(scores, key=scores.get)
-    best_confidence = scores[best_format]
-
-    # Only return a result if confidence is above threshold
-    if best_confidence < 0.3:
-        return None, best_confidence
-
-    return best_format, best_confidence
+    # Simple pattern matching
+    if sample.startswith('<?xml') or (sample.startswith('<') and '</' in sample):
+        return 'xml', 0.9
+    elif sample.startswith(('{', '[')):
+        return 'json', 0.9
+    else:
+        return 'csv', 0.8


 def _check_json_format(sample: str) -> float:
@ -83,33 +68,20 @@ def _check_json_format(sample: str) -> float:

 def _check_xml_format(sample: str) -> float:
    """Check if sample is valid XML format"""
-    try:
-        # Quick heuristic checks first
-        if not sample.startswith('<'):
-            return 0.0
-
-        if not ('>' in sample and '</' in sample):
-            return 0.1  # Might be incomplete XML
-
-        # Try to parse as XML
-        root = ET.fromstring(sample)
-
-        # Higher confidence for XML with multiple child elements
-        child_count = len(list(root))
-        if child_count > 10:
-            return 0.95
-        elif child_count > 5:
-            return 0.9
-        elif child_count > 0:
-            return 0.8
+    # XML declaration or starts with tag
+    if sample.startswith('<?xml') or sample.startswith('<'):
+        # Must have closing tags for valid XML
+        if '</' in sample and '>' in sample:
+            try:
+                # Quick parse test
+                ET.fromstring(sample)
+                return 0.9  # Valid XML
+            except ET.ParseError:
+                return 0.3  # Looks like XML but malformed
        else:
-            return 0.6
+            return 0.1  # Incomplete XML

-    except ET.ParseError:
-        # Check for common XML characteristics even if not well-formed
-        xml_indicators = ['</', '<?xml', 'xmlns:', '<![CDATA[']
-        score = sum(0.1 for indicator in xml_indicators if indicator in sample)
-        return min(score, 0.3)  # Max 0.3 for malformed XML
+    return 0.0  # Not XML


 def _check_csv_format(sample: str) -> float: