Added XML, JSON, CSV detection (#519)

* Improved XML detect, added schema selection * Add schema select + tests * API additions * More tests * Fixed tests
2026-04-26 08:56:21 +02:00 · 2025-09-16 23:53:43 +01:00 · 2025-09-16 23:53:43 +01:00 · 48016d8fb2
commit 48016d8fb2
parent 3d783f4bd4
10 changed files with 1240 additions and 54 deletions
--- a/trustgraph-base/trustgraph/api/flow.py
+++ b/trustgraph-base/trustgraph/api/flow.py
@ -492,12 +492,148 @@ class FlowInstance:
            "service/structured-query",
            input
        )
-        
+
        # Check for system-level error
        if "error" in response and response["error"]:
            error_type = response["error"].get("type", "unknown")
            error_message = response["error"].get("message", "Unknown error")
            raise ProtocolException(f"{error_type}: {error_message}")
-        
+
        return response

+    def detect_type(self, sample):
+        """
+        Detect the data type of a structured data sample.
+
+        Args:
+            sample: Data sample to analyze (string content)
+
+        Returns:
+            dict with detected_type, confidence, and optional metadata
+        """
+
+        input = {
+            "operation": "detect-type",
+            "sample": sample
+        }
+
+        response = self.request(
+            "service/structured-diag",
+            input
+        )
+
+        # Check for system-level error
+        if "error" in response and response["error"]:
+            error_type = response["error"].get("type", "unknown")
+            error_message = response["error"].get("message", "Unknown error")
+            raise ProtocolException(f"{error_type}: {error_message}")
+
+        return response["detected-type"]
+
+    def generate_descriptor(self, sample, data_type, schema_name, options=None):
+        """
+        Generate a descriptor for structured data mapping to a specific schema.
+
+        Args:
+            sample: Data sample to analyze (string content)
+            data_type: Data type (csv, json, xml)
+            schema_name: Target schema name for descriptor generation
+            options: Optional parameters (e.g., delimiter for CSV)
+
+        Returns:
+            dict with descriptor and metadata
+        """
+
+        input = {
+            "operation": "generate-descriptor",
+            "sample": sample,
+            "type": data_type,
+            "schema-name": schema_name
+        }
+
+        if options:
+            input["options"] = options
+
+        response = self.request(
+            "service/structured-diag",
+            input
+        )
+
+        # Check for system-level error
+        if "error" in response and response["error"]:
+            error_type = response["error"].get("type", "unknown")
+            error_message = response["error"].get("message", "Unknown error")
+            raise ProtocolException(f"{error_type}: {error_message}")
+
+        return response["descriptor"]
+
+    def diagnose_data(self, sample, schema_name=None, options=None):
+        """
+        Perform combined data diagnosis: detect type and generate descriptor.
+
+        Args:
+            sample: Data sample to analyze (string content)
+            schema_name: Optional target schema name for descriptor generation
+            options: Optional parameters (e.g., delimiter for CSV)
+
+        Returns:
+            dict with detected_type, confidence, descriptor, and metadata
+        """
+
+        input = {
+            "operation": "diagnose",
+            "sample": sample
+        }
+
+        if schema_name:
+            input["schema-name"] = schema_name
+
+        if options:
+            input["options"] = options
+
+        response = self.request(
+            "service/structured-diag",
+            input
+        )
+
+        # Check for system-level error
+        if "error" in response and response["error"]:
+            error_type = response["error"].get("type", "unknown")
+            error_message = response["error"].get("message", "Unknown error")
+            raise ProtocolException(f"{error_type}: {error_message}")
+
+        return response
+
+    def schema_selection(self, sample, options=None):
+        """
+        Select matching schemas for a data sample using prompt analysis.
+
+        Args:
+            sample: Data sample to analyze (string content)
+            options: Optional parameters
+
+        Returns:
+            dict with schema_matches array and metadata
+        """
+
+        input = {
+            "operation": "schema-selection",
+            "sample": sample
+        }
+
+        if options:
+            input["options"] = options
+
+        response = self.request(
+            "service/structured-diag",
+            input
+        )
+
+        # Check for system-level error
+        if "error" in response and response["error"]:
+            error_type = response["error"].get("type", "unknown")
+            error_message = response["error"].get("message", "Unknown error")
+            raise ProtocolException(f"{error_type}: {error_message}")
+
+        return response["schema-matches"]
+
--- a/trustgraph-base/trustgraph/messaging/translators/diagnosis.py
+++ b/trustgraph-base/trustgraph/messaging/translators/diagnosis.py
@ -57,6 +57,8 @@ class StructuredDataDiagnosisResponseTranslator(MessageTranslator):
                result["descriptor"] = obj.descriptor
        if obj.metadata:
            result["metadata"] = obj.metadata
+        if obj.schema_matches is not None:
+            result["schema-matches"] = obj.schema_matches

        return result

--- a/trustgraph-base/trustgraph/schema/services/diagnosis.py
+++ b/trustgraph-base/trustgraph/schema/services/diagnosis.py
@ -1,4 +1,4 @@
-from pulsar.schema import Record, String, Map, Double
+from pulsar.schema import Record, String, Map, Double, Array
 from ..core.primitives import Error

 ############################################################################
@ -6,7 +6,7 @@ from ..core.primitives import Error
 # Structured data diagnosis services

 class StructuredDataDiagnosisRequest(Record):
-    operation = String()  # "detect-type", "generate-descriptor", or "diagnose"
+    operation = String()  # "detect-type", "generate-descriptor", "diagnose", or "schema-selection"
    sample = String()     # Data sample to analyze (text content)
    type = String()       # Data type (csv, json, xml) - optional, required for generate-descriptor
    schema_name = String() # Target schema name for descriptor generation - optional
@ -27,4 +27,7 @@ class StructuredDataDiagnosisResponse(Record):
    # JSON encoded additional metadata (e.g., field count, sample records)
    metadata = Map(String())

+    # Array of matching schema IDs (for schema-selection operation) - optional
+    schema_matches = Array(String())
+
 ############################################################################