mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-26 08:56:21 +02:00
Added XML, JSON, CSV detection (#519)
* Improved XML detect, added schema selection * Add schema select + tests * API additions * More tests * Fixed tests
This commit is contained in:
parent
3d783f4bd4
commit
48016d8fb2
10 changed files with 1240 additions and 54 deletions
|
|
@ -492,12 +492,148 @@ class FlowInstance:
|
|||
"service/structured-query",
|
||||
input
|
||||
)
|
||||
|
||||
|
||||
# Check for system-level error
|
||||
if "error" in response and response["error"]:
|
||||
error_type = response["error"].get("type", "unknown")
|
||||
error_message = response["error"].get("message", "Unknown error")
|
||||
raise ProtocolException(f"{error_type}: {error_message}")
|
||||
|
||||
|
||||
return response
|
||||
|
||||
def detect_type(self, sample):
|
||||
"""
|
||||
Detect the data type of a structured data sample.
|
||||
|
||||
Args:
|
||||
sample: Data sample to analyze (string content)
|
||||
|
||||
Returns:
|
||||
dict with detected_type, confidence, and optional metadata
|
||||
"""
|
||||
|
||||
input = {
|
||||
"operation": "detect-type",
|
||||
"sample": sample
|
||||
}
|
||||
|
||||
response = self.request(
|
||||
"service/structured-diag",
|
||||
input
|
||||
)
|
||||
|
||||
# Check for system-level error
|
||||
if "error" in response and response["error"]:
|
||||
error_type = response["error"].get("type", "unknown")
|
||||
error_message = response["error"].get("message", "Unknown error")
|
||||
raise ProtocolException(f"{error_type}: {error_message}")
|
||||
|
||||
return response["detected-type"]
|
||||
|
||||
def generate_descriptor(self, sample, data_type, schema_name, options=None):
|
||||
"""
|
||||
Generate a descriptor for structured data mapping to a specific schema.
|
||||
|
||||
Args:
|
||||
sample: Data sample to analyze (string content)
|
||||
data_type: Data type (csv, json, xml)
|
||||
schema_name: Target schema name for descriptor generation
|
||||
options: Optional parameters (e.g., delimiter for CSV)
|
||||
|
||||
Returns:
|
||||
dict with descriptor and metadata
|
||||
"""
|
||||
|
||||
input = {
|
||||
"operation": "generate-descriptor",
|
||||
"sample": sample,
|
||||
"type": data_type,
|
||||
"schema-name": schema_name
|
||||
}
|
||||
|
||||
if options:
|
||||
input["options"] = options
|
||||
|
||||
response = self.request(
|
||||
"service/structured-diag",
|
||||
input
|
||||
)
|
||||
|
||||
# Check for system-level error
|
||||
if "error" in response and response["error"]:
|
||||
error_type = response["error"].get("type", "unknown")
|
||||
error_message = response["error"].get("message", "Unknown error")
|
||||
raise ProtocolException(f"{error_type}: {error_message}")
|
||||
|
||||
return response["descriptor"]
|
||||
|
||||
def diagnose_data(self, sample, schema_name=None, options=None):
|
||||
"""
|
||||
Perform combined data diagnosis: detect type and generate descriptor.
|
||||
|
||||
Args:
|
||||
sample: Data sample to analyze (string content)
|
||||
schema_name: Optional target schema name for descriptor generation
|
||||
options: Optional parameters (e.g., delimiter for CSV)
|
||||
|
||||
Returns:
|
||||
dict with detected_type, confidence, descriptor, and metadata
|
||||
"""
|
||||
|
||||
input = {
|
||||
"operation": "diagnose",
|
||||
"sample": sample
|
||||
}
|
||||
|
||||
if schema_name:
|
||||
input["schema-name"] = schema_name
|
||||
|
||||
if options:
|
||||
input["options"] = options
|
||||
|
||||
response = self.request(
|
||||
"service/structured-diag",
|
||||
input
|
||||
)
|
||||
|
||||
# Check for system-level error
|
||||
if "error" in response and response["error"]:
|
||||
error_type = response["error"].get("type", "unknown")
|
||||
error_message = response["error"].get("message", "Unknown error")
|
||||
raise ProtocolException(f"{error_type}: {error_message}")
|
||||
|
||||
return response
|
||||
|
||||
def schema_selection(self, sample, options=None):
|
||||
"""
|
||||
Select matching schemas for a data sample using prompt analysis.
|
||||
|
||||
Args:
|
||||
sample: Data sample to analyze (string content)
|
||||
options: Optional parameters
|
||||
|
||||
Returns:
|
||||
dict with schema_matches array and metadata
|
||||
"""
|
||||
|
||||
input = {
|
||||
"operation": "schema-selection",
|
||||
"sample": sample
|
||||
}
|
||||
|
||||
if options:
|
||||
input["options"] = options
|
||||
|
||||
response = self.request(
|
||||
"service/structured-diag",
|
||||
input
|
||||
)
|
||||
|
||||
# Check for system-level error
|
||||
if "error" in response and response["error"]:
|
||||
error_type = response["error"].get("type", "unknown")
|
||||
error_message = response["error"].get("message", "Unknown error")
|
||||
raise ProtocolException(f"{error_type}: {error_message}")
|
||||
|
||||
return response["schema-matches"]
|
||||
|
||||
|
|
|
|||
|
|
@ -57,6 +57,8 @@ class StructuredDataDiagnosisResponseTranslator(MessageTranslator):
|
|||
result["descriptor"] = obj.descriptor
|
||||
if obj.metadata:
|
||||
result["metadata"] = obj.metadata
|
||||
if obj.schema_matches is not None:
|
||||
result["schema-matches"] = obj.schema_matches
|
||||
|
||||
return result
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
from pulsar.schema import Record, String, Map, Double
|
||||
from pulsar.schema import Record, String, Map, Double, Array
|
||||
from ..core.primitives import Error
|
||||
|
||||
############################################################################
|
||||
|
|
@ -6,7 +6,7 @@ from ..core.primitives import Error
|
|||
# Structured data diagnosis services
|
||||
|
||||
class StructuredDataDiagnosisRequest(Record):
|
||||
operation = String() # "detect-type", "generate-descriptor", or "diagnose"
|
||||
operation = String() # "detect-type", "generate-descriptor", "diagnose", or "schema-selection"
|
||||
sample = String() # Data sample to analyze (text content)
|
||||
type = String() # Data type (csv, json, xml) - optional, required for generate-descriptor
|
||||
schema_name = String() # Target schema name for descriptor generation - optional
|
||||
|
|
@ -27,4 +27,7 @@ class StructuredDataDiagnosisResponse(Record):
|
|||
# JSON encoded additional metadata (e.g., field count, sample records)
|
||||
metadata = Map(String())
|
||||
|
||||
# Array of matching schema IDs (for schema-selection operation) - optional
|
||||
schema_matches = Array(String())
|
||||
|
||||
############################################################################
|
||||
Loading…
Add table
Add a link
Reference in a new issue