Added XML, JSON, CSV detection (#519)

* Improved XML detect, added schema selection

* Add schema select + tests

* API additions

* More tests

* Fixed tests
This commit is contained in:
cybermaggedon 2025-09-16 23:53:43 +01:00 committed by GitHub
parent 3d783f4bd4
commit 48016d8fb2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 1240 additions and 54 deletions

View file

@ -21,6 +21,7 @@ default_ident = "structured-diag"
default_csv_prompt = "diagnose-csv"
default_json_prompt = "diagnose-json"
default_xml_prompt = "diagnose-xml"
default_schema_selection_prompt = "schema-selection"
class Processor(FlowProcessor):
@ -36,6 +37,7 @@ class Processor(FlowProcessor):
self.csv_prompt = params.get("csv_prompt", default_csv_prompt)
self.json_prompt = params.get("json_prompt", default_json_prompt)
self.xml_prompt = params.get("xml_prompt", default_xml_prompt)
self.schema_selection_prompt = params.get("schema_selection_prompt", default_schema_selection_prompt)
super(Processor, self).__init__(
**params | {
@ -143,10 +145,12 @@ class Processor(FlowProcessor):
response = await self.generate_descriptor_operation(request, flow)
elif request.operation == "diagnose":
response = await self.diagnose_operation(request, flow)
elif request.operation == "schema-selection":
response = await self.schema_selection_operation(request, flow)
else:
error = Error(
type="InvalidOperation",
message=f"Unknown operation: {request.operation}. Supported: detect-type, generate-descriptor, diagnose"
message=f"Unknown operation: {request.operation}. Supported: detect-type, generate-descriptor, diagnose, schema-selection"
)
response = StructuredDataDiagnosisResponse(
error=error,
@ -155,7 +159,7 @@ class Processor(FlowProcessor):
# Send response
await flow("response").send(
id, response, properties={"id": id}
response, properties={"id": id}
)
except Exception as e:
@ -172,7 +176,7 @@ class Processor(FlowProcessor):
)
await flow("response").send(
id, response, properties={"id": id}
response, properties={"id": id}
)
async def detect_type_operation(self, request: StructuredDataDiagnosisRequest, flow) -> StructuredDataDiagnosisResponse:
@ -307,6 +311,102 @@ class Processor(FlowProcessor):
metadata=metadata
)
async def schema_selection_operation(self, request: StructuredDataDiagnosisRequest, flow) -> StructuredDataDiagnosisResponse:
"""Handle schema-selection operation"""
logger.info("Processing schema-selection operation")
# Prepare all schemas for the prompt - match the original config format
all_schemas = []
for schema_name, row_schema in self.schemas.items():
schema_info = {
"name": row_schema.name,
"description": row_schema.description,
"fields": [
{
"name": f.name,
"type": f.type,
"description": f.description,
"required": f.required,
"primary_key": f.primary,
"indexed": f.indexed,
"enum": f.enum_values if f.enum_values else [],
"size": f.size if hasattr(f, 'size') else 0
}
for f in row_schema.fields
]
}
all_schemas.append(schema_info)
# Create prompt variables - schemas array contains ALL schemas
# Note: The prompt expects 'question' not 'sample'
variables = {
"question": request.sample, # The prompt template expects 'question'
"schemas": all_schemas,
"options": request.options or {}
}
# Call prompt service with configurable template
terms = {k: json.dumps(v) for k, v in variables.items()}
prompt_request = PromptRequest(
id=self.schema_selection_prompt,
terms=terms
)
try:
logger.info(f"Calling prompt service for schema selection with template: {self.schema_selection_prompt}")
response = await flow("prompt-request").request(prompt_request)
if response.error:
logger.error(f"Prompt service error: {response.error.message}")
error = Error(
type="PromptServiceError",
message="Failed to select schemas using prompt service"
)
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
# Check both text and object fields for response
response_data = None
if response.object and response.object.strip():
response_data = response.object.strip()
logger.debug(f"Using response from 'object' field: {response_data}")
elif response.text and response.text.strip():
response_data = response.text.strip()
logger.debug(f"Using response from 'text' field: {response_data}")
else:
logger.error("Empty response from prompt service (checked both text and object fields)")
error = Error(
type="PromptServiceError",
message="Empty response from prompt service"
)
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
# Parse the response as JSON array of schema IDs
try:
schema_matches = json.loads(response_data)
if not isinstance(schema_matches, list):
raise ValueError("Response must be an array")
except (json.JSONDecodeError, ValueError) as e:
logger.error(f"Failed to parse schema matches response: {e}")
error = Error(
type="ParseError",
message="Failed to parse schema selection response as JSON array"
)
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
return StructuredDataDiagnosisResponse(
error=None,
operation=request.operation,
schema_matches=schema_matches
)
except Exception as e:
logger.error(f"Error calling prompt service: {e}", exc_info=True)
error = Error(
type="PromptServiceError",
message="Failed to select schemas using prompt service"
)
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
async def generate_descriptor_with_prompt(
self, sample: str, data_type: str, target_schema: RowSchema,
options: Dict[str, str], flow

View file

@ -31,28 +31,13 @@ def detect_data_type(sample: str) -> Tuple[Optional[str], float]:
sample = sample.strip()
# Try each format and calculate confidence scores
json_confidence = _check_json_format(sample)
xml_confidence = _check_xml_format(sample)
csv_confidence = _check_csv_format(sample)
logger.debug(f"Format confidence scores - JSON: {json_confidence}, XML: {xml_confidence}, CSV: {csv_confidence}")
# Find the format with highest confidence
scores = {
"json": json_confidence,
"xml": xml_confidence,
"csv": csv_confidence
}
best_format = max(scores, key=scores.get)
best_confidence = scores[best_format]
# Only return a result if confidence is above threshold
if best_confidence < 0.3:
return None, best_confidence
return best_format, best_confidence
# Simple pattern matching
if sample.startswith('<?xml') or (sample.startswith('<') and '</' in sample):
return 'xml', 0.9
elif sample.startswith(('{', '[')):
return 'json', 0.9
else:
return 'csv', 0.8
def _check_json_format(sample: str) -> float:
@ -83,33 +68,20 @@ def _check_json_format(sample: str) -> float:
def _check_xml_format(sample: str) -> float:
"""Check if sample is valid XML format"""
try:
# Quick heuristic checks first
if not sample.startswith('<'):
return 0.0
if not ('>' in sample and '</' in sample):
return 0.1 # Might be incomplete XML
# Try to parse as XML
root = ET.fromstring(sample)
# Higher confidence for XML with multiple child elements
child_count = len(list(root))
if child_count > 10:
return 0.95
elif child_count > 5:
return 0.9
elif child_count > 0:
return 0.8
# XML declaration or starts with tag
if sample.startswith('<?xml') or sample.startswith('<'):
# Must have closing tags for valid XML
if '</' in sample and '>' in sample:
try:
# Quick parse test
ET.fromstring(sample)
return 0.9 # Valid XML
except ET.ParseError:
return 0.3 # Looks like XML but malformed
else:
return 0.6
return 0.1 # Incomplete XML
except ET.ParseError:
# Check for common XML characteristics even if not well-formed
xml_indicators = ['</', '<?xml', 'xmlns:', '<![CDATA[']
score = sum(0.1 for indicator in xml_indicators if indicator in sample)
return min(score, 0.3) # Max 0.3 for malformed XML
return 0.0 # Not XML
def _check_csv_format(sample: str) -> float: