Structure data diagnosis service (#518)

* Import flow tech spec

* Structured diag service

* Plumbed into API gateway

* Type detector

* Diag service

* Added entry point
This commit is contained in:
cybermaggedon 2025-09-16 21:43:23 +01:00 committed by GitHub
parent d73af56690
commit 3d783f4bd4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 1201 additions and 3 deletions

View file

@ -0,0 +1,2 @@
# Structured data diagnosis service
from .service import *

View file

@ -0,0 +1,394 @@
"""
Structured Data Diagnosis Service - analyzes structured data and generates descriptors.
Supports three operations: detect-type, generate-descriptor, and diagnose (combined).
"""
import json
import logging
from typing import Dict, Any, Optional
from ...schema import StructuredDataDiagnosisRequest, StructuredDataDiagnosisResponse
from ...schema import PromptRequest, Error, RowSchema, Field as SchemaField
from ...base import FlowProcessor, ConsumerSpec, ProducerSpec, PromptClientSpec
from .type_detector import detect_data_type, detect_csv_options
# Module logger
logger = logging.getLogger(__name__)
default_ident = "structured-diag"
default_csv_prompt = "diagnose-csv"
default_json_prompt = "diagnose-json"
default_xml_prompt = "diagnose-xml"
class Processor(FlowProcessor):
def __init__(self, **params):
id = params.get("id", default_ident)
# Config key for schemas
self.config_key = params.get("config_type", "schema")
# Configurable prompt template names
self.csv_prompt = params.get("csv_prompt", default_csv_prompt)
self.json_prompt = params.get("json_prompt", default_json_prompt)
self.xml_prompt = params.get("xml_prompt", default_xml_prompt)
super(Processor, self).__init__(
**params | {
"id": id,
"config_type": self.config_key,
}
)
self.register_specification(
ConsumerSpec(
name = "request",
schema = StructuredDataDiagnosisRequest,
handler = self.on_message
)
)
self.register_specification(
ProducerSpec(
name = "response",
schema = StructuredDataDiagnosisResponse,
)
)
# Client spec for calling prompt service
self.register_specification(
PromptClientSpec(
request_name = "prompt-request",
response_name = "prompt-response",
)
)
# Register config handler for schema updates
self.register_config_handler(self.on_schema_config)
# Schema storage: name -> RowSchema
self.schemas: Dict[str, RowSchema] = {}
logger.info("Structured Data Diagnosis service initialized")
async def on_schema_config(self, config, version):
"""Handle schema configuration updates"""
logger.info(f"Loading schema configuration version {version}")
# Clear existing schemas
self.schemas = {}
# Check if our config type exists
if self.config_key not in config:
logger.warning(f"No '{self.config_key}' type in configuration")
return
# Get the schemas dictionary for our type
schemas_config = config[self.config_key]
# Process each schema in the schemas config
for schema_name, schema_json in schemas_config.items():
try:
# Parse the JSON schema definition
schema_def = json.loads(schema_json)
# Create Field objects
fields = []
for field_def in schema_def.get("fields", []):
field = SchemaField(
name=field_def["name"],
type=field_def["type"],
size=field_def.get("size", 0),
primary=field_def.get("primary_key", False),
description=field_def.get("description", ""),
required=field_def.get("required", False),
enum_values=field_def.get("enum", []),
indexed=field_def.get("indexed", False)
)
fields.append(field)
# Create RowSchema
row_schema = RowSchema(
name=schema_def.get("name", schema_name),
description=schema_def.get("description", ""),
fields=fields
)
self.schemas[schema_name] = row_schema
logger.info(f"Loaded schema: {schema_name} with {len(fields)} fields")
except Exception as e:
logger.error(f"Failed to parse schema {schema_name}: {e}", exc_info=True)
logger.info(f"Schema configuration loaded: {len(self.schemas)} schemas")
async def on_message(self, msg, consumer, flow):
"""Handle incoming structured data diagnosis request"""
try:
request = msg.value()
# Sender-produced ID
id = msg.properties()["id"]
logger.info(f"Handling structured data diagnosis request {id}: operation={request.operation}")
if request.operation == "detect-type":
response = await self.detect_type_operation(request, flow)
elif request.operation == "generate-descriptor":
response = await self.generate_descriptor_operation(request, flow)
elif request.operation == "diagnose":
response = await self.diagnose_operation(request, flow)
else:
error = Error(
type="InvalidOperation",
message=f"Unknown operation: {request.operation}. Supported: detect-type, generate-descriptor, diagnose"
)
response = StructuredDataDiagnosisResponse(
error=error,
operation=request.operation
)
# Send response
await flow("response").send(
id, response, properties={"id": id}
)
except Exception as e:
logger.error(f"Error processing diagnosis request: {e}", exc_info=True)
error = Error(
type="ProcessingError",
message=f"Failed to process diagnosis request: {str(e)}"
)
response = StructuredDataDiagnosisResponse(
error=error,
operation=request.operation if request else "unknown"
)
await flow("response").send(
id, response, properties={"id": id}
)
async def detect_type_operation(self, request: StructuredDataDiagnosisRequest, flow) -> StructuredDataDiagnosisResponse:
"""Handle detect-type operation"""
logger.info("Processing detect-type operation")
detected_type, confidence = detect_data_type(request.sample)
metadata = {}
if detected_type == "csv":
csv_options = detect_csv_options(request.sample)
metadata["csv_options"] = json.dumps(csv_options)
return StructuredDataDiagnosisResponse(
error=None,
operation=request.operation,
detected_type=detected_type or "",
confidence=confidence,
metadata=metadata
)
async def generate_descriptor_operation(self, request: StructuredDataDiagnosisRequest, flow) -> StructuredDataDiagnosisResponse:
"""Handle generate-descriptor operation"""
logger.info(f"Processing generate-descriptor operation for type: {request.type}")
if not request.type:
error = Error(
type="MissingParameter",
message="Type parameter is required for generate-descriptor operation"
)
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
if not request.schema_name:
error = Error(
type="MissingParameter",
message="Schema name parameter is required for generate-descriptor operation"
)
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
# Get target schema
if request.schema_name not in self.schemas:
error = Error(
type="SchemaNotFound",
message=f"Schema '{request.schema_name}' not found in configuration"
)
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
target_schema = self.schemas[request.schema_name]
# Generate descriptor using prompt service
descriptor = await self.generate_descriptor_with_prompt(
request.sample, request.type, target_schema, request.options, flow
)
if descriptor is None:
error = Error(
type="DescriptorGenerationFailed",
message="Failed to generate descriptor using prompt service"
)
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
return StructuredDataDiagnosisResponse(
error=None,
operation=request.operation,
descriptor=json.dumps(descriptor),
metadata={"schema_name": request.schema_name, "type": request.type}
)
async def diagnose_operation(self, request: StructuredDataDiagnosisRequest, flow) -> StructuredDataDiagnosisResponse:
"""Handle combined diagnose operation"""
logger.info("Processing combined diagnose operation")
# Step 1: Detect type
detected_type, confidence = detect_data_type(request.sample)
if not detected_type:
error = Error(
type="TypeDetectionFailed",
message="Unable to detect data type from sample"
)
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
# Step 2: Use provided schema name or auto-select first available
schema_name = request.schema_name
if not schema_name and self.schemas:
schema_name = list(self.schemas.keys())[0]
logger.info(f"Auto-selected schema: {schema_name}")
if not schema_name:
error = Error(
type="NoSchemaAvailable",
message="No schema specified and no schemas available in configuration"
)
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
if schema_name not in self.schemas:
error = Error(
type="SchemaNotFound",
message=f"Schema '{schema_name}' not found in configuration"
)
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
target_schema = self.schemas[schema_name]
# Step 3: Generate descriptor
descriptor = await self.generate_descriptor_with_prompt(
request.sample, detected_type, target_schema, request.options, flow
)
if descriptor is None:
error = Error(
type="DescriptorGenerationFailed",
message="Failed to generate descriptor using prompt service"
)
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
metadata = {
"schema_name": schema_name,
"auto_selected_schema": request.schema_name != schema_name
}
if detected_type == "csv":
csv_options = detect_csv_options(request.sample)
metadata["csv_options"] = json.dumps(csv_options)
return StructuredDataDiagnosisResponse(
error=None,
operation=request.operation,
detected_type=detected_type,
confidence=confidence,
descriptor=json.dumps(descriptor),
metadata=metadata
)
async def generate_descriptor_with_prompt(
self, sample: str, data_type: str, target_schema: RowSchema,
options: Dict[str, str], flow
) -> Optional[Dict[str, Any]]:
"""Generate descriptor using appropriate prompt service"""
# Select prompt template based on data type
prompt_templates = {
"csv": self.csv_prompt,
"json": self.json_prompt,
"xml": self.xml_prompt
}
prompt_id = prompt_templates.get(data_type)
if not prompt_id:
logger.error(f"No prompt template defined for data type: {data_type}")
return None
# Prepare schema information for prompt
schema_info = {
"name": target_schema.name,
"description": target_schema.description,
"fields": [
{
"name": f.name,
"type": f.type,
"description": f.description,
"required": f.required,
"primary_key": f.primary,
"indexed": f.indexed,
"enum_values": f.enum_values if f.enum_values else []
}
for f in target_schema.fields
]
}
# Create prompt variables
variables = {
"sample": sample,
"schemas": [schema_info], # Array with single target schema
"options": options or {}
}
# Call prompt service
terms = {k: json.dumps(v) for k, v in variables.items()}
prompt_request = PromptRequest(
id=prompt_id,
terms=terms
)
try:
logger.info(f"Calling prompt service with template: {prompt_id}")
response = await flow("prompt-request").request(prompt_request)
if response.error:
logger.error(f"Prompt service error: {response.error.message}")
return None
# Parse response
if response.object:
try:
return json.loads(response.object)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse prompt response as JSON: {e}")
logger.debug(f"Response object: {response.object}")
return None
elif response.text:
try:
return json.loads(response.text)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse prompt text response as JSON: {e}")
logger.debug(f"Response text: {response.text}")
return None
else:
logger.error("Empty response from prompt service")
return None
except Exception as e:
logger.error(f"Error calling prompt service: {e}", exc_info=True)
return None
def run():
"""Entry point for structured-diag command"""
Processor.launch(default_ident, __doc__)

View file

@ -0,0 +1,236 @@
"""
Algorithmic data type detection for structured data.
Determines if data is CSV, JSON, or XML based on content analysis.
"""
import json
import xml.etree.ElementTree as ET
import csv
from io import StringIO
import logging
from typing import Dict, Optional, Tuple
# Module logger
logger = logging.getLogger(__name__)
def detect_data_type(sample: str) -> Tuple[Optional[str], float]:
"""
Detect the data type (csv, json, xml) of a data sample.
Args:
sample: String containing data sample to analyze
Returns:
Tuple of (detected_type, confidence_score)
detected_type: "csv", "json", "xml", or None if unable to determine
confidence_score: Float between 0.0 and 1.0 indicating confidence
"""
if not sample or not sample.strip():
return None, 0.0
sample = sample.strip()
# Try each format and calculate confidence scores
json_confidence = _check_json_format(sample)
xml_confidence = _check_xml_format(sample)
csv_confidence = _check_csv_format(sample)
logger.debug(f"Format confidence scores - JSON: {json_confidence}, XML: {xml_confidence}, CSV: {csv_confidence}")
# Find the format with highest confidence
scores = {
"json": json_confidence,
"xml": xml_confidence,
"csv": csv_confidence
}
best_format = max(scores, key=scores.get)
best_confidence = scores[best_format]
# Only return a result if confidence is above threshold
if best_confidence < 0.3:
return None, best_confidence
return best_format, best_confidence
def _check_json_format(sample: str) -> float:
"""Check if sample is valid JSON format"""
try:
# Must start with { or [
if not (sample.startswith('{') or sample.startswith('[')):
return 0.0
# Try to parse as JSON
data = json.loads(sample)
# Higher confidence for structured data
if isinstance(data, dict):
return 0.95
elif isinstance(data, list) and len(data) > 0:
# Check if it's an array of objects (common for structured data)
if isinstance(data[0], dict):
return 0.9
else:
return 0.7
else:
return 0.6
except (json.JSONDecodeError, ValueError):
return 0.0
def _check_xml_format(sample: str) -> float:
"""Check if sample is valid XML format"""
try:
# Quick heuristic checks first
if not sample.startswith('<'):
return 0.0
if not ('>' in sample and '</' in sample):
return 0.1 # Might be incomplete XML
# Try to parse as XML
root = ET.fromstring(sample)
# Higher confidence for XML with multiple child elements
child_count = len(list(root))
if child_count > 10:
return 0.95
elif child_count > 5:
return 0.9
elif child_count > 0:
return 0.8
else:
return 0.6
except ET.ParseError:
# Check for common XML characteristics even if not well-formed
xml_indicators = ['</', '<?xml', 'xmlns:', '<![CDATA[']
score = sum(0.1 for indicator in xml_indicators if indicator in sample)
return min(score, 0.3) # Max 0.3 for malformed XML
def _check_csv_format(sample: str) -> float:
"""Check if sample is valid CSV format"""
try:
lines = sample.strip().split('\n')
if len(lines) < 2:
return 0.0
# Try to parse as CSV with different delimiters
delimiters = [',', ';', '\t', '|']
best_score = 0.0
for delimiter in delimiters:
score = _check_csv_with_delimiter(sample, delimiter)
best_score = max(best_score, score)
return best_score
except Exception:
return 0.0
def _check_csv_with_delimiter(sample: str, delimiter: str) -> float:
"""Check CSV format with specific delimiter"""
try:
reader = csv.reader(StringIO(sample), delimiter=delimiter)
rows = list(reader)
if len(rows) < 2:
return 0.0
# Check consistency of column counts
first_row_cols = len(rows[0])
if first_row_cols < 2:
return 0.0
consistent_rows = 0
for row in rows[1:]:
if len(row) == first_row_cols:
consistent_rows += 1
consistency_ratio = consistent_rows / (len(rows) - 1) if len(rows) > 1 else 0
# Base score on consistency and structure
if consistency_ratio > 0.8:
# Higher score for more columns and rows
column_bonus = min(first_row_cols * 0.05, 0.2)
row_bonus = min(len(rows) * 0.01, 0.1)
return min(0.7 + column_bonus + row_bonus, 0.95)
elif consistency_ratio > 0.6:
return 0.5
else:
return 0.2
except Exception:
return 0.0
def detect_csv_options(sample: str) -> Dict[str, any]:
"""
Detect CSV-specific options like delimiter and header presence.
Args:
sample: CSV data sample
Returns:
Dict with detected options: delimiter, has_header, etc.
"""
options = {
"delimiter": ",",
"has_header": True,
"encoding": "utf-8"
}
try:
lines = sample.strip().split('\n')
if len(lines) < 2:
return options
# Detect delimiter
delimiters = [',', ';', '\t', '|']
best_delimiter = ","
best_score = 0
for delimiter in delimiters:
score = _check_csv_with_delimiter(sample, delimiter)
if score > best_score:
best_score = score
best_delimiter = delimiter
options["delimiter"] = best_delimiter
# Detect header (heuristic: first row has text, second row has more numbers/structured data)
reader = csv.reader(StringIO(sample), delimiter=best_delimiter)
rows = list(reader)
if len(rows) >= 2:
first_row = rows[0]
second_row = rows[1]
# Count numeric fields in each row
first_numeric = sum(1 for cell in first_row if _is_numeric(cell))
second_numeric = sum(1 for cell in second_row if _is_numeric(cell))
# If second row has more numeric values, first row is likely header
if second_numeric > first_numeric and first_numeric < len(first_row) * 0.7:
options["has_header"] = True
else:
options["has_header"] = False
except Exception as e:
logger.debug(f"Error detecting CSV options: {e}")
return options
def _is_numeric(value: str) -> bool:
"""Check if a string value represents a number"""
try:
float(value.strip())
return True
except (ValueError, AttributeError):
return False