mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-27 17:36:23 +02:00
Added XML, JSON, CSV detection (#519)
* Improved XML detect, added schema selection * Add schema select + tests * API additions * More tests * Fixed tests
This commit is contained in:
parent
3d783f4bd4
commit
48016d8fb2
10 changed files with 1240 additions and 54 deletions
|
|
@ -21,6 +21,7 @@ default_ident = "structured-diag"
|
|||
default_csv_prompt = "diagnose-csv"
|
||||
default_json_prompt = "diagnose-json"
|
||||
default_xml_prompt = "diagnose-xml"
|
||||
default_schema_selection_prompt = "schema-selection"
|
||||
|
||||
|
||||
class Processor(FlowProcessor):
|
||||
|
|
@ -36,6 +37,7 @@ class Processor(FlowProcessor):
|
|||
self.csv_prompt = params.get("csv_prompt", default_csv_prompt)
|
||||
self.json_prompt = params.get("json_prompt", default_json_prompt)
|
||||
self.xml_prompt = params.get("xml_prompt", default_xml_prompt)
|
||||
self.schema_selection_prompt = params.get("schema_selection_prompt", default_schema_selection_prompt)
|
||||
|
||||
super(Processor, self).__init__(
|
||||
**params | {
|
||||
|
|
@ -143,10 +145,12 @@ class Processor(FlowProcessor):
|
|||
response = await self.generate_descriptor_operation(request, flow)
|
||||
elif request.operation == "diagnose":
|
||||
response = await self.diagnose_operation(request, flow)
|
||||
elif request.operation == "schema-selection":
|
||||
response = await self.schema_selection_operation(request, flow)
|
||||
else:
|
||||
error = Error(
|
||||
type="InvalidOperation",
|
||||
message=f"Unknown operation: {request.operation}. Supported: detect-type, generate-descriptor, diagnose"
|
||||
message=f"Unknown operation: {request.operation}. Supported: detect-type, generate-descriptor, diagnose, schema-selection"
|
||||
)
|
||||
response = StructuredDataDiagnosisResponse(
|
||||
error=error,
|
||||
|
|
@ -155,7 +159,7 @@ class Processor(FlowProcessor):
|
|||
|
||||
# Send response
|
||||
await flow("response").send(
|
||||
id, response, properties={"id": id}
|
||||
response, properties={"id": id}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -172,7 +176,7 @@ class Processor(FlowProcessor):
|
|||
)
|
||||
|
||||
await flow("response").send(
|
||||
id, response, properties={"id": id}
|
||||
response, properties={"id": id}
|
||||
)
|
||||
|
||||
async def detect_type_operation(self, request: StructuredDataDiagnosisRequest, flow) -> StructuredDataDiagnosisResponse:
|
||||
|
|
@ -307,6 +311,102 @@ class Processor(FlowProcessor):
|
|||
metadata=metadata
|
||||
)
|
||||
|
||||
async def schema_selection_operation(self, request: StructuredDataDiagnosisRequest, flow) -> StructuredDataDiagnosisResponse:
|
||||
"""Handle schema-selection operation"""
|
||||
logger.info("Processing schema-selection operation")
|
||||
|
||||
# Prepare all schemas for the prompt - match the original config format
|
||||
all_schemas = []
|
||||
for schema_name, row_schema in self.schemas.items():
|
||||
schema_info = {
|
||||
"name": row_schema.name,
|
||||
"description": row_schema.description,
|
||||
"fields": [
|
||||
{
|
||||
"name": f.name,
|
||||
"type": f.type,
|
||||
"description": f.description,
|
||||
"required": f.required,
|
||||
"primary_key": f.primary,
|
||||
"indexed": f.indexed,
|
||||
"enum": f.enum_values if f.enum_values else [],
|
||||
"size": f.size if hasattr(f, 'size') else 0
|
||||
}
|
||||
for f in row_schema.fields
|
||||
]
|
||||
}
|
||||
all_schemas.append(schema_info)
|
||||
|
||||
# Create prompt variables - schemas array contains ALL schemas
|
||||
# Note: The prompt expects 'question' not 'sample'
|
||||
variables = {
|
||||
"question": request.sample, # The prompt template expects 'question'
|
||||
"schemas": all_schemas,
|
||||
"options": request.options or {}
|
||||
}
|
||||
|
||||
# Call prompt service with configurable template
|
||||
terms = {k: json.dumps(v) for k, v in variables.items()}
|
||||
prompt_request = PromptRequest(
|
||||
id=self.schema_selection_prompt,
|
||||
terms=terms
|
||||
)
|
||||
|
||||
try:
|
||||
logger.info(f"Calling prompt service for schema selection with template: {self.schema_selection_prompt}")
|
||||
response = await flow("prompt-request").request(prompt_request)
|
||||
|
||||
if response.error:
|
||||
logger.error(f"Prompt service error: {response.error.message}")
|
||||
error = Error(
|
||||
type="PromptServiceError",
|
||||
message="Failed to select schemas using prompt service"
|
||||
)
|
||||
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
|
||||
|
||||
# Check both text and object fields for response
|
||||
response_data = None
|
||||
if response.object and response.object.strip():
|
||||
response_data = response.object.strip()
|
||||
logger.debug(f"Using response from 'object' field: {response_data}")
|
||||
elif response.text and response.text.strip():
|
||||
response_data = response.text.strip()
|
||||
logger.debug(f"Using response from 'text' field: {response_data}")
|
||||
else:
|
||||
logger.error("Empty response from prompt service (checked both text and object fields)")
|
||||
error = Error(
|
||||
type="PromptServiceError",
|
||||
message="Empty response from prompt service"
|
||||
)
|
||||
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
|
||||
|
||||
# Parse the response as JSON array of schema IDs
|
||||
try:
|
||||
schema_matches = json.loads(response_data)
|
||||
if not isinstance(schema_matches, list):
|
||||
raise ValueError("Response must be an array")
|
||||
except (json.JSONDecodeError, ValueError) as e:
|
||||
logger.error(f"Failed to parse schema matches response: {e}")
|
||||
error = Error(
|
||||
type="ParseError",
|
||||
message="Failed to parse schema selection response as JSON array"
|
||||
)
|
||||
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
|
||||
|
||||
return StructuredDataDiagnosisResponse(
|
||||
error=None,
|
||||
operation=request.operation,
|
||||
schema_matches=schema_matches
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling prompt service: {e}", exc_info=True)
|
||||
error = Error(
|
||||
type="PromptServiceError",
|
||||
message="Failed to select schemas using prompt service"
|
||||
)
|
||||
return StructuredDataDiagnosisResponse(error=error, operation=request.operation)
|
||||
|
||||
async def generate_descriptor_with_prompt(
|
||||
self, sample: str, data_type: str, target_schema: RowSchema,
|
||||
options: Dict[str, str], flow
|
||||
|
|
|
|||
|
|
@ -31,28 +31,13 @@ def detect_data_type(sample: str) -> Tuple[Optional[str], float]:
|
|||
|
||||
sample = sample.strip()
|
||||
|
||||
# Try each format and calculate confidence scores
|
||||
json_confidence = _check_json_format(sample)
|
||||
xml_confidence = _check_xml_format(sample)
|
||||
csv_confidence = _check_csv_format(sample)
|
||||
|
||||
logger.debug(f"Format confidence scores - JSON: {json_confidence}, XML: {xml_confidence}, CSV: {csv_confidence}")
|
||||
|
||||
# Find the format with highest confidence
|
||||
scores = {
|
||||
"json": json_confidence,
|
||||
"xml": xml_confidence,
|
||||
"csv": csv_confidence
|
||||
}
|
||||
|
||||
best_format = max(scores, key=scores.get)
|
||||
best_confidence = scores[best_format]
|
||||
|
||||
# Only return a result if confidence is above threshold
|
||||
if best_confidence < 0.3:
|
||||
return None, best_confidence
|
||||
|
||||
return best_format, best_confidence
|
||||
# Simple pattern matching
|
||||
if sample.startswith('<?xml') or (sample.startswith('<') and '</' in sample):
|
||||
return 'xml', 0.9
|
||||
elif sample.startswith(('{', '[')):
|
||||
return 'json', 0.9
|
||||
else:
|
||||
return 'csv', 0.8
|
||||
|
||||
|
||||
def _check_json_format(sample: str) -> float:
|
||||
|
|
@ -83,33 +68,20 @@ def _check_json_format(sample: str) -> float:
|
|||
|
||||
def _check_xml_format(sample: str) -> float:
|
||||
"""Check if sample is valid XML format"""
|
||||
try:
|
||||
# Quick heuristic checks first
|
||||
if not sample.startswith('<'):
|
||||
return 0.0
|
||||
|
||||
if not ('>' in sample and '</' in sample):
|
||||
return 0.1 # Might be incomplete XML
|
||||
|
||||
# Try to parse as XML
|
||||
root = ET.fromstring(sample)
|
||||
|
||||
# Higher confidence for XML with multiple child elements
|
||||
child_count = len(list(root))
|
||||
if child_count > 10:
|
||||
return 0.95
|
||||
elif child_count > 5:
|
||||
return 0.9
|
||||
elif child_count > 0:
|
||||
return 0.8
|
||||
# XML declaration or starts with tag
|
||||
if sample.startswith('<?xml') or sample.startswith('<'):
|
||||
# Must have closing tags for valid XML
|
||||
if '</' in sample and '>' in sample:
|
||||
try:
|
||||
# Quick parse test
|
||||
ET.fromstring(sample)
|
||||
return 0.9 # Valid XML
|
||||
except ET.ParseError:
|
||||
return 0.3 # Looks like XML but malformed
|
||||
else:
|
||||
return 0.6
|
||||
return 0.1 # Incomplete XML
|
||||
|
||||
except ET.ParseError:
|
||||
# Check for common XML characteristics even if not well-formed
|
||||
xml_indicators = ['</', '<?xml', 'xmlns:', '<![CDATA[']
|
||||
score = sum(0.1 for indicator in xml_indicators if indicator in sample)
|
||||
return min(score, 0.3) # Max 0.3 for malformed XML
|
||||
return 0.0 # Not XML
|
||||
|
||||
|
||||
def _check_csv_format(sample: str) -> float:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue