Structure data diagnosis service (#518)

* Import flow tech spec

* Structured diag service

* Plumbed into API gateway

* Type detector

* Diag service

* Added entry point
This commit is contained in:
cybermaggedon 2025-09-16 21:43:23 +01:00 committed by GitHub
parent d73af56690
commit 3d783f4bd4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 1201 additions and 3 deletions

View file

@ -24,6 +24,7 @@ from .translators.embeddings_query import (
from .translators.objects_query import ObjectsQueryRequestTranslator, ObjectsQueryResponseTranslator
from .translators.nlp_query import QuestionToStructuredQueryRequestTranslator, QuestionToStructuredQueryResponseTranslator
from .translators.structured_query import StructuredQueryRequestTranslator, StructuredQueryResponseTranslator
from .translators.diagnosis import StructuredDataDiagnosisRequestTranslator, StructuredDataDiagnosisResponseTranslator
# Register all service translators
TranslatorRegistry.register_service(
@ -123,11 +124,17 @@ TranslatorRegistry.register_service(
)
TranslatorRegistry.register_service(
"structured-query",
StructuredQueryRequestTranslator(),
"structured-query",
StructuredQueryRequestTranslator(),
StructuredQueryResponseTranslator()
)
TranslatorRegistry.register_service(
"structured-diag",
StructuredDataDiagnosisRequestTranslator(),
StructuredDataDiagnosisResponseTranslator()
)
# Register single-direction translators for document loading
TranslatorRegistry.register_request("document", DocumentTranslator())
TranslatorRegistry.register_request("text-document", TextDocumentTranslator())

View file

@ -18,3 +18,4 @@ from .embeddings_query import (
GraphEmbeddingsRequestTranslator, GraphEmbeddingsResponseTranslator
)
from .objects_query import ObjectsQueryRequestTranslator, ObjectsQueryResponseTranslator
from .diagnosis import StructuredDataDiagnosisRequestTranslator, StructuredDataDiagnosisResponseTranslator

View file

@ -0,0 +1,65 @@
from typing import Dict, Any, Tuple
import json
from ...schema import StructuredDataDiagnosisRequest, StructuredDataDiagnosisResponse
from .base import MessageTranslator
class StructuredDataDiagnosisRequestTranslator(MessageTranslator):
"""Translator for StructuredDataDiagnosisRequest schema objects"""
def to_pulsar(self, data: Dict[str, Any]) -> StructuredDataDiagnosisRequest:
return StructuredDataDiagnosisRequest(
operation=data["operation"],
sample=data["sample"],
type=data.get("type", ""),
schema_name=data.get("schema-name", ""),
options=data.get("options", {})
)
def from_pulsar(self, obj: StructuredDataDiagnosisRequest) -> Dict[str, Any]:
result = {
"operation": obj.operation,
"sample": obj.sample,
}
# Add optional fields if they exist
if obj.type:
result["type"] = obj.type
if obj.schema_name:
result["schema-name"] = obj.schema_name
if obj.options:
result["options"] = obj.options
return result
class StructuredDataDiagnosisResponseTranslator(MessageTranslator):
"""Translator for StructuredDataDiagnosisResponse schema objects"""
def to_pulsar(self, data: Dict[str, Any]) -> StructuredDataDiagnosisResponse:
raise NotImplementedError("Response translation to Pulsar not typically needed")
def from_pulsar(self, obj: StructuredDataDiagnosisResponse) -> Dict[str, Any]:
result = {
"operation": obj.operation
}
# Add optional response fields if they exist
if obj.detected_type:
result["detected-type"] = obj.detected_type
if obj.confidence is not None:
result["confidence"] = obj.confidence
if obj.descriptor:
# Parse JSON-encoded descriptor
try:
result["descriptor"] = json.loads(obj.descriptor)
except (json.JSONDecodeError, TypeError):
result["descriptor"] = obj.descriptor
if obj.metadata:
result["metadata"] = obj.metadata
return result
def from_response_with_completion(self, obj: StructuredDataDiagnosisResponse) -> Tuple[Dict[str, Any], bool]:
"""Returns (response_dict, is_final)"""
return self.from_pulsar(obj), True

View file

@ -9,4 +9,5 @@ from .library import *
from .lookup import *
from .nlp_query import *
from .structured_query import *
from .objects_query import *
from .objects_query import *
from .diagnosis import *

View file

@ -0,0 +1,30 @@
from pulsar.schema import Record, String, Map, Double
from ..core.primitives import Error
############################################################################
# Structured data diagnosis services
class StructuredDataDiagnosisRequest(Record):
operation = String() # "detect-type", "generate-descriptor", or "diagnose"
sample = String() # Data sample to analyze (text content)
type = String() # Data type (csv, json, xml) - optional, required for generate-descriptor
schema_name = String() # Target schema name for descriptor generation - optional
# JSON encoded options (e.g., delimiter for CSV)
options = Map(String())
class StructuredDataDiagnosisResponse(Record):
error = Error()
operation = String() # The operation that was performed
detected_type = String() # Detected data type (for detect-type/diagnose) - optional
confidence = Double() # Confidence score for type detection - optional
# JSON encoded descriptor (for generate-descriptor/diagnose) - optional
descriptor = String()
# JSON encoded additional metadata (e.g., field count, sample records)
metadata = Map(String())
############################################################################