Structure data mvp (#452)

* Structured data tech spec

* Architecture principles

* New schemas

* Updated schemas and specs

* Object extractor

* Add .coveragerc

* New tests

* Cassandra object storage

* Trying to object extraction working, issues exist
This commit is contained in:
cybermaggedon 2025-08-07 20:47:20 +01:00 committed by GitHub
parent 5de56c5dbc
commit 83f0c1e7f3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
46 changed files with 5313 additions and 1629 deletions

View file

@ -40,6 +40,13 @@ class PromptClient(RequestResponse):
timeout = timeout,
)
async def extract_objects(self, text, schema, timeout=600):
return await self.prompt(
id = "extract-rows",
variables = { "text": text, "schema": schema, },
timeout = timeout,
)
async def kg_prompt(self, query, kg, timeout=600):
return await self.prompt(
id = "kg-prompt",

View file

@ -1,5 +1,5 @@
from .base import Translator, MessageTranslator
from .primitives import ValueTranslator, TripleTranslator, SubgraphTranslator
from .primitives import ValueTranslator, TripleTranslator, SubgraphTranslator, RowSchemaTranslator, FieldTranslator, row_schema_translator, field_translator
from .metadata import DocumentMetadataTranslator, ProcessingMetadataTranslator
from .agent import AgentRequestTranslator, AgentResponseTranslator
from .embeddings import EmbeddingsRequestTranslator, EmbeddingsResponseTranslator

View file

@ -1,5 +1,5 @@
from typing import Dict, Any, List
from ...schema import Value, Triple
from ...schema import Value, Triple, RowSchema, Field
from .base import Translator
@ -44,4 +44,97 @@ class SubgraphTranslator(Translator):
return [self.triple_translator.to_pulsar(t) for t in data]
def from_pulsar(self, obj: List[Triple]) -> List[Dict[str, Any]]:
return [self.triple_translator.from_pulsar(t) for t in obj]
return [self.triple_translator.from_pulsar(t) for t in obj]
class RowSchemaTranslator(Translator):
"""Translator for RowSchema objects"""
def to_pulsar(self, data: Dict[str, Any]) -> RowSchema:
"""Convert dict to RowSchema Pulsar object"""
fields = []
for field_data in data.get("fields", []):
field = Field(
name=field_data.get("name", ""),
type=field_data.get("type", "string"),
size=field_data.get("size", 0),
primary=field_data.get("primary", False),
description=field_data.get("description", ""),
required=field_data.get("required", False),
indexed=field_data.get("indexed", False),
enum_values=field_data.get("enum_values", [])
)
fields.append(field)
return RowSchema(
name=data.get("name", ""),
description=data.get("description", ""),
fields=fields
)
def from_pulsar(self, obj: RowSchema) -> Dict[str, Any]:
"""Convert RowSchema Pulsar object to JSON-serializable dictionary"""
result = {
"name": obj.name,
"description": obj.description,
"fields": []
}
for field in obj.fields:
field_dict = {
"name": field.name,
"type": field.type,
"size": field.size,
"primary": field.primary,
"description": field.description,
"required": field.required,
"indexed": field.indexed
}
# Handle enum_values array
if field.enum_values:
field_dict["enum_values"] = list(field.enum_values)
result["fields"].append(field_dict)
return result
class FieldTranslator(Translator):
"""Translator for Field objects"""
def to_pulsar(self, data: Dict[str, Any]) -> Field:
"""Convert dict to Field Pulsar object"""
return Field(
name=data.get("name", ""),
type=data.get("type", "string"),
size=data.get("size", 0),
primary=data.get("primary", False),
description=data.get("description", ""),
required=data.get("required", False),
indexed=data.get("indexed", False),
enum_values=data.get("enum_values", [])
)
def from_pulsar(self, obj: Field) -> Dict[str, Any]:
"""Convert Field Pulsar object to JSON-serializable dictionary"""
result = {
"name": obj.name,
"type": obj.type,
"size": obj.size,
"primary": obj.primary,
"description": obj.description,
"required": obj.required,
"indexed": obj.indexed
}
# Handle enum_values array
if obj.enum_values:
result["enum_values"] = list(obj.enum_values)
return result
# Create singleton instances for easy access
row_schema_translator = RowSchemaTranslator()
field_translator = FieldTranslator()

View file

@ -17,11 +17,15 @@ class Triple(Record):
class Field(Record):
name = String()
# int, string, long, bool, float, double
# int, string, long, bool, float, double, timestamp
type = String()
size = Integer()
primary = Boolean()
description = String()
# NEW FIELDS for structured data:
required = Boolean() # Whether field is required
enum_values = Array(String()) # For enum type fields
indexed = Boolean() # Whether field should be indexed
class RowSchema(Record):
name = String()

View file

@ -3,4 +3,6 @@ from .document import *
from .embeddings import *
from .knowledge import *
from .nlp import *
from .rows import *
from .rows import *
from .structured import *
from .object import *

View file

@ -40,4 +40,17 @@ class ObjectEmbeddings(Record):
vectors = Array(Array(Double()))
name = String()
key_name = String()
id = String()
id = String()
############################################################################
# Structured object embeddings with enhanced capabilities
class StructuredObjectEmbedding(Record):
metadata = Metadata()
vectors = Array(Array(Double()))
schema_name = String()
object_id = String() # Primary key value
field_embeddings = Map(Array(Double())) # Per-field embeddings
############################################################################

View file

@ -0,0 +1,17 @@
from pulsar.schema import Record, String, Map, Double
from ..core.metadata import Metadata
from ..core.topic import topic
############################################################################
# Extracted object from text processing
class ExtractedObject(Record):
metadata = Metadata()
schema_name = String() # Which schema this object belongs to
values = Map(String()) # Field name -> value
confidence = Double()
source_span = String() # Text span where object was found
############################################################################

View file

@ -0,0 +1,17 @@
from pulsar.schema import Record, String, Bytes, Map
from ..core.metadata import Metadata
from ..core.topic import topic
############################################################################
# Structured data submission for fire-and-forget processing
class StructuredDataSubmission(Record):
metadata = Metadata()
format = String() # "json", "csv", "xml"
schema_name = String() # Reference to schema in config
data = Bytes() # Raw data to ingest
options = Map(String()) # Format-specific options
############################################################################

View file

@ -6,4 +6,6 @@ from .flow import *
from .prompt import *
from .config import *
from .library import *
from .lookup import *
from .lookup import *
from .nlp_query import *
from .structured_query import *

View file

@ -0,0 +1,22 @@
from pulsar.schema import Record, String, Array, Map, Integer, Double
from ..core.primitives import Error
from ..core.topic import topic
############################################################################
# NLP to Structured Query Service - converts natural language to GraphQL
class NLPToStructuredQueryRequest(Record):
natural_language_query = String()
max_results = Integer()
context_hints = Map(String()) # Optional context for query generation
class NLPToStructuredQueryResponse(Record):
error = Error()
graphql_query = String() # Generated GraphQL query
variables = Map(String()) # GraphQL variables if any
detected_schemas = Array(String()) # Which schemas the query targets
confidence = Double()
############################################################################

View file

@ -0,0 +1,20 @@
from pulsar.schema import Record, String, Map, Array
from ..core.primitives import Error
from ..core.topic import topic
############################################################################
# Structured Query Service - executes GraphQL queries
class StructuredQueryRequest(Record):
query = String() # GraphQL query
variables = Map(String()) # GraphQL variables
operation_name = String() # Optional operation name for multi-operation documents
class StructuredQueryResponse(Record):
error = Error()
data = String() # JSON-encoded GraphQL response data
errors = Array(String()) # GraphQL errors if any
############################################################################