Structure data mvp (#452)

* Structured data tech spec

* Architecture principles

* New schemas

* Updated schemas and specs

* Object extractor

* Add .coveragerc

* New tests

* Cassandra object storage

* Trying to object extraction working, issues exist
This commit is contained in:
cybermaggedon 2025-08-07 20:47:20 +01:00 committed by GitHub
parent 5de56c5dbc
commit 83f0c1e7f3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
46 changed files with 5313 additions and 1629 deletions

View file

@ -1,5 +1,5 @@
from .base import Translator, MessageTranslator
from .primitives import ValueTranslator, TripleTranslator, SubgraphTranslator
from .primitives import ValueTranslator, TripleTranslator, SubgraphTranslator, RowSchemaTranslator, FieldTranslator, row_schema_translator, field_translator
from .metadata import DocumentMetadataTranslator, ProcessingMetadataTranslator
from .agent import AgentRequestTranslator, AgentResponseTranslator
from .embeddings import EmbeddingsRequestTranslator, EmbeddingsResponseTranslator

View file

@ -1,5 +1,5 @@
from typing import Dict, Any, List
from ...schema import Value, Triple
from ...schema import Value, Triple, RowSchema, Field
from .base import Translator
@ -44,4 +44,97 @@ class SubgraphTranslator(Translator):
return [self.triple_translator.to_pulsar(t) for t in data]
def from_pulsar(self, obj: List[Triple]) -> List[Dict[str, Any]]:
return [self.triple_translator.from_pulsar(t) for t in obj]
return [self.triple_translator.from_pulsar(t) for t in obj]
class RowSchemaTranslator(Translator):
"""Translator for RowSchema objects"""
def to_pulsar(self, data: Dict[str, Any]) -> RowSchema:
"""Convert dict to RowSchema Pulsar object"""
fields = []
for field_data in data.get("fields", []):
field = Field(
name=field_data.get("name", ""),
type=field_data.get("type", "string"),
size=field_data.get("size", 0),
primary=field_data.get("primary", False),
description=field_data.get("description", ""),
required=field_data.get("required", False),
indexed=field_data.get("indexed", False),
enum_values=field_data.get("enum_values", [])
)
fields.append(field)
return RowSchema(
name=data.get("name", ""),
description=data.get("description", ""),
fields=fields
)
def from_pulsar(self, obj: RowSchema) -> Dict[str, Any]:
"""Convert RowSchema Pulsar object to JSON-serializable dictionary"""
result = {
"name": obj.name,
"description": obj.description,
"fields": []
}
for field in obj.fields:
field_dict = {
"name": field.name,
"type": field.type,
"size": field.size,
"primary": field.primary,
"description": field.description,
"required": field.required,
"indexed": field.indexed
}
# Handle enum_values array
if field.enum_values:
field_dict["enum_values"] = list(field.enum_values)
result["fields"].append(field_dict)
return result
class FieldTranslator(Translator):
"""Translator for Field objects"""
def to_pulsar(self, data: Dict[str, Any]) -> Field:
"""Convert dict to Field Pulsar object"""
return Field(
name=data.get("name", ""),
type=data.get("type", "string"),
size=data.get("size", 0),
primary=data.get("primary", False),
description=data.get("description", ""),
required=data.get("required", False),
indexed=data.get("indexed", False),
enum_values=data.get("enum_values", [])
)
def from_pulsar(self, obj: Field) -> Dict[str, Any]:
"""Convert Field Pulsar object to JSON-serializable dictionary"""
result = {
"name": obj.name,
"type": obj.type,
"size": obj.size,
"primary": obj.primary,
"description": obj.description,
"required": obj.required,
"indexed": obj.indexed
}
# Handle enum_values array
if obj.enum_values:
result["enum_values"] = list(obj.enum_values)
return result
# Create singleton instances for easy access
row_schema_translator = RowSchemaTranslator()
field_translator = FieldTranslator()