mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-26 00:46:22 +02:00
Structure data mvp (#452)
* Structured data tech spec * Architecture principles * New schemas * Updated schemas and specs * Object extractor * Add .coveragerc * New tests * Cassandra object storage * Trying to object extraction working, issues exist
This commit is contained in:
parent
5de56c5dbc
commit
83f0c1e7f3
46 changed files with 5313 additions and 1629 deletions
|
|
@ -40,6 +40,13 @@ class PromptClient(RequestResponse):
|
|||
timeout = timeout,
|
||||
)
|
||||
|
||||
async def extract_objects(self, text, schema, timeout=600):
|
||||
return await self.prompt(
|
||||
id = "extract-rows",
|
||||
variables = { "text": text, "schema": schema, },
|
||||
timeout = timeout,
|
||||
)
|
||||
|
||||
async def kg_prompt(self, query, kg, timeout=600):
|
||||
return await self.prompt(
|
||||
id = "kg-prompt",
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
from .base import Translator, MessageTranslator
|
||||
from .primitives import ValueTranslator, TripleTranslator, SubgraphTranslator
|
||||
from .primitives import ValueTranslator, TripleTranslator, SubgraphTranslator, RowSchemaTranslator, FieldTranslator, row_schema_translator, field_translator
|
||||
from .metadata import DocumentMetadataTranslator, ProcessingMetadataTranslator
|
||||
from .agent import AgentRequestTranslator, AgentResponseTranslator
|
||||
from .embeddings import EmbeddingsRequestTranslator, EmbeddingsResponseTranslator
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Dict, Any, List
|
||||
from ...schema import Value, Triple
|
||||
from ...schema import Value, Triple, RowSchema, Field
|
||||
from .base import Translator
|
||||
|
||||
|
||||
|
|
@ -44,4 +44,97 @@ class SubgraphTranslator(Translator):
|
|||
return [self.triple_translator.to_pulsar(t) for t in data]
|
||||
|
||||
def from_pulsar(self, obj: List[Triple]) -> List[Dict[str, Any]]:
|
||||
return [self.triple_translator.from_pulsar(t) for t in obj]
|
||||
return [self.triple_translator.from_pulsar(t) for t in obj]
|
||||
|
||||
|
||||
class RowSchemaTranslator(Translator):
|
||||
"""Translator for RowSchema objects"""
|
||||
|
||||
def to_pulsar(self, data: Dict[str, Any]) -> RowSchema:
|
||||
"""Convert dict to RowSchema Pulsar object"""
|
||||
fields = []
|
||||
for field_data in data.get("fields", []):
|
||||
field = Field(
|
||||
name=field_data.get("name", ""),
|
||||
type=field_data.get("type", "string"),
|
||||
size=field_data.get("size", 0),
|
||||
primary=field_data.get("primary", False),
|
||||
description=field_data.get("description", ""),
|
||||
required=field_data.get("required", False),
|
||||
indexed=field_data.get("indexed", False),
|
||||
enum_values=field_data.get("enum_values", [])
|
||||
)
|
||||
fields.append(field)
|
||||
|
||||
return RowSchema(
|
||||
name=data.get("name", ""),
|
||||
description=data.get("description", ""),
|
||||
fields=fields
|
||||
)
|
||||
|
||||
def from_pulsar(self, obj: RowSchema) -> Dict[str, Any]:
|
||||
"""Convert RowSchema Pulsar object to JSON-serializable dictionary"""
|
||||
result = {
|
||||
"name": obj.name,
|
||||
"description": obj.description,
|
||||
"fields": []
|
||||
}
|
||||
|
||||
for field in obj.fields:
|
||||
field_dict = {
|
||||
"name": field.name,
|
||||
"type": field.type,
|
||||
"size": field.size,
|
||||
"primary": field.primary,
|
||||
"description": field.description,
|
||||
"required": field.required,
|
||||
"indexed": field.indexed
|
||||
}
|
||||
|
||||
# Handle enum_values array
|
||||
if field.enum_values:
|
||||
field_dict["enum_values"] = list(field.enum_values)
|
||||
|
||||
result["fields"].append(field_dict)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class FieldTranslator(Translator):
|
||||
"""Translator for Field objects"""
|
||||
|
||||
def to_pulsar(self, data: Dict[str, Any]) -> Field:
|
||||
"""Convert dict to Field Pulsar object"""
|
||||
return Field(
|
||||
name=data.get("name", ""),
|
||||
type=data.get("type", "string"),
|
||||
size=data.get("size", 0),
|
||||
primary=data.get("primary", False),
|
||||
description=data.get("description", ""),
|
||||
required=data.get("required", False),
|
||||
indexed=data.get("indexed", False),
|
||||
enum_values=data.get("enum_values", [])
|
||||
)
|
||||
|
||||
def from_pulsar(self, obj: Field) -> Dict[str, Any]:
|
||||
"""Convert Field Pulsar object to JSON-serializable dictionary"""
|
||||
result = {
|
||||
"name": obj.name,
|
||||
"type": obj.type,
|
||||
"size": obj.size,
|
||||
"primary": obj.primary,
|
||||
"description": obj.description,
|
||||
"required": obj.required,
|
||||
"indexed": obj.indexed
|
||||
}
|
||||
|
||||
# Handle enum_values array
|
||||
if obj.enum_values:
|
||||
result["enum_values"] = list(obj.enum_values)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# Create singleton instances for easy access
|
||||
row_schema_translator = RowSchemaTranslator()
|
||||
field_translator = FieldTranslator()
|
||||
|
|
@ -17,11 +17,15 @@ class Triple(Record):
|
|||
|
||||
class Field(Record):
|
||||
name = String()
|
||||
# int, string, long, bool, float, double
|
||||
# int, string, long, bool, float, double, timestamp
|
||||
type = String()
|
||||
size = Integer()
|
||||
primary = Boolean()
|
||||
description = String()
|
||||
# NEW FIELDS for structured data:
|
||||
required = Boolean() # Whether field is required
|
||||
enum_values = Array(String()) # For enum type fields
|
||||
indexed = Boolean() # Whether field should be indexed
|
||||
|
||||
class RowSchema(Record):
|
||||
name = String()
|
||||
|
|
|
|||
|
|
@ -3,4 +3,6 @@ from .document import *
|
|||
from .embeddings import *
|
||||
from .knowledge import *
|
||||
from .nlp import *
|
||||
from .rows import *
|
||||
from .rows import *
|
||||
from .structured import *
|
||||
from .object import *
|
||||
|
|
|
|||
|
|
@ -40,4 +40,17 @@ class ObjectEmbeddings(Record):
|
|||
vectors = Array(Array(Double()))
|
||||
name = String()
|
||||
key_name = String()
|
||||
id = String()
|
||||
id = String()
|
||||
|
||||
############################################################################
|
||||
|
||||
# Structured object embeddings with enhanced capabilities
|
||||
|
||||
class StructuredObjectEmbedding(Record):
|
||||
metadata = Metadata()
|
||||
vectors = Array(Array(Double()))
|
||||
schema_name = String()
|
||||
object_id = String() # Primary key value
|
||||
field_embeddings = Map(Array(Double())) # Per-field embeddings
|
||||
|
||||
############################################################################
|
||||
17
trustgraph-base/trustgraph/schema/knowledge/object.py
Normal file
17
trustgraph-base/trustgraph/schema/knowledge/object.py
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
from pulsar.schema import Record, String, Map, Double
|
||||
|
||||
from ..core.metadata import Metadata
|
||||
from ..core.topic import topic
|
||||
|
||||
############################################################################
|
||||
|
||||
# Extracted object from text processing
|
||||
|
||||
class ExtractedObject(Record):
|
||||
metadata = Metadata()
|
||||
schema_name = String() # Which schema this object belongs to
|
||||
values = Map(String()) # Field name -> value
|
||||
confidence = Double()
|
||||
source_span = String() # Text span where object was found
|
||||
|
||||
############################################################################
|
||||
17
trustgraph-base/trustgraph/schema/knowledge/structured.py
Normal file
17
trustgraph-base/trustgraph/schema/knowledge/structured.py
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
from pulsar.schema import Record, String, Bytes, Map
|
||||
|
||||
from ..core.metadata import Metadata
|
||||
from ..core.topic import topic
|
||||
|
||||
############################################################################
|
||||
|
||||
# Structured data submission for fire-and-forget processing
|
||||
|
||||
class StructuredDataSubmission(Record):
|
||||
metadata = Metadata()
|
||||
format = String() # "json", "csv", "xml"
|
||||
schema_name = String() # Reference to schema in config
|
||||
data = Bytes() # Raw data to ingest
|
||||
options = Map(String()) # Format-specific options
|
||||
|
||||
############################################################################
|
||||
|
|
@ -6,4 +6,6 @@ from .flow import *
|
|||
from .prompt import *
|
||||
from .config import *
|
||||
from .library import *
|
||||
from .lookup import *
|
||||
from .lookup import *
|
||||
from .nlp_query import *
|
||||
from .structured_query import *
|
||||
22
trustgraph-base/trustgraph/schema/services/nlp_query.py
Normal file
22
trustgraph-base/trustgraph/schema/services/nlp_query.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
from pulsar.schema import Record, String, Array, Map, Integer, Double
|
||||
|
||||
from ..core.primitives import Error
|
||||
from ..core.topic import topic
|
||||
|
||||
############################################################################
|
||||
|
||||
# NLP to Structured Query Service - converts natural language to GraphQL
|
||||
|
||||
class NLPToStructuredQueryRequest(Record):
|
||||
natural_language_query = String()
|
||||
max_results = Integer()
|
||||
context_hints = Map(String()) # Optional context for query generation
|
||||
|
||||
class NLPToStructuredQueryResponse(Record):
|
||||
error = Error()
|
||||
graphql_query = String() # Generated GraphQL query
|
||||
variables = Map(String()) # GraphQL variables if any
|
||||
detected_schemas = Array(String()) # Which schemas the query targets
|
||||
confidence = Double()
|
||||
|
||||
############################################################################
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
from pulsar.schema import Record, String, Map, Array
|
||||
|
||||
from ..core.primitives import Error
|
||||
from ..core.topic import topic
|
||||
|
||||
############################################################################
|
||||
|
||||
# Structured Query Service - executes GraphQL queries
|
||||
|
||||
class StructuredQueryRequest(Record):
|
||||
query = String() # GraphQL query
|
||||
variables = Map(String()) # GraphQL variables
|
||||
operation_name = String() # Optional operation name for multi-operation documents
|
||||
|
||||
class StructuredQueryResponse(Record):
|
||||
error = Error()
|
||||
data = String() # JSON-encoded GraphQL response data
|
||||
errors = Array(String()) # GraphQL errors if any
|
||||
|
||||
############################################################################
|
||||
Loading…
Add table
Add a link
Reference in a new issue