Structured data 2 (#645)

* Structured data refactor - multi-index tables, remove need for manual mods to the Cassandra tables

* Tech spec updated to track implementation
This commit is contained in:
cybermaggedon 2026-02-23 15:56:29 +00:00 committed by GitHub
parent 5ffad92345
commit 1809c1f56d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
87 changed files with 5233 additions and 3235 deletions

View file

@ -101,7 +101,7 @@ from .exceptions import (
LoadError,
LookupError,
NLPQueryError,
ObjectsQueryError,
RowsQueryError,
RequestError,
StructuredQueryError,
UnexpectedError,
@ -161,7 +161,7 @@ __all__ = [
"LoadError",
"LookupError",
"NLPQueryError",
"ObjectsQueryError",
"RowsQueryError",
"RequestError",
"StructuredQueryError",
"UnexpectedError",

View file

@ -115,15 +115,15 @@ class AsyncBulkClient:
async for raw_message in websocket:
yield json.loads(raw_message)
async def import_objects(self, flow: str, objects: AsyncIterator[Dict[str, Any]], **kwargs: Any) -> None:
"""Bulk import objects via WebSocket"""
ws_url = f"{self.url}/api/v1/flow/{flow}/import/objects"
async def import_rows(self, flow: str, rows: AsyncIterator[Dict[str, Any]], **kwargs: Any) -> None:
"""Bulk import rows via WebSocket"""
ws_url = f"{self.url}/api/v1/flow/{flow}/import/rows"
if self.token:
ws_url = f"{ws_url}?token={self.token}"
async with websockets.connect(ws_url, ping_interval=20, ping_timeout=self.timeout) as websocket:
async for obj in objects:
await websocket.send(json.dumps(obj))
async for row in rows:
await websocket.send(json.dumps(row))
async def aclose(self) -> None:
"""Close connections"""

View file

@ -708,18 +708,18 @@ class AsyncFlowInstance:
return await self.request("triples", request_data)
async def objects_query(self, query: str, user: str, collection: str, variables: Optional[Dict] = None,
operation_name: Optional[str] = None, **kwargs: Any):
async def rows_query(self, query: str, user: str, collection: str, variables: Optional[Dict] = None,
operation_name: Optional[str] = None, **kwargs: Any):
"""
Execute a GraphQL query on stored objects.
Execute a GraphQL query on stored rows.
Queries structured data objects using GraphQL syntax. Supports complex
Queries structured data rows using GraphQL syntax. Supports complex
queries with variables and named operations.
Args:
query: GraphQL query string
user: User identifier
collection: Collection identifier containing objects
collection: Collection identifier containing rows
variables: Optional GraphQL query variables
operation_name: Optional operation name for multi-operation queries
**kwargs: Additional service-specific parameters
@ -743,7 +743,7 @@ class AsyncFlowInstance:
}
'''
result = await flow.objects_query(
result = await flow.rows_query(
query=query,
user="trustgraph",
collection="users",
@ -765,4 +765,4 @@ class AsyncFlowInstance:
request_data["operationName"] = operation_name
request_data.update(kwargs)
return await self.request("objects", request_data)
return await self.request("rows", request_data)

View file

@ -320,9 +320,9 @@ class AsyncSocketFlowInstance:
return await self.client._send_request("triples", self.flow_id, request)
async def objects_query(self, query: str, user: str, collection: str, variables: Optional[Dict] = None,
operation_name: Optional[str] = None, **kwargs):
"""GraphQL query"""
async def rows_query(self, query: str, user: str, collection: str, variables: Optional[Dict] = None,
operation_name: Optional[str] = None, **kwargs):
"""GraphQL query against structured rows"""
request = {
"query": query,
"user": user,
@ -334,7 +334,7 @@ class AsyncSocketFlowInstance:
request["operationName"] = operation_name
request.update(kwargs)
return await self.client._send_request("objects", self.flow_id, request)
return await self.client._send_request("rows", self.flow_id, request)
async def mcp_tool(self, name: str, parameters: Dict[str, Any], **kwargs):
"""Execute MCP tool"""

View file

@ -530,45 +530,45 @@ class BulkClient:
async for raw_message in websocket:
yield json.loads(raw_message)
def import_objects(self, flow: str, objects: Iterator[Dict[str, Any]], **kwargs: Any) -> None:
def import_rows(self, flow: str, rows: Iterator[Dict[str, Any]], **kwargs: Any) -> None:
"""
Bulk import structured objects into a flow.
Bulk import structured rows into a flow.
Efficiently uploads structured data objects via WebSocket streaming
Efficiently uploads structured data rows via WebSocket streaming
for use in GraphQL queries.
Args:
flow: Flow identifier
objects: Iterator yielding object dictionaries
rows: Iterator yielding row dictionaries
**kwargs: Additional parameters (reserved for future use)
Example:
```python
bulk = api.bulk()
# Generate objects to import
def object_generator():
yield {"id": "obj1", "name": "Object 1", "value": 100}
yield {"id": "obj2", "name": "Object 2", "value": 200}
# ... more objects
# Generate rows to import
def row_generator():
yield {"id": "row1", "name": "Row 1", "value": 100}
yield {"id": "row2", "name": "Row 2", "value": 200}
# ... more rows
bulk.import_objects(
bulk.import_rows(
flow="default",
objects=object_generator()
rows=row_generator()
)
```
"""
self._run_async(self._import_objects_async(flow, objects))
self._run_async(self._import_rows_async(flow, rows))
async def _import_objects_async(self, flow: str, objects: Iterator[Dict[str, Any]]) -> None:
"""Async implementation of objects import"""
ws_url = f"{self.url}/api/v1/flow/{flow}/import/objects"
async def _import_rows_async(self, flow: str, rows: Iterator[Dict[str, Any]]) -> None:
"""Async implementation of rows import"""
ws_url = f"{self.url}/api/v1/flow/{flow}/import/rows"
if self.token:
ws_url = f"{ws_url}?token={self.token}"
async with websockets.connect(ws_url, ping_interval=20, ping_timeout=self.timeout) as websocket:
for obj in objects:
await websocket.send(json.dumps(obj))
for row in rows:
await websocket.send(json.dumps(row))
def close(self) -> None:
"""Close connections"""

View file

@ -71,8 +71,8 @@ class NLPQueryError(TrustGraphException):
pass
class ObjectsQueryError(TrustGraphException):
"""Objects query service error"""
class RowsQueryError(TrustGraphException):
"""Rows query service error"""
pass
@ -103,7 +103,7 @@ ERROR_TYPE_MAPPING = {
"load-error": LoadError,
"lookup-error": LookupError,
"nlp-query-error": NLPQueryError,
"objects-query-error": ObjectsQueryError,
"rows-query-error": RowsQueryError,
"request-error": RequestError,
"structured-query-error": StructuredQueryError,
"unexpected-error": UnexpectedError,

View file

@ -1001,12 +1001,12 @@ class FlowInstance:
input
)
def objects_query(
def rows_query(
self, query, user="trustgraph", collection="default",
variables=None, operation_name=None
):
"""
Execute a GraphQL query against structured objects in the knowledge graph.
Execute a GraphQL query against structured rows in the knowledge graph.
Queries structured data using GraphQL syntax, allowing complex queries
with filtering, aggregation, and relationship traversal.
@ -1038,7 +1038,7 @@ class FlowInstance:
}
}
'''
result = flow.objects_query(
result = flow.rows_query(
query=query,
user="trustgraph",
collection="scientists"
@ -1053,7 +1053,7 @@ class FlowInstance:
}
}
'''
result = flow.objects_query(
result = flow.rows_query(
query=query,
variables={"name": "Marie Curie"}
)
@ -1074,7 +1074,7 @@ class FlowInstance:
input["operation_name"] = operation_name
response = self.request(
"service/objects",
"service/rows",
input
)

View file

@ -789,7 +789,7 @@ class SocketFlowInstance:
return self.client._send_request_sync("triples", self.flow_id, request, False)
def objects_query(
def rows_query(
self,
query: str,
user: str,
@ -799,7 +799,7 @@ class SocketFlowInstance:
**kwargs: Any
) -> Dict[str, Any]:
"""
Execute a GraphQL query against structured objects.
Execute a GraphQL query against structured rows.
Args:
query: GraphQL query string
@ -826,7 +826,7 @@ class SocketFlowInstance:
}
}
'''
result = flow.objects_query(
result = flow.rows_query(
query=query,
user="trustgraph",
collection="scientists"
@ -844,7 +844,7 @@ class SocketFlowInstance:
request["operationName"] = operation_name
request.update(kwargs)
return self.client._send_request_sync("objects", self.flow_id, request, False)
return self.client._send_request_sync("rows", self.flow_id, request, False)
def mcp_tool(
self,

View file

@ -21,7 +21,7 @@ from .translators.embeddings_query import (
DocumentEmbeddingsRequestTranslator, DocumentEmbeddingsResponseTranslator,
GraphEmbeddingsRequestTranslator, GraphEmbeddingsResponseTranslator
)
from .translators.objects_query import ObjectsQueryRequestTranslator, ObjectsQueryResponseTranslator
from .translators.rows_query import RowsQueryRequestTranslator, RowsQueryResponseTranslator
from .translators.nlp_query import QuestionToStructuredQueryRequestTranslator, QuestionToStructuredQueryResponseTranslator
from .translators.structured_query import StructuredQueryRequestTranslator, StructuredQueryResponseTranslator
from .translators.diagnosis import StructuredDataDiagnosisRequestTranslator, StructuredDataDiagnosisResponseTranslator
@ -113,9 +113,9 @@ TranslatorRegistry.register_service(
)
TranslatorRegistry.register_service(
"objects-query",
ObjectsQueryRequestTranslator(),
ObjectsQueryResponseTranslator()
"rows-query",
RowsQueryRequestTranslator(),
RowsQueryResponseTranslator()
)
TranslatorRegistry.register_service(

View file

@ -17,5 +17,5 @@ from .embeddings_query import (
DocumentEmbeddingsRequestTranslator, DocumentEmbeddingsResponseTranslator,
GraphEmbeddingsRequestTranslator, GraphEmbeddingsResponseTranslator
)
from .objects_query import ObjectsQueryRequestTranslator, ObjectsQueryResponseTranslator
from .rows_query import RowsQueryRequestTranslator, RowsQueryResponseTranslator
from .diagnosis import StructuredDataDiagnosisRequestTranslator, StructuredDataDiagnosisResponseTranslator

View file

@ -1,44 +1,44 @@
from typing import Dict, Any, Tuple, Optional
from ...schema import ObjectsQueryRequest, ObjectsQueryResponse
from ...schema import RowsQueryRequest, RowsQueryResponse
from .base import MessageTranslator
import json
class ObjectsQueryRequestTranslator(MessageTranslator):
"""Translator for ObjectsQueryRequest schema objects"""
def to_pulsar(self, data: Dict[str, Any]) -> ObjectsQueryRequest:
return ObjectsQueryRequest(
class RowsQueryRequestTranslator(MessageTranslator):
"""Translator for RowsQueryRequest schema objects"""
def to_pulsar(self, data: Dict[str, Any]) -> RowsQueryRequest:
return RowsQueryRequest(
user=data.get("user", "trustgraph"),
collection=data.get("collection", "default"),
query=data.get("query", ""),
variables=data.get("variables", {}),
operation_name=data.get("operation_name", None)
)
def from_pulsar(self, obj: ObjectsQueryRequest) -> Dict[str, Any]:
def from_pulsar(self, obj: RowsQueryRequest) -> Dict[str, Any]:
result = {
"user": obj.user,
"collection": obj.collection,
"query": obj.query,
"variables": dict(obj.variables) if obj.variables else {}
}
if obj.operation_name:
result["operation_name"] = obj.operation_name
return result
class ObjectsQueryResponseTranslator(MessageTranslator):
"""Translator for ObjectsQueryResponse schema objects"""
def to_pulsar(self, data: Dict[str, Any]) -> ObjectsQueryResponse:
class RowsQueryResponseTranslator(MessageTranslator):
"""Translator for RowsQueryResponse schema objects"""
def to_pulsar(self, data: Dict[str, Any]) -> RowsQueryResponse:
raise NotImplementedError("Response translation to Pulsar not typically needed")
def from_pulsar(self, obj: ObjectsQueryResponse) -> Dict[str, Any]:
def from_pulsar(self, obj: RowsQueryResponse) -> Dict[str, Any]:
result = {}
# Handle GraphQL response data
if obj.data:
try:
@ -47,7 +47,7 @@ class ObjectsQueryResponseTranslator(MessageTranslator):
result["data"] = obj.data
else:
result["data"] = None
# Handle GraphQL errors
if obj.errors:
result["errors"] = []
@ -60,20 +60,20 @@ class ObjectsQueryResponseTranslator(MessageTranslator):
if error.extensions:
error_dict["extensions"] = dict(error.extensions)
result["errors"].append(error_dict)
# Handle extensions
if obj.extensions:
result["extensions"] = dict(obj.extensions)
# Handle system-level error
if obj.error:
result["error"] = {
"type": obj.error.type,
"message": obj.error.message
}
return result
def from_response_with_completion(self, obj: ObjectsQueryResponse) -> Tuple[Dict[str, Any], bool]:
def from_response_with_completion(self, obj: RowsQueryResponse) -> Tuple[Dict[str, Any], bool]:
"""Returns (response_dict, is_final)"""
return self.from_pulsar(obj), True
return self.from_pulsar(obj), True

View file

@ -60,3 +60,23 @@ class StructuredObjectEmbedding:
field_embeddings: dict[str, list[float]] = field(default_factory=dict) # Per-field embeddings
############################################################################
# Row embeddings are embeddings associated with indexed field values
# in structured row data. Each index gets embedded separately.
@dataclass
class RowIndexEmbedding:
"""Single row's embedding for one index"""
index_name: str = "" # The indexed field name(s)
index_value: list[str] = field(default_factory=list) # The field value(s)
text: str = "" # Text that was embedded
vectors: list[list[float]] = field(default_factory=list)
@dataclass
class RowEmbeddings:
"""Batched row embeddings for a schema"""
metadata: Metadata | None = None
schema_name: str = ""
embeddings: list[RowIndexEmbedding] = field(default_factory=list)
############################################################################

View file

@ -9,7 +9,7 @@ from .library import *
from .lookup import *
from .nlp_query import *
from .structured_query import *
from .objects_query import *
from .rows_query import *
from .diagnosis import *
from .collection import *
from .storage import *

View file

@ -59,4 +59,39 @@ document_embeddings_request_queue = topic(
)
document_embeddings_response_queue = topic(
"document-embeddings-response", qos='q0', tenant='trustgraph', namespace='flow'
)
############################################################################
# Row embeddings query - for semantic/fuzzy matching on row index values
@dataclass
class RowIndexMatch:
"""A single matching row index from a semantic search"""
index_name: str = "" # The indexed field(s)
index_value: list[str] = field(default_factory=list) # The index values
text: str = "" # The text that was embedded
score: float = 0.0 # Similarity score
@dataclass
class RowEmbeddingsRequest:
"""Request for row embeddings semantic search"""
vectors: list[list[float]] = field(default_factory=list) # Query vectors
limit: int = 10 # Max results to return
user: str = "" # User/keyspace
collection: str = "" # Collection name
schema_name: str = "" # Schema name to search within
index_name: str | None = None # Optional: filter to specific index
@dataclass
class RowEmbeddingsResponse:
"""Response from row embeddings semantic search"""
error: Error | None = None
matches: list[RowIndexMatch] = field(default_factory=list)
row_embeddings_request_queue = topic(
"row-embeddings-request", qos='q0', tenant='trustgraph', namespace='flow'
)
row_embeddings_response_queue = topic(
"row-embeddings-response", qos='q0', tenant='trustgraph', namespace='flow'
)

View file

@ -6,7 +6,7 @@ from ..core.topic import topic
############################################################################
# Objects Query Service - executes GraphQL queries against structured data
# Rows Query Service - executes GraphQL queries against structured data
@dataclass
class GraphQLError:
@ -15,7 +15,7 @@ class GraphQLError:
extensions: dict[str, str] = field(default_factory=dict) # Additional error metadata
@dataclass
class ObjectsQueryRequest:
class RowsQueryRequest:
user: str = "" # Cassandra keyspace (follows pattern from TriplesQueryRequest)
collection: str = "" # Data collection identifier (required for partition key)
query: str = "" # GraphQL query string
@ -23,7 +23,7 @@ class ObjectsQueryRequest:
operation_name: Optional[str] = None # Operation to execute for multi-operation documents
@dataclass
class ObjectsQueryResponse:
class RowsQueryResponse:
error: Error | None = None # System-level error (connection, timeout, etc.)
data: str = "" # JSON-encoded GraphQL response data
errors: list[GraphQLError] = field(default_factory=list) # GraphQL field-level errors