mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-06-12 08:15:14 +02:00
Introduces `workspace` as the isolation boundary for config, flows,
library, and knowledge data. Removes `user` as a schema-level field
throughout the code, API specs, and tests; workspace provides the
same separation more cleanly at the trusted flow.workspace layer
rather than through client-supplied message fields.
Design
------
- IAM tech spec (docs/tech-specs/iam.md) documents current state,
proposed auth/access model, and migration direction.
- Data ownership model (docs/tech-specs/data-ownership-model.md)
captures the workspace/collection/flow hierarchy.
Schema + messaging
------------------
- Drop `user` field from AgentRequest/Step, GraphRagQuery,
DocumentRagQuery, Triples/Graph/Document/Row EmbeddingsRequest,
Sparql/Rows/Structured QueryRequest, ToolServiceRequest.
- Keep collection/workspace routing via flow.workspace at the
service layer.
- Translators updated to not serialise/deserialise user.
API specs
---------
- OpenAPI schemas and path examples cleaned of user fields.
- Websocket async-api messages updated.
- Removed the unused parameters/User.yaml.
Services + base
---------------
- Librarian, collection manager, knowledge, config: all operations
scoped by workspace. Config client API takes workspace as first
positional arg.
- `flow.workspace` set at flow start time by the infrastructure;
no longer pass-through from clients.
- Tool service drops user-personalisation passthrough.
CLI + SDK
---------
- tg-init-workspace and workspace-aware import/export.
- All tg-* commands drop user args; accept --workspace.
- Python API/SDK (flow, socket_client, async_*, explainability,
library) drop user kwargs from every method signature.
MCP server
----------
- All tool endpoints drop user parameters; socket_manager no longer
keyed per user.
Flow service
------------
- Closure-based topic cleanup on flow stop: only delete topics
whose blueprint template was parameterised AND no remaining
live flow (across all workspaces) still resolves to that topic.
Three scopes fall out naturally from template analysis:
* {id} -> per-flow, deleted on stop
* {blueprint} -> per-blueprint, kept while any flow of the
same blueprint exists
* {workspace} -> per-workspace, kept while any flow in the
workspace exists
* literal -> global, never deleted (e.g. tg.request.librarian)
Fixes a bug where stopping a flow silently destroyed the global
librarian exchange, wedging all library operations until manual
restart.
RabbitMQ backend
----------------
- heartbeat=60, blocked_connection_timeout=300. Catches silently
dead connections (broker restart, orphaned channels, network
partitions) within ~2 heartbeat windows, so the consumer
reconnects and re-binds its queue rather than sitting forever
on a zombie connection.
Tests
-----
- Full test refresh: unit, integration, contract, provenance.
- Dropped user-field assertions and constructor kwargs across
~100 test files.
- Renamed user-collection isolation tests to workspace-collection.
576 lines
20 KiB
Python
576 lines
20 KiB
Python
"""
|
|
TrustGraph Synchronous Bulk Operations Client
|
|
|
|
This module provides synchronous bulk import/export operations via WebSocket
|
|
for efficient transfer of large datasets including triples, embeddings,
|
|
entity contexts, and objects.
|
|
"""
|
|
|
|
import json
|
|
import asyncio
|
|
import websockets
|
|
from typing import Optional, Iterator, Dict, Any, Coroutine
|
|
|
|
from . types import Triple
|
|
from . exceptions import ProtocolException
|
|
|
|
|
|
def _string_to_term(value: str) -> Dict[str, Any]:
|
|
"""Convert a string value to Term format for the gateway."""
|
|
# Treat URIs as IRI type, otherwise as literal
|
|
if value.startswith("http://") or value.startswith("https://") or "://" in value:
|
|
return {"t": "i", "i": value}
|
|
else:
|
|
return {"t": "l", "v": value}
|
|
|
|
|
|
class BulkClient:
|
|
"""
|
|
Synchronous bulk operations client for import/export.
|
|
|
|
Provides efficient bulk data transfer via WebSocket for large datasets.
|
|
Wraps async WebSocket operations with synchronous generators for ease of use.
|
|
|
|
Note: For true async support, use AsyncBulkClient instead.
|
|
"""
|
|
|
|
def __init__(self, url: str, timeout: int, token: Optional[str]) -> None:
|
|
"""
|
|
Initialize synchronous bulk client.
|
|
|
|
Args:
|
|
url: Base URL for TrustGraph API (HTTP/HTTPS will be converted to WS/WSS)
|
|
timeout: WebSocket timeout in seconds
|
|
token: Optional bearer token for authentication
|
|
"""
|
|
self.url: str = self._convert_to_ws_url(url)
|
|
self.timeout: int = timeout
|
|
self.token: Optional[str] = token
|
|
|
|
def _convert_to_ws_url(self, url: str) -> str:
|
|
"""Convert HTTP URL to WebSocket URL"""
|
|
if url.startswith("http://"):
|
|
return url.replace("http://", "ws://", 1)
|
|
elif url.startswith("https://"):
|
|
return url.replace("https://", "wss://", 1)
|
|
elif url.startswith("ws://") or url.startswith("wss://"):
|
|
return url
|
|
else:
|
|
return f"ws://{url}"
|
|
|
|
def _run_async(self, coro: Coroutine[Any, Any, Any]) -> Any:
|
|
"""Run async coroutine synchronously"""
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
if loop.is_running():
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
except RuntimeError:
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
|
|
return loop.run_until_complete(coro)
|
|
|
|
def import_triples(
|
|
self, flow: str, triples: Iterator[Triple],
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
batch_size: int = 100,
|
|
**kwargs: Any
|
|
) -> None:
|
|
"""
|
|
Bulk import RDF triples into a flow.
|
|
|
|
Efficiently uploads large numbers of triples via WebSocket streaming.
|
|
|
|
Args:
|
|
flow: Flow identifier
|
|
triples: Iterator yielding Triple objects
|
|
metadata: Metadata dict with id, metadata, collection
|
|
batch_size: Number of triples per batch (default 100)
|
|
**kwargs: Additional parameters (reserved for future use)
|
|
|
|
Example:
|
|
```python
|
|
from trustgraph.api import Triple
|
|
|
|
bulk = api.bulk()
|
|
|
|
# Generate triples to import
|
|
def triple_generator():
|
|
yield Triple(s="subj1", p="pred", o="obj1")
|
|
yield Triple(s="subj2", p="pred", o="obj2")
|
|
# ... more triples
|
|
|
|
# Import triples
|
|
bulk.import_triples(
|
|
flow="default",
|
|
triples=triple_generator(),
|
|
metadata={"id": "doc1", "metadata": [], "collection": "default"}
|
|
)
|
|
```
|
|
"""
|
|
self._run_async(self._import_triples_async(flow, triples, metadata, batch_size))
|
|
|
|
async def _import_triples_async(
|
|
self, flow: str, triples: Iterator[Triple],
|
|
metadata: Optional[Dict[str, Any]], batch_size: int
|
|
) -> None:
|
|
"""Async implementation of triple import"""
|
|
ws_url = f"{self.url}/api/v1/flow/{flow}/import/triples"
|
|
if self.token:
|
|
ws_url = f"{ws_url}?token={self.token}"
|
|
|
|
if metadata is None:
|
|
metadata = {"id": "", "metadata": [], "collection": "default"}
|
|
|
|
async with websockets.connect(ws_url, ping_interval=20, ping_timeout=self.timeout) as websocket:
|
|
batch = []
|
|
for triple in triples:
|
|
batch.append({
|
|
"s": _string_to_term(triple.s),
|
|
"p": _string_to_term(triple.p),
|
|
"o": _string_to_term(triple.o)
|
|
})
|
|
if len(batch) >= batch_size:
|
|
message = {
|
|
"metadata": metadata,
|
|
"triples": batch
|
|
}
|
|
await websocket.send(json.dumps(message))
|
|
batch = []
|
|
# Send remaining items
|
|
if batch:
|
|
message = {
|
|
"metadata": metadata,
|
|
"triples": batch
|
|
}
|
|
await websocket.send(json.dumps(message))
|
|
|
|
def export_triples(self, flow: str, **kwargs: Any) -> Iterator[Triple]:
|
|
"""
|
|
Bulk export RDF triples from a flow.
|
|
|
|
Efficiently downloads all triples via WebSocket streaming.
|
|
|
|
Args:
|
|
flow: Flow identifier
|
|
**kwargs: Additional parameters (reserved for future use)
|
|
|
|
Returns:
|
|
Iterator[Triple]: Stream of Triple objects
|
|
|
|
Example:
|
|
```python
|
|
bulk = api.bulk()
|
|
|
|
# Export and process triples
|
|
for triple in bulk.export_triples(flow="default"):
|
|
print(f"{triple.s} -> {triple.p} -> {triple.o}")
|
|
```
|
|
"""
|
|
async_gen = self._export_triples_async(flow)
|
|
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
if loop.is_running():
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
except RuntimeError:
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
|
|
try:
|
|
while True:
|
|
try:
|
|
triple = loop.run_until_complete(async_gen.__anext__())
|
|
yield triple
|
|
except StopAsyncIteration:
|
|
break
|
|
finally:
|
|
try:
|
|
loop.run_until_complete(async_gen.aclose())
|
|
except:
|
|
pass
|
|
|
|
async def _export_triples_async(self, flow: str) -> Iterator[Triple]:
|
|
"""Async implementation of triple export"""
|
|
ws_url = f"{self.url}/api/v1/flow/{flow}/export/triples"
|
|
if self.token:
|
|
ws_url = f"{ws_url}?token={self.token}"
|
|
|
|
async with websockets.connect(ws_url, ping_interval=20, ping_timeout=self.timeout) as websocket:
|
|
async for raw_message in websocket:
|
|
data = json.loads(raw_message)
|
|
yield Triple(
|
|
s=data.get("s", ""),
|
|
p=data.get("p", ""),
|
|
o=data.get("o", "")
|
|
)
|
|
|
|
def import_graph_embeddings(self, flow: str, embeddings: Iterator[Dict[str, Any]], **kwargs: Any) -> None:
|
|
"""
|
|
Bulk import graph embeddings into a flow.
|
|
|
|
Efficiently uploads graph entity embeddings via WebSocket streaming.
|
|
|
|
Args:
|
|
flow: Flow identifier
|
|
embeddings: Iterator yielding embedding dictionaries
|
|
**kwargs: Additional parameters (reserved for future use)
|
|
|
|
Example:
|
|
```python
|
|
bulk = api.bulk()
|
|
|
|
# Generate embeddings to import
|
|
def embedding_generator():
|
|
yield {"entity": "entity1", "embedding": [0.1, 0.2, ...]}
|
|
yield {"entity": "entity2", "embedding": [0.3, 0.4, ...]}
|
|
# ... more embeddings
|
|
|
|
bulk.import_graph_embeddings(
|
|
flow="default",
|
|
embeddings=embedding_generator()
|
|
)
|
|
```
|
|
"""
|
|
self._run_async(self._import_graph_embeddings_async(flow, embeddings))
|
|
|
|
async def _import_graph_embeddings_async(self, flow: str, embeddings: Iterator[Dict[str, Any]]) -> None:
|
|
"""Async implementation of graph embeddings import"""
|
|
ws_url = f"{self.url}/api/v1/flow/{flow}/import/graph-embeddings"
|
|
if self.token:
|
|
ws_url = f"{ws_url}?token={self.token}"
|
|
|
|
async with websockets.connect(ws_url, ping_interval=20, ping_timeout=self.timeout) as websocket:
|
|
for embedding in embeddings:
|
|
await websocket.send(json.dumps(embedding))
|
|
|
|
def export_graph_embeddings(self, flow: str, **kwargs: Any) -> Iterator[Dict[str, Any]]:
|
|
"""
|
|
Bulk export graph embeddings from a flow.
|
|
|
|
Efficiently downloads all graph entity embeddings via WebSocket streaming.
|
|
|
|
Args:
|
|
flow: Flow identifier
|
|
**kwargs: Additional parameters (reserved for future use)
|
|
|
|
Returns:
|
|
Iterator[Dict[str, Any]]: Stream of embedding dictionaries
|
|
|
|
Example:
|
|
```python
|
|
bulk = api.bulk()
|
|
|
|
# Export and process embeddings
|
|
for embedding in bulk.export_graph_embeddings(flow="default"):
|
|
entity = embedding.get("entity")
|
|
vector = embedding.get("embedding")
|
|
print(f"{entity}: {len(vector)} dimensions")
|
|
```
|
|
"""
|
|
async_gen = self._export_graph_embeddings_async(flow)
|
|
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
if loop.is_running():
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
except RuntimeError:
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
|
|
try:
|
|
while True:
|
|
try:
|
|
embedding = loop.run_until_complete(async_gen.__anext__())
|
|
yield embedding
|
|
except StopAsyncIteration:
|
|
break
|
|
finally:
|
|
try:
|
|
loop.run_until_complete(async_gen.aclose())
|
|
except:
|
|
pass
|
|
|
|
async def _export_graph_embeddings_async(self, flow: str) -> Iterator[Dict[str, Any]]:
|
|
"""Async implementation of graph embeddings export"""
|
|
ws_url = f"{self.url}/api/v1/flow/{flow}/export/graph-embeddings"
|
|
if self.token:
|
|
ws_url = f"{ws_url}?token={self.token}"
|
|
|
|
async with websockets.connect(ws_url, ping_interval=20, ping_timeout=self.timeout) as websocket:
|
|
async for raw_message in websocket:
|
|
yield json.loads(raw_message)
|
|
|
|
def import_document_embeddings(self, flow: str, embeddings: Iterator[Dict[str, Any]], **kwargs: Any) -> None:
|
|
"""
|
|
Bulk import document embeddings into a flow.
|
|
|
|
Efficiently uploads document chunk embeddings via WebSocket streaming
|
|
for use in document RAG queries.
|
|
|
|
Args:
|
|
flow: Flow identifier
|
|
embeddings: Iterator yielding embedding dictionaries
|
|
**kwargs: Additional parameters (reserved for future use)
|
|
|
|
Example:
|
|
```python
|
|
bulk = api.bulk()
|
|
|
|
# Generate document embeddings to import
|
|
def doc_embedding_generator():
|
|
yield {"chunk_id": "doc1/p0/c0", "embedding": [0.1, 0.2, ...]}
|
|
yield {"chunk_id": "doc1/p0/c1", "embedding": [0.3, 0.4, ...]}
|
|
# ... more embeddings
|
|
|
|
bulk.import_document_embeddings(
|
|
flow="default",
|
|
embeddings=doc_embedding_generator()
|
|
)
|
|
```
|
|
"""
|
|
self._run_async(self._import_document_embeddings_async(flow, embeddings))
|
|
|
|
async def _import_document_embeddings_async(self, flow: str, embeddings: Iterator[Dict[str, Any]]) -> None:
|
|
"""Async implementation of document embeddings import"""
|
|
ws_url = f"{self.url}/api/v1/flow/{flow}/import/document-embeddings"
|
|
if self.token:
|
|
ws_url = f"{ws_url}?token={self.token}"
|
|
|
|
async with websockets.connect(ws_url, ping_interval=20, ping_timeout=self.timeout) as websocket:
|
|
for embedding in embeddings:
|
|
await websocket.send(json.dumps(embedding))
|
|
|
|
def export_document_embeddings(self, flow: str, **kwargs: Any) -> Iterator[Dict[str, Any]]:
|
|
"""
|
|
Bulk export document embeddings from a flow.
|
|
|
|
Efficiently downloads all document chunk embeddings via WebSocket streaming.
|
|
|
|
Args:
|
|
flow: Flow identifier
|
|
**kwargs: Additional parameters (reserved for future use)
|
|
|
|
Returns:
|
|
Iterator[Dict[str, Any]]: Stream of embedding dictionaries
|
|
|
|
Example:
|
|
```python
|
|
bulk = api.bulk()
|
|
|
|
# Export and process document embeddings
|
|
for embedding in bulk.export_document_embeddings(flow="default"):
|
|
chunk_id = embedding.get("chunk_id")
|
|
vector = embedding.get("embedding")
|
|
print(f"{chunk_id}: {len(vector)} dimensions")
|
|
```
|
|
"""
|
|
async_gen = self._export_document_embeddings_async(flow)
|
|
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
if loop.is_running():
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
except RuntimeError:
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
|
|
try:
|
|
while True:
|
|
try:
|
|
embedding = loop.run_until_complete(async_gen.__anext__())
|
|
yield embedding
|
|
except StopAsyncIteration:
|
|
break
|
|
finally:
|
|
try:
|
|
loop.run_until_complete(async_gen.aclose())
|
|
except:
|
|
pass
|
|
|
|
async def _export_document_embeddings_async(self, flow: str) -> Iterator[Dict[str, Any]]:
|
|
"""Async implementation of document embeddings export"""
|
|
ws_url = f"{self.url}/api/v1/flow/{flow}/export/document-embeddings"
|
|
if self.token:
|
|
ws_url = f"{ws_url}?token={self.token}"
|
|
|
|
async with websockets.connect(ws_url, ping_interval=20, ping_timeout=self.timeout) as websocket:
|
|
async for raw_message in websocket:
|
|
yield json.loads(raw_message)
|
|
|
|
def import_entity_contexts(
|
|
self, flow: str, contexts: Iterator[Dict[str, Any]],
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
batch_size: int = 100,
|
|
**kwargs: Any
|
|
) -> None:
|
|
"""
|
|
Bulk import entity contexts into a flow.
|
|
|
|
Efficiently uploads entity context information via WebSocket streaming.
|
|
Entity contexts provide additional textual context about graph entities
|
|
for improved RAG performance.
|
|
|
|
Args:
|
|
flow: Flow identifier
|
|
contexts: Iterator yielding context dictionaries
|
|
metadata: Metadata dict with id, metadata, collection
|
|
batch_size: Number of contexts per batch (default 100)
|
|
**kwargs: Additional parameters (reserved for future use)
|
|
|
|
Example:
|
|
```python
|
|
bulk = api.bulk()
|
|
|
|
# Generate entity contexts to import
|
|
def context_generator():
|
|
yield {"entity": {"v": "entity1", "e": True}, "context": "Description..."}
|
|
yield {"entity": {"v": "entity2", "e": True}, "context": "Description..."}
|
|
# ... more contexts
|
|
|
|
bulk.import_entity_contexts(
|
|
flow="default",
|
|
contexts=context_generator(),
|
|
metadata={"id": "doc1", "metadata": [], "collection": "default"}
|
|
)
|
|
```
|
|
"""
|
|
self._run_async(self._import_entity_contexts_async(flow, contexts, metadata, batch_size))
|
|
|
|
async def _import_entity_contexts_async(
|
|
self, flow: str, contexts: Iterator[Dict[str, Any]],
|
|
metadata: Optional[Dict[str, Any]], batch_size: int
|
|
) -> None:
|
|
"""Async implementation of entity contexts import"""
|
|
ws_url = f"{self.url}/api/v1/flow/{flow}/import/entity-contexts"
|
|
if self.token:
|
|
ws_url = f"{ws_url}?token={self.token}"
|
|
|
|
if metadata is None:
|
|
metadata = {"id": "", "metadata": [], "collection": "default"}
|
|
|
|
async with websockets.connect(ws_url, ping_interval=20, ping_timeout=self.timeout) as websocket:
|
|
batch = []
|
|
for context in contexts:
|
|
batch.append(context)
|
|
if len(batch) >= batch_size:
|
|
message = {
|
|
"metadata": metadata,
|
|
"entities": batch
|
|
}
|
|
await websocket.send(json.dumps(message))
|
|
batch = []
|
|
# Send remaining items
|
|
if batch:
|
|
message = {
|
|
"metadata": metadata,
|
|
"entities": batch
|
|
}
|
|
await websocket.send(json.dumps(message))
|
|
|
|
def export_entity_contexts(self, flow: str, **kwargs: Any) -> Iterator[Dict[str, Any]]:
|
|
"""
|
|
Bulk export entity contexts from a flow.
|
|
|
|
Efficiently downloads all entity context information via WebSocket streaming.
|
|
|
|
Args:
|
|
flow: Flow identifier
|
|
**kwargs: Additional parameters (reserved for future use)
|
|
|
|
Returns:
|
|
Iterator[Dict[str, Any]]: Stream of context dictionaries
|
|
|
|
Example:
|
|
```python
|
|
bulk = api.bulk()
|
|
|
|
# Export and process entity contexts
|
|
for context in bulk.export_entity_contexts(flow="default"):
|
|
entity = context.get("entity")
|
|
text = context.get("context")
|
|
print(f"{entity}: {text[:100]}...")
|
|
```
|
|
"""
|
|
async_gen = self._export_entity_contexts_async(flow)
|
|
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
if loop.is_running():
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
except RuntimeError:
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
|
|
try:
|
|
while True:
|
|
try:
|
|
context = loop.run_until_complete(async_gen.__anext__())
|
|
yield context
|
|
except StopAsyncIteration:
|
|
break
|
|
finally:
|
|
try:
|
|
loop.run_until_complete(async_gen.aclose())
|
|
except:
|
|
pass
|
|
|
|
async def _export_entity_contexts_async(self, flow: str) -> Iterator[Dict[str, Any]]:
|
|
"""Async implementation of entity contexts export"""
|
|
ws_url = f"{self.url}/api/v1/flow/{flow}/export/entity-contexts"
|
|
if self.token:
|
|
ws_url = f"{ws_url}?token={self.token}"
|
|
|
|
async with websockets.connect(ws_url, ping_interval=20, ping_timeout=self.timeout) as websocket:
|
|
async for raw_message in websocket:
|
|
yield json.loads(raw_message)
|
|
|
|
def import_rows(self, flow: str, rows: Iterator[Dict[str, Any]], **kwargs: Any) -> None:
|
|
"""
|
|
Bulk import structured rows into a flow.
|
|
|
|
Efficiently uploads structured data rows via WebSocket streaming
|
|
for use in GraphQL queries.
|
|
|
|
Args:
|
|
flow: Flow identifier
|
|
rows: Iterator yielding row dictionaries
|
|
**kwargs: Additional parameters (reserved for future use)
|
|
|
|
Example:
|
|
```python
|
|
bulk = api.bulk()
|
|
|
|
# Generate rows to import
|
|
def row_generator():
|
|
yield {"id": "row1", "name": "Row 1", "value": 100}
|
|
yield {"id": "row2", "name": "Row 2", "value": 200}
|
|
# ... more rows
|
|
|
|
bulk.import_rows(
|
|
flow="default",
|
|
rows=row_generator()
|
|
)
|
|
```
|
|
"""
|
|
self._run_async(self._import_rows_async(flow, rows))
|
|
|
|
async def _import_rows_async(self, flow: str, rows: Iterator[Dict[str, Any]]) -> None:
|
|
"""Async implementation of rows import"""
|
|
ws_url = f"{self.url}/api/v1/flow/{flow}/import/rows"
|
|
if self.token:
|
|
ws_url = f"{ws_url}?token={self.token}"
|
|
|
|
async with websockets.connect(ws_url, ping_interval=20, ping_timeout=self.timeout) as websocket:
|
|
for row in rows:
|
|
await websocket.send(json.dumps(row))
|
|
|
|
def close(self) -> None:
|
|
"""Close connections"""
|
|
# Cleanup handled by context managers
|
|
pass
|