Feature/pkgsplit (#83)

* Starting to spawn base package
* More package hacking
* Bedrock and VertexAI
* Parquet split
* Updated templates
* Utils
This commit is contained in:
cybermaggedon 2024-09-30 19:36:09 +01:00 committed by GitHub
parent 3fb75c617b
commit 9b91d5eee3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
262 changed files with 630 additions and 420 deletions

View file

@ -0,0 +1,6 @@
from . base_processor import BaseProcessor
from . consumer import Consumer
from . producer import Producer
from . consumer_producer import ConsumerProducer

View file

@ -0,0 +1,119 @@
import os
import argparse
import pulsar
import _pulsar
import time
from prometheus_client import start_http_server, Info
from .. log_level import LogLevel
class BaseProcessor:
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
def __init__(self, **params):
self.client = None
if not hasattr(__class__, "params_metric"):
__class__.params_metric = Info(
'params', 'Parameters configuration'
)
# FIXME: Maybe outputs information it should not
__class__.params_metric.info({
k: str(params[k])
for k in params
})
pulsar_host = params.get("pulsar_host", self.default_pulsar_host)
log_level = params.get("log_level", LogLevel.INFO)
self.pulsar_host = pulsar_host
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
)
def __del__(self):
if self.client:
self.client.close()
@staticmethod
def add_args(parser):
parser.add_argument(
'-p', '--pulsar-host',
default=__class__.default_pulsar_host,
help=f'Pulsar host (default: {__class__.default_pulsar_host})',
)
parser.add_argument(
'-l', '--log-level',
type=LogLevel,
default=LogLevel.INFO,
choices=list(LogLevel),
help=f'Output queue (default: info)'
)
parser.add_argument(
'--metrics',
action=argparse.BooleanOptionalAction,
default=True,
help=f'Metrics enabled (default: true)',
)
parser.add_argument(
'-P', '--metrics-port',
type=int,
default=8000,
help=f'Pulsar host (default: 8000)',
)
def run(self):
raise RuntimeError("Something should have implemented the run method")
@classmethod
def start(cls, prog, doc):
parser = argparse.ArgumentParser(
prog=prog,
description=doc
)
cls.add_args(parser)
args = parser.parse_args()
args = vars(args)
print(args)
if args["metrics"]:
start_http_server(args["metrics_port"])
while True:
try:
p = cls(**args)
p.run()
except KeyboardInterrupt:
print("Keyboard interrupt.")
return
except _pulsar.Interrupted:
print("Pulsar Interrupted.")
return
except Exception as e:
print(type(e))
print("Exception:", e, flush=True)
print("Will retry...", flush=True)
time.sleep(4)

View file

@ -0,0 +1,107 @@
from pulsar.schema import JsonSchema
from prometheus_client import Histogram, Info, Counter, Enum
import time
from . base_processor import BaseProcessor
from .. exceptions import TooManyRequests
class Consumer(BaseProcessor):
def __init__(self, **params):
if not hasattr(__class__, "state_metric"):
__class__.state_metric = Enum(
'processor_state', 'Processor state',
states=['starting', 'running', 'stopped']
)
__class__.state_metric.state('starting')
__class__.state_metric.state('starting')
super(Consumer, self).__init__(**params)
input_queue = params.get("input_queue")
subscriber = params.get("subscriber")
input_schema = params.get("input_schema")
if input_schema == None:
raise RuntimeError("input_schema must be specified")
if not hasattr(__class__, "request_metric"):
__class__.request_metric = Histogram(
'request_latency', 'Request latency (seconds)'
)
if not hasattr(__class__, "pubsub_metric"):
__class__.pubsub_metric = Info(
'pubsub', 'Pub/sub configuration'
)
if not hasattr(__class__, "processing_metric"):
__class__.processing_metric = Counter(
'processing_count', 'Processing count', ["status"]
)
__class__.pubsub_metric.info({
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": input_schema.__name__,
})
self.consumer = self.client.subscribe(
input_queue, subscriber,
schema=JsonSchema(input_schema),
)
def run(self):
__class__.state_metric.state('running')
while True:
msg = self.consumer.receive()
try:
with __class__.request_metric.time():
self.handle(msg)
# Acknowledge successful processing of the message
self.consumer.acknowledge(msg)
__class__.processing_metric.labels(status="success").inc()
except TooManyRequests:
self.consumer.negative_acknowledge(msg)
print("TooManyRequests: will retry")
__class__.processing_metric.labels(status="rate-limit").inc()
time.sleep(5)
continue
except Exception as e:
print("Exception:", e, flush=True)
# Message failed to be processed
self.consumer.negative_acknowledge(msg)
__class__.processing_metric.labels(status="error").inc()
@staticmethod
def add_args(parser, default_input_queue, default_subscriber):
BaseProcessor.add_args(parser)
parser.add_argument(
'-i', '--input-queue',
default=default_input_queue,
help=f'Input queue (default: {default_input_queue})'
)
parser.add_argument(
'-s', '--subscriber',
default=default_subscriber,
help=f'Queue subscriber name (default: {default_subscriber})'
)

View file

@ -0,0 +1,139 @@
from pulsar.schema import JsonSchema
from prometheus_client import Histogram, Info, Counter, Enum
import time
from . base_processor import BaseProcessor
from .. exceptions import TooManyRequests
# FIXME: Derive from consumer? And producer?
class ConsumerProducer(BaseProcessor):
def __init__(self, **params):
if not hasattr(__class__, "state_metric"):
__class__.state_metric = Enum(
'processor_state', 'Processor state',
states=['starting', 'running', 'stopped']
)
__class__.state_metric.state('starting')
__class__.state_metric.state('starting')
input_queue = params.get("input_queue")
output_queue = params.get("output_queue")
subscriber = params.get("subscriber")
input_schema = params.get("input_schema")
output_schema = params.get("output_schema")
if not hasattr(__class__, "request_metric"):
__class__.request_metric = Histogram(
'request_latency', 'Request latency (seconds)'
)
if not hasattr(__class__, "output_metric"):
__class__.output_metric = Counter(
'output_count', 'Output items created'
)
if not hasattr(__class__, "pubsub_metric"):
__class__.pubsub_metric = Info(
'pubsub', 'Pub/sub configuration'
)
if not hasattr(__class__, "processing_metric"):
__class__.processing_metric = Counter(
'processing_count', 'Processing count', ["status"]
)
__class__.pubsub_metric.info({
"input_queue": input_queue,
"output_queue": output_queue,
"subscriber": subscriber,
"input_schema": input_schema.__name__,
"output_schema": output_schema.__name__,
})
super(ConsumerProducer, self).__init__(**params)
if input_schema == None:
raise RuntimeError("input_schema must be specified")
if output_schema == None:
raise RuntimeError("output_schema must be specified")
self.producer = self.client.create_producer(
topic=output_queue,
schema=JsonSchema(output_schema),
)
self.consumer = self.client.subscribe(
input_queue, subscriber,
schema=JsonSchema(input_schema),
)
def run(self):
__class__.state_metric.state('running')
while True:
msg = self.consumer.receive()
try:
with __class__.request_metric.time():
resp = self.handle(msg)
# Acknowledge successful processing of the message
self.consumer.acknowledge(msg)
__class__.processing_metric.labels(status="success").inc()
except TooManyRequests:
self.consumer.negative_acknowledge(msg)
print("TooManyRequests: will retry")
__class__.processing_metric.labels(status="rate-limit").inc()
time.sleep(5)
continue
except Exception as e:
print("Exception:", e, flush=True)
# Message failed to be processed
self.consumer.negative_acknowledge(msg)
__class__.processing_metric.labels(status="error").inc()
def send(self, msg, properties={}):
self.producer.send(msg, properties)
__class__.output_metric.inc()
@staticmethod
def add_args(
parser, default_input_queue, default_subscriber,
default_output_queue,
):
BaseProcessor.add_args(parser)
parser.add_argument(
'-i', '--input-queue',
default=default_input_queue,
help=f'Input queue (default: {default_input_queue})'
)
parser.add_argument(
'-s', '--subscriber',
default=default_subscriber,
help=f'Queue subscriber name (default: {default_subscriber})'
)
parser.add_argument(
'-o', '--output-queue',
default=default_output_queue,
help=f'Output queue (default: {default_output_queue})'
)

View file

@ -0,0 +1,55 @@
from pulsar.schema import JsonSchema
from prometheus_client import Info, Counter
from . base_processor import BaseProcessor
class Producer(BaseProcessor):
def __init__(self, **params):
output_queue = params.get("output_queue")
output_schema = params.get("output_schema")
if not hasattr(__class__, "output_metric"):
__class__.output_metric = Counter(
'output_count', 'Output items created'
)
if not hasattr(__class__, "pubsub_metric"):
__class__.pubsub_metric = Info(
'pubsub', 'Pub/sub configuration'
)
__class__.pubsub_metric.info({
"output_queue": output_queue,
"output_schema": output_schema.__name__,
})
super(Producer, self).__init__(**params)
if output_schema == None:
raise RuntimeError("output_schema must be specified")
self.producer = self.client.create_producer(
topic=output_queue,
schema=JsonSchema(output_schema),
)
def send(self, msg, properties={}):
self.producer.send(msg, properties)
__class__.output_metric.inc()
@staticmethod
def add_args(
parser, default_input_queue, default_subscriber,
default_output_queue,
):
BaseProcessor.add_args(parser)
parser.add_argument(
'-o', '--output-queue',
default=default_output_queue,
help=f'Output queue (default: {default_output_queue})'
)

View file

@ -0,0 +1,125 @@
import pulsar
import _pulsar
import hashlib
import uuid
import time
from pulsar.schema import JsonSchema
from .. exceptions import *
# Default timeout for a request/response. In seconds.
DEFAULT_TIMEOUT=300
# Ugly
ERROR=_pulsar.LoggerLevel.Error
WARN=_pulsar.LoggerLevel.Warn
INFO=_pulsar.LoggerLevel.Info
DEBUG=_pulsar.LoggerLevel.Debug
class BaseClient:
def __init__(
self, log_level=ERROR,
subscriber=None,
input_queue=None,
output_queue=None,
input_schema=None,
output_schema=None,
pulsar_host="pulsar://pulsar:6650",
):
if input_queue == None: raise RuntimeError("Need input_queue")
if output_queue == None: raise RuntimeError("Need output_queue")
if input_schema == None: raise RuntimeError("Need input_schema")
if output_schema == None: raise RuntimeError("Need output_schema")
if subscriber == None:
subscriber = str(uuid.uuid4())
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level),
)
self.producer = self.client.create_producer(
topic=input_queue,
schema=JsonSchema(input_schema),
chunking_enabled=True,
)
self.consumer = self.client.subscribe(
output_queue, subscriber,
schema=JsonSchema(output_schema),
)
self.input_schema = input_schema
self.output_schema = output_schema
def call(self, **args):
timeout = args.get("timeout", DEFAULT_TIMEOUT)
if "timeout" in args:
del args["timeout"]
id = str(uuid.uuid4())
r = self.input_schema(**args)
end_time = time.time() + timeout
self.producer.send(r, properties={ "id": id })
while time.time() < end_time:
try:
msg = self.consumer.receive(timeout_millis=2500)
except pulsar.exceptions.Timeout:
continue
mid = msg.properties()["id"]
if mid == id:
value = msg.value()
if value.error:
self.consumer.acknowledge(msg)
if value.error.type == "llm-error":
raise LlmError(value.error.message)
elif value.error.type == "too-many-requests":
raise TooManyRequests(value.error.message)
elif value.error.type == "ParseError":
raise ParseError(value.error.message)
else:
raise RuntimeError(
f"{value.error.type}: {value.error.message}"
)
resp = msg.value()
self.consumer.acknowledge(msg)
return resp
# Ignore messages with wrong ID
self.consumer.acknowledge(msg)
raise TimeoutError("Timed out waiting for response")
def __del__(self):
if hasattr(self, "consumer"):
self.consumer.close()
if hasattr(self, "producer"):
self.producer.flush()
self.producer.close()
self.client.close()

View file

@ -0,0 +1,45 @@
import _pulsar
from .. schema import DocumentEmbeddingsRequest, DocumentEmbeddingsResponse
from .. schema import document_embeddings_request_queue
from .. schema import document_embeddings_response_queue
from . base import BaseClient
# Ugly
ERROR=_pulsar.LoggerLevel.Error
WARN=_pulsar.LoggerLevel.Warn
INFO=_pulsar.LoggerLevel.Info
DEBUG=_pulsar.LoggerLevel.Debug
class DocumentEmbeddingsClient(BaseClient):
def __init__(
self, log_level=ERROR,
subscriber=None,
input_queue=None,
output_queue=None,
pulsar_host="pulsar://pulsar:6650",
):
if input_queue == None:
input_queue = document_embeddings_request_queue
if output_queue == None:
output_queue = document_embeddings_response_queue
super(DocumentEmbeddingsClient, self).__init__(
log_level=log_level,
subscriber=subscriber,
input_queue=input_queue,
output_queue=output_queue,
pulsar_host=pulsar_host,
input_schema=DocumentEmbeddingsRequest,
output_schema=DocumentEmbeddingsResponse,
)
def request(self, vectors, limit=10, timeout=300):
return self.call(
vectors=vectors, limit=limit, timeout=timeout
).documents

View file

@ -0,0 +1,46 @@
import _pulsar
from .. schema import DocumentRagQuery, DocumentRagResponse
from .. schema import document_rag_request_queue, document_rag_response_queue
from . base import BaseClient
# Ugly
ERROR=_pulsar.LoggerLevel.Error
WARN=_pulsar.LoggerLevel.Warn
INFO=_pulsar.LoggerLevel.Info
DEBUG=_pulsar.LoggerLevel.Debug
class DocumentRagClient(BaseClient):
def __init__(
self,
log_level=ERROR,
subscriber=None,
input_queue=None,
output_queue=None,
pulsar_host="pulsar://pulsar:6650",
):
if input_queue == None:
input_queue = document_rag_request_queue
if output_queue == None:
output_queue = document_rag_response_queue
super(DocumentRagClient, self).__init__(
log_level=log_level,
subscriber=subscriber,
input_queue=input_queue,
output_queue=output_queue,
pulsar_host=pulsar_host,
input_schema=DocumentRagQuery,
output_schema=DocumentRagResponse,
)
def request(self, query, timeout=500):
return self.call(
query=query, timeout=timeout
).response

View file

@ -0,0 +1,44 @@
from pulsar.schema import JsonSchema
from .. schema import EmbeddingsRequest, EmbeddingsResponse
from .. schema import embeddings_request_queue, embeddings_response_queue
from . base import BaseClient
import _pulsar
# Ugly
ERROR=_pulsar.LoggerLevel.Error
WARN=_pulsar.LoggerLevel.Warn
INFO=_pulsar.LoggerLevel.Info
DEBUG=_pulsar.LoggerLevel.Debug
class EmbeddingsClient(BaseClient):
def __init__(
self, log_level=ERROR,
input_queue=None,
output_queue=None,
subscriber=None,
pulsar_host="pulsar://pulsar:6650",
):
if input_queue == None:
input_queue=embeddings_request_queue
if output_queue == None:
output_queue=embeddings_response_queue
super(EmbeddingsClient, self).__init__(
log_level=log_level,
subscriber=subscriber,
input_queue=input_queue,
output_queue=output_queue,
pulsar_host=pulsar_host,
input_schema=EmbeddingsRequest,
output_schema=EmbeddingsResponse,
)
def request(self, text, timeout=300):
return self.call(text=text, timeout=timeout).vectors

View file

@ -0,0 +1,45 @@
import _pulsar
from .. schema import GraphEmbeddingsRequest, GraphEmbeddingsResponse
from .. schema import graph_embeddings_request_queue
from .. schema import graph_embeddings_response_queue
from . base import BaseClient
# Ugly
ERROR=_pulsar.LoggerLevel.Error
WARN=_pulsar.LoggerLevel.Warn
INFO=_pulsar.LoggerLevel.Info
DEBUG=_pulsar.LoggerLevel.Debug
class GraphEmbeddingsClient(BaseClient):
def __init__(
self, log_level=ERROR,
subscriber=None,
input_queue=None,
output_queue=None,
pulsar_host="pulsar://pulsar:6650",
):
if input_queue == None:
input_queue = graph_embeddings_request_queue
if output_queue == None:
output_queue = graph_embeddings_response_queue
super(GraphEmbeddingsClient, self).__init__(
log_level=log_level,
subscriber=subscriber,
input_queue=input_queue,
output_queue=output_queue,
pulsar_host=pulsar_host,
input_schema=GraphEmbeddingsRequest,
output_schema=GraphEmbeddingsResponse,
)
def request(self, vectors, limit=10, timeout=300):
return self.call(
vectors=vectors, limit=limit, timeout=timeout
).entities

View file

@ -0,0 +1,46 @@
import _pulsar
from .. schema import GraphRagQuery, GraphRagResponse
from .. schema import graph_rag_request_queue, graph_rag_response_queue
from . base import BaseClient
# Ugly
ERROR=_pulsar.LoggerLevel.Error
WARN=_pulsar.LoggerLevel.Warn
INFO=_pulsar.LoggerLevel.Info
DEBUG=_pulsar.LoggerLevel.Debug
class GraphRagClient(BaseClient):
def __init__(
self,
log_level=ERROR,
subscriber=None,
input_queue=None,
output_queue=None,
pulsar_host="pulsar://pulsar:6650",
):
if input_queue == None:
input_queue = graph_rag_request_queue
if output_queue == None:
output_queue = graph_rag_response_queue
super(GraphRagClient, self).__init__(
log_level=log_level,
subscriber=subscriber,
input_queue=input_queue,
output_queue=output_queue,
pulsar_host=pulsar_host,
input_schema=GraphRagQuery,
output_schema=GraphRagResponse,
)
def request(self, query, timeout=500):
return self.call(
query=query, timeout=timeout
).response

View file

@ -0,0 +1,40 @@
import _pulsar
from .. schema import TextCompletionRequest, TextCompletionResponse
from .. schema import text_completion_request_queue
from .. schema import text_completion_response_queue
from . base import BaseClient
# Ugly
ERROR=_pulsar.LoggerLevel.Error
WARN=_pulsar.LoggerLevel.Warn
INFO=_pulsar.LoggerLevel.Info
DEBUG=_pulsar.LoggerLevel.Debug
class LlmClient(BaseClient):
def __init__(
self, log_level=ERROR,
subscriber=None,
input_queue=None,
output_queue=None,
pulsar_host="pulsar://pulsar:6650",
):
if input_queue is None: input_queue = text_completion_request_queue
if output_queue is None: output_queue = text_completion_response_queue
super(LlmClient, self).__init__(
log_level=log_level,
subscriber=subscriber,
input_queue=input_queue,
output_queue=output_queue,
pulsar_host=pulsar_host,
input_schema=TextCompletionRequest,
output_schema=TextCompletionResponse,
)
def request(self, prompt, timeout=300):
return self.call(prompt=prompt, timeout=timeout).response

View file

@ -0,0 +1,100 @@
import _pulsar
from .. schema import PromptRequest, PromptResponse, Fact, RowSchema, Field
from .. schema import prompt_request_queue
from .. schema import prompt_response_queue
from . base import BaseClient
# Ugly
ERROR=_pulsar.LoggerLevel.Error
WARN=_pulsar.LoggerLevel.Warn
INFO=_pulsar.LoggerLevel.Info
DEBUG=_pulsar.LoggerLevel.Debug
class PromptClient(BaseClient):
def __init__(
self, log_level=ERROR,
subscriber=None,
input_queue=None,
output_queue=None,
pulsar_host="pulsar://pulsar:6650",
):
if input_queue == None:
input_queue = prompt_request_queue
if output_queue == None:
output_queue = prompt_response_queue
super(PromptClient, self).__init__(
log_level=log_level,
subscriber=subscriber,
input_queue=input_queue,
output_queue=output_queue,
pulsar_host=pulsar_host,
input_schema=PromptRequest,
output_schema=PromptResponse,
)
def request_definitions(self, chunk, timeout=300):
return self.call(
kind="extract-definitions", chunk=chunk,
timeout=timeout
).definitions
def request_topics(self, chunk, timeout=300):
return self.call(
kind="extract-topics", chunk=chunk,
timeout=timeout
).topics
def request_relationships(self, chunk, timeout=300):
return self.call(
kind="extract-relationships", chunk=chunk,
timeout=timeout
).relationships
def request_rows(self, schema, chunk, timeout=300):
return self.call(
kind="extract-rows", chunk=chunk,
row_schema=RowSchema(
name=schema.name,
description=schema.description,
fields=[
Field(
name=f.name, type=str(f.type), size=f.size,
primary=f.primary, description=f.description,
)
for f in schema.fields
]
),
timeout=timeout
).rows
def request_kg_prompt(self, query, kg, timeout=300):
return self.call(
kind="kg-prompt",
query=query,
kg=[
Fact(s=v[0], p=v[1], o=v[2])
for v in kg
],
timeout=timeout
).answer
def request_document_prompt(self, query, documents, timeout=300):
return self.call(
kind="document-prompt",
query=query,
documents=documents,
timeout=timeout
).answer

View file

@ -0,0 +1,59 @@
#!/usr/bin/env python3
import _pulsar
from .. schema import TriplesQueryRequest, TriplesQueryResponse, Value
from .. schema import triples_request_queue
from .. schema import triples_response_queue
from . base import BaseClient
# Ugly
ERROR=_pulsar.LoggerLevel.Error
WARN=_pulsar.LoggerLevel.Warn
INFO=_pulsar.LoggerLevel.Info
DEBUG=_pulsar.LoggerLevel.Debug
class TriplesQueryClient(BaseClient):
def __init__(
self, log_level=ERROR,
subscriber=None,
input_queue=None,
output_queue=None,
pulsar_host="pulsar://pulsar:6650",
):
if input_queue == None:
input_queue = triples_request_queue
if output_queue == None:
output_queue = triples_response_queue
super(TriplesQueryClient, self).__init__(
log_level=log_level,
subscriber=subscriber,
input_queue=input_queue,
output_queue=output_queue,
pulsar_host=pulsar_host,
input_schema=TriplesQueryRequest,
output_schema=TriplesQueryResponse,
)
def create_value(self, ent):
if ent == None: return None
if ent.startswith("http://") or ent.startswith("https://"):
return Value(value=ent, is_uri=True)
return Value(value=ent, is_uri=False)
def request(self, s, p, o, limit=10, timeout=60):
return self.call(
s=self.create_value(s),
p=self.create_value(p),
o=self.create_value(o),
limit=limit,
timeout=timeout,
).triples

View file

@ -0,0 +1,14 @@
class TooManyRequests(Exception):
pass
class LlmError(Exception):
pass
class ParseError(Exception):
pass

View file

@ -0,0 +1,20 @@
from enum import Enum
import _pulsar
class LogLevel(Enum):
DEBUG = 'debug'
INFO = 'info'
WARN = 'warn'
ERROR = 'error'
def __str__(self):
return self.value
def to_pulsar(self):
if self == LogLevel.DEBUG: return _pulsar.LoggerLevel.Debug
if self == LogLevel.INFO: return _pulsar.LoggerLevel.Info
if self == LogLevel.WARN: return _pulsar.LoggerLevel.Warn
if self == LogLevel.ERROR: return _pulsar.LoggerLevel.Error
raise RuntimeError("Log level mismatch")

View file

@ -0,0 +1,72 @@
from dataclasses import dataclass
from enum import Enum
class FieldType(Enum):
STRING = 0
INT = 1
LONG = 2
BOOL = 3
FLOAT = 4
DOUBLE = 5
def __str__(self):
return self.name.lower()
@dataclass
class Field:
name: str
size: int = -1
primary: bool = False
type: str = "undefined"
description: str = ""
@staticmethod
def parse(defn):
if defn == "" or defn is None:
raise RuntimeError("Field definition cannot be empty")
parts = defn.split(":")
if len(parts) == 0:
raise RuntimeError("Field definition cannot be empty")
if len(parts) == 1: parts.append("string")
if len(parts) == 2: parts.append("0")
if len(parts) == 3: parts.append("")
if len(parts) == 4: parts.append("")
name, type, size, pri, description = parts
size = int(size)
try:
type = FieldType[type.upper()]
except:
raise RuntimeError(f"Field type {type} is not known")
pri = True if pri == "pri" else False
return Field(
name=name, type=type, size=size, primary=pri,
description=description
)
def __repr__(self):
name = self.name
type = self.type
size = self.size
pri = "pri" if self.primary else ""
description = self.description
return f"{name}:{type}:{size}:{pri}:{description}"
def __str__(self):
name = self.name
type = self.type
size = self.size
pri = "pri" if self.primary else ""
description = self.description
return f"{name}:{type}:{size}:{pri}:{description}"

View file

@ -0,0 +1,8 @@
class Schema:
def __init__(self, name, description, fields):
self.name = name
self.description = description
self.fields = fields

View file

@ -0,0 +1,6 @@
RDF_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
DEFINITION = "http://www.w3.org/2004/02/skos/core#definition"
TRUSTGRAPH_ENTITIES = "http://trustgraph.ai/e/"

View file

@ -0,0 +1,12 @@
from . types import *
from . prompt import *
from . documents import *
from . models import *
from . object import *
from . topic import *
from . graph import *
from . retrieval import *

View file

@ -0,0 +1,68 @@
from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double
from . topic import topic
from . types import Error
class Source(Record):
source = String()
id = String()
title = String()
############################################################################
# PDF docs etc.
class Document(Record):
source = Source()
data = Bytes()
document_ingest_queue = topic('document-load')
############################################################################
# Text documents / text from PDF
class TextDocument(Record):
source = Source()
text = Bytes()
text_ingest_queue = topic('text-document-load')
############################################################################
# Chunks of text
class Chunk(Record):
source = Source()
chunk = Bytes()
chunk_ingest_queue = topic('chunk-load')
############################################################################
# Chunk embeddings are an embeddings associated with a text chunk
class ChunkEmbeddings(Record):
source = Source()
vectors = Array(Array(Double()))
chunk = Bytes()
chunk_embeddings_ingest_queue = topic('chunk-embeddings-load')
############################################################################
# Doc embeddings query
class DocumentEmbeddingsRequest(Record):
vectors = Array(Array(Double()))
limit = Integer()
class DocumentEmbeddingsResponse(Record):
error = Error()
documents = Array(Bytes())
document_embeddings_request_queue = topic(
'doc-embeddings', kind='non-persistent', namespace='request'
)
document_embeddings_response_queue = topic(
'doc-embeddings-response', kind='non-persistent', namespace='response',
)

View file

@ -0,0 +1,69 @@
from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double
from . documents import Source
from . types import Error, Value
from . topic import topic
############################################################################
# Graph embeddings are embeddings associated with a graph entity
class GraphEmbeddings(Record):
source = Source()
vectors = Array(Array(Double()))
entity = Value()
graph_embeddings_store_queue = topic('graph-embeddings-store')
############################################################################
# Graph embeddings query
class GraphEmbeddingsRequest(Record):
vectors = Array(Array(Double()))
limit = Integer()
class GraphEmbeddingsResponse(Record):
error = Error()
entities = Array(Value())
graph_embeddings_request_queue = topic(
'graph-embeddings', kind='non-persistent', namespace='request'
)
graph_embeddings_response_queue = topic(
'graph-embeddings-response', kind='non-persistent', namespace='response',
)
############################################################################
# Graph triples
class Triple(Record):
source = Source()
s = Value()
p = Value()
o = Value()
triples_store_queue = topic('triples-store')
############################################################################
# Triples query
class TriplesQueryRequest(Record):
s = Value()
p = Value()
o = Value()
limit = Integer()
class TriplesQueryResponse(Record):
error = Error()
triples = Array(Triple())
triples_request_queue = topic(
'triples', kind='non-persistent', namespace='request'
)
triples_response_queue = topic(
'triples-response', kind='non-persistent', namespace='response',
)

View file

@ -0,0 +1,44 @@
from pulsar.schema import Record, String, Array, Double, Integer
from . topic import topic
from . types import Error
############################################################################
# LLM text completion
class TextCompletionRequest(Record):
prompt = String()
class TextCompletionResponse(Record):
error = Error()
response = String()
in_token = Integer()
out_token = Integer()
model = String()
text_completion_request_queue = topic(
'text-completion', kind='non-persistent', namespace='request'
)
text_completion_response_queue = topic(
'text-completion-response', kind='non-persistent', namespace='response',
)
############################################################################
# Embeddings
class EmbeddingsRequest(Record):
text = String()
class EmbeddingsResponse(Record):
error = Error()
vectors = Array(Array(Double()))
embeddings_request_queue = topic(
'embeddings', kind='non-persistent', namespace='request'
)
embeddings_response_queue = topic(
'embeddings-response', kind='non-persistent', namespace='response'
)

View file

@ -0,0 +1,33 @@
from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array
from pulsar.schema import Double, Map
from . documents import Source
from . types import Value, RowSchema
from . topic import topic
############################################################################
# Object embeddings are embeddings associated with the primary key of an
# object
class ObjectEmbeddings(Record):
source = Source()
vectors = Array(Array(Double()))
name = String()
key_name = String()
id = String()
object_embeddings_store_queue = topic('object-embeddings-store')
############################################################################
# Stores rows of information
class Rows(Record):
source = Source()
row_schema = RowSchema()
rows = Array(Map(String()))
rows_store_queue = topic('rows-store')

View file

@ -0,0 +1,65 @@
from pulsar.schema import Record, Bytes, String, Boolean, Array, Map, Integer
from . topic import topic
from . types import Error, RowSchema
############################################################################
# Prompt services, abstract the prompt generation
class Definition(Record):
name = String()
definition = String()
class Topic(Record):
name = String()
definition = String()
class Relationship(Record):
s = String()
p = String()
o = String()
o_entity = Boolean()
class Fact(Record):
s = String()
p = String()
o = String()
# extract-definitions:
# chunk -> definitions
# extract-relationships:
# chunk -> relationships
# kg-prompt:
# query, triples -> answer
# document-prompt:
# query, documents -> answer
# extract-rows
# schema, chunk -> rows
class PromptRequest(Record):
kind = String()
chunk = String()
query = String()
kg = Array(Fact())
documents = Array(Bytes())
row_schema = RowSchema()
class PromptResponse(Record):
error = Error()
answer = String()
definitions = Array(Definition())
topics = Array(Topic())
relationships = Array(Relationship())
rows = Array(Map(String()))
prompt_request_queue = topic(
'prompt', kind='non-persistent', namespace='request'
)
prompt_response_queue = topic(
'prompt-response', kind='non-persistent', namespace='response'
)
############################################################################

View file

@ -0,0 +1,40 @@
from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double
from . topic import topic
from . types import Error, Value
############################################################################
# Graph RAG text retrieval
class GraphRagQuery(Record):
query = String()
class GraphRagResponse(Record):
error = Error()
response = String()
graph_rag_request_queue = topic(
'graph-rag', kind='non-persistent', namespace='request'
)
graph_rag_response_queue = topic(
'graph-rag-response', kind='non-persistent', namespace='response'
)
############################################################################
# Document RAG text retrieval
class DocumentRagQuery(Record):
query = String()
class DocumentRagResponse(Record):
error = Error()
response = String()
document_rag_request_queue = topic(
'doc-rag', kind='non-persistent', namespace='request'
)
document_rag_response_queue = topic(
'doc-rag-response', kind='non-persistent', namespace='response'
)

View file

@ -0,0 +1,4 @@
def topic(topic, kind='persistent', tenant='tg', namespace='flow'):
return f"{kind}://{tenant}/{namespace}/{topic}"

View file

@ -0,0 +1,25 @@
from pulsar.schema import Record, String, Boolean, Array, Integer
class Error(Record):
type = String()
message = String()
class Value(Record):
value = String()
is_uri = Boolean()
type = String()
class Field(Record):
name = String()
# int, string, long, bool, float, double
type = String()
size = Integer()
primary = Boolean()
description = String()
class RowSchema(Record):
name = String()
description = String()
fields = Array(Field())