Refactor names (#4)

- Downsize embeddings model to mini-lm in docker-compose files
- Rename for structure
- Default queues defined in schema file
- Standardize naming: graph embeddings, chunk embeddings, triples
This commit is contained in:
cybermaggedon 2024-07-23 21:34:03 +01:00 committed by GitHub
parent cbddf197ad
commit 3947920ee8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
71 changed files with 764 additions and 585 deletions

View file

@ -1,6 +1,6 @@
# VERSION=$(shell git describe | sed 's/^v//')
VERSION=0.4.2
VERSION=0.5.1
all: container

View file

@ -119,7 +119,7 @@ services:
restart: on-failure:100
pdf-decoder:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "pdf-decoder"
- "-p"
@ -127,7 +127,7 @@ services:
restart: on-failure:100
chunker:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "chunker-recursive"
- "-p"
@ -135,7 +135,7 @@ services:
restart: on-failure:100
vectorize:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "embeddings-vectorize"
- "-p"
@ -143,17 +143,17 @@ services:
restart: on-failure:100
embeddings:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "embeddings-hf"
- "-p"
- "pulsar://pulsar:6650"
- "-m"
- "mixedbread-ai/mxbai-embed-large-v1"
# - "-m"
# - "mixedbread-ai/mxbai-embed-large-v1"
restart: on-failure:100
kg-extract-definitions:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "kg-extract-definitions"
- "-p"
@ -161,37 +161,37 @@ services:
restart: on-failure:100
kg-extract-relationships:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "kg-extract-relationships"
- "-p"
- "pulsar://pulsar:6650"
restart: on-failure:100
vector-write:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
store-graph-embeddings:
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "vector-write-milvus"
- "ge-write-milvus"
- "-p"
- "pulsar://pulsar:6650"
- "-t"
- "http://milvus:19530"
restart: on-failure:100
graph-write:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
store-triples:
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "graph-write-cassandra"
- "triples-write-cassandra"
- "-p"
- "pulsar://pulsar:6650"
- "-g"
- "cassandra"
restart: on-failure:100
llm:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
text-completion:
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "llm-azure-text"
- "text-completion-azure"
- "-p"
- "pulsar://pulsar:6650"
- "-k"
@ -201,7 +201,7 @@ services:
restart: on-failure:100
graph-rag:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "graph-rag"
- "-p"

View file

@ -119,7 +119,7 @@ services:
restart: on-failure:100
pdf-decoder:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "pdf-decoder"
- "-p"
@ -127,7 +127,7 @@ services:
restart: on-failure:100
chunker:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "chunker-recursive"
- "-p"
@ -135,7 +135,7 @@ services:
restart: on-failure:100
vectorize:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "embeddings-vectorize"
- "-p"
@ -143,17 +143,17 @@ services:
restart: on-failure:100
embeddings:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "embeddings-hf"
- "-p"
- "pulsar://pulsar:6650"
- "-m"
- "mixedbread-ai/mxbai-embed-large-v1"
# - "-m"
# - "mixedbread-ai/mxbai-embed-large-v1"
restart: on-failure:100
kg-extract-definitions:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "kg-extract-definitions"
- "-p"
@ -161,37 +161,37 @@ services:
restart: on-failure:100
kg-extract-relationships:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "kg-extract-relationships"
- "-p"
- "pulsar://pulsar:6650"
restart: on-failure:100
vector-write:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
store-graph-embeddings:
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "vector-write-milvus"
- "ge-write-milvus"
- "-p"
- "pulsar://pulsar:6650"
- "-t"
- "http://milvus:19530"
restart: on-failure:100
graph-write:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
store-triples:
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "graph-write-cassandra"
- "triples-write-cassandra"
- "-p"
- "pulsar://pulsar:6650"
- "-g"
- "cassandra"
restart: on-failure:100
llm:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
text-completion:
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "llm-claude-text"
- "text-completion-claude"
- "-p"
- "pulsar://pulsar:6650"
- "-k"
@ -199,7 +199,7 @@ services:
restart: on-failure:100
graph-rag:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "graph-rag"
- "-p"

View file

@ -119,7 +119,7 @@ services:
restart: on-failure:100
pdf-decoder:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "pdf-decoder"
- "-p"
@ -127,7 +127,7 @@ services:
restart: on-failure:100
chunker:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "chunker-recursive"
- "-p"
@ -135,7 +135,7 @@ services:
restart: on-failure:100
vectorize:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "embeddings-vectorize"
- "-p"
@ -143,17 +143,17 @@ services:
restart: on-failure:100
embeddings:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "embeddings-hf"
- "-p"
- "pulsar://pulsar:6650"
- "-m"
- "mixedbread-ai/mxbai-embed-large-v1"
# - "-m"
# - "mixedbread-ai/mxbai-embed-large-v1"
restart: on-failure:100
kg-extract-definitions:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "kg-extract-definitions"
- "-p"
@ -161,37 +161,37 @@ services:
restart: on-failure:100
kg-extract-relationships:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "kg-extract-relationships"
- "-p"
- "pulsar://pulsar:6650"
restart: on-failure:100
vector-write:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
store-graph-embeddings:
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "vector-write-milvus"
- "ge-write-milvus"
- "-p"
- "pulsar://pulsar:6650"
- "-t"
- "http://milvus:19530"
restart: on-failure:100
graph-write:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
store-triples:
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "graph-write-cassandra"
- "triples-write-cassandra"
- "-p"
- "pulsar://pulsar:6650"
- "-g"
- "cassandra"
restart: on-failure:100
llm:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
text-completion:
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "llm-ollama-text"
- "text-completion-ollama"
- "-p"
- "pulsar://pulsar:6650"
- "-r"
@ -199,7 +199,7 @@ services:
restart: on-failure:100
graph-rag:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "graph-rag"
- "-p"

View file

@ -119,7 +119,7 @@ services:
restart: on-failure:100
pdf-decoder:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "pdf-decoder"
- "-p"
@ -127,7 +127,7 @@ services:
restart: on-failure:100
chunker:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "chunker-recursive"
- "-p"
@ -135,7 +135,7 @@ services:
restart: on-failure:100
vectorize:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "embeddings-vectorize"
- "-p"
@ -143,17 +143,17 @@ services:
restart: on-failure:100
embeddings:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "embeddings-hf"
- "-p"
- "pulsar://pulsar:6650"
- "-m"
- "mixedbread-ai/mxbai-embed-large-v1"
# - "-m"
# - "mixedbread-ai/mxbai-embed-large-v1"
restart: on-failure:100
kg-extract-definitions:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "kg-extract-definitions"
- "-p"
@ -161,37 +161,37 @@ services:
restart: on-failure:100
kg-extract-relationships:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "kg-extract-relationships"
- "-p"
- "pulsar://pulsar:6650"
restart: on-failure:100
vector-write:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
store-graph-embeddings:
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "vector-write-milvus"
- "ge-write-milvus"
- "-p"
- "pulsar://pulsar:6650"
- "-t"
- "http://milvus:19530"
restart: on-failure:100
graph-write:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
store-triples:
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "graph-write-cassandra"
- "triples-write-cassandra"
- "-p"
- "pulsar://pulsar:6650"
- "-g"
- "cassandra"
restart: on-failure:100
llm:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
text-completion:
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "llm-vertexai-text"
- "text-completion-ollama"
- "-p"
- "pulsar://pulsar:6650"
- "-k"
@ -203,7 +203,7 @@ services:
restart: on-failure:100
graph-rag:
image: docker.io/trustgraph/trustgraph-flow:0.4.2
image: docker.io/trustgraph/trustgraph-flow:0.5.1
command:
- "graph-rag"
- "-p"

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3
from trustgraph.chunker.recursive import run
from trustgraph.chunking.recursive import run
run()

6
scripts/ge-write-milvus Executable file
View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.storage.graph_embeddings.milvus import run
run()

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3
from trustgraph.rag.graph import run
from trustgraph.retrieval.graph_rag import run
run()

View file

@ -1,6 +0,0 @@
#!/usr/bin/env python3
from trustgraph.graph.cassandra_write import run
run()

View file

@ -1,6 +0,0 @@
#!/usr/bin/env python3
from trustgraph.llm.azure_text import run
run()

View file

@ -1,6 +0,0 @@
#!/usr/bin/env python3
from trustgraph.llm.claude_text import run
run()

View file

@ -1,6 +0,0 @@
#!/usr/bin/env python3
from trustgraph.llm.ollama_text import run
run()

View file

@ -1,6 +0,0 @@
#!/usr/bin/env python3
from trustgraph.llm.vertexai_text import run
run()

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3
from trustgraph.decoder.pdf import run
from trustgraph.decoding.pdf import run
run()

6
scripts/text-completion-azure Executable file
View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.model.text_completion.azure import run
run()

6
scripts/text-completion-claude Executable file
View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.model.text_completion.claude import run
run()

6
scripts/text-completion-ollama Executable file
View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.model.text_completion.ollama import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.model.text_completion.vertexai import run
run()

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from trustgraph.storage.triples.cassandra import run
run()

View file

@ -1,6 +0,0 @@
#!/usr/bin/env python3
from trustgraph.vector.milvus_write import run
run()

View file

@ -4,7 +4,7 @@ import os
with open("README.md", "r") as fh:
long_description = fh.read()
version = "0.4.2"
version = "0.5.1"
setuptools.setup(
name="trustgraph",
@ -50,21 +50,21 @@ setuptools.setup(
"scripts/embeddings-hf",
"scripts/embeddings-ollama",
"scripts/embeddings-vectorize",
"scripts/ge-write-milvus",
"scripts/graph-rag",
"scripts/graph-show",
"scripts/graph-to-turtle",
"scripts/graph-write-cassandra",
"scripts/init-pulsar-manager",
"scripts/kg-extract-definitions",
"scripts/kg-extract-relationships",
"scripts/llm-azure-text",
"scripts/llm-claude-text",
"scripts/llm-ollama-text",
"scripts/llm-vertexai-text",
"scripts/loader",
"scripts/pdf-decoder",
"scripts/query",
"scripts/run-processing",
"scripts/vector-write-milvus",
"scripts/text-completion-azure",
"scripts/text-completion-claude",
"scripts/text-completion-ollama",
"scripts/text-completion-vertexai",
"scripts/triples-write-cassandra",
]
)

View file

@ -1,3 +1,6 @@
from . processor import *
from . base_processor import BaseProcessor
from . consumer import Consumer
from . producer import Producer
from . consumer_producer import ConsumerProducer

View file

@ -0,0 +1,117 @@
import os
import argparse
import pulsar
import _pulsar
import time
from prometheus_client import start_http_server, Info
from .. log_level import LogLevel
class BaseProcessor:
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
def __init__(self, **params):
self.client = None
if not hasattr(__class__, "params_metric"):
__class__.params_metric = Info(
'params', 'Parameters configuration'
)
# FIXME: Maybe outputs information it should not
__class__.params_metric.info({
k: str(params[k])
for k in params
})
pulsar_host = params.get("pulsar_host", self.default_pulsar_host)
log_level = params.get("log_level", LogLevel.INFO)
self.pulsar_host = pulsar_host
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
)
def __del__(self):
if self.client:
self.client.close()
@staticmethod
def add_args(parser):
parser.add_argument(
'-p', '--pulsar-host',
default=__class__.default_pulsar_host,
help=f'Pulsar host (default: {__class__.default_pulsar_host})',
)
parser.add_argument(
'-l', '--log-level',
type=LogLevel,
default=LogLevel.INFO,
choices=list(LogLevel),
help=f'Output queue (default: info)'
)
parser.add_argument(
'-M', '--metrics-enabled',
type=bool,
default=True,
help=f'Pulsar host (default: true)',
)
parser.add_argument(
'-P', '--metrics-port',
type=int,
default=8000,
help=f'Pulsar host (default: 8000)',
)
def run(self):
raise RuntimeError("Something should have implemented the run method")
@classmethod
def start(cls, prog, doc):
while True:
parser = argparse.ArgumentParser(
prog=prog,
description=doc
)
cls.add_args(parser)
args = parser.parse_args()
args = vars(args)
if args["metrics_enabled"]:
start_http_server(args["metrics_port"])
try:
p = cls(**args)
p.run()
except KeyboardInterrupt:
print("Keyboard interrupt.")
return
except _pulsar.Interrupted:
print("Pulsar Interrupted.")
return
except Exception as e:
print(type(e))
print("Exception:", e, flush=True)
print("Will retry...", flush=True)
time.sleep(10)

View file

@ -0,0 +1,87 @@
from pulsar.schema import JsonSchema
from prometheus_client import start_http_server, Histogram, Info, Counter
from . base_processor import BaseProcessor
class Consumer(BaseProcessor):
def __init__(self, **params):
super(Consumer, self).__init__(**params)
input_queue = params.get("input_queue")
subscriber = params.get("subscriber")
input_schema = params.get("input_schema")
if input_schema == None:
raise RuntimeError("input_schema must be specified")
if not hasattr(__class__, "request_metric"):
__class__.request_metric = Histogram(
'request_latency', 'Request latency (seconds)'
)
if not hasattr(__class__, "pubsub_metric"):
__class__.pubsub_metric = Info(
'pubsub', 'Pub/sub configuration'
)
if not hasattr(__class__, "processing_metric"):
__class__.processing_metric = Counter(
'processing_count', 'Processing count', ["status"]
)
__class__.pubsub_metric.info({
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": input_schema.__name__,
})
self.consumer = self.client.subscribe(
input_queue, subscriber,
schema=JsonSchema(input_schema),
)
def run(self):
while True:
msg = self.consumer.receive()
try:
with __class__.request_metric.time():
self.handle(msg)
# Acknowledge successful processing of the message
self.consumer.acknowledge(msg)
__class__.processing_metric.labels(status="success").inc()
except Exception as e:
print("Exception:", e, flush=True)
# Message failed to be processed
self.consumer.negative_acknowledge(msg)
__class__.processing_metric.labels(status="error").inc()
@staticmethod
def add_args(parser, default_input_queue, default_subscriber):
BaseProcessor.add_args(parser)
parser.add_argument(
'-i', '--input-queue',
default=default_input_queue,
help=f'Input queue (default: {default_input_queue})'
)
parser.add_argument(
'-s', '--subscriber',
default=default_subscriber,
help=f'Queue subscriber name (default: {default_subscriber})'
)

View file

@ -0,0 +1,168 @@
from pulsar.schema import JsonSchema
from prometheus_client import Histogram, Info, Counter
from . base_processor import BaseProcessor
# FIXME: Derive from consumer? And producer?
class ConsumerProducer(BaseProcessor):
def __init__(self, **params):
input_queue = params.get("input_queue")
output_queue = params.get("output_queue")
subscriber = params.get("subscriber")
input_schema = params.get("input_schema")
output_schema = params.get("output_schema")
if not hasattr(__class__, "request_metric"):
__class__.request_metric = Histogram(
'request_latency', 'Request latency (seconds)'
)
if not hasattr(__class__, "output_metric"):
__class__.output_metric = Counter(
'output_count', 'Output items created'
)
if not hasattr(__class__, "pubsub_metric"):
__class__.pubsub_metric = Info(
'pubsub', 'Pub/sub configuration'
)
if not hasattr(__class__, "processing_metric"):
__class__.processing_metric = Counter(
'processing_count', 'Processing count', ["status"]
)
__class__.pubsub_metric.info({
"input_queue": input_queue,
"output_queue": output_queue,
"subscriber": subscriber,
"input_schema": input_schema.__name__,
"output_schema": output_schema.__name__,
})
super(ConsumerProducer, self).__init__(**params)
if input_schema == None:
raise RuntimeError("input_schema must be specified")
if output_schema == None:
raise RuntimeError("output_schema must be specified")
self.consumer = self.client.subscribe(
input_queue, subscriber,
schema=JsonSchema(input_schema),
)
self.producer = self.client.create_producer(
topic=output_queue,
schema=JsonSchema(output_schema),
)
def run(self):
while True:
msg = self.consumer.receive()
try:
with __class__.request_metric.time():
resp = self.handle(msg)
# Acknowledge successful processing of the message
self.consumer.acknowledge(msg)
__class__.processing_metric.labels(status="success").inc()
except Exception as e:
print("Exception:", e, flush=True)
# Message failed to be processed
self.consumer.negative_acknowledge(msg)
__class__.processing_metric.labels(status="error").inc()
def send(self, msg, properties={}):
self.producer.send(msg, properties)
__class__.output_metric.inc()
@staticmethod
def add_args(
parser, default_input_queue, default_subscriber,
default_output_queue,
):
BaseProcessor.add_args(parser)
parser.add_argument(
'-i', '--input-queue',
default=default_input_queue,
help=f'Input queue (default: {default_input_queue})'
)
parser.add_argument(
'-s', '--subscriber',
default=default_subscriber,
help=f'Queue subscriber name (default: {default_subscriber})'
)
parser.add_argument(
'-o', '--output-queue',
default=default_output_queue,
help=f'Output queue (default: {default_output_queue})'
)
class Producer(BaseProcessor):
def __init__(self, **params):
output_queue = params.get("output_queue")
output_schema = params.get("output_schema")
if not hasattr(__class__, "output_metric"):
__class__.output_metric = Counter(
'output_count', 'Output items created'
)
if not hasattr(__class__, "pubsub_metric"):
__class__.pubsub_metric = Info(
'pubsub', 'Pub/sub configuration'
)
__class__.pubsub_metric.info({
"output_queue": output_queue,
"output_schema": output_schema.__name__,
})
super(Producer, self).__init__(**params)
if output_schema == None:
raise RuntimeError("output_schema must be specified")
self.producer = self.client.create_producer(
topic=output_queue,
schema=JsonSchema(output_schema),
)
def send(self, msg, properties={}):
self.producer.send(msg, properties)
__class__.output_metric.inc()
@staticmethod
def add_args(
parser, default_input_queue, default_subscriber,
default_output_queue,
):
BaseProcessor.add_args(parser)
parser.add_argument(
'-o', '--output-queue',
default=default_output_queue,
help=f'Output queue (default: {default_output_queue})'
)

View file

@ -1,360 +0,0 @@
import os
import argparse
import pulsar
import _pulsar
import time
from pulsar.schema import JsonSchema
from prometheus_client import start_http_server, Histogram, Info, Counter
from .. log_level import LogLevel
class BaseProcessor:
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
def __init__(self, **params):
self.client = None
if not hasattr(__class__, "params_metric"):
__class__.params_metric = Info(
'params', 'Parameters configuration'
)
# FIXME: Maybe outputs information it should not
__class__.params_metric.info({
k: str(params[k])
for k in params
})
pulsar_host = params.get("pulsar_host", self.default_pulsar_host)
log_level = params.get("log_level", LogLevel.INFO)
self.pulsar_host = pulsar_host
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
)
def __del__(self):
if self.client:
self.client.close()
@staticmethod
def add_args(parser):
parser.add_argument(
'-p', '--pulsar-host',
default=__class__.default_pulsar_host,
help=f'Pulsar host (default: {__class__.default_pulsar_host})',
)
parser.add_argument(
'-l', '--log-level',
type=LogLevel,
default=LogLevel.INFO,
choices=list(LogLevel),
help=f'Output queue (default: info)'
)
parser.add_argument(
'-M', '--metrics-enabled',
type=bool,
default=True,
help=f'Pulsar host (default: true)',
)
parser.add_argument(
'-P', '--metrics-port',
type=int,
default=8000,
help=f'Pulsar host (default: 8000)',
)
def run(self):
raise RuntimeError("Something should have implemented the run method")
@classmethod
def start(cls, prog, doc):
while True:
parser = argparse.ArgumentParser(
prog=prog,
description=doc
)
cls.add_args(parser)
args = parser.parse_args()
args = vars(args)
if args["metrics_enabled"]:
start_http_server(args["metrics_port"])
try:
p = cls(**args)
p.run()
except KeyboardInterrupt:
print("Keyboard interrupt.")
return
except _pulsar.Interrupted:
print("Pulsar Interrupted.")
return
except Exception as e:
print(type(e))
print("Exception:", e, flush=True)
print("Will retry...", flush=True)
time.sleep(10)
class Consumer(BaseProcessor):
def __init__(self, **params):
super(Consumer, self).__init__(**params)
input_queue = params.get("input_queue")
subscriber = params.get("subscriber")
input_schema = params.get("input_schema")
if input_schema == None:
raise RuntimeError("input_schema must be specified")
if not hasattr(__class__, "request_metric"):
__class__.request_metric = Histogram(
'request_latency', 'Request latency (seconds)'
)
if not hasattr(__class__, "pubsub_metric"):
__class__.pubsub_metric = Info(
'pubsub', 'Pub/sub configuration'
)
if not hasattr(__class__, "processing_metric"):
__class__.processing_metric = Counter(
'processing_count', 'Processing count', ["status"]
)
__class__.pubsub_metric.info({
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": input_schema.__name__,
})
self.consumer = self.client.subscribe(
input_queue, subscriber,
schema=JsonSchema(input_schema),
)
def run(self):
while True:
msg = self.consumer.receive()
try:
with __class__.request_metric.time():
self.handle(msg)
# Acknowledge successful processing of the message
self.consumer.acknowledge(msg)
__class__.processing_metric.labels(status="success").inc()
except Exception as e:
print("Exception:", e, flush=True)
# Message failed to be processed
self.consumer.negative_acknowledge(msg)
__class__.processing_metric.labels(status="error").inc()
@staticmethod
def add_args(parser, default_input_queue, default_subscriber):
BaseProcessor.add_args(parser)
parser.add_argument(
'-i', '--input-queue',
default=default_input_queue,
help=f'Input queue (default: {default_input_queue})'
)
parser.add_argument(
'-s', '--subscriber',
default=default_subscriber,
help=f'Queue subscriber name (default: {default_subscriber})'
)
class ConsumerProducer(BaseProcessor):
def __init__(self, **params):
input_queue = params.get("input_queue")
output_queue = params.get("output_queue")
subscriber = params.get("subscriber")
input_schema = params.get("input_schema")
output_schema = params.get("output_schema")
if not hasattr(__class__, "request_metric"):
__class__.request_metric = Histogram(
'request_latency', 'Request latency (seconds)'
)
if not hasattr(__class__, "output_metric"):
__class__.output_metric = Counter(
'output_count', 'Output items created'
)
if not hasattr(__class__, "pubsub_metric"):
__class__.pubsub_metric = Info(
'pubsub', 'Pub/sub configuration'
)
if not hasattr(__class__, "processing_metric"):
__class__.processing_metric = Counter(
'processing_count', 'Processing count', ["status"]
)
__class__.pubsub_metric.info({
"input_queue": input_queue,
"output_queue": output_queue,
"subscriber": subscriber,
"input_schema": input_schema.__name__,
"output_schema": output_schema.__name__,
})
super(ConsumerProducer, self).__init__(**params)
if input_schema == None:
raise RuntimeError("input_schema must be specified")
if output_schema == None:
raise RuntimeError("output_schema must be specified")
self.consumer = self.client.subscribe(
input_queue, subscriber,
schema=JsonSchema(input_schema),
)
self.producer = self.client.create_producer(
topic=output_queue,
schema=JsonSchema(output_schema),
)
def run(self):
while True:
msg = self.consumer.receive()
try:
with __class__.request_metric.time():
resp = self.handle(msg)
# Acknowledge successful processing of the message
self.consumer.acknowledge(msg)
__class__.processing_metric.labels(status="success").inc()
except Exception as e:
print("Exception:", e, flush=True)
# Message failed to be processed
self.consumer.negative_acknowledge(msg)
__class__.processing_metric.labels(status="error").inc()
def send(self, msg, properties={}):
self.producer.send(msg, properties)
__class__.output_metric.inc()
@staticmethod
def add_args(
parser, default_input_queue, default_subscriber,
default_output_queue,
):
BaseProcessor.add_args(parser)
parser.add_argument(
'-i', '--input-queue',
default=default_input_queue,
help=f'Input queue (default: {default_input_queue})'
)
parser.add_argument(
'-s', '--subscriber',
default=default_subscriber,
help=f'Queue subscriber name (default: {default_subscriber})'
)
parser.add_argument(
'-o', '--output-queue',
default=default_output_queue,
help=f'Output queue (default: {default_output_queue})'
)
class Producer(BaseProcessor):
def __init__(self, **params):
output_queue = params.get("output_queue")
output_schema = params.get("output_schema")
if not hasattr(__class__, "output_metric"):
__class__.output_metric = Counter(
'output_count', 'Output items created'
)
if not hasattr(__class__, "pubsub_metric"):
__class__.pubsub_metric = Info(
'pubsub', 'Pub/sub configuration'
)
__class__.pubsub_metric.info({
"output_queue": output_queue,
"output_schema": output_schema.__name__,
})
super(Producer, self).__init__(**params)
if output_schema == None:
raise RuntimeError("output_schema must be specified")
self.producer = self.client.create_producer(
topic=output_queue,
schema=JsonSchema(output_schema),
)
def send(self, msg, properties={}):
self.producer.send(msg, properties)
__class__.output_metric.inc()
@staticmethod
def add_args(
parser, default_input_queue, default_subscriber,
default_output_queue,
):
BaseProcessor.add_args(parser)
parser.add_argument(
'-o', '--output-queue',
default=default_output_queue,
help=f'Output queue (default: {default_output_queue})'
)

View file

@ -0,0 +1,55 @@
from pulsar.schema import JsonSchema
from prometheus_client import Info, Counter
from . base_processor import BaseProcessor
class Producer(BaseProcessor):
def __init__(self, **params):
output_queue = params.get("output_queue")
output_schema = params.get("output_schema")
if not hasattr(__class__, "output_metric"):
__class__.output_metric = Counter(
'output_count', 'Output items created'
)
if not hasattr(__class__, "pubsub_metric"):
__class__.pubsub_metric = Info(
'pubsub', 'Pub/sub configuration'
)
__class__.pubsub_metric.info({
"output_queue": output_queue,
"output_schema": output_schema.__name__,
})
super(Producer, self).__init__(**params)
if output_schema == None:
raise RuntimeError("output_schema must be specified")
self.producer = self.client.create_producer(
topic=output_queue,
schema=JsonSchema(output_schema),
)
def send(self, msg, properties={}):
self.producer.send(msg, properties)
__class__.output_metric.inc()
@staticmethod
def add_args(
parser, default_input_queue, default_subscriber,
default_output_queue,
):
BaseProcessor.add_args(parser)
parser.add_argument(
'-o', '--output-queue',
default=default_output_queue,
help=f'Output queue (default: {default_output_queue})'
)

View file

@ -8,12 +8,15 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
from ... schema import TextDocument, Chunk, Source
from ... schema import text_ingest_queue, chunk_ingest_queue
from ... log_level import LogLevel
from ... base import ConsumerProducer
default_input_queue = 'text-doc-load'
default_output_queue = 'chunk-load'
default_subscriber = 'chunker-recursive'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = text_ingest_queue
default_output_queue = chunk_ingest_queue
default_subscriber = module
class Processor(ConsumerProducer):
@ -92,5 +95,5 @@ class Processor(ConsumerProducer):
def run():
Processor.start('chunker', __doc__)
Processor.start(module, __doc__)

View file

@ -9,12 +9,15 @@ import base64
from langchain_community.document_loaders import PyPDFLoader
from ... schema import Document, TextDocument, Source
from ... schema import document_ingest_queue, text_ingest_queue
from ... log_level import LogLevel
from ... base import ConsumerProducer
default_input_queue = 'document-load'
default_output_queue = 'text-doc-load'
default_subscriber = 'pdf-decoder'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = document_ingest_queue
default_output_queue = text_ingest_queue
default_subscriber = module
class Processor(ConsumerProducer):
@ -80,5 +83,5 @@ class Processor(ConsumerProducer):
def run():
Processor.start("pdf-decoder", __doc__)
Processor.start(module, __doc__)

View file

@ -7,12 +7,15 @@ Input is text, output is embeddings vector.
from langchain_huggingface import HuggingFaceEmbeddings
from ... schema import EmbeddingsRequest, EmbeddingsResponse
from ... schema import embeddings_request_queue, embeddings_response_queue
from ... log_level import LogLevel
from ... base import ConsumerProducer
default_input_queue = 'embeddings'
default_output_queue = 'embeddings-response'
default_subscriber = 'embeddings-hf'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = embeddings_request_queue
default_output_queue = embeddings_response_queue
default_subscriber = module
default_model="all-MiniLM-L6-v2"
class Processor(ConsumerProducer):
@ -70,5 +73,5 @@ class Processor(ConsumerProducer):
def run():
Processor.start("embeddings-hf", __doc__)
Processor.start(module, __doc__)

View file

@ -6,12 +6,15 @@ Input is text, output is embeddings vector.
from langchain_community.embeddings import OllamaEmbeddings
from ... schema import EmbeddingsRequest, EmbeddingsResponse
from ... schema import embeddings_request_queue, embeddings_response_queue
from ... log_level import LogLevel
from ... base import ConsumerProducer
default_input_queue = 'embeddings'
default_output_queue = 'embeddings-response'
default_subscriber = 'embeddings-ollama'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = embeddings_request_queue
default_output_queue = embeddings_response_queue
default_subscriber = module
default_model="mxbai-embed-large"
default_ollama = 'http://localhost:11434'
@ -77,5 +80,5 @@ class Processor(ConsumerProducer):
def run():
Processor.start('embeddings-ollama', __doc__)
Processor.start(module, __doc__)

View file

@ -4,14 +4,17 @@ Vectorizer, calls the embeddings service to get embeddings for a chunk.
Input is text chunk, output is chunk and vectors.
"""
from ... schema import Chunk, VectorsChunk
from ... schema import Chunk, ChunkEmbeddings
from ... schema import chunk_ingest_queue, chunk_embeddings_ingest_queue
from ... embeddings_client import EmbeddingsClient
from ... log_level import LogLevel
from ... base import ConsumerProducer
default_input_queue = 'chunk-load'
default_output_queue = 'vectors-chunk-load'
default_subscriber = 'embeddings-vectorizer'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = chunk_ingest_queue
default_output_queue = chunk_embeddings_ingest_queue
default_subscriber = module
class Processor(ConsumerProducer):
@ -27,7 +30,7 @@ class Processor(ConsumerProducer):
"output_queue": output_queue,
"subscriber": subscriber,
"input_schema": Chunk,
"output_schema": VectorsChunk,
"output_schema": ChunkEmbeddings,
}
)
@ -35,7 +38,7 @@ class Processor(ConsumerProducer):
def emit(self, source, chunk, vectors):
r = VectorsChunk(source=source, chunk=chunk, vectors=vectors)
r = ChunkEmbeddings(source=source, chunk=chunk, vectors=vectors)
self.producer.send(r)
def handle(self, msg):
@ -70,5 +73,5 @@ class Processor(ConsumerProducer):
def run():
Processor.start("embeddings-vectorize", __doc__)
Processor.start(module, __doc__)

View file

@ -3,7 +3,8 @@
import pulsar
import _pulsar
from pulsar.schema import JsonSchema
from trustgraph.schema import EmbeddingsRequest, EmbeddingsResponse
from . schema import EmbeddingsRequest, EmbeddingsResponse
from . schema import embeddings_request_queue, embeddings_response_queue
import hashlib
import uuid
@ -31,13 +32,13 @@ class EmbeddingsClient:
)
self.producer = self.client.create_producer(
topic='embeddings',
topic=embeddings_request_queue,
schema=JsonSchema(EmbeddingsRequest),
chunking_enabled=True,
)
self.consumer = self.client.subscribe(
'embeddings-response', client_id,
embeddings_response_queue, client_id,
schema=JsonSchema(EmbeddingsResponse),
)

View file

@ -3,7 +3,9 @@
import pulsar
import _pulsar
from pulsar.schema import JsonSchema
from trustgraph.schema import GraphRagQuery, GraphRagResponse
from . schema import GraphRagQuery, GraphRagResponse
from . schema import graph_rag_request_queue, graph_rag_response_queue
import hashlib
import uuid
@ -29,13 +31,13 @@ class GraphRagClient:
)
self.producer = self.client.create_producer(
topic='graph-rag-query',
topic=graph_rag_request_queue,
schema=JsonSchema(GraphRagQuery),
chunking_enabled=True,
)
self.consumer = self.client.subscribe(
'graph-rag-response', client_id,
graph_rag_response_queue, client_id,
schema=JsonSchema(GraphRagResponse),
)

View file

@ -1,13 +1,14 @@
"""
Simple decoder, accepts vector+text chunks input, applies entity analysis to
Simple decoder, accepts embeddings+text chunks input, applies entity analysis to
get entity definitions which are output as graph edges.
"""
import urllib.parse
import json
from ... schema import VectorsChunk, Triple, Source, Value
from ... schema import ChunkEmbeddings, Triple, Source, Value
from ... schema import chunk_embeddings_ingest_queue, triples_store_queue
from ... log_level import LogLevel
from ... llm_client import LlmClient
from ... prompts import to_definitions
@ -16,9 +17,11 @@ from ... base import ConsumerProducer
DEFINITION_VALUE = Value(value=DEFINITION, is_uri=True)
default_input_queue = 'vectors-chunk-load'
default_output_queue = 'graph-load'
default_subscriber = 'kg-extract-definitions'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = chunk_embeddings_ingest_queue
default_output_queue = triples_store_queue
default_subscriber = module
class Processor(ConsumerProducer):
@ -33,7 +36,7 @@ class Processor(ConsumerProducer):
"input_queue": input_queue,
"output_queue": output_queue,
"subscriber": subscriber,
"input_schema": VectorsChunk,
"input_schema": ChunkEmbeddings,
"output_schema": Triple,
}
)
@ -101,5 +104,5 @@ class Processor(ConsumerProducer):
def run():
Processor.start("kg-extract-definitions", __doc__)
Processor.start(module, __doc__)

View file

@ -10,7 +10,8 @@ import json
import os
from pulsar.schema import JsonSchema
from ... schema import VectorsChunk, Triple, VectorsAssociation, Source, Value
from ... schema import ChunkEmbeddings, Triple, GraphEmbeddings, Source, Value
from ... schema import chunk_embeddings_ingest_queue, triples_store_queue, graph_embeddings_store_queue
from ... log_level import LogLevel
from ... llm_client import LlmClient
from ... prompts import to_relationships
@ -19,10 +20,12 @@ from ... base import ConsumerProducer
RDF_LABEL_VALUE = Value(value=RDF_LABEL, is_uri=True)
default_input_queue = 'vectors-chunk-load'
default_output_queue = 'graph-load'
default_subscriber = 'kg-extract-relationships'
default_vector_queue='vectors-load'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = chunk_embeddings_ingest_queue
default_output_queue = triples_store_queue
default_vector_queue = graph_embeddings_store_queue
default_subscriber = module
class Processor(ConsumerProducer):
@ -38,14 +41,14 @@ class Processor(ConsumerProducer):
"input_queue": input_queue,
"output_queue": output_queue,
"subscriber": subscriber,
"input_schema": VectorsChunk,
"input_schema": ChunkEmbeddings,
"output_schema": Triple,
}
)
self.vec_prod = self.client.create_producer(
topic=vector_queue,
schema=JsonSchema(VectorsAssociation),
schema=JsonSchema(GraphEmbeddings),
)
__class__.pubsub_metric.info({
@ -53,9 +56,9 @@ class Processor(ConsumerProducer):
"output_queue": output_queue,
"vector_queue": vector_queue,
"subscriber": subscriber,
"input_schema": VectorsChunk.__name__,
"input_schema": ChunkEmbeddings.__name__,
"output_schema": Triple.__name__,
"vector_schema": VectorsAssociation.__name__,
"vector_schema": GraphEmbeddings.__name__,
})
self.llm = LlmClient(pulsar_host=self.pulsar_host)
@ -84,7 +87,7 @@ class Processor(ConsumerProducer):
def emit_vec(self, ent, vec):
r = VectorsAssociation(entity=ent, vectors=vec)
r = GraphEmbeddings(entity=ent, vectors=vec)
self.vec_prod.send(r)
def handle(self, msg):
@ -171,5 +174,5 @@ class Processor(ConsumerProducer):
def run():
Processor.start("kg-extract-relationships", __doc__)
Processor.start(module, __doc__)

View file

@ -3,10 +3,13 @@
import pulsar
import _pulsar
from pulsar.schema import JsonSchema
from trustgraph.schema import TextCompletionRequest, TextCompletionResponse
import hashlib
import uuid
from . schema import TextCompletionRequest, TextCompletionResponse
from . schema import text_completion_request_queue
from . schema import text_completion_response_queue
# Ugly
ERROR=_pulsar.LoggerLevel.Error
WARN=_pulsar.LoggerLevel.Warn
@ -29,13 +32,13 @@ class LlmClient:
)
self.producer = self.client.create_producer(
topic='llm-complete-text',
topic=text_completion_request_queue,
schema=JsonSchema(TextCompletionRequest),
chunking_enabled=True,
)
self.consumer = self.client.subscribe(
'llm-complete-text-response', client_id,
text_completion_response_queue, client_id,
schema=JsonSchema(TextCompletionResponse),
)

View file

@ -7,13 +7,17 @@ serverless endpoint service. Input is prompt, output is response.
import requests
import json
from ... schema import TextCompletionRequest, TextCompletionResponse
from ... log_level import LogLevel
from ... base import ConsumerProducer
from .... schema import TextCompletionRequest, TextCompletionResponse
from .... schema import text_completion_request_queue
from .... schema import text_completion_response_queue
from .... log_level import LogLevel
from .... base import ConsumerProducer
default_input_queue = 'llm-complete-text'
default_output_queue = 'llm-complete-text-response'
default_subscriber = 'llm-azure-text'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = text_completion_request_queue
default_output_queue = text_completion_response_queue
default_subscriber = module
class Processor(ConsumerProducer):
@ -121,4 +125,4 @@ class Processor(ConsumerProducer):
def run():
Processor.start("llm-azure-text", __doc__)
Processor.start(module, __doc__)

View file

@ -6,13 +6,17 @@ Input is prompt, output is response.
import anthropic
from ... schema import TextCompletionRequest, TextCompletionResponse
from ... log_level import LogLevel
from ... base import ConsumerProducer
from .... schema import TextCompletionRequest, TextCompletionResponse
from .... schema import text_completion_request_queue
from .... schema import text_completion_response_queue
from .... log_level import LogLevel
from .... base import ConsumerProducer
default_input_queue = 'llm-complete-text'
default_output_queue = 'llm-complete-text-response'
default_subscriber = 'llm-claude-text'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = text_completion_request_queue
default_output_queue = text_completion_response_queue
default_subscriber = module
default_model = 'claude-3-5-sonnet-20240620'
class Processor(ConsumerProducer):
@ -101,6 +105,6 @@ class Processor(ConsumerProducer):
def run():
Processor.start("llm-claude-text", __doc__)
Processor.start(module, __doc__)

View file

@ -7,13 +7,17 @@ Input is prompt, output is response.
from langchain_community.llms import Ollama
from prometheus_client import Histogram, Info, Counter
from ... schema import TextCompletionRequest, TextCompletionResponse
from ... log_level import LogLevel
from ... base import ConsumerProducer
from .... schema import TextCompletionRequest, TextCompletionResponse
from .... schema import text_completion_request_queue
from .... schema import text_completion_response_queue
from .... log_level import LogLevel
from .... base import ConsumerProducer
default_input_queue = 'llm-complete-text'
default_output_queue = 'llm-complete-text-response'
default_subscriber = 'llm-ollama-text'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = text_completion_request_queue
default_output_queue = text_completion_response_queue
default_subscriber = module
default_model = 'gemma2'
default_ollama = 'http://localhost:11434'
@ -93,6 +97,6 @@ class Processor(ConsumerProducer):
def run():
Processor.start("llm-ollama-text", __doc__)
Processor.start(module, __doc__)

View file

@ -21,13 +21,17 @@ from vertexai.preview.generative_models import (
Tool,
)
from ... schema import TextCompletionRequest, TextCompletionResponse
from ... log_level import LogLevel
from ... base import ConsumerProducer
from .... schema import TextCompletionRequest, TextCompletionResponse
from .... schema import text_completion_request_queue
from .... schema import text_completion_response_queue
from .... log_level import LogLevel
from .... base import ConsumerProducer
default_input_queue = 'llm-complete-text'
default_output_queue = 'llm-complete-text-response'
default_subscriber = 'llm-vertexai-text'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = text_completion_request_queue
default_output_queue = text_completion_response_queue
default_subscriber = module
class Processor(ConsumerProducer):
@ -169,5 +173,5 @@ class Processor(ConsumerProducer):
def run():
Processor.start("llm-vertexai-text", __doc__)
Processor.start(module, __doc__)

View file

@ -5,13 +5,16 @@ Input is query, output is response.
"""
from ... schema import GraphRagQuery, GraphRagResponse
from ... schema import graph_rag_request_queue, graph_rag_response_queue
from ... log_level import LogLevel
from ... graph_rag import GraphRag
from ... base import ConsumerProducer
default_input_queue = 'graph-rag-query'
default_output_queue = 'graph-rag-response'
default_subscriber = 'graph-rag'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = graph_rag_request_queue
default_output_queue = graph_rag_response_queue
default_subscriber = module
default_graph_hosts = 'localhost'
default_vector_store = 'http://localhost:19530'
@ -112,5 +115,5 @@ class Processor(ConsumerProducer):
def run():
Processor.start('graph-rag', __doc__)
Processor.start(module, __doc__)

View file

@ -3,11 +3,7 @@ from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double
from enum import Enum
#class Command(Enum):
# reindex = 1
#class IndexCommand(Record):
# command = Command
############################################################################
class Value(Record):
value = String()
@ -19,49 +15,111 @@ class Source(Record):
id = String()
title = String()
############################################################################
# PDF docs etc.
class Document(Record):
source = Source()
data = Bytes()
document_ingest_queue = 'document-load'
############################################################################
# Text documents / text from PDF
class TextDocument(Record):
source = Source()
text = Bytes()
text_ingest_queue = 'text-document-load'
############################################################################
# Chunks of text
class Chunk(Record):
source = Source()
chunk = Bytes()
class VectorsChunk(Record):
chunk_ingest_queue = 'chunk-load'
############################################################################
# Chunk embeddings are an embeddings associated with a text chunk
class ChunkEmbeddings(Record):
source = Source()
vectors = Array(Array(Double()))
chunk = Bytes()
class VectorsAssociation(Record):
chunk_embeddings_ingest_queue = 'chunk-embeddings-load'
############################################################################
# Graph embeddings are embeddings associated with a graph entity
class GraphEmbeddings(Record):
source = Source()
vectors = Array(Array(Double()))
entity = Value()
graph_embeddings_store_queue = 'graph-embeddings-store'
############################################################################
# Graph triples
class Triple(Record):
source = Source()
s = Value()
p = Value()
o = Value()
triples_store_queue = 'triples-store'
############################################################################
# chunk_embeddings_store_queue = 'chunk-embeddings-store'
############################################################################
# LLM text completion
class TextCompletionRequest(Record):
prompt = String()
class TextCompletionResponse(Record):
response = String()
text_completion_request_queue = 'text-completion'
text_completion_response_queue = 'text-completion-response'
############################################################################
# Embeddings
class EmbeddingsRequest(Record):
text = String()
class EmbeddingsResponse(Record):
vectors = Array(Array(Double()))
embeddings_request_queue = 'embeddings'
embeddings_response_queue = 'embeddings-response'
############################################################################
# Graph RAG text retrieval
class GraphRagQuery(Record):
query = String()
class GraphRagResponse(Record):
response = String()
graph_rag_request_queue = 'graph-rag'
graph_rag_response_queue = 'graph-rag-response'
############################################################################

View file

@ -3,13 +3,16 @@
Accepts entity/vector pairs and writes them to a Milvus store.
"""
from ... schema import VectorsAssociation
from ... log_level import LogLevel
from ... triple_vectors import TripleVectors
from ... base import Consumer
from .... schema import GraphEmbeddings
from .... schema import graph_embeddings_store_queue
from .... log_level import LogLevel
from .... triple_vectors import TripleVectors
from .... base import Consumer
default_input_queue = 'vectors-load'
default_subscriber = 'vector-write-milvus'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = graph_embeddings_store_queue
default_subscriber = module
default_store_uri = 'http://localhost:19530'
class Processor(Consumer):
@ -24,7 +27,7 @@ class Processor(Consumer):
**params | {
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": VectorsAssociation,
"input_schema": GraphEmbeddings,
"store_uri": store_uri,
}
)
@ -54,6 +57,5 @@ class Processor(Consumer):
def run():
Processor.start("vector-write-milvus", __doc__)
Processor.start(module, __doc__)

View file

View file

@ -9,13 +9,16 @@ import os
import argparse
import time
from ... trustgraph import TrustGraph
from ... schema import Triple
from ... log_level import LogLevel
from ... base import Consumer
from .... trustgraph import TrustGraph
from .... schema import Triple
from .... schema import triples_store_queue
from .... log_level import LogLevel
from .... base import Consumer
default_input_queue = 'graph-load'
default_subscriber = 'graph-write-cassandra'
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = triples_store_queue
default_subscriber = module
default_graph_host='localhost'
class Processor(Consumer):
@ -61,5 +64,5 @@ class Processor(Consumer):
def run():
Processor.start("graph-write-cassandra", __doc__)
Processor.start(module, __doc__)