Processor model prototype

This commit is contained in:
Cyber MacGeddon 2024-07-15 17:17:04 +01:00
parent 3565e5c5a9
commit bc34f8b99f
22 changed files with 541 additions and 172 deletions

View file

@ -16,17 +16,24 @@ import time
from ... schema import TextDocument, Chunk, Source
from ... log_level import LogLevel
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'text-doc-load'
default_output_queue = 'chunk-load'
default_subscriber = 'chunker-recursive'
class Processor:
def __init__(
self,
pulsar_host,
input_queue,
output_queue,
subscriber,
log_level,
pulsar_host=default_pulsar_host,
input_queue=default_input_queue,
output_queue=default_output_queue,
subscriber=default_subscriber,
log_level=LogLevel.INFO,
):
self.client = None
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
@ -49,11 +56,16 @@ class Processor:
is_separator_regex=False,
)
print("Chunker inited")
def run(self):
print("Chunker running")
while True:
msg = self.consumer.receive()
print("Chunker message received")
try:
@ -91,7 +103,9 @@ class Processor:
self.consumer.negative_acknowledge(msg)
def __del__(self):
self.client.close()
if self.client:
self.client.close()
def run():
@ -100,11 +114,6 @@ def run():
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'text-doc-load'
default_output_queue = 'chunk-load'
default_subscriber = 'chunker-recursive'
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,

View file

@ -16,17 +16,24 @@ import time
from ... schema import Document, TextDocument, Source
from ... log_level import LogLevel
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'document-load'
default_output_queue = 'text-doc-load'
default_subscriber = 'pdf-decoder'
class Processor:
def __init__(
self,
pulsar_host,
input_queue,
output_queue,
subscriber,
log_level,
pulsar_host=default_pulsar_host,
input_queue=default_input_queue,
output_queue=default_output_queue,
subscriber=default_subscriber,
log_level=LogLevel.INFO,
):
self.client = None
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
@ -42,12 +49,23 @@ class Processor:
schema=JsonSchema(TextDocument),
)
print("PDF inited")
print("Pulsar", pulsar_host)
print("Input", input_queue)
print("Output", output_queue)
print("Subscriber", subscriber)
def run(self):
print("PDF running")
while True:
msg = self.consumer.receive()
print("PDF message received")
try:
v = msg.value()
@ -89,7 +107,9 @@ class Processor:
self.consumer.negative_acknowledge(msg)
def __del__(self):
self.client.close()
if self.client:
self.client.close()
def run():
@ -98,11 +118,6 @@ def run():
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'document-load'
default_output_queue = 'text-doc-load'
default_subscriber = 'pdf-decoder'
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,

View file

@ -16,18 +16,26 @@ import time
from ... schema import EmbeddingsRequest, EmbeddingsResponse
from ... log_level import LogLevel
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'embeddings'
default_output_queue = 'embeddings-response'
default_subscriber = 'embeddings-hf'
default_model="all-MiniLM-L6-v2"
class Processor:
def __init__(
self,
pulsar_host,
input_queue,
output_queue,
subscriber,
log_level,
model,
pulsar_host=default_pulsar_host,
input_queue=default_input_queue,
output_queue=default_output_queue,
subscriber=default_subscriber,
log_level=LogLevel.INFO,
model=default_model,
):
self.client = None
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
@ -81,8 +89,9 @@ class Processor:
self.consumer.negative_acknowledge(msg)
def __del__(self):
print("Closing", flush=True)
self.client.close()
if self.client:
self.client.close()
def run():
@ -91,11 +100,6 @@ def run():
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'embeddings'
default_output_queue = 'embeddings-response'
default_subscriber = 'embeddings-hf'
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,

View file

@ -16,18 +16,24 @@ from ... schema import Chunk, VectorsChunk
from ... embeddings_client import EmbeddingsClient
from ... log_level import LogLevel
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'chunk-load'
default_output_queue = 'vectors-chunk-load'
default_subscriber = 'embeddings-vectorizer'
class Processor:
def __init__(
self,
pulsar_host,
input_queue,
output_queue,
subscriber,
log_level,
model,
pulsar_host=default_pulsar_host,
input_queue=default_input_queue,
output_queue=default_output_queue,
subscriber=default_subscriber,
log_level=LogLevel.INFO,
):
self.client = None
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
@ -89,7 +95,9 @@ class Processor:
self.consumer.negative_acknowledge(msg)
def __del__(self):
self.client.close()
if self.client:
self.client.close()
def run():
@ -98,11 +106,6 @@ def run():
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'chunk-load'
default_output_queue = 'vectors-chunk-load'
default_subscriber = 'embeddings-vectorizer'
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
@ -135,12 +138,6 @@ def run():
help=f'Output queue (default: info)'
)
parser.add_argument(
'-m', '--model',
default="all-MiniLM-L6-v2",
help=f'LLM model (default: all-MiniLM-L6-v2)'
)
args = parser.parse_args()
while True:
@ -153,7 +150,6 @@ def run():
output_queue=args.output_queue,
subscriber=args.subscriber,
log_level=args.log_level,
model=args.model,
)
p.run()

View file

@ -15,17 +15,24 @@ from ... trustgraph import TrustGraph
from ... schema import Triple
from ... log_level import LogLevel
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'graph-load'
default_subscriber = 'graph-write-cassandra'
default_graph_host='localhost'
class Processor:
def __init__(
self,
pulsar_host,
input_queue,
subscriber,
log_level,
graph_host,
pulsar_host=default_pulsar_host,
input_queue=default_input_queue,
subscriber=default_subscriber,
graph_host=default_graph_host,
log_level=LogLevel.INFO,
):
self.client = None
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
@ -72,7 +79,9 @@ class Processor:
self.consumer.negative_acknowledge(msg)
def __del__(self):
self.client.close()
if self.client:
self.client.close()
def run():
@ -81,10 +90,6 @@ def run():
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'graph-load'
default_subscriber = 'graph-write-cassandra'
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,

View file

@ -24,17 +24,24 @@ from ... rdf import TRUSTGRAPH_ENTITIES, DEFINITION
DEFINITION_VALUE = Value(value=DEFINITION, is_uri=True)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'vectors-chunk-load'
default_output_queue = 'graph-load'
default_subscriber = 'kg-extract-definitions'
class Processor:
def __init__(
self,
pulsar_host,
input_queue,
output_queue,
subscriber,
log_level,
pulsar_host=default_pulsar_host,
input_queue=default_input_queue,
output_queue=default_output_queue,
subscriber=default_subscriber,
log_level=LogLevel.INFO,
):
self.client = None
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
@ -122,7 +129,9 @@ class Processor:
self.consumer.negative_acknowledge(msg)
def __del__(self):
self.client.close()
if self.client:
self.client.close()
def run():
@ -131,11 +140,6 @@ def run():
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'vectors-chunk-load'
default_output_queue = 'graph-load'
default_subscriber = 'kg-extract-definitions'
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,

View file

@ -25,16 +25,22 @@ from ... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES
RDF_LABEL_VALUE = Value(value=RDF_LABEL, is_uri=True)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'vectors-chunk-load'
default_output_queue = 'graph-load'
default_subscriber = 'kg-extract-relationships'
default_vector_queue='vectors-load'
class Processor:
def __init__(
self,
pulsar_host,
input_queue,
output_queue,
vec_queue,
subscriber,
log_level,
pulsar_host=default_pulsar_host,
input_queue=default_input_queue,
vector_queue=default_vector_queue,
output_queue=default_output_queue,
subscriber=default_subscriber,
log_level=LogLevel.INFO,
):
self.client = pulsar.Client(
@ -53,7 +59,7 @@ class Processor:
)
self.vec_prod = self.client.create_producer(
topic=vec_queue,
topic=vector_queue,
schema=JsonSchema(VectorsAssociation),
)
@ -182,12 +188,6 @@ def run():
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'vectors-chunk-load'
default_output_queue = 'graph-load'
default_subscriber = 'kg-extract-relationships'
default_vector_queue='vectors-load'
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
@ -236,7 +236,7 @@ def run():
pulsar_host=args.pulsar_host,
input_queue=args.input_queue,
output_queue=args.output_queue,
vec_queue=args.vector_queue,
vector_queue=args.vector_queue,
subscriber=args.subscriber,
log_level=args.log_level,
)

View file

@ -18,15 +18,20 @@ import json
from ... schema import TextCompletionRequest, TextCompletionResponse
from ... log_level import LogLevel
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'llm-complete-text'
default_output_queue = 'llm-complete-text-response'
default_subscriber = 'llm-azure-text'
class Processor:
def __init__(
self,
pulsar_host,
input_queue,
output_queue,
subscriber,
log_level,
pulsar_host=default_pulsar_host,
input_queue=default_input_queue,
output_queue=default_output_queue,
subscriber=default_subscriber,
log_level=LogLevel.INFO,
endpoint,
token,
):
@ -138,11 +143,6 @@ def run():
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'llm-complete-text'
default_output_queue = 'llm-complete-text-response'
default_subscriber = 'llm-ollama-text'
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,

View file

@ -16,19 +16,27 @@ import time
from ... schema import TextCompletionRequest, TextCompletionResponse
from ... log_level import LogLevel
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'llm-complete-text'
default_output_queue = 'llm-complete-text-response'
default_subscriber = 'llm-claude-text'
default_model = 'claude-3-5-sonnet-20240620'
class Processor:
def __init__(
self,
pulsar_host,
input_queue,
output_queue,
subscriber,
log_level,
model,
pulsar_host=default_pulsar_host,
input_queue=default_input_queue,
output_queue=default_output_queue,
subscriber=default_subscriber,
log_level=LogLevel.INFO,
model=default_model,
api_key,
):
self.client = None
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
@ -114,11 +122,6 @@ def run():
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'llm-complete-text'
default_output_queue = 'llm-complete-text-response'
default_subscriber = 'llm-claude-text'
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
@ -187,4 +190,3 @@ def run():
time.sleep(10)

View file

@ -16,19 +16,28 @@ import time
from ... schema import TextCompletionRequest, TextCompletionResponse
from ... log_level import LogLevel
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'llm-complete-text'
default_output_queue = 'llm-complete-text-response'
default_subscriber = 'llm-ollama-text'
default_model = 'gemma2'
default_ollama = 'http://localhost:11434'
class Processor:
def __init__(
self,
pulsar_host,
input_queue,
output_queue,
subscriber,
log_level,
model,
ollama,
pulsar_host=default_pulsar_host,
input_queue=default_input_queue,
output_queue=default_output_queue,
subscriber=default_subscriber,
log_level=LogLevel.INFO,
model=default_model,
ollama=default_ollama,
):
self.client = None
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
@ -82,8 +91,9 @@ class Processor:
self.consumer.negative_acknowledge(msg)
def __del__(self):
print("Closing")
self.client.close()
if self.client:
self.client.close()
def run():
@ -92,11 +102,6 @@ def run():
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'llm-complete-text'
default_output_queue = 'llm-complete-text-response'
default_subscriber = 'llm-ollama-text'
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,

View file

@ -30,20 +30,27 @@ from vertexai.preview.generative_models import (
from ... schema import TextCompletionRequest, TextCompletionResponse
from ... log_level import LogLevel
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'llm-complete-text'
default_output_queue = 'llm-complete-text-response'
default_subscriber = 'llm-vertexai-text'
class Processor:
def __init__(
self,
pulsar_host,
input_queue,
output_queue,
subscriber,
log_level,
pulsar_host=default_pulsar_host,
input_queue=default_input_queue,
output_queue=default_output_queue,
subscriber=default_subscriber,
log_level=LogLevel.INFO,
region="us-west1",
model="gemini-1.0-pro-001",
credentials,
region,
model,
):
self.client = None
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
@ -155,7 +162,9 @@ class Processor:
self.consumer.negative_acknowledge(msg)
def __del__(self):
self.client.close()
if self.client:
self.client.close()
def run():
@ -164,11 +173,6 @@ def run():
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'llm-complete-text'
default_output_queue = 'llm-complete-text-response'
default_subscriber = 'llm-vertexai-text'
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,

View file

@ -0,0 +1,3 @@
from . processing import *

View file

@ -0,0 +1,7 @@
#!/usr/bin/env python3
from . processing import run
if __name__ == '__main__':
run()

View file

@ -0,0 +1,171 @@
import argparse
import time
import os
from yaml import load, Loader
import json
import multiprocessing
from multiprocessing.connection import wait
import importlib
from .. log_level import LogLevel
def fn(module_name, class_name, params, w):
print(f"Starting {module_name}...")
if "log_level" in params:
params["log_level"] = LogLevel(params["log_level"])
while True:
try:
print(f"Starting {class_name} using {module_name}...")
module = importlib.import_module(module_name)
class_object = getattr(module, class_name)
processor = class_object(**params)
processor.run()
print(f"{module_name} stopped.")
except Exception as e:
print("Exception:", e)
print("Restarting in 10...")
time.sleep(10)
print("Closing")
w.close()
class Processing:
def __init__(
self,
pulsar_host,
log_level,
file,
):
self.pulsar_host = pulsar_host
self.log_level = log_level
self.file = file
self.defs = load(open(file, "r"), Loader=Loader)
def run(self):
procs = []
readers = []
services = []
for service in self.defs["services"]:
sdef = self.defs["services"][service]
params = {
"pulsar_host": self.pulsar_host,
"log_level": str(self.log_level),
}
if "parameters" in sdef:
for par in sdef["parameters"]:
params[par] = sdef["parameters"][par]
module_name = sdef["module"]
class_name = sdef.get("class", "Processor")
r, w = multiprocessing.Pipe(duplex=False)
process = multiprocessing.Process(
target=fn,
args=(module_name, class_name, params, w)
)
process.start()
w.close()
procs.append(process)
services.append(service)
readers.append(r)
wait_for = len(readers)
while wait_for > 0:
ret = wait(readers)
for r in ret:
try:
msg = r.recv()
except EOFError:
readers.remove(r)
wait_for -= 1
print("All processes exited")
for p in procs:
p.join()
def __del__(self):
pass
def run():
# Seems not to work.
# multiprocessing.set_start_method('spawn')
parser = argparse.ArgumentParser(
prog='run-processing',
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
help=f'Pulsar host (default: {default_pulsar_host})',
)
parser.add_argument(
'-l', '--log-level',
type=LogLevel,
default=LogLevel.INFO,
choices=list(LogLevel),
help=f'Output queue (default: info)'
)
parser.add_argument(
'-f', '--file',
default="processing.yaml",
help=f'Processing definition file (default: processing.yaml)'
)
args = parser.parse_args()
while True:
try:
p = Processing(
pulsar_host=args.pulsar_host,
file=args.file,
log_level=args.log_level,
)
p.run()
print("Finished.")
break
except Exception as e:
print("Exception:", e, flush=True)
print("Will retry...", flush=True)
time.sleep(10)

View file

@ -16,19 +16,28 @@ from ... schema import GraphRagQuery, GraphRagResponse
from ... log_level import LogLevel
from ... graph_rag import GraphRag
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'graph-rag-query'
default_output_queue = 'graph-rag-response'
default_subscriber = 'graph-rag'
default_graph_hosts = [ 'localhost' ]
default_vector_store = 'http://localhost:19530'
class Processor:
def __init__(
self,
pulsar_host,
input_queue,
output_queue,
subscriber,
log_level,
graph_hosts,
vector_store,
pulsar_host=default_pulsar_host,
input_queue=default_input_queue,
output_queue=default_output_queue,
subscriber=default_subscriber,
log_level=LogLevel.INFO,
graph_hosts=default_graph_hosts,
vector_store=default_vector_store,
):
self.client = None
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
@ -86,8 +95,9 @@ class Processor:
self.consumer.negative_acknowledge(msg)
def __del__(self):
print("Closing", flush=True)
self.client.close()
if self.client:
self.client.close()
def run():
@ -96,11 +106,6 @@ def run():
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'graph-rag-query'
default_output_queue = 'graph-rag-response'
default_subscriber = 'graph-rag'
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,

View file

@ -16,17 +16,24 @@ from ... schema import VectorsAssociation
from ... log_level import LogLevel
from ... edge_map import VectorStore
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'vectors-load'
default_subscriber = 'vector-write-milvus'
default_store_uri = 'http://localhost:19530'
class Processor:
def __init__(
self,
pulsar_host,
input_queue,
subscriber,
store_uri,
log_level,
pulsar_host=default_pulsar_host,
input_queue=default_input_queue,
subscriber=default_subscriber,
store_uri=default_store_uri,
log_level=LogLevel.INFO,
):
self.client = None
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
@ -64,7 +71,9 @@ class Processor:
self.consumer.negative_acknowledge(msg)
def __del__(self):
self.client.close()
if self.client:
self.client.close()
def run():
@ -73,10 +82,6 @@ def run():
description=__doc__,
)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')
default_input_queue = 'vectors-load'
default_subscriber = 'vector-write-milvus'
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
@ -105,8 +110,8 @@ def run():
parser.add_argument(
'-t', '--store-uri',
default="http://localhost:19530",
help=f'Milvus store URI (default: http://localhost:19530)'
default="http://milvus:19530",
help=f'Milvus store URI (default: http://milvus:19530)'
)
args = parser.parse_args()