Librarian (#304)

This commit is contained in:
cybermaggedon 2025-02-11 16:01:03 +00:00 committed by GitHub
parent e99c0ac238
commit a0bf2362f6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
32 changed files with 922 additions and 66 deletions

View file

@ -3,4 +3,6 @@ from . base_processor import BaseProcessor
from . consumer import Consumer
from . producer import Producer
from . consumer_producer import ConsumerProducer
from . publisher import Publisher
from . subscriber import Subscriber

View file

@ -28,15 +28,19 @@ class BaseProcessor:
})
pulsar_host = params.get("pulsar_host", self.default_pulsar_host)
pulsar_listener = params.get("pulsar_listener", None)
log_level = params.get("log_level", LogLevel.INFO)
self.pulsar_host = pulsar_host
self.client = pulsar.Client(
pulsar_host,
listener_name=pulsar_listener,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
)
self.pulsar_listener = pulsar_listener
def __del__(self):
if hasattr(self, "client"):
@ -52,6 +56,11 @@ class BaseProcessor:
help=f'Pulsar host (default: {__class__.default_pulsar_host})',
)
parser.add_argument(
'--pulsar-listener',
help=f'Pulsar listener (default: none)',
)
parser.add_argument(
'-l', '--log-level',
type=LogLevel,

View file

@ -0,0 +1,67 @@
import queue
import time
import pulsar
import threading
class Publisher:
def __init__(self, pulsar_host, topic, schema=None, max_size=10,
chunking_enabled=True, listener=None):
self.pulsar_host = pulsar_host
self.topic = topic
self.schema = schema
self.q = queue.Queue(maxsize=max_size)
self.chunking_enabled = chunking_enabled
self.listener_name = listener
self.running = True
def start(self):
self.task = threading.Thread(target=self.run)
self.task.start()
def stop(self):
self.running = False
def join(self):
self.stop()
self.task.join()
def run(self):
while self.running:
try:
client = pulsar.Client(
self.pulsar_host, listener_name=self.listener_name
)
producer = client.create_producer(
topic=self.topic,
schema=self.schema,
chunking_enabled=self.chunking_enabled,
)
while self.running:
try:
id, item = self.q.get(timeout=0.5)
except queue.Empty:
continue
if id:
producer.send(item, { "id": id })
else:
producer.send(item)
except Exception as e:
print("Exception:", e, flush=True)
# If handler drops out, sleep a retry
time.sleep(2)
def send(self, id, msg):
self.q.put((id, msg))

View file

@ -0,0 +1,120 @@
import queue
import pulsar
import threading
import time
class Subscriber:
def __init__(self, pulsar_host, topic, subscription, consumer_name,
schema=None, max_size=100, listener=None):
self.pulsar_host = pulsar_host
self.topic = topic
self.subscription = subscription
self.consumer_name = consumer_name
self.schema = schema
self.q = {}
self.full = {}
self.max_size = max_size
self.lock = threading.Lock()
self.listener_name = listener
self.running = True
def start(self):
self.task = threading.Thread(target=self.run)
self.task.start()
def stop(self):
self.running = False
def join(self):
self.task.join()
def run(self):
while self.running:
try:
client = pulsar.Client(
self.pulsar_host,
listener_name=self.listener_name,
)
consumer = client.subscribe(
topic=self.topic,
subscription_name=self.subscription,
consumer_name=self.consumer_name,
schema=self.schema,
)
while self.running:
msg = consumer.receive()
# Acknowledge successful reception of the message
consumer.acknowledge(msg)
try:
id = msg.properties()["id"]
except:
id = None
value = msg.value()
with self.lock:
if id in self.q:
try:
# FIXME: Timeout means data goes missing
self.q[id].put(value, timeout=0.5)
except:
pass
for q in self.full.values():
try:
# FIXME: Timeout means data goes missing
q.put(value, timeout=0.5)
except:
pass
except Exception as e:
print("Exception:", e, flush=True)
# If handler drops out, sleep a retry
time.sleep(2)
def subscribe(self, id):
with self.lock:
q = queue.Queue(maxsize=self.max_size)
self.q[id] = q
return q
def unsubscribe(self, id):
with self.lock:
if id in self.q:
# self.q[id].shutdown(immediate=True)
del self.q[id]
def subscribe_all(self, id):
with self.lock:
q = queue.Queue(maxsize=self.max_size)
self.full[id] = q
return q
def unsubscribe_all(self, id):
with self.lock:
if id in self.full:
# self.full[id].shutdown(immediate=True)
del self.full[id]

View file

@ -8,3 +8,6 @@ class LlmError(Exception):
class ParseError(Exception):
pass
class RequestError(Exception):
pass

View file

@ -10,5 +10,6 @@ from . retrieval import *
from . metadata import *
from . agent import *
from . lookup import *
from . library import *

View file

@ -0,0 +1,56 @@
from pulsar.schema import Record, Bytes, String, Array
from . types import Triple
from . topic import topic
from . types import Error
from . metadata import Metadata
from . documents import Document, TextDocument
# add(Metadata, Bytes) : error?
# copy(id, user, collection)
# move(id, user, collection)
# delete(id)
# get(id) : Bytes
# reindex(id)
# list(user, collection) : id[]
# info(id[]) : DocumentInfo[]
# search(<key,op,value>[]) : id[]
class DocumentPackage(Record):
metadata = Array(Triple())
document = Bytes()
kind = String()
user = String()
collection = String()
class DocumentInfo(Record):
metadata = Array(Triple())
kind = String()
user = String()
collection = String()
class Criteria(Record):
key = String()
value = String()
operator = String()
class LibrarianRequest(Record):
operation = String()
id = String()
document = DocumentPackage()
user = String()
collection = String()
criteria = Array(Criteria())
class LibrarianResponse(Record):
error = Error()
document = DocumentPackage()
info = Array(DocumentInfo())
librarian_request_queue = topic(
'librarian', kind='non-persistent', namespace='request'
)
librarian_response_queue = topic(
'librarian', kind='non-persistent', namespace='response',
)