Feature/librarian (#307)

* Bring QDrant up-to-date

* Tables for data from queue outputs

- Pass single Pulsar client to everything in gateway & librarian
- Pulsar listener-name support in gateway
- PDF and text load working in librarian

* Complete Cassandra schema

* Add librarian support to templates
This commit is contained in:
cybermaggedon 2025-02-12 23:39:24 +00:00 committed by GitHub
parent f350abb415
commit f7df2df266
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
35 changed files with 500 additions and 145 deletions

View file

@ -7,6 +7,7 @@ from functools import partial
import asyncio
import threading
import queue
import base64
from pulsar.schema import JsonSchema
@ -94,23 +95,38 @@ class Processor(ConsumerProducer):
)
self.document_load = Publisher(
self.pulsar_host, document_load_queue, JsonSchema(Document),
listener=self.pulsar_listener,
self.client, document_load_queue, JsonSchema(Document),
)
self.text_load = Publisher(
self.pulsar_host, text_load_queue, JsonSchema(TextDocument),
listener=self.pulsar_listener,
self.client, text_load_queue, JsonSchema(TextDocument),
)
self.triples_load = Subscriber(
self.pulsar_host, triples_store_queue,
self.triples_brk = Subscriber(
self.client, triples_store_queue,
"librarian", "librarian",
schema=JsonSchema(Triples),
listener=self.pulsar_listener,
)
self.graph_embeddings_brk = Subscriber(
self.client, graph_embeddings_store_queue,
"librarian", "librarian",
schema=JsonSchema(GraphEmbeddings),
)
self.document_embeddings_brk = Subscriber(
self.client, document_embeddings_store_queue,
"librarian", "librarian",
schema=JsonSchema(DocumentEmbeddings),
)
self.triples_reader = threading.Thread(target=self.receive_triples)
self.triples_reader = threading.Thread(
target=self.receive_triples
)
self.graph_embeddings_reader = threading.Thread(
target=self.receive_graph_embeddings
)
self.document_embeddings_reader = threading.Thread(
target=self.receive_document_embeddings
)
self.librarian = Librarian(
cassandra_host = cassandra_host.split(","),
@ -131,34 +147,23 @@ class Processor(ConsumerProducer):
self.document_load.start()
self.text_load.start()
self.triples_load.start()
self.triples_sub = self.triples_load.subscribe_all("x")
self.triples_brk.start()
self.graph_embeddings_brk.start()
self.document_embeddings_brk.start()
self.triples_sub = self.triples_brk.subscribe_all("x")
self.graph_embeddings_sub = self.graph_embeddings_brk.subscribe_all("x")
self.document_embeddings_sub = self.document_embeddings_brk.subscribe_all("x")
self.triples_reader.start()
def receive_triples(self):
print("Receive triples!")
while self.running:
try:
msg = self.triples_sub.get(timeout=1)
except queue.Empty:
print("Tick")
continue
print(msg)
print("BYE")
self.graph_embeddings_reader.start()
self.document_embeddings_reader.start()
def __del__(self):
self.running = False
if hasattr(self, "triples_sub"):
self.triples_sub.unsubscribe_all("x")
if hasattr(self, "document_load"):
self.document_load.stop()
self.document_load.join()
@ -167,9 +172,56 @@ class Processor(ConsumerProducer):
self.text_load.stop()
self.text_load.join()
if hasattr(self, "triples_load"):
self.triples_load.stop()
self.triples_load.join()
if hasattr(self, "triples_sub"):
self.triples_sub.unsubscribe_all("x")
if hasattr(self, "graph_embeddings_sub"):
self.graph_embeddings_sub.unsubscribe_all("x")
if hasattr(self, "document_embeddings_sub"):
self.document_embeddings_sub.unsubscribe_all("x")
if hasattr(self, "triples_brk"):
self.triples_brk.stop()
self.triples_brk.join()
if hasattr(self, "graph_embeddings_brk"):
self.graph_embeddings_brk.stop()
self.graph_embeddings_brk.join()
if hasattr(self, "document_embeddings_brk"):
self.document_embeddings_brk.stop()
self.document_embeddings_brk.join()
def receive_triples(self):
while self.running:
try:
msg = self.triples_sub.get(timeout=1)
except queue.Empty:
continue
self.librarian.handle_triples(msg)
def receive_graph_embeddings(self):
while self.running:
try:
msg = self.graph_embeddings_sub.get(timeout=1)
except queue.Empty:
continue
self.librarian.handle_graph_embeddings(msg)
def receive_document_embeddings(self):
while self.running:
try:
msg = self.document_embeddings_sub.get(timeout=1)
except queue.Empty:
continue
self.librarian.handle_document_embeddings(msg)
async def load_document(self, id, document):
@ -187,6 +239,9 @@ class Processor(ConsumerProducer):
async def load_text(self, id, document):
text = base64.b64decode(document.document)
text = text.decode("utf-8")
doc = TextDocument(
metadata = Metadata(
id = id,
@ -194,7 +249,7 @@ class Processor(ConsumerProducer):
user = document.user,
collection = document.collection
),
text = document.document
text = text,
)
self.text_load.send(None, doc)