mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-07-02 02:58:10 +02:00
Feature/librarian (#307)
* Bring QDrant up-to-date * Tables for data from queue outputs - Pass single Pulsar client to everything in gateway & librarian - Pulsar listener-name support in gateway - PDF and text load working in librarian * Complete Cassandra schema * Add librarian support to templates
This commit is contained in:
parent
f350abb415
commit
f7df2df266
35 changed files with 500 additions and 145 deletions
|
|
@ -53,4 +53,22 @@ class Librarian:
|
|||
info = None,
|
||||
)
|
||||
|
||||
def handle_triples(self, m):
|
||||
self.table_store.add_triples(m)
|
||||
|
||||
def handle_graph_embeddings(self, m):
|
||||
self.table_store.add_graph_embeddings(m)
|
||||
|
||||
def handle_document_embeddings(self, m):
|
||||
self.table_store.add_document_embeddings(m)
|
||||
|
||||
|
||||
def handle_triples(self, m):
|
||||
self.table_store.add_triples(m)
|
||||
|
||||
def handle_graph_embeddings(self, m):
|
||||
self.table_store.add_graph_embeddings(m)
|
||||
|
||||
def handle_document_embeddings(self, m):
|
||||
self.table_store.add_document_embeddings(m)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ from functools import partial
|
|||
import asyncio
|
||||
import threading
|
||||
import queue
|
||||
import base64
|
||||
|
||||
from pulsar.schema import JsonSchema
|
||||
|
||||
|
|
@ -94,23 +95,38 @@ class Processor(ConsumerProducer):
|
|||
)
|
||||
|
||||
self.document_load = Publisher(
|
||||
self.pulsar_host, document_load_queue, JsonSchema(Document),
|
||||
listener=self.pulsar_listener,
|
||||
self.client, document_load_queue, JsonSchema(Document),
|
||||
)
|
||||
|
||||
self.text_load = Publisher(
|
||||
self.pulsar_host, text_load_queue, JsonSchema(TextDocument),
|
||||
listener=self.pulsar_listener,
|
||||
self.client, text_load_queue, JsonSchema(TextDocument),
|
||||
)
|
||||
|
||||
self.triples_load = Subscriber(
|
||||
self.pulsar_host, triples_store_queue,
|
||||
self.triples_brk = Subscriber(
|
||||
self.client, triples_store_queue,
|
||||
"librarian", "librarian",
|
||||
schema=JsonSchema(Triples),
|
||||
listener=self.pulsar_listener,
|
||||
)
|
||||
self.graph_embeddings_brk = Subscriber(
|
||||
self.client, graph_embeddings_store_queue,
|
||||
"librarian", "librarian",
|
||||
schema=JsonSchema(GraphEmbeddings),
|
||||
)
|
||||
self.document_embeddings_brk = Subscriber(
|
||||
self.client, document_embeddings_store_queue,
|
||||
"librarian", "librarian",
|
||||
schema=JsonSchema(DocumentEmbeddings),
|
||||
)
|
||||
|
||||
self.triples_reader = threading.Thread(target=self.receive_triples)
|
||||
self.triples_reader = threading.Thread(
|
||||
target=self.receive_triples
|
||||
)
|
||||
self.graph_embeddings_reader = threading.Thread(
|
||||
target=self.receive_graph_embeddings
|
||||
)
|
||||
self.document_embeddings_reader = threading.Thread(
|
||||
target=self.receive_document_embeddings
|
||||
)
|
||||
|
||||
self.librarian = Librarian(
|
||||
cassandra_host = cassandra_host.split(","),
|
||||
|
|
@ -131,34 +147,23 @@ class Processor(ConsumerProducer):
|
|||
|
||||
self.document_load.start()
|
||||
self.text_load.start()
|
||||
self.triples_load.start()
|
||||
|
||||
self.triples_sub = self.triples_load.subscribe_all("x")
|
||||
self.triples_brk.start()
|
||||
self.graph_embeddings_brk.start()
|
||||
self.document_embeddings_brk.start()
|
||||
|
||||
self.triples_sub = self.triples_brk.subscribe_all("x")
|
||||
self.graph_embeddings_sub = self.graph_embeddings_brk.subscribe_all("x")
|
||||
self.document_embeddings_sub = self.document_embeddings_brk.subscribe_all("x")
|
||||
|
||||
self.triples_reader.start()
|
||||
|
||||
def receive_triples(self):
|
||||
|
||||
print("Receive triples!")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
msg = self.triples_sub.get(timeout=1)
|
||||
except queue.Empty:
|
||||
print("Tick")
|
||||
continue
|
||||
|
||||
print(msg)
|
||||
|
||||
print("BYE")
|
||||
self.graph_embeddings_reader.start()
|
||||
self.document_embeddings_reader.start()
|
||||
|
||||
def __del__(self):
|
||||
|
||||
self.running = False
|
||||
|
||||
if hasattr(self, "triples_sub"):
|
||||
self.triples_sub.unsubscribe_all("x")
|
||||
|
||||
if hasattr(self, "document_load"):
|
||||
self.document_load.stop()
|
||||
self.document_load.join()
|
||||
|
|
@ -167,9 +172,56 @@ class Processor(ConsumerProducer):
|
|||
self.text_load.stop()
|
||||
self.text_load.join()
|
||||
|
||||
if hasattr(self, "triples_load"):
|
||||
self.triples_load.stop()
|
||||
self.triples_load.join()
|
||||
if hasattr(self, "triples_sub"):
|
||||
self.triples_sub.unsubscribe_all("x")
|
||||
|
||||
if hasattr(self, "graph_embeddings_sub"):
|
||||
self.graph_embeddings_sub.unsubscribe_all("x")
|
||||
|
||||
if hasattr(self, "document_embeddings_sub"):
|
||||
self.document_embeddings_sub.unsubscribe_all("x")
|
||||
|
||||
if hasattr(self, "triples_brk"):
|
||||
self.triples_brk.stop()
|
||||
self.triples_brk.join()
|
||||
|
||||
if hasattr(self, "graph_embeddings_brk"):
|
||||
self.graph_embeddings_brk.stop()
|
||||
self.graph_embeddings_brk.join()
|
||||
|
||||
if hasattr(self, "document_embeddings_brk"):
|
||||
self.document_embeddings_brk.stop()
|
||||
self.document_embeddings_brk.join()
|
||||
|
||||
def receive_triples(self):
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
msg = self.triples_sub.get(timeout=1)
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
self.librarian.handle_triples(msg)
|
||||
|
||||
def receive_graph_embeddings(self):
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
msg = self.graph_embeddings_sub.get(timeout=1)
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
self.librarian.handle_graph_embeddings(msg)
|
||||
|
||||
def receive_document_embeddings(self):
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
msg = self.document_embeddings_sub.get(timeout=1)
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
self.librarian.handle_document_embeddings(msg)
|
||||
|
||||
async def load_document(self, id, document):
|
||||
|
||||
|
|
@ -187,6 +239,9 @@ class Processor(ConsumerProducer):
|
|||
|
||||
async def load_text(self, id, document):
|
||||
|
||||
text = base64.b64decode(document.document)
|
||||
text = text.decode("utf-8")
|
||||
|
||||
doc = TextDocument(
|
||||
metadata = Metadata(
|
||||
id = id,
|
||||
|
|
@ -194,7 +249,7 @@ class Processor(ConsumerProducer):
|
|||
user = document.user,
|
||||
collection = document.collection
|
||||
),
|
||||
text = document.document
|
||||
text = text,
|
||||
)
|
||||
|
||||
self.text_load.send(None, doc)
|
||||
|
|
|
|||
|
|
@ -36,11 +36,7 @@ class TableStore:
|
|||
|
||||
self.ensure_cassandra_schema()
|
||||
|
||||
self.insert_document_stmt = self.cassandra.prepare("""
|
||||
insert into document
|
||||
(id, user, collection, kind, object_id, metadata)
|
||||
values (?, ?, ?, ?, ?, ?)
|
||||
""")
|
||||
self.prepare_statements()
|
||||
|
||||
def ensure_cassandra_schema(self):
|
||||
|
||||
|
|
@ -62,10 +58,13 @@ class TableStore:
|
|||
print("document table...", flush=True)
|
||||
|
||||
self.cassandra.execute("""
|
||||
create table if not exists document (
|
||||
CREATE TABLE IF NOT EXISTS document (
|
||||
user text,
|
||||
collection text,
|
||||
id uuid,
|
||||
time timestamp,
|
||||
title text,
|
||||
comments text,
|
||||
kind text,
|
||||
object_id uuid,
|
||||
metadata list<tuple<
|
||||
|
|
@ -78,12 +77,113 @@ class TableStore:
|
|||
print("object index...", flush=True)
|
||||
|
||||
self.cassandra.execute("""
|
||||
create index if not exists document_object
|
||||
on document ( object_id)
|
||||
CREATE INDEX IF NOT EXISTS document_object
|
||||
ON document (object_id)
|
||||
""");
|
||||
|
||||
print("triples table...", flush=True)
|
||||
|
||||
self.cassandra.execute("""
|
||||
CREATE TABLE IF NOT EXISTS triples (
|
||||
user text,
|
||||
collection text,
|
||||
document_id text,
|
||||
id uuid,
|
||||
time timestamp,
|
||||
metadata list<tuple<
|
||||
text, boolean, text, boolean, text, boolean
|
||||
>>,
|
||||
triples list<tuple<
|
||||
text, boolean, text, boolean, text, boolean
|
||||
>>,
|
||||
PRIMARY KEY (user, collection, document_id, id)
|
||||
);
|
||||
""");
|
||||
|
||||
print("graph_embeddings table...", flush=True)
|
||||
|
||||
self.cassandra.execute("""
|
||||
create table if not exists graph_embeddings (
|
||||
user text,
|
||||
collection text,
|
||||
document_id text,
|
||||
id uuid,
|
||||
time timestamp,
|
||||
metadata list<tuple<
|
||||
text, boolean, text, boolean, text, boolean
|
||||
>>,
|
||||
entity_embeddings list<
|
||||
tuple<
|
||||
tuple<text, boolean>,
|
||||
list<list<double>>
|
||||
>
|
||||
>,
|
||||
PRIMARY KEY (user, collection, document_id, id)
|
||||
);
|
||||
""");
|
||||
|
||||
print("document_embeddings table...", flush=True)
|
||||
|
||||
self.cassandra.execute("""
|
||||
create table if not exists document_embeddings (
|
||||
user text,
|
||||
collection text,
|
||||
document_id text,
|
||||
id uuid,
|
||||
time timestamp,
|
||||
metadata list<tuple<
|
||||
text, boolean, text, boolean, text, boolean
|
||||
>>,
|
||||
chunks list<
|
||||
tuple<
|
||||
blob,
|
||||
list<list<double>>
|
||||
>
|
||||
>,
|
||||
PRIMARY KEY (user, collection, document_id, id)
|
||||
);
|
||||
""");
|
||||
|
||||
print("Cassandra schema OK.", flush=True)
|
||||
|
||||
def prepare_statements(self):
|
||||
|
||||
self.insert_document_stmt = self.cassandra.prepare("""
|
||||
INSERT INTO document
|
||||
(
|
||||
id, user, collection, kind, object_id, time, title, comments,
|
||||
metadata
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""")
|
||||
|
||||
self.insert_triples_stmt = self.cassandra.prepare("""
|
||||
INSERT INTO triples
|
||||
(
|
||||
id, user, collection, document_id, time,
|
||||
metadata, triples
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""")
|
||||
|
||||
self.insert_graph_embeddings_stmt = self.cassandra.prepare("""
|
||||
INSERT INTO graph_embeddings
|
||||
(
|
||||
id, user, collection, document_id, time,
|
||||
metadata, entity_embeddings
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""")
|
||||
|
||||
self.insert_document_embeddings_stmt = self.cassandra.prepare("""
|
||||
INSERT INTO document_embeddings
|
||||
(
|
||||
id, user, collection, document_id, time,
|
||||
metadata, chunks
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""")
|
||||
|
||||
def add(self, object_id, document):
|
||||
|
||||
if document.kind not in (
|
||||
|
|
@ -93,6 +193,7 @@ class TableStore:
|
|||
|
||||
# Create random doc ID
|
||||
doc_id = uuid.uuid4()
|
||||
when = int(time.time() * 1000)
|
||||
|
||||
print("Adding", object_id, doc_id)
|
||||
|
||||
|
|
@ -104,6 +205,8 @@ class TableStore:
|
|||
for v in document.metadata
|
||||
]
|
||||
|
||||
# FIXME: doc_id should be the user-supplied ID???
|
||||
|
||||
while True:
|
||||
|
||||
try:
|
||||
|
|
@ -111,8 +214,10 @@ class TableStore:
|
|||
resp = self.cassandra.execute(
|
||||
self.insert_document_stmt,
|
||||
(
|
||||
doc_id, document.user, document.collection,
|
||||
document.kind, object_id, metadata
|
||||
doc_id, document.user, document.collection,
|
||||
document.kind, object_id, when,
|
||||
document.title, document.comments,
|
||||
metadata
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -126,6 +231,136 @@ class TableStore:
|
|||
|
||||
print("Add complete", flush=True)
|
||||
|
||||
def add_triples(self, m):
|
||||
|
||||
when = int(time.time() * 1000)
|
||||
|
||||
if m.metadata.metadata:
|
||||
metadata = [
|
||||
(
|
||||
v.s.value, v.s.is_uri, v.p.value, v.p.is_uri,
|
||||
v.o.value, v.o.is_uri
|
||||
)
|
||||
for v in m.metadata.metadata
|
||||
]
|
||||
else:
|
||||
metadata = []
|
||||
|
||||
triples = [
|
||||
(
|
||||
v.s.value, v.s.is_uri, v.p.value, v.p.is_uri,
|
||||
v.o.value, v.o.is_uri
|
||||
)
|
||||
for v in m.triples
|
||||
]
|
||||
|
||||
while True:
|
||||
|
||||
try:
|
||||
|
||||
resp = self.cassandra.execute(
|
||||
self.insert_triples_stmt,
|
||||
(
|
||||
uuid.uuid4(), m.metadata.user,
|
||||
m.metadata.collection, m.metadata.id, when,
|
||||
metadata, triples,
|
||||
)
|
||||
)
|
||||
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
|
||||
print("Exception:", type(e))
|
||||
print(f"{e}, retry...", flush=True)
|
||||
time.sleep(1)
|
||||
|
||||
def add_graph_embeddings(self, m):
|
||||
|
||||
when = int(time.time() * 1000)
|
||||
|
||||
if m.metadata.metadata:
|
||||
metadata = [
|
||||
(
|
||||
v.s.value, v.s.is_uri, v.p.value, v.p.is_uri,
|
||||
v.o.value, v.o.is_uri
|
||||
)
|
||||
for v in m.metadata.metadata
|
||||
]
|
||||
else:
|
||||
metadata = []
|
||||
|
||||
entities = [
|
||||
(
|
||||
(v.entity.value, v.entity.is_uri),
|
||||
v.vectors
|
||||
)
|
||||
for v in m.entities
|
||||
]
|
||||
|
||||
while True:
|
||||
|
||||
try:
|
||||
|
||||
resp = self.cassandra.execute(
|
||||
self.insert_graph_embeddings_stmt,
|
||||
(
|
||||
uuid.uuid4(), m.metadata.user,
|
||||
m.metadata.collection, m.metadata.id, when,
|
||||
metadata, entities,
|
||||
)
|
||||
)
|
||||
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
|
||||
print("Exception:", type(e))
|
||||
print(f"{e}, retry...", flush=True)
|
||||
time.sleep(1)
|
||||
|
||||
def add_document_embeddings(self, m):
|
||||
|
||||
when = int(time.time() * 1000)
|
||||
|
||||
if m.metadata.metadata:
|
||||
metadata = [
|
||||
(
|
||||
v.s.value, v.s.is_uri, v.p.value, v.p.is_uri,
|
||||
v.o.value, v.o.is_uri
|
||||
)
|
||||
for v in m.metadata.metadata
|
||||
]
|
||||
else:
|
||||
metadata = []
|
||||
|
||||
chunks = [
|
||||
(
|
||||
v.chunk,
|
||||
v.vectors,
|
||||
)
|
||||
for v in m.chunks
|
||||
]
|
||||
|
||||
while True:
|
||||
|
||||
try:
|
||||
|
||||
resp = self.cassandra.execute(
|
||||
self.insert_document_embeddings_stmt,
|
||||
(
|
||||
uuid.uuid4(), m.metadata.user,
|
||||
m.metadata.collection, m.metadata.id, when,
|
||||
metadata, chunks,
|
||||
)
|
||||
)
|
||||
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
|
||||
print("Exception:", type(e))
|
||||
print(f"{e}, retry...", flush=True)
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue