mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-07-01 17:39:39 +02:00
Librarian (#304)
This commit is contained in:
parent
e99c0ac238
commit
a0bf2362f6
32 changed files with 922 additions and 66 deletions
3
trustgraph-flow/trustgraph/librarian/__init__.py
Normal file
3
trustgraph-flow/trustgraph/librarian/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
|
||||
from . service import *
|
||||
|
||||
7
trustgraph-flow/trustgraph/librarian/__main__.py
Executable file
7
trustgraph-flow/trustgraph/librarian/__main__.py
Executable file
|
|
@ -0,0 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from . service import run
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
||||
|
||||
51
trustgraph-flow/trustgraph/librarian/blob_store.py
Normal file
51
trustgraph-flow/trustgraph/librarian/blob_store.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
from .. schema import LibrarianRequest, LibrarianResponse, Error
|
||||
from .. knowledge import hash
|
||||
from .. exceptions import RequestError
|
||||
|
||||
from minio import Minio
|
||||
import time
|
||||
import io
|
||||
|
||||
class BlobStore:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
minio_host, minio_access_key, minio_secret_key, bucket_name,
|
||||
):
|
||||
|
||||
|
||||
self.minio = Minio(
|
||||
minio_host,
|
||||
access_key = minio_access_key,
|
||||
secret_key = minio_secret_key,
|
||||
secure = False,
|
||||
)
|
||||
|
||||
self.bucket_name = bucket_name
|
||||
|
||||
print("Connected to minio", flush=True)
|
||||
|
||||
self.ensure_bucket()
|
||||
|
||||
def ensure_bucket(self):
|
||||
|
||||
# Make the bucket if it doesn't exist.
|
||||
found = self.minio.bucket_exists(self.bucket_name)
|
||||
if not found:
|
||||
self.minio.make_bucket(self.bucket_name)
|
||||
print("Created bucket", self.bucket_name, flush=True)
|
||||
else:
|
||||
print("Bucket", self.bucket_name, "already exists", flush=True)
|
||||
|
||||
def add(self, object_id, blob, kind):
|
||||
|
||||
# FIXME: Loop retry
|
||||
self.minio.put_object(
|
||||
bucket_name = self.bucket_name,
|
||||
object_name = "doc/" + str(object_id),
|
||||
length = len(blob),
|
||||
data = io.BytesIO(blob),
|
||||
content_type = kind,
|
||||
)
|
||||
|
||||
print("Add blob complete", flush=True)
|
||||
55
trustgraph-flow/trustgraph/librarian/librarian.py
Normal file
55
trustgraph-flow/trustgraph/librarian/librarian.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
from .. schema import LibrarianRequest, LibrarianResponse, Error
|
||||
from .. knowledge import hash
|
||||
from .. exceptions import RequestError
|
||||
from . table_store import TableStore
|
||||
from . blob_store import BlobStore
|
||||
|
||||
import uuid
|
||||
|
||||
class Librarian:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cassandra_host, cassandra_user, cassandra_password,
|
||||
minio_host, minio_access_key, minio_secret_key,
|
||||
bucket_name, keyspace, load_document, load_text,
|
||||
):
|
||||
|
||||
self.blob_store = BlobStore(
|
||||
minio_host, minio_access_key, minio_secret_key, bucket_name
|
||||
)
|
||||
|
||||
self.table_store = TableStore(
|
||||
cassandra_host, cassandra_user, cassandra_password, keyspace
|
||||
)
|
||||
|
||||
self.load_document = load_document
|
||||
self.load_text = load_text
|
||||
|
||||
def add(self, id, document):
|
||||
|
||||
if document.kind not in (
|
||||
"text/plain", "application/pdf"
|
||||
):
|
||||
raise RequestError("Invalid document kind: " + document.kind)
|
||||
|
||||
# Create object ID as a hash of the document
|
||||
object_id = uuid.UUID(hash(document.document))
|
||||
|
||||
self.blob_store.add(object_id, document.document, document.kind)
|
||||
|
||||
self.table_store.add(object_id, document)
|
||||
|
||||
if document.kind == "application/pdf":
|
||||
self.load_document(id, document)
|
||||
elif document.kind == "text/plain":
|
||||
self.load_text(id, document)
|
||||
|
||||
print("Add complete", flush=True)
|
||||
|
||||
return LibrarianResponse(
|
||||
error = None,
|
||||
document = None,
|
||||
info = None,
|
||||
)
|
||||
|
||||
352
trustgraph-flow/trustgraph/librarian/service.py
Executable file
352
trustgraph-flow/trustgraph/librarian/service.py
Executable file
|
|
@ -0,0 +1,352 @@
|
|||
|
||||
"""
|
||||
Librarian service, manages documents in collections
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
import asyncio
|
||||
import threading
|
||||
import queue
|
||||
|
||||
from pulsar.schema import JsonSchema
|
||||
|
||||
from .. schema import LibrarianRequest, LibrarianResponse, Error
|
||||
from .. schema import librarian_request_queue, librarian_response_queue
|
||||
|
||||
from .. schema import GraphEmbeddings
|
||||
from .. schema import graph_embeddings_store_queue
|
||||
from .. schema import Triples
|
||||
from .. schema import triples_store_queue
|
||||
from .. schema import DocumentEmbeddings
|
||||
from .. schema import document_embeddings_store_queue
|
||||
|
||||
from .. schema import Document, Metadata
|
||||
from .. schema import document_ingest_queue
|
||||
from .. schema import TextDocument, Metadata
|
||||
from .. schema import text_ingest_queue
|
||||
|
||||
from .. base import Publisher
|
||||
from .. base import Subscriber
|
||||
|
||||
from .. log_level import LogLevel
|
||||
from .. base import ConsumerProducer
|
||||
from .. exceptions import RequestError
|
||||
|
||||
from . librarian import Librarian
|
||||
|
||||
module = ".".join(__name__.split(".")[1:-1])
|
||||
|
||||
default_input_queue = librarian_request_queue
|
||||
default_output_queue = librarian_response_queue
|
||||
default_subscriber = module
|
||||
default_minio_host = "minio:9000"
|
||||
default_minio_access_key = "minioadmin"
|
||||
default_minio_secret_key = "minioadmin"
|
||||
default_cassandra_host = "cassandra"
|
||||
|
||||
bucket_name = "library"
|
||||
|
||||
# FIXME: How to ensure this doesn't conflict with other usage?
|
||||
keyspace = "librarian"
|
||||
|
||||
class Processor(ConsumerProducer):
|
||||
|
||||
def __init__(self, **params):
|
||||
|
||||
self.running = True
|
||||
|
||||
input_queue = params.get("input_queue", default_input_queue)
|
||||
output_queue = params.get("output_queue", default_output_queue)
|
||||
subscriber = params.get("subscriber", default_subscriber)
|
||||
|
||||
minio_host = params.get("minio_host", default_minio_host)
|
||||
minio_access_key = params.get(
|
||||
"minio_access_key",
|
||||
default_minio_access_key
|
||||
)
|
||||
minio_secret_key = params.get(
|
||||
"minio_secret_key",
|
||||
default_minio_secret_key
|
||||
)
|
||||
|
||||
cassandra_host = params.get("cassandra_host", default_cassandra_host)
|
||||
cassandra_user = params.get("cassandra_user")
|
||||
cassandra_password = params.get("cassandra_password")
|
||||
|
||||
triples_queue = params.get("triples_queue")
|
||||
graph_embeddings_queue = params.get("graph_embeddings_queue")
|
||||
document_embeddings_queue = params.get("document_embeddings_queue")
|
||||
document_load_queue = params.get("document_load_queue")
|
||||
text_load_queue = params.get("text_load_queue")
|
||||
|
||||
super(Processor, self).__init__(
|
||||
**params | {
|
||||
"input_queue": input_queue,
|
||||
"output_queue": output_queue,
|
||||
"subscriber": subscriber,
|
||||
"input_schema": LibrarianRequest,
|
||||
"output_schema": LibrarianResponse,
|
||||
"minio_host": minio_host,
|
||||
"minio_access_key": minio_access_key,
|
||||
"cassandra_host": cassandra_host,
|
||||
"cassandra_user": cassandra_user,
|
||||
}
|
||||
)
|
||||
|
||||
self.document_load = Publisher(
|
||||
self.pulsar_host, document_load_queue, JsonSchema(Document),
|
||||
listener=self.pulsar_listener,
|
||||
)
|
||||
|
||||
self.text_load = Publisher(
|
||||
self.pulsar_host, text_load_queue, JsonSchema(TextDocument),
|
||||
listener=self.pulsar_listener,
|
||||
)
|
||||
|
||||
self.triples_load = Subscriber(
|
||||
self.pulsar_host, triples_store_queue,
|
||||
"librarian", "librarian",
|
||||
schema=JsonSchema(Triples),
|
||||
listener=self.pulsar_listener,
|
||||
)
|
||||
|
||||
self.document_load.start()
|
||||
self.text_load.start()
|
||||
self.triples_load.start()
|
||||
|
||||
self.triples_sub = self.triples_load.subscribe_all("x")
|
||||
|
||||
self.triples_reader = threading.Thread(target=self.receive_triples)
|
||||
self.triples_reader.start()
|
||||
|
||||
self.librarian = Librarian(
|
||||
cassandra_host = cassandra_host.split(","),
|
||||
cassandra_user = cassandra_user,
|
||||
cassandra_password = cassandra_password,
|
||||
minio_host = minio_host,
|
||||
minio_access_key = minio_access_key,
|
||||
minio_secret_key = minio_secret_key,
|
||||
bucket_name = bucket_name,
|
||||
keyspace = keyspace,
|
||||
load_document = self.load_document,
|
||||
load_text = self.load_text,
|
||||
)
|
||||
|
||||
print("Initialised.", flush=True)
|
||||
|
||||
def receive_triples(self):
|
||||
|
||||
print("Receive triples!")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
msg = self.triples_sub.get(timeout=1)
|
||||
except queue.Empty:
|
||||
print("Tick")
|
||||
continue
|
||||
|
||||
print(msg)
|
||||
|
||||
print("BYE")
|
||||
|
||||
def __del__(self):
|
||||
|
||||
self.running = False
|
||||
|
||||
if hasattr(self, "triples_sub"):
|
||||
self.triples_sub.unsubscribe_all("x")
|
||||
|
||||
if hasattr(self, "document_load"):
|
||||
self.document_load.stop()
|
||||
self.document_load.join()
|
||||
|
||||
if hasattr(self, "text_load"):
|
||||
self.text_load.stop()
|
||||
self.text_load.join()
|
||||
|
||||
if hasattr(self, "triples_load"):
|
||||
self.triples_load.stop()
|
||||
self.triples_load.join()
|
||||
|
||||
def load_document(self, id, document):
|
||||
|
||||
doc = Document(
|
||||
metadata = Metadata(
|
||||
id = id,
|
||||
metadata = document.metadata,
|
||||
user = document.user,
|
||||
collection = document.collection
|
||||
),
|
||||
data = document.document
|
||||
)
|
||||
|
||||
self.document_load.send(None, doc)
|
||||
|
||||
def load_text(self, id, document):
|
||||
|
||||
doc = TextDocument(
|
||||
metadata = Metadata(
|
||||
id = id,
|
||||
metadata = document.metadata,
|
||||
user = document.user,
|
||||
collection = document.collection
|
||||
),
|
||||
text = document.document
|
||||
)
|
||||
|
||||
self.text_load.send(None, doc)
|
||||
|
||||
def parse_request(self, v):
|
||||
|
||||
if v.operation is None:
|
||||
raise RequestError("Null operation")
|
||||
|
||||
if v.operation == "add":
|
||||
print(v)
|
||||
if (
|
||||
v.id and v.document and v.document.metadata and
|
||||
v.document.document and v.document.kind
|
||||
):
|
||||
return partial(
|
||||
self.librarian.add,
|
||||
id = v.id,
|
||||
document = v.document,
|
||||
)
|
||||
else:
|
||||
raise RequestError("Invalid call")
|
||||
|
||||
raise RequestError("Invalid operation: " + v.operation)
|
||||
|
||||
def handle(self, msg):
|
||||
|
||||
v = msg.value()
|
||||
|
||||
# Sender-produced ID
|
||||
|
||||
id = msg.properties()["id"]
|
||||
|
||||
print(f"Handling input {id}...", flush=True)
|
||||
|
||||
try:
|
||||
func = self.parse_request(v)
|
||||
except RequestError as e:
|
||||
resp = LibrarianResponse(
|
||||
error = Error(
|
||||
type = "request-error",
|
||||
message = str(e),
|
||||
)
|
||||
)
|
||||
self.producer.send(resp, properties={"id": id})
|
||||
return
|
||||
|
||||
try:
|
||||
resp = func()
|
||||
except RequestError as e:
|
||||
resp = LibrarianResponse(
|
||||
error = Error(
|
||||
type = "request-error",
|
||||
message = str(e),
|
||||
)
|
||||
)
|
||||
self.producer.send(resp, properties={"id": id})
|
||||
return
|
||||
except Exception as e:
|
||||
print("Exception:", e, flush=True)
|
||||
resp = LibrarianResponse(
|
||||
error = Error(
|
||||
type = "processing-error",
|
||||
message = "Unhandled error: " + str(e),
|
||||
)
|
||||
)
|
||||
self.producer.send(resp, properties={"id": id})
|
||||
return
|
||||
|
||||
print("Send response...", flush=True)
|
||||
|
||||
self.producer.send(resp, properties={"id": id})
|
||||
|
||||
print("Done.", flush=True)
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
ConsumerProducer.add_args(
|
||||
parser, default_input_queue, default_subscriber,
|
||||
default_output_queue,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--minio-host',
|
||||
default=default_minio_host,
|
||||
help=f'Minio hostname (default: {default_minio_host})',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--minio-access-key',
|
||||
default='minioadmin',
|
||||
help='Minio access key / username '
|
||||
f'(default: {default_minio_access_key})',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--minio-secret-key',
|
||||
default='minioadmin',
|
||||
help='Minio secret key / password '
|
||||
f'(default: {default_minio_access_key})',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--cassandra-host',
|
||||
default="cassandra",
|
||||
help=f'Graph host (default: cassandra)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--cassandra-user',
|
||||
default=None,
|
||||
help=f'Cassandra user'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--cassandra-password',
|
||||
default=None,
|
||||
help=f'Cassandra password'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--triples-queue',
|
||||
default=triples_store_queue,
|
||||
help=f'Triples queue (default: {triples_store_queue})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--graph-embeddings-queue',
|
||||
default=graph_embeddings_store_queue,
|
||||
help=f'Graph embeddings queue (default: {triples_store_queue})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--document-embeddings-queue',
|
||||
default=document_embeddings_store_queue,
|
||||
help='Document embeddings queue '
|
||||
f'(default: {document_embeddings_store_queue})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--document-load-queue',
|
||||
default=document_ingest_queue,
|
||||
help='Document load queue '
|
||||
f'(default: {document_ingest_queue})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--text-load-queue',
|
||||
default=text_ingest_queue,
|
||||
help='Text ingest queue '
|
||||
f'(default: {text_ingest_queue})'
|
||||
)
|
||||
|
||||
def run():
|
||||
|
||||
Processor.start(module, __doc__)
|
||||
|
||||
131
trustgraph-flow/trustgraph/librarian/table_store.py
Normal file
131
trustgraph-flow/trustgraph/librarian/table_store.py
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
from .. schema import LibrarianRequest, LibrarianResponse, Error
|
||||
from .. knowledge import hash
|
||||
from .. exceptions import RequestError
|
||||
|
||||
from cassandra.cluster import Cluster
|
||||
from cassandra.auth import PlainTextAuthProvider
|
||||
from cassandra.query import BatchStatement
|
||||
import uuid
|
||||
import time
|
||||
|
||||
class TableStore:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cassandra_host, cassandra_user, cassandra_password, keyspace,
|
||||
):
|
||||
|
||||
self.keyspace = keyspace
|
||||
|
||||
print("Connecting to Cassandra...", flush=True)
|
||||
|
||||
if cassandra_user and cassandra_password:
|
||||
auth_provider = PlainTextAuthProvider(
|
||||
username=cassandra_user, password=cassandra_password
|
||||
)
|
||||
self.cluster = Cluster(
|
||||
cassandra_host,
|
||||
auth_provider=auth_provider
|
||||
)
|
||||
else:
|
||||
self.cluster = Cluster(cassandra_host)
|
||||
|
||||
self.cassandra = self.cluster.connect()
|
||||
|
||||
print("Connected.", flush=True)
|
||||
|
||||
self.ensure_cassandra_schema()
|
||||
|
||||
self.insert_document_stmt = self.cassandra.prepare("""
|
||||
insert into document
|
||||
(id, user, collection, kind, object_id, metadata)
|
||||
values (?, ?, ?, ?, ?, ?)
|
||||
""")
|
||||
|
||||
def ensure_cassandra_schema(self):
|
||||
|
||||
print("Ensure Cassandra schema...", flush=True)
|
||||
|
||||
print("Keyspace...", flush=True)
|
||||
|
||||
# FIXME: Replication factor should be configurable
|
||||
self.cassandra.execute(f"""
|
||||
create keyspace if not exists {self.keyspace}
|
||||
with replication = {{
|
||||
'class' : 'SimpleStrategy',
|
||||
'replication_factor' : 1
|
||||
}};
|
||||
""");
|
||||
|
||||
self.cassandra.set_keyspace(self.keyspace)
|
||||
|
||||
print("document table...", flush=True)
|
||||
|
||||
self.cassandra.execute("""
|
||||
create table if not exists document (
|
||||
user text,
|
||||
collection text,
|
||||
id uuid,
|
||||
kind text,
|
||||
object_id uuid,
|
||||
metadata list<tuple<
|
||||
text, boolean, text, boolean, text, boolean
|
||||
>>,
|
||||
PRIMARY KEY (user, collection, id)
|
||||
);
|
||||
""");
|
||||
|
||||
print("object index...", flush=True)
|
||||
|
||||
self.cassandra.execute("""
|
||||
create index if not exists document_object
|
||||
on document ( object_id)
|
||||
""");
|
||||
|
||||
print("Cassandra schema OK.", flush=True)
|
||||
|
||||
def add(self, object_id, document):
|
||||
|
||||
if document.kind not in (
|
||||
"text/plain", "application/pdf"
|
||||
):
|
||||
raise RequestError("Invalid document kind: " + document.kind)
|
||||
|
||||
# Create random doc ID
|
||||
doc_id = uuid.uuid4()
|
||||
|
||||
print("Adding", object_id, doc_id)
|
||||
|
||||
metadata = [
|
||||
(
|
||||
v.s.value, v.s.is_uri, v.p.value, v.p.is_uri,
|
||||
v.o.value, v.o.is_uri
|
||||
)
|
||||
for v in document.metadata
|
||||
]
|
||||
|
||||
while True:
|
||||
|
||||
try:
|
||||
|
||||
resp = self.cassandra.execute(
|
||||
self.insert_document_stmt,
|
||||
(
|
||||
doc_id, document.user, document.collection,
|
||||
document.kind, object_id, metadata
|
||||
)
|
||||
)
|
||||
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
|
||||
print("Exception:", type(e))
|
||||
print(f"{e}, retry...", flush=True)
|
||||
time.sleep(1)
|
||||
|
||||
print("Add complete", flush=True)
|
||||
|
||||
|
||||
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue