Feature/flow librarian (#361)

* Update librarian to new API

* Implementing new schema with document + processing objects
This commit is contained in:
cybermaggedon 2025-05-04 22:26:19 +01:00 committed by GitHub
parent 6bf485788a
commit ff28d26f4d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 1323 additions and 428 deletions

View file

@ -4,20 +4,25 @@ import requests
import json import json
import sys import sys
import base64 import base64
import time
url = "http://localhost:8088/api/v1/" url = "http://localhost:8088/api/v1/"
############################################################################ ############################################################################
id = "http://trustgraph.ai/doc/12345678" id = "http://trustgraph.ai/doc/9fdee98b-b259-40ac-bcb9-8e82ccedeb04"
with open("docs/README.cats") as f: with open("docs/README.cats", "rb") as f:
doc = base64.b64encode(f.read().encode("utf-8")).decode("utf-8") doc = base64.b64encode(f.read()).decode("utf-8")
input = { input = {
"operation": "add", "operation": "add-document",
"document": { "document-metadata": {
"id": id, "id": id,
"time": int(time.time()),
"kind": "text/plain",
"title": "Mark's cats",
"comments": "Test doc taken from the TrustGraph repo",
"metadata": [ "metadata": [
{ {
"s": { "s": {
@ -46,13 +51,10 @@ input = {
}, },
}, },
], ],
"document": doc,
"kind": "text/plain",
"user": "trustgraph", "user": "trustgraph",
"collection": "default", "tags": ["mark", "cats"],
"title": "Mark's cats", },
"comments": "Test doc taken from the TrustGraph repo", "content": doc,
}
} }
resp = requests.post( resp = requests.post(

View file

@ -4,12 +4,13 @@ import requests
import json import json
import sys import sys
import base64 import base64
import time
url = "http://localhost:8088/api/v1/" url = "http://localhost:8088/api/v1/"
############################################################################ ############################################################################
id = "http://trustgraph.ai/doc/12345678" id = "http://trustgraph.ai/doc/6d034da9-2759-45c2-af24-14db7f4c44c2"
source = "../sources/20160001634.pdf" source = "../sources/20160001634.pdf"
@ -17,9 +18,13 @@ with open(source, "rb") as f:
doc = base64.b64encode(f.read()).decode("utf-8") doc = base64.b64encode(f.read()).decode("utf-8")
input = { input = {
"operation": "add", "operation": "add-document",
"id": id, "document-metadata": {
"document": { "id": id,
"time": int(time.time()),
"kind": "application/pdf",
"title": "Application of SAE ARP4754A to Flight Critical Systems",
"comments": "Application of federal safety standards to NASA spacecraft",
"metadata": [ "metadata": [
{ {
"s": { "s": {
@ -61,11 +66,10 @@ input = {
}, },
}, },
], ],
"document": doc,
"kind": "application/pdf",
"user": "trustgraph", "user": "trustgraph",
"collection": "default", "tags": ["nasa", "safety-engineering"],
} },
"content": doc,
} }
resp = requests.post( resp = requests.post(

View file

@ -0,0 +1,50 @@
#!/usr/bin/env python3
import requests
import json
import sys
import base64
import time
url = "http://localhost:8088/api/v1/"
############################################################################
doc_id = "http://trustgraph.ai/doc/9fdee98b-b259-40ac-bcb9-8e82ccedeb04"
proc_id = "2714fc72-44ab-45f2-94dd-6773fc336535"
input = {
"operation": "add-processing",
"processing-metadata": {
"id": proc_id,
"document-id": doc_id,
"time": int(time.time()),
"flow": "0000",
"user": "trustgraph",
"collection": "default",
"tags": ["test"],
}
}
resp = requests.post(
f"{url}librarian",
json=input,
)
print(resp.text)
resp = resp.json()
print(resp)
if "error" in resp:
print(f"Error: {resp['error']}")
sys.exit(1)
# print(resp["response"])
print(resp)
sys.exit(0)
############################################################################

View file

@ -0,0 +1,41 @@
#!/usr/bin/env python3
import requests
import json
import sys
import base64
url = "http://localhost:8088/api/v1/"
############################################################################
id = "http://trustgraph.ai/doc/9fdee98b-b259-40ac-bcb9-8e82ccedeb04"
user = "trustgraph"
input = {
"operation": "get-document-content",
"user": user,
"document-id": id,
}
resp = requests.post(
f"{url}librarian",
json=input,
)
resp = resp.json()
if "error" in resp:
print(f"Error: {resp['error']}")
sys.exit(1)
content = base64.b64decode(resp["content"]).decode("utf-8")
print(content)
sys.exit(0)
############################################################################

View file

@ -0,0 +1,42 @@
#!/usr/bin/env python3
import requests
import json
import sys
import base64
url = "http://localhost:8088/api/v1/"
############################################################################
id = "http://trustgraph.ai/doc/9fdee98b-b259-40ac-bcb9-8e82ccedeb04"
user = "trustgraph"
input = {
"operation": "get-document-metadata",
"user": user,
"document-id": id,
}
resp = requests.post(
f"{url}librarian",
json=input,
)
print(resp.text)
resp = resp.json()
print(resp)
if "error" in resp:
print(f"Error: {resp['error']}")
sys.exit(1)
# print(resp["response"])
print(resp)
sys.exit(0)
############################################################################

View file

@ -12,7 +12,7 @@ url = "http://localhost:8088/api/v1/"
user = "trustgraph" user = "trustgraph"
input = { input = {
"operation": "list", "operation": "list-documents",
"user": user, "user": user,
} }

View file

@ -0,0 +1,38 @@
#!/usr/bin/env python3
import requests
import json
import sys
import base64
import time
url = "http://localhost:8088/api/v1/"
############################################################################
input = {
"operation": "list-documents",
"user": "trustgraph",
}
resp = requests.post(
f"{url}librarian",
json=input,
)
print(resp.text)
resp = resp.json()
print(resp)
if "error" in resp:
print(f"Error: {resp['error']}")
sys.exit(1)
# print(resp["response"])
print(resp)
sys.exit(0)
############################################################################

View file

@ -0,0 +1,38 @@
#!/usr/bin/env python3
import requests
import json
import sys
import base64
import time
url = "http://localhost:8088/api/v1/"
############################################################################
input = {
"operation": "list-processing",
"user": "trustgraph",
}
resp = requests.post(
f"{url}librarian",
json=input,
)
print(resp.text)
resp = resp.json()
print(resp)
if "error" in resp:
print(f"Error: {resp['error']}")
sys.exit(1)
# print(resp["response"])
print(resp)
sys.exit(0)
############################################################################

View file

@ -0,0 +1,41 @@
#!/usr/bin/env python3
import requests
import json
import sys
import base64
import time
url = "http://localhost:8088/api/v1/"
############################################################################
id = "http://trustgraph.ai/doc/9fdee98b-b259-40ac-bcb9-8e82ccedeb04"
input = {
"operation": "remove-document",
"user": "trustgraph",
"document-id": id
}
resp = requests.post(
f"{url}librarian",
json=input,
)
print(resp.text)
resp = resp.json()
print(resp)
if "error" in resp:
print(f"Error: {resp['error']}")
sys.exit(1)
# print(resp["response"])
print(resp)
sys.exit(0)
############################################################################

View file

@ -0,0 +1,41 @@
#!/usr/bin/env python3
import requests
import json
import sys
import base64
import time
url = "http://localhost:8088/api/v1/"
############################################################################
id = "http://trustgraph.ai/doc/6d034da9-2759-45c2-af24-14db7f4c44c2"
input = {
"operation": "remove-document",
"user": "trustgraph",
"document-id": id
}
resp = requests.post(
f"{url}librarian",
json=input,
)
print(resp.text)
resp = resp.json()
print(resp)
if "error" in resp:
print(f"Error: {resp['error']}")
sys.exit(1)
# print(resp["response"])
print(resp)
sys.exit(0)
############################################################################

View file

@ -0,0 +1,41 @@
#!/usr/bin/env python3
import requests
import json
import sys
import base64
import time
url = "http://localhost:8088/api/v1/"
############################################################################
proc_id = "2714fc72-44ab-45f2-94dd-6773fc336535"
input = {
"operation": "remove-processing",
"user": "trustgraph",
"processing-id": proc_id,
}
resp = requests.post(
f"{url}librarian",
json=input,
)
print(resp.text)
resp = resp.json()
print(resp)
if "error" in resp:
print(f"Error: {resp['error']}")
sys.exit(1)
# print(resp["response"])
print(resp)
sys.exit(0)
############################################################################

View file

@ -0,0 +1,75 @@
#!/usr/bin/env python3
import requests
import json
import sys
import base64
import time
url = "http://localhost:8088/api/v1/"
############################################################################
id = "http://trustgraph.ai/doc/9fdee98b-b259-40ac-bcb9-8e82ccedeb04"
input = {
"operation": "update-document",
"document-metadata": {
"id": id,
"time": int(time.time()),
"title": "Mark's cats - a story",
"comments": "Information about Mark's cats",
"metadata": [
{
"s": {
"v": id,
"e": True,
},
"p": {
"v": "http://www.w3.org/2000/01/rdf-schema#label",
"e": True,
},
"o": {
"v": "Mark's pets", "e": False,
},
},
{
"s": {
"v": id,
"e": True,
},
"p": {
"v": 'https://schema.org/keywords',
"e": True,
},
"o": {
"v": "cats", "e": False,
},
},
],
"user": "trustgraph",
"tags": ["mark", "cats", "pets"],
},
}
resp = requests.post(
f"{url}librarian",
json=input,
)
print(resp.text)
resp = resp.json()
print(resp)
if "error" in resp:
print(f"Error: {resp['error']}")
sys.exit(1)
# print(resp["response"])
print(resp)
sys.exit(0)
############################################################################

View file

@ -6,16 +6,52 @@ from . types import Error
from . metadata import Metadata from . metadata import Metadata
from . documents import Document, TextDocument from . documents import Document, TextDocument
# add # add-document
# -> (id, document) # -> (document_id, document_metadata, content)
# <- () # <- ()
# <- (error) # <- (error)
# list # remove-document
# -> (user, collection?) # -> (document_id)
# <- (info) # <- ()
# <- (error) # <- (error)
# update-document
# -> (document_id, document_metadata)
# <- ()
# <- (error)
# get-document-metadata
# -> (document_id)
# <- (document_metadata)
# <- (error)
# get-document-content
# -> (document_id)
# <- (content)
# <- (error)
# add-processing
# -> (processing_id, processing_metadata)
# <- ()
# <- (error)
# remove-processing
# -> (processing_id)
# <- ()
# <- (error)
# list-documents
# -> (user, collection?)
# <- (document_metadata[])
# <- (error)
# list-processing
# -> (user, collection?)
# <- (processing_metadata[])
# <- (error)
# OLD:
# add(Metadata, Bytes) : error? # add(Metadata, Bytes) : error?
# copy(id, user, collection) # copy(id, user, collection)
# move(id, user, collection) # move(id, user, collection)
@ -26,26 +62,24 @@ from . documents import Document, TextDocument
# info(id[]) : DocumentInfo[] # info(id[]) : DocumentInfo[]
# search(<key,op,value>[]) : id[] # search(<key,op,value>[]) : id[]
class DocumentPackage(Record): class DocumentMetadata(Record):
id = String() id = String()
document = Bytes() time = Long()
kind = String() kind = String()
user = String()
collection = String()
title = String() title = String()
comments = String() comments = String()
time = Long()
metadata = Array(Triple()) metadata = Array(Triple())
user = String()
tags = Array(String())
class DocumentInfo(Record): class ProcessingMetadata(Record):
id = String() id = String()
kind = String() document_id = String()
time = Long()
flow = String()
user = String() user = String()
collection = String() collection = String()
title = String() tags = Array(String())
comments = String()
time = Long()
metadata = Array(Triple())
class Criteria(Record): class Criteria(Record):
key = String() key = String()
@ -53,17 +87,43 @@ class Criteria(Record):
operator = String() operator = String()
class LibrarianRequest(Record): class LibrarianRequest(Record):
# add-document, remove-document, update-document, get-document-metadata,
# get-document-content, add-processing, remove-processing, list-documents,
# list-processing
operation = String() operation = String()
id = String()
document = DocumentPackage() # add-document, remove-document, update-document, get-document-metadata,
# get-document-content
document_id = String()
# add-processing, remove-processing
processing_id = String()
# add-document, update-document
document_metadata = DocumentMetadata()
# add-processing
processing_metadata = ProcessingMetadata()
# add-document
content = Bytes()
# list-documents, list-processing
user = String() user = String()
# list-documents?, list-processing?
collection = String() collection = String()
#
criteria = Array(Criteria()) criteria = Array(Criteria())
class LibrarianResponse(Record): class LibrarianResponse(Record):
error = Error() error = Error()
document = DocumentPackage() document_metadata = DocumentMetadata()
info = Array(DocumentInfo()) content = Bytes()
document_metadatas = Array(DocumentMetadata())
processing_metadatas = Array(ProcessingMetadata())
librarian_request_queue = topic( librarian_request_queue = topic(
'librarian', kind='non-persistent', namespace='request' 'librarian', kind='non-persistent', namespace='request'

View file

@ -3,8 +3,6 @@
Config service. Manages system global configuration state Config service. Manages system global configuration state
""" """
from pulsar.schema import JsonSchema
from trustgraph.schema import Error from trustgraph.schema import Error
from trustgraph.schema import ConfigRequest, ConfigResponse, ConfigPush from trustgraph.schema import ConfigRequest, ConfigResponse, ConfigPush
@ -14,7 +12,6 @@ from trustgraph.schema import config_push_queue
from trustgraph.schema import FlowRequest, FlowResponse from trustgraph.schema import FlowRequest, FlowResponse
from trustgraph.schema import flow_request_queue, flow_response_queue from trustgraph.schema import flow_request_queue, flow_response_queue
from trustgraph.log_level import LogLevel
from trustgraph.base import AsyncProcessor, Consumer, Producer from trustgraph.base import AsyncProcessor, Consumer, Producer
from . config import Configuration from . config import Configuration

View file

@ -1,11 +1,15 @@
import base64
from ... schema import LibrarianRequest, LibrarianResponse from ... schema import LibrarianRequest, LibrarianResponse
from ... schema import librarian_request_queue from ... schema import librarian_request_queue
from ... schema import librarian_response_queue from ... schema import librarian_response_queue
from . requestor import ServiceRequestor from . requestor import ServiceRequestor
from . serialize import serialize_document_package, serialize_document_info from . serialize import serialize_document_metadata
from . serialize import to_document_package, to_document_info, to_criteria from . serialize import serialize_processing_metadata
from . serialize import to_document_metadata, to_processing_metadata
from . serialize import to_criteria
class LibrarianRequestor(ServiceRequestor): class LibrarianRequestor(ServiceRequestor):
def __init__(self, pulsar_client, consumer, subscriber, timeout=120): def __init__(self, pulsar_client, consumer, subscriber, timeout=120):
@ -23,20 +27,37 @@ class LibrarianRequestor(ServiceRequestor):
def to_request(self, body): def to_request(self, body):
if "document" in body: # Content gets base64 decoded & encoded again. It at least makes
dp = to_document_package(body["document"]) # sure payload is valid base64.
if "document-metadata" in body:
dm = to_document_metadata(body["document-metadata"])
else: else:
dp = None dm = None
if "processing-metadata" in body:
pm = to_processing_metadata(body["processing-metadata"])
else:
pm = None
if "criteria" in body: if "criteria" in body:
criteria = to_criteria(body["criteria"]) criteria = to_criteria(body["criteria"])
else: else:
criteria = None criteria = None
if "content" in body:
content = base64.b64decode(body["content"].encode("utf-8"))
content = base64.b64encode(content).decode("utf-8")
else:
content = None
return LibrarianRequest( return LibrarianRequest(
operation = body.get("operation", None), operation = body.get("operation", None),
id = body.get("id", None), document_id = body.get("document-id", None),
document = dp, processing_id = body.get("processing-id", None),
document_metadata = dm,
processing_metadata = pm,
content = content,
user = body.get("user", None), user = body.get("user", None),
collection = body.get("collection", None), collection = body.get("collection", None),
criteria = criteria, criteria = criteria,
@ -44,15 +65,28 @@ class LibrarianRequestor(ServiceRequestor):
def from_response(self, message): def from_response(self, message):
print(message)
response = {} response = {}
if message.document: if message.document_metadata:
response["document"] = serialize_document_package(message.document) response["document-metadata"] = serialize_document_metadata(
message.document_metadata
)
if message.info: if message.content:
response["info"] = [ response["content"] = message.content.decode("utf-8")
serialize_document_info(v)
for v in message.info if message.document_metadatas != None:
response["document-metadatas"] = [
serialize_document_metadata(v)
for v in message.document_metadatas
]
if message.processing_metadatas != None:
response["processing-metadatas"] = [
serialize_processing_metadata(v)
for v in message.processing_metadatas
] ]
return response, True return response, True

View file

@ -1,7 +1,7 @@
import base64 import base64
from ... schema import Value, Triple, DocumentPackage, DocumentInfo from ... schema import Value, Triple, DocumentMetadata, ProcessingMetadata
def to_value(x): def to_value(x):
return Value(value=x["v"], is_uri=x["e"]) return Value(value=x["v"], is_uri=x["e"])
@ -80,88 +80,86 @@ def serialize_document_embeddings(message):
], ],
} }
def serialize_document_package(message): def serialize_document_metadata(message):
ret = {} ret = {}
if message.id: if message.id:
ret["id"] = message.id ret["id"] = message.id
if message.metadata: if message.time:
ret["metadata"] = serialize_subgraph(message.metdata) ret["time"] = message.time
if message.document:
blob = base64.b64encode(
message.document.encode("utf-8")
).decode("utf-8")
ret["document"] = blob
if message.kind: if message.kind:
ret["kind"] = message.kind ret["kind"] = message.kind
if message.user:
ret["user"] = message.user
if message.collection:
ret["collection"] = message.collection
return ret
def serialize_document_info(message):
ret = {}
if message.id:
ret["id"] = message.id
if message.kind:
ret["kind"] = message.kind
if message.user:
ret["user"] = message.user
if message.collection:
ret["collection"] = message.collection
if message.title: if message.title:
ret["title"] = message.title ret["title"] = message.title
if message.comments: if message.comments:
ret["comments"] = message.comments ret["comments"] = message.comments
if message.time:
ret["time"] = message.time
if message.metadata: if message.metadata:
ret["metadata"] = serialize_subgraph(message.metadata) ret["metadata"] = serialize_subgraph(message.metadata)
if message.user:
ret["user"] = message.user
if message.tags:
ret["tags"] = message.tags
return ret return ret
def to_document_package(x): def serialize_processing_metadata(message):
return DocumentPackage( ret = {}
if message.id:
ret["id"] = message.id
if message.id:
ret["document-id"] = message.document_id
if message.time:
ret["time"] = message.time
if message.flow:
ret["flow"] = message.flow
if message.user:
ret["user"] = message.user
if message.collection:
ret["collection"] = message.collection
if message.tags:
ret["tags"] = message.tags
return ret
def to_document_metadata(x):
return DocumentMetadata(
id = x.get("id", None), id = x.get("id", None),
time = x.get("time", None),
kind = x.get("kind", None), kind = x.get("kind", None),
user = x.get("user", None),
collection = x.get("collection", None),
title = x.get("title", None), title = x.get("title", None),
comments = x.get("comments", None), comments = x.get("comments", None),
time = x.get("time", None),
document = x.get("document", None),
metadata = to_subgraph(x["metadata"]), metadata = to_subgraph(x["metadata"]),
user = x.get("user", None),
tags = x.get("tags", None),
) )
def to_document_info(x): def to_processing_metadata(x):
return DocumentInfo( return ProcessingMetadata(
id = x.get("id", None), id = x.get("id", None),
kind = x.get("kind", None), document_id = x.get("document-id", None),
time = x.get("time", None),
flow = x.get("flow", None),
user = x.get("user", None), user = x.get("user", None),
collection = x.get("collection", None), collection = x.get("collection", None),
title = x.get("title", None), tags = x.get("tags", None),
comments = x.get("comments", None),
time = x.get("time", None),
metadata = to_subgraph(x["metadata"]),
) )
def to_criteria(x): def to_criteria(x):
@ -169,3 +167,4 @@ def to_criteria(x):
Critera(v["key"], v["value"], v["operator"]) Critera(v["key"], v["value"], v["operator"])
for v in x for v in x
] ]

View file

@ -95,7 +95,6 @@ class Api:
await self.config_receiver.start() await self.config_receiver.start()
for ep in self.endpoints: for ep in self.endpoints:
ep.add_routes(self.app) ep.add_routes(self.app)

View file

@ -37,7 +37,7 @@ class BlobStore:
else: else:
print("Bucket", self.bucket_name, "already exists", flush=True) print("Bucket", self.bucket_name, "already exists", flush=True)
def add(self, object_id, blob, kind): async def add(self, object_id, blob, kind):
# FIXME: Loop retry # FIXME: Loop retry
self.minio.put_object( self.minio.put_object(
@ -49,3 +49,25 @@ class BlobStore:
) )
print("Add blob complete", flush=True) print("Add blob complete", flush=True)
async def remove(self, object_id):
# FIXME: Loop retry
self.minio.remove_object(
bucket_name = self.bucket_name,
object_name = "doc/" + str(object_id),
)
print("Remove blob complete", flush=True)
async def get(self, object_id):
# FIXME: Loop retry
resp = self.minio.get_object(
bucket_name = self.bucket_name,
object_name = "doc/" + str(object_id),
)
return resp.read()

View file

@ -1,8 +1,10 @@
from .. schema import LibrarianRequest, LibrarianResponse, Error, Triple from .. schema import LibrarianRequest, LibrarianResponse, Error, Triple
from .. knowledge import hash from .. knowledge import hash
from .. exceptions import RequestError from .. exceptions import RequestError
from . table_store import TableStore from . table_store import TableStore
from . blob_store import BlobStore from . blob_store import BlobStore
import base64
import uuid import uuid
@ -26,63 +28,240 @@ class Librarian:
self.load_document = load_document self.load_document = load_document
self.load_text = load_text self.load_text = load_text
async def add(self, document): async def add_document(self, request):
if document.kind not in ( if request.document_metadata.kind not in (
"text/plain", "application/pdf" "text/plain", "application/pdf"
): ):
raise RequestError("Invalid document kind: " + document.kind) raise RequestError(
"Invalid document kind: " + request.document_metadata.kind
)
# Create object ID as a hash of the document if await self.table_store.document_exists(
object_id = uuid.UUID(hash(document.document)) request.document_metadata.user,
request.document_metadata.id
):
raise RuntimeError("Document already exists")
self.blob_store.add(object_id, document.document, document.kind) # Create object ID for blob
object_id = uuid.uuid4()
self.table_store.add(object_id, document) print("Add blob...")
if document.kind == "application/pdf": await self.blob_store.add(
await self.load_document(document) object_id, base64.b64decode(request.content),
elif document.kind == "text/plain": request.document_metadata.kind
await self.load_text(document) )
print("Add table...")
await self.table_store.add_document(
request.document_metadata, object_id
)
print("Add complete", flush=True) print("Add complete", flush=True)
return LibrarianResponse( return LibrarianResponse(
error = None, error = None,
document = None, document_metadata = None,
info = None, content = None,
document_metadatas = None,
processing_metadatas = None,
) )
async def list(self, user, collection): async def remove_document(self, request):
print("list") print("Removing doc...")
info = self.table_store.list(user, collection) if not await self.table_store.document_exists(
request.user,
request.document_id,
):
raise RuntimeError("Document does not exist")
print(">>", info) object_id = await self.table_store.get_document_object_id(
request.user,
request.document_id
)
# Remove blob...
await self.blob_store.remove(object_id)
# Remove doc table row
await self.table_store.remove_document(
request.user,
request.document_id
)
print("Remove complete", flush=True)
return LibrarianResponse( return LibrarianResponse(
error = None, error = None,
document = None, document_metadata = None,
info = info, content = None,
document_metadatas = None,
processing_metadatas = None,
) )
def handle_triples(self, m): async def update_document(self, request):
self.table_store.add_triples(m)
def handle_graph_embeddings(self, m): print("Updating doc...")
self.table_store.add_graph_embeddings(m)
def handle_document_embeddings(self, m): # You can't update the document ID, user or kind.
self.table_store.add_document_embeddings(m)
if not await self.table_store.document_exists(
request.document_metadata.user,
request.document_metadata.id
):
raise RuntimeError("Document does not exist")
await self.table_store.update_document(request.document_metadata)
print("Update complete", flush=True)
return LibrarianResponse(
error = None,
document_metadata = None,
content = None,
document_metadatas = None,
processing_metadatas = None,
)
async def get_document_metadata(self, request):
print("Get doc...")
doc = await self.table_store.get_document(
request.user,
request.document_id
)
print("Get complete", flush=True)
return LibrarianResponse(
error = None,
document_metadata = doc,
content = None,
document_metadatas = None,
processing_metadatas = None,
)
async def get_document_content(self, request):
print("Get doc content...")
object_id = await self.table_store.get_document_object_id(
request.user,
request.document_id
)
content = await self.blob_store.get(
object_id
)
print("Get complete", flush=True)
return LibrarianResponse(
error = None,
document_metadata = None,
content = base64.b64encode(content),
document_metadatas = None,
processing_metadatas = None,
)
async def add_processing(self, request):
print("Add processing")
if await self.table_store.processing_exists(
request.processing_metadata.user,
request.processing_metadata.id
):
raise RuntimeError("Processing already exists")
doc = await self.table_store.get_document(
request.processing_metadata.user,
request.processing_metadata.document_id
)
object_id = await self.table_store.get_document_object_id(
request.processing_metadata.user,
request.processing_metadata.document_id
)
content = await self.blob_store.get(
object_id
)
print("Got content")
print("Add processing...")
await self.table_store.add_processing(request.processing_metadata)
print("Add complete", flush=True)
return LibrarianResponse(
error = None,
document_metadata = None,
content = None,
document_metadatas = None,
processing_metadatas = None,
)
def handle_triples(self, m): # if document.kind == "application/pdf":
self.table_store.add_triples(m) # await self.load_document(document)
# elif document.kind == "text/plain":
# await self.load_text(document)
def handle_graph_embeddings(self, m): async def remove_processing(self, request):
self.table_store.add_graph_embeddings(m)
def handle_document_embeddings(self, m): print("Removing processing...")
self.table_store.add_document_embeddings(m)
if not await self.table_store.processing_exists(
request.user,
request.processing_id,
):
raise RuntimeError("Processing object does not exist")
# Remove doc table row
await self.table_store.remove_processing(
request.user,
request.processing_id
)
print("Remove complete", flush=True)
return LibrarianResponse(
error = None,
document_metadata = None,
content = None,
document_metadatas = None,
processing_metadatas = None,
)
async def list_documents(self, request):
docs = await self.table_store.list_documents(request.user)
return LibrarianResponse(
error = None,
document_metadata = None,
content = None,
document_metadatas = docs,
processing_metadatas = None,
)
async def list_processing(self, request):
procs = await self.table_store.list_processing(request.user)
return LibrarianResponse(
error = None,
document_metadata = None,
content = None,
document_metadatas = None,
processing_metadatas = procs,
)

View file

@ -5,41 +5,27 @@ Librarian service, manages documents in collections
from functools import partial from functools import partial
import asyncio import asyncio
import threading
import queue
import base64 import base64
import json
from pulsar.schema import JsonSchema from .. base import AsyncProcessor, Consumer, Producer, Publisher, Subscriber
from .. base import ConsumerMetrics, ProducerMetrics
from .. schema import LibrarianRequest, LibrarianResponse, Error from .. schema import LibrarianRequest, LibrarianResponse, Error
from .. schema import librarian_request_queue, librarian_response_queue from .. schema import librarian_request_queue, librarian_response_queue
from .. schema import GraphEmbeddings
from .. schema import graph_embeddings_store_queue
from .. schema import Triples
from .. schema import triples_store_queue
from .. schema import DocumentEmbeddings
from .. schema import document_embeddings_store_queue
from .. schema import Document, Metadata from .. schema import Document, Metadata
from .. schema import document_ingest_queue
from .. schema import TextDocument, Metadata from .. schema import TextDocument, Metadata
from .. schema import text_ingest_queue
from .. base import Publisher
from .. base import Subscriber
from .. log_level import LogLevel
from .. base import ConsumerProducer
from .. exceptions import RequestError from .. exceptions import RequestError
from . librarian import Librarian from . librarian import Librarian
module = "librarian" default_ident = "librarian"
default_librarian_request_queue = librarian_request_queue
default_librarian_response_queue = librarian_response_queue
default_input_queue = librarian_request_queue
default_output_queue = librarian_response_queue
default_subscriber = module
default_minio_host = "minio:9000" default_minio_host = "minio:9000"
default_minio_access_key = "minioadmin" default_minio_access_key = "minioadmin"
default_minio_secret_key = "minioadmin" default_minio_secret_key = "minioadmin"
@ -50,15 +36,21 @@ bucket_name = "library"
# FIXME: How to ensure this doesn't conflict with other usage? # FIXME: How to ensure this doesn't conflict with other usage?
keyspace = "librarian" keyspace = "librarian"
class Processor(ConsumerProducer): class Processor(AsyncProcessor):
def __init__(self, **params): def __init__(self, **params):
self.running = True id = params.get("id")
input_queue = params.get("input_queue", default_input_queue) # self.running = True
output_queue = params.get("output_queue", default_output_queue)
subscriber = params.get("subscriber", default_subscriber) librarian_request_queue = params.get(
"librarian_request_queue", default_librarian_request_queue
)
librarian_response_queue = params.get(
"librarian_response_queue", default_librarian_response_queue
)
minio_host = params.get("minio_host", default_minio_host) minio_host = params.get("minio_host", default_minio_host)
minio_access_key = params.get( minio_access_key = params.get(
@ -74,19 +66,10 @@ class Processor(ConsumerProducer):
cassandra_user = params.get("cassandra_user") cassandra_user = params.get("cassandra_user")
cassandra_password = params.get("cassandra_password") cassandra_password = params.get("cassandra_password")
triples_queue = params.get("triples_queue")
graph_embeddings_queue = params.get("graph_embeddings_queue")
document_embeddings_queue = params.get("document_embeddings_queue")
document_load_queue = params.get("document_load_queue")
text_load_queue = params.get("text_load_queue")
super(Processor, self).__init__( super(Processor, self).__init__(
**params | { **params | {
"input_queue": input_queue, "librarian_request_queue": librarian_request_queue,
"output_queue": output_queue, "librarian_response_queue": librarian_response_queue,
"subscriber": subscriber,
"input_schema": LibrarianRequest,
"output_schema": LibrarianResponse,
"minio_host": minio_host, "minio_host": minio_host,
"minio_access_key": minio_access_key, "minio_access_key": minio_access_key,
"cassandra_host": cassandra_host, "cassandra_host": cassandra_host,
@ -94,38 +77,30 @@ class Processor(ConsumerProducer):
} }
) )
self.document_load = Publisher( librarian_request_metrics = ConsumerMetrics(
self.client, document_load_queue, JsonSchema(Document), processor = self.id, flow = None, name = "librarian-request"
) )
self.text_load = Publisher( librarian_response_metrics = ProducerMetrics(
self.client, text_load_queue, JsonSchema(TextDocument), processor = self.id, flow = None, name = "librarian-response"
) )
self.triples_brk = Subscriber( self.librarian_request_consumer = Consumer(
self.client, triples_store_queue, taskgroup = self.taskgroup,
"librarian", "librarian", client = self.pulsar_client,
schema=JsonSchema(Triples), flow = None,
) topic = librarian_request_queue,
self.graph_embeddings_brk = Subscriber( subscriber = id,
self.client, graph_embeddings_store_queue, schema = LibrarianRequest,
"librarian", "librarian", handler = self.on_librarian_request,
schema=JsonSchema(GraphEmbeddings), metrics = librarian_request_metrics,
)
self.document_embeddings_brk = Subscriber(
self.client, document_embeddings_store_queue,
"librarian", "librarian",
schema=JsonSchema(DocumentEmbeddings),
) )
self.triples_reader = threading.Thread( self.librarian_response_producer = Producer(
target=self.receive_triples client = self.pulsar_client,
) topic = librarian_response_queue,
self.graph_embeddings_reader = threading.Thread( schema = LibrarianResponse,
target=self.receive_graph_embeddings metrics = librarian_response_metrics,
)
self.document_embeddings_reader = threading.Thread(
target=self.receive_document_embeddings
) )
self.librarian = Librarian( self.librarian = Librarian(
@ -141,87 +116,34 @@ class Processor(ConsumerProducer):
load_text = self.load_text, load_text = self.load_text,
) )
self.register_config_handler(self.on_librarian_config)
self.flows = {}
print("Initialised.", flush=True) print("Initialised.", flush=True)
async def start(self): async def start(self):
self.document_load.start()
self.text_load.start()
self.triples_brk.start() await super(Processor, self).start()
self.graph_embeddings_brk.start() await self.librarian_request_consumer.start()
self.document_embeddings_brk.start() await self.librarian_response_producer.start()
self.triples_sub = self.triples_brk.subscribe_all("x") async def on_librarian_config(self, config, version):
self.graph_embeddings_sub = self.graph_embeddings_brk.subscribe_all("x")
self.document_embeddings_sub = self.document_embeddings_brk.subscribe_all("x")
self.triples_reader.start() print("config version", version)
self.graph_embeddings_reader.start()
self.document_embeddings_reader.start() if "flows" in config:
self.flows = {
k: json.loads(v)
for k, v in config["flows"].items()
}
print(self.flows)
def __del__(self): def __del__(self):
self.running = False pass
if hasattr(self, "document_load"):
self.document_load.stop()
self.document_load.join()
if hasattr(self, "text_load"):
self.text_load.stop()
self.text_load.join()
if hasattr(self, "triples_sub"):
self.triples_sub.unsubscribe_all("x")
if hasattr(self, "graph_embeddings_sub"):
self.graph_embeddings_sub.unsubscribe_all("x")
if hasattr(self, "document_embeddings_sub"):
self.document_embeddings_sub.unsubscribe_all("x")
if hasattr(self, "triples_brk"):
self.triples_brk.stop()
self.triples_brk.join()
if hasattr(self, "graph_embeddings_brk"):
self.graph_embeddings_brk.stop()
self.graph_embeddings_brk.join()
if hasattr(self, "document_embeddings_brk"):
self.document_embeddings_brk.stop()
self.document_embeddings_brk.join()
def receive_triples(self):
while self.running:
try:
msg = self.triples_sub.get(timeout=1)
except queue.Empty:
continue
self.librarian.handle_triples(msg)
def receive_graph_embeddings(self):
while self.running:
try:
msg = self.graph_embeddings_sub.get(timeout=1)
except queue.Empty:
continue
self.librarian.handle_graph_embeddings(msg)
def receive_document_embeddings(self):
while self.running:
try:
msg = self.document_embeddings_sub.get(timeout=1)
except queue.Empty:
continue
self.librarian.handle_document_embeddings(msg)
async def load_document(self, document): async def load_document(self, document):
@ -235,6 +157,8 @@ class Processor(ConsumerProducer):
data = document.document data = document.document
) )
self.document_load.send(None, doc) self.document_load.send(None, doc)
async def load_text(self, document): async def load_text(self, document):
@ -254,41 +178,31 @@ class Processor(ConsumerProducer):
self.text_load.send(None, doc) self.text_load.send(None, doc)
def parse_request(self, v): async def process_request(self, v):
if v.operation is None: if v.operation is None:
raise RequestError("Null operation") raise RequestError("Null operation")
print("op", v.operation) print("requets", v.operation)
if v.operation == "add": impls = {
if ( "add-document": self.librarian.add_document,
v.document and v.document.id and v.document.metadata and "remove-document": self.librarian.remove_document,
v.document.document and v.document.kind "update-document": self.librarian.update_document,
): "get-document-metadata": self.librarian.get_document_metadata,
return partial( "get-document-content": self.librarian.get_document_content,
self.librarian.add, "add-processing": self.librarian.add_processing,
document = v.document, "remove-processing": self.librarian.remove_processing,
) "list-documents": self.librarian.list_documents,
else: "list-processing": self.librarian.list_processing,
raise RequestError("Invalid call") }
if v.operation == "list": if v.operation not in impls:
print("list", v) raise RequestError(f"Invalid operation: {v.operation}")
print(v.user)
if v.user:
return partial(
self.librarian.list,
user = v.user,
collection = v.collection,
)
else:
print("BROK")
raise RequestError("Invalid call")
raise RequestError("Invalid operation: " + v.operation) return await impls[v.operation](v)
async def handle(self, msg): async def on_librarian_request(self, msg, consumer, flow):
v = msg.value() v = msg.value()
@ -299,20 +213,15 @@ class Processor(ConsumerProducer):
print(f"Handling input {id}...", flush=True) print(f"Handling input {id}...", flush=True)
try: try:
func = self.parse_request(v)
except RequestError as e: resp = await self.process_request(v)
resp = LibrarianResponse(
error = Error( await self.librarian_response_producer.send(
type = "request-error", resp, properties={"id": id}
message = str(e),
)
) )
await self.send(resp, properties={"id": id})
return return
try:
resp = await func()
print("->", resp)
except RequestError as e: except RequestError as e:
resp = LibrarianResponse( resp = LibrarianResponse(
error = Error( error = Error(
@ -320,31 +229,43 @@ class Processor(ConsumerProducer):
message = str(e), message = str(e),
) )
) )
await self.send(resp, properties={"id": id})
await self.librarian_response_producer.send(
resp, properties={"id": id}
)
return return
except Exception as e: except Exception as e:
print("Exception:", e, flush=True)
resp = LibrarianResponse( resp = LibrarianResponse(
error = Error( error = Error(
type = "processing-error", type = "unexpected-error",
message = "Unhandled error: " + str(e), message = str(e),
) )
) )
await self.send(resp, properties={"id": id})
await self.librarian_response_producer.send(
resp, properties={"id": id}
)
return return
print("Send response..!.", flush=True)
await self.send(resp, properties={"id": id})
print("Done.", flush=True) print("Done.", flush=True)
@staticmethod @staticmethod
def add_args(parser): def add_args(parser):
ConsumerProducer.add_args( AsyncProcessor.add_args(parser)
parser, default_input_queue, default_subscriber,
default_output_queue, parser.add_argument(
'--librarian-request-queue',
default=default_librarian_request_queue,
help=f'Config request queue (default: {default_librarian_request_queue})'
)
parser.add_argument(
'--librarian-response-queue',
default=default_librarian_response_queue,
help=f'Config response queue {default_librarian_response_queue}',
) )
parser.add_argument( parser.add_argument(
@ -385,40 +306,7 @@ class Processor(ConsumerProducer):
help=f'Cassandra password' help=f'Cassandra password'
) )
parser.add_argument(
'--triples-queue',
default=triples_store_queue,
help=f'Triples queue (default: {triples_store_queue})'
)
parser.add_argument(
'--graph-embeddings-queue',
default=graph_embeddings_store_queue,
help=f'Graph embeddings queue (default: {triples_store_queue})'
)
parser.add_argument(
'--document-embeddings-queue',
default=document_embeddings_store_queue,
help='Document embeddings queue '
f'(default: {document_embeddings_store_queue})'
)
parser.add_argument(
'--document-load-queue',
default=document_ingest_queue,
help='Document load queue '
f'(default: {document_ingest_queue})'
)
parser.add_argument(
'--text-load-queue',
default=text_ingest_queue,
help='Text ingest queue '
f'(default: {text_ingest_queue})'
)
def run(): def run():
Processor.launch(module, __doc__) Processor.launch(default_ident, __doc__)

View file

@ -1,5 +1,7 @@
from .. schema import LibrarianRequest, LibrarianResponse from .. schema import LibrarianRequest, LibrarianResponse
from .. schema import DocumentInfo, Error, Triple, Value from .. schema import DocumentMetadata, ProcessingMetadata
from .. schema import Error, Triple, Value
from .. knowledge import hash from .. knowledge import hash
from .. exceptions import RequestError from .. exceptions import RequestError
@ -7,8 +9,10 @@ from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider from cassandra.auth import PlainTextAuthProvider
from cassandra.query import BatchStatement from cassandra.query import BatchStatement
from ssl import SSLContext, PROTOCOL_TLSv1_2 from ssl import SSLContext, PROTOCOL_TLSv1_2
import uuid import uuid
import time import time
import asyncio
class TableStore: class TableStore:
@ -63,18 +67,18 @@ class TableStore:
self.cassandra.execute(""" self.cassandra.execute("""
CREATE TABLE IF NOT EXISTS document ( CREATE TABLE IF NOT EXISTS document (
user text,
collection text,
id text, id text,
user text,
time timestamp, time timestamp,
kind text,
title text, title text,
comments text, comments text,
kind text,
object_id uuid,
metadata list<tuple< metadata list<tuple<
text, boolean, text, boolean, text, boolean text, boolean, text, boolean, text, boolean
>>, >>,
PRIMARY KEY (user, collection, id) tags list<text>,
object_id uuid,
PRIMARY KEY (user, id)
); );
"""); """);
@ -85,6 +89,23 @@ class TableStore:
ON document (object_id) ON document (object_id)
"""); """);
print("processing table...", flush=True)
self.cassandra.execute("""
CREATE TABLE IF NOT EXISTS processing (
id text,
document_id text,
time timestamp,
flow text,
user text,
collection text,
tags list<text>,
PRIMARY KEY (user, id)
);
""");
return
print("triples table...", flush=True) print("triples table...", flush=True)
self.cassandra.execute(""" self.cassandra.execute("""
@ -155,26 +176,84 @@ class TableStore:
self.insert_document_stmt = self.cassandra.prepare(""" self.insert_document_stmt = self.cassandra.prepare("""
INSERT INTO document INSERT INTO document
( (
id, user, collection, kind, object_id, time, title, comments, id, user, time,
metadata kind, title, comments,
metadata, tags, object_id
) )
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""") """)
self.update_document_stmt = self.cassandra.prepare("""
UPDATE document
SET time = ?, title = ?, comments = ?,
metadata = ?, tags = ?
WHERE user = ? AND id = ?
""")
self.get_document_stmt = self.cassandra.prepare("""
SELECT time, kind, title, comments, metadata, tags, object_id
FROM document
WHERE user = ? AND id = ?
""")
self.delete_document_stmt = self.cassandra.prepare("""
DELETE FROM document
WHERE user = ? AND id = ?
""")
self.test_document_exists_stmt = self.cassandra.prepare("""
SELECT id
FROM document
WHERE user = ? AND id = ?
LIMIT 1
""")
self.list_document_stmt = self.cassandra.prepare(""" self.list_document_stmt = self.cassandra.prepare("""
SELECT SELECT
id, kind, user, collection, title, comments, time, metadata id, time, kind, title, comments, metadata, tags, object_id
FROM document FROM document
WHERE user = ? WHERE user = ?
""") """)
self.list_document_by_collection_stmt = self.cassandra.prepare(""" self.list_document_by_tag_stmt = self.cassandra.prepare("""
SELECT SELECT
id, kind, user, collection, title, comments, time, metadata id, time, kind, title, comments, metadata, tags, object_id
FROM document FROM document
WHERE user = ? AND collection = ? WHERE user = ? AND tags CONTAINS ?
ALLOW FILTERING
""") """)
self.insert_processing_stmt = self.cassandra.prepare("""
INSERT INTO processing
(
id, document_id, time,
flow, user, collection,
tags
)
VALUES (?, ?, ?, ?, ?, ?, ?)
""")
self.delete_processing_stmt = self.cassandra.prepare("""
DELETE FROM processing
WHERE user = ? AND id = ?
""")
self.test_processing_exists_stmt = self.cassandra.prepare("""
SELECT id
FROM processing
WHERE user = ? AND id = ?
LIMIT 1
""")
self.list_processing_stmt = self.cassandra.prepare("""
SELECT
id, document_id, time, flow, collection, tags
FROM processing
WHERE user = ?
""")
return
self.insert_triples_stmt = self.cassandra.prepare(""" self.insert_triples_stmt = self.cassandra.prepare("""
INSERT INTO triples INSERT INTO triples
( (
@ -202,17 +281,24 @@ class TableStore:
VALUES (?, ?, ?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?, ?, ?)
""") """)
def add(self, object_id, document): async def document_exists(self, user, id):
if document.kind not in ( resp = self.cassandra.execute(
"text/plain", "application/pdf" self.test_document_exists_stmt,
): ( user, id )
raise RequestError("Invalid document kind: " + document.kind) )
# Create random doc ID # If a row exists, document exists. It's a cursor, can't just
when = int(time.time() * 1000) # count the length
print("Adding", document.id, object_id) for row in resp:
return True
return False
async def add_document(self, document, object_id):
print("Adding document", document.id, object_id)
metadata = [ metadata = [
( (
@ -229,10 +315,9 @@ class TableStore:
resp = self.cassandra.execute( resp = self.cassandra.execute(
self.insert_document_stmt, self.insert_document_stmt,
( (
document.id, document.user, document.collection, document.id, document.user, int(document.time * 1000),
document.kind, object_id, when, document.kind, document.title, document.comments,
document.title, document.comments, metadata, document.tags, object_id
metadata
) )
) )
@ -242,11 +327,71 @@ class TableStore:
print("Exception:", type(e)) print("Exception:", type(e))
print(f"{e}, retry...", flush=True) print(f"{e}, retry...", flush=True)
time.sleep(1) await asyncio.sleep(1)
print("Add complete", flush=True) print("Add complete", flush=True)
def add_triples(self, m): async def update_document(self, document):
print("Updating document", document.id)
metadata = [
(
v.s.value, v.s.is_uri, v.p.value, v.p.is_uri,
v.o.value, v.o.is_uri
)
for v in document.metadata
]
while True:
try:
resp = self.cassandra.execute(
self.update_document_stmt,
(
int(document.time * 1000), document.title,
document.comments, metadata, document.tags,
document.user, document.id
)
)
break
except Exception as e:
print("Exception:", type(e))
print(f"{e}, retry...", flush=True)
await asyncio.sleep(1)
print("Update complete", flush=True)
async def remove_document(self, user, document_id):
print("Removing document", document_id)
while True:
try:
resp = self.cassandra.execute(
self.delete_document_stmt,
(
user, document_id
)
)
break
except Exception as e:
print("Exception:", type(e))
print(f"{e}, retry...", flush=True)
await asyncio.sleep(1)
print("Delete complete", flush=True)
async def add_triples(self, m):
when = int(time.time() * 1000) when = int(time.time() * 1000)
@ -288,76 +433,235 @@ class TableStore:
print("Exception:", type(e)) print("Exception:", type(e))
print(f"{e}, retry...", flush=True) print(f"{e}, retry...", flush=True)
time.sleep(1) await asyncio.sleep(1)
def list(self, user, collection=None): async def list_documents(self, user):
print("List documents...")
print("LIST")
while True: while True:
print("TRY")
print(self.list_document_stmt)
try: try:
if collection: resp = self.cassandra.execute(
resp = self.cassandra.execute( self.list_document_stmt,
self.list_document_by_collection_stmt, (user,)
(user, collection) )
)
else:
resp = self.cassandra.execute(
self.list_document_stmt,
(user,)
)
break
print("OK") break
except Exception as e: except Exception as e:
print("Exception:", type(e)) print("Exception:", type(e))
print(f"{e}, retry...", flush=True) print(f"{e}, retry...", flush=True)
time.sleep(1) await asyncio.sleep(1)
print("OK2")
info = [ lst = [
DocumentInfo( DocumentMetadata(
id = row[0], id = row[0],
kind = row[1], user = user,
user = row[2], time = int(time.mktime(row[1].timetuple())),
collection = row[3], kind = row[2],
title = row[4], title = row[3],
comments = row[5], comments = row[4],
time = int(1000 * row[6].timestamp()),
metadata = [ metadata = [
Triple( Triple(
s=Value(value=m[0], is_uri=m[1]), s=Value(value=m[0], is_uri=m[1]),
p=Value(value=m[2], is_uri=m[3]), p=Value(value=m[2], is_uri=m[3]),
o=Value(value=m[4], is_uri=m[5]) o=Value(value=m[4], is_uri=m[5])
) )
for m in row[7] for m in row[5]
], ],
tags = row[6],
object_id = row[7],
) )
for row in resp for row in resp
] ]
print("OK3") print("Done")
print(info[0]) return lst
print(info[0].user) async def get_document(self, user, id):
print(info[0].time)
print(info[0].kind)
print(info[0].collection)
print(info[0].title)
print(info[0].comments)
print(info[0].metadata)
print(info[0].metadata)
return info print("Get document")
def add_graph_embeddings(self, m): while True:
try:
resp = self.cassandra.execute(
self.get_document_stmt,
(user, id)
)
break
except Exception as e:
print("Exception:", type(e))
print(f"{e}, retry...", flush=True)
await asyncio.sleep(1)
for row in resp:
doc = DocumentMetadata(
id = id,
user = user,
time = int(time.mktime(row[0].timetuple())),
kind = row[1],
title = row[2],
comments = row[3],
metadata = [
Triple(
s=Value(value=m[0], is_uri=m[1]),
p=Value(value=m[2], is_uri=m[3]),
o=Value(value=m[4], is_uri=m[5])
)
for m in row[4]
],
tags = row[5],
object_id = row[6],
)
print("Done")
return doc
raise RuntimeError("No such document row?")
async def get_document_object_id(self, user, id):
print("Get document obj ID")
while True:
try:
resp = self.cassandra.execute(
self.get_document_stmt,
(user, id)
)
break
except Exception as e:
print("Exception:", type(e))
print(f"{e}, retry...", flush=True)
await asyncio.sleep(1)
for row in resp:
print("Done")
return row[6]
raise RuntimeError("No such document row?")
async def processing_exists(self, user, id):
resp = self.cassandra.execute(
self.test_processing_exists_stmt,
( user, id )
)
# If a row exists, document exists. It's a cursor, can't just
# count the length
for row in resp:
return True
return False
async def add_processing(self, processing):
print("Adding processing", processing.id)
while True:
try:
resp = self.cassandra.execute(
self.insert_processing_stmt,
(
processing.id, processing.document_id,
int(processing.time * 1000), processing.flow,
processing.user, processing.collection,
processing.tags
)
)
break
except Exception as e:
print("Exception:", type(e))
print(f"{e}, retry...", flush=True)
await asyncio.sleep(1)
print("Add complete", flush=True)
async def remove_processing(self, user, processing_id):
print("Removing processing", processing_id)
while True:
try:
resp = self.cassandra.execute(
self.delete_processing_stmt,
(
user, processing_id
)
)
break
except Exception as e:
print("Exception:", type(e))
print(f"{e}, retry...", flush=True)
await asyncio.sleep(1)
print("Delete complete", flush=True)
async def list_processing(self, user):
print("List processing objects")
while True:
try:
resp = self.cassandra.execute(
self.list_processing_stmt,
(user,)
)
break
except Exception as e:
print("Exception:", type(e))
print(f"{e}, retry...", flush=True)
await asyncio.sleep(1)
lst = [
ProcessingMetadata(
id = row[0],
document_id = row[1],
time = int(time.mktime(row[2].timetuple())),
flow = row[3],
user = user,
collection = row[4],
tags = row[5],
)
for row in resp
]
print("Done")
return lst
async def add_graph_embeddings(self, m):
when = int(time.time() * 1000) when = int(time.time() * 1000)
@ -399,9 +703,9 @@ class TableStore:
print("Exception:", type(e)) print("Exception:", type(e))
print(f"{e}, retry...", flush=True) print(f"{e}, retry...", flush=True)
time.sleep(1) await asyncio.sleep(1)
def add_document_embeddings(self, m): async def add_document_embeddings(self, m):
when = int(time.time() * 1000) when = int(time.time() * 1000)
@ -443,6 +747,6 @@ class TableStore:
print("Exception:", type(e)) print("Exception:", type(e))
print(f"{e}, retry...", flush=True) print(f"{e}, retry...", flush=True)
time.sleep(1) await asyncio.sleep(1)