mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Feature/flow librarian (#361)
* Update librarian to new API * Implementing new schema with document + processing objects
This commit is contained in:
parent
6bf485788a
commit
ff28d26f4d
21 changed files with 1323 additions and 428 deletions
|
|
@ -4,20 +4,25 @@ import requests
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
import base64
|
import base64
|
||||||
|
import time
|
||||||
|
|
||||||
url = "http://localhost:8088/api/v1/"
|
url = "http://localhost:8088/api/v1/"
|
||||||
|
|
||||||
############################################################################
|
############################################################################
|
||||||
|
|
||||||
id = "http://trustgraph.ai/doc/12345678"
|
id = "http://trustgraph.ai/doc/9fdee98b-b259-40ac-bcb9-8e82ccedeb04"
|
||||||
|
|
||||||
with open("docs/README.cats") as f:
|
with open("docs/README.cats", "rb") as f:
|
||||||
doc = base64.b64encode(f.read().encode("utf-8")).decode("utf-8")
|
doc = base64.b64encode(f.read()).decode("utf-8")
|
||||||
|
|
||||||
input = {
|
input = {
|
||||||
"operation": "add",
|
"operation": "add-document",
|
||||||
"document": {
|
"document-metadata": {
|
||||||
"id": id,
|
"id": id,
|
||||||
|
"time": int(time.time()),
|
||||||
|
"kind": "text/plain",
|
||||||
|
"title": "Mark's cats",
|
||||||
|
"comments": "Test doc taken from the TrustGraph repo",
|
||||||
"metadata": [
|
"metadata": [
|
||||||
{
|
{
|
||||||
"s": {
|
"s": {
|
||||||
|
|
@ -46,13 +51,10 @@ input = {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"document": doc,
|
|
||||||
"kind": "text/plain",
|
|
||||||
"user": "trustgraph",
|
"user": "trustgraph",
|
||||||
"collection": "default",
|
"tags": ["mark", "cats"],
|
||||||
"title": "Mark's cats",
|
},
|
||||||
"comments": "Test doc taken from the TrustGraph repo",
|
"content": doc,
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
resp = requests.post(
|
resp = requests.post(
|
||||||
|
|
|
||||||
|
|
@ -4,12 +4,13 @@ import requests
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
import base64
|
import base64
|
||||||
|
import time
|
||||||
|
|
||||||
url = "http://localhost:8088/api/v1/"
|
url = "http://localhost:8088/api/v1/"
|
||||||
|
|
||||||
############################################################################
|
############################################################################
|
||||||
|
|
||||||
id = "http://trustgraph.ai/doc/12345678"
|
id = "http://trustgraph.ai/doc/6d034da9-2759-45c2-af24-14db7f4c44c2"
|
||||||
|
|
||||||
source = "../sources/20160001634.pdf"
|
source = "../sources/20160001634.pdf"
|
||||||
|
|
||||||
|
|
@ -17,9 +18,13 @@ with open(source, "rb") as f:
|
||||||
doc = base64.b64encode(f.read()).decode("utf-8")
|
doc = base64.b64encode(f.read()).decode("utf-8")
|
||||||
|
|
||||||
input = {
|
input = {
|
||||||
"operation": "add",
|
"operation": "add-document",
|
||||||
"id": id,
|
"document-metadata": {
|
||||||
"document": {
|
"id": id,
|
||||||
|
"time": int(time.time()),
|
||||||
|
"kind": "application/pdf",
|
||||||
|
"title": "Application of SAE ARP4754A to Flight Critical Systems",
|
||||||
|
"comments": "Application of federal safety standards to NASA spacecraft",
|
||||||
"metadata": [
|
"metadata": [
|
||||||
{
|
{
|
||||||
"s": {
|
"s": {
|
||||||
|
|
@ -61,11 +66,10 @@ input = {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"document": doc,
|
|
||||||
"kind": "application/pdf",
|
|
||||||
"user": "trustgraph",
|
"user": "trustgraph",
|
||||||
"collection": "default",
|
"tags": ["nasa", "safety-engineering"],
|
||||||
}
|
},
|
||||||
|
"content": doc,
|
||||||
}
|
}
|
||||||
|
|
||||||
resp = requests.post(
|
resp = requests.post(
|
||||||
|
|
|
||||||
50
test-api/test-library-add-processing
Executable file
50
test-api/test-library-add-processing
Executable file
|
|
@ -0,0 +1,50 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import base64
|
||||||
|
import time
|
||||||
|
|
||||||
|
url = "http://localhost:8088/api/v1/"
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
doc_id = "http://trustgraph.ai/doc/9fdee98b-b259-40ac-bcb9-8e82ccedeb04"
|
||||||
|
|
||||||
|
proc_id = "2714fc72-44ab-45f2-94dd-6773fc336535"
|
||||||
|
|
||||||
|
input = {
|
||||||
|
"operation": "add-processing",
|
||||||
|
"processing-metadata": {
|
||||||
|
"id": proc_id,
|
||||||
|
"document-id": doc_id,
|
||||||
|
"time": int(time.time()),
|
||||||
|
"flow": "0000",
|
||||||
|
"user": "trustgraph",
|
||||||
|
"collection": "default",
|
||||||
|
"tags": ["test"],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = requests.post(
|
||||||
|
f"{url}librarian",
|
||||||
|
json=input,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(resp.text)
|
||||||
|
resp = resp.json()
|
||||||
|
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
if "error" in resp:
|
||||||
|
print(f"Error: {resp['error']}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# print(resp["response"])
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
41
test-api/test-library-get-document-content
Executable file
41
test-api/test-library-get-document-content
Executable file
|
|
@ -0,0 +1,41 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import base64
|
||||||
|
|
||||||
|
url = "http://localhost:8088/api/v1/"
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
id = "http://trustgraph.ai/doc/9fdee98b-b259-40ac-bcb9-8e82ccedeb04"
|
||||||
|
|
||||||
|
user = "trustgraph"
|
||||||
|
|
||||||
|
input = {
|
||||||
|
"operation": "get-document-content",
|
||||||
|
"user": user,
|
||||||
|
"document-id": id,
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = requests.post(
|
||||||
|
f"{url}librarian",
|
||||||
|
json=input,
|
||||||
|
)
|
||||||
|
|
||||||
|
resp = resp.json()
|
||||||
|
|
||||||
|
if "error" in resp:
|
||||||
|
print(f"Error: {resp['error']}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
content = base64.b64decode(resp["content"]).decode("utf-8")
|
||||||
|
|
||||||
|
print(content)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
42
test-api/test-library-get-document-metadata
Executable file
42
test-api/test-library-get-document-metadata
Executable file
|
|
@ -0,0 +1,42 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import base64
|
||||||
|
|
||||||
|
url = "http://localhost:8088/api/v1/"
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
id = "http://trustgraph.ai/doc/9fdee98b-b259-40ac-bcb9-8e82ccedeb04"
|
||||||
|
|
||||||
|
user = "trustgraph"
|
||||||
|
|
||||||
|
input = {
|
||||||
|
"operation": "get-document-metadata",
|
||||||
|
"user": user,
|
||||||
|
"document-id": id,
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = requests.post(
|
||||||
|
f"{url}librarian",
|
||||||
|
json=input,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(resp.text)
|
||||||
|
resp = resp.json()
|
||||||
|
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
if "error" in resp:
|
||||||
|
print(f"Error: {resp['error']}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# print(resp["response"])
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
|
@ -12,7 +12,7 @@ url = "http://localhost:8088/api/v1/"
|
||||||
user = "trustgraph"
|
user = "trustgraph"
|
||||||
|
|
||||||
input = {
|
input = {
|
||||||
"operation": "list",
|
"operation": "list-documents",
|
||||||
"user": user,
|
"user": user,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
38
test-api/test-library-list-documents
Executable file
38
test-api/test-library-list-documents
Executable file
|
|
@ -0,0 +1,38 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import base64
|
||||||
|
import time
|
||||||
|
|
||||||
|
url = "http://localhost:8088/api/v1/"
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
input = {
|
||||||
|
"operation": "list-documents",
|
||||||
|
"user": "trustgraph",
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = requests.post(
|
||||||
|
f"{url}librarian",
|
||||||
|
json=input,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(resp.text)
|
||||||
|
resp = resp.json()
|
||||||
|
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
if "error" in resp:
|
||||||
|
print(f"Error: {resp['error']}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# print(resp["response"])
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
38
test-api/test-library-list-processing
Executable file
38
test-api/test-library-list-processing
Executable file
|
|
@ -0,0 +1,38 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import base64
|
||||||
|
import time
|
||||||
|
|
||||||
|
url = "http://localhost:8088/api/v1/"
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
input = {
|
||||||
|
"operation": "list-processing",
|
||||||
|
"user": "trustgraph",
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = requests.post(
|
||||||
|
f"{url}librarian",
|
||||||
|
json=input,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(resp.text)
|
||||||
|
resp = resp.json()
|
||||||
|
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
if "error" in resp:
|
||||||
|
print(f"Error: {resp['error']}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# print(resp["response"])
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
41
test-api/test-library-remove-document
Executable file
41
test-api/test-library-remove-document
Executable file
|
|
@ -0,0 +1,41 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import base64
|
||||||
|
import time
|
||||||
|
|
||||||
|
url = "http://localhost:8088/api/v1/"
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
id = "http://trustgraph.ai/doc/9fdee98b-b259-40ac-bcb9-8e82ccedeb04"
|
||||||
|
|
||||||
|
input = {
|
||||||
|
"operation": "remove-document",
|
||||||
|
"user": "trustgraph",
|
||||||
|
"document-id": id
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = requests.post(
|
||||||
|
f"{url}librarian",
|
||||||
|
json=input,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(resp.text)
|
||||||
|
resp = resp.json()
|
||||||
|
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
if "error" in resp:
|
||||||
|
print(f"Error: {resp['error']}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# print(resp["response"])
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
41
test-api/test-library-remove-document2
Executable file
41
test-api/test-library-remove-document2
Executable file
|
|
@ -0,0 +1,41 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import base64
|
||||||
|
import time
|
||||||
|
|
||||||
|
url = "http://localhost:8088/api/v1/"
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
id = "http://trustgraph.ai/doc/6d034da9-2759-45c2-af24-14db7f4c44c2"
|
||||||
|
|
||||||
|
input = {
|
||||||
|
"operation": "remove-document",
|
||||||
|
"user": "trustgraph",
|
||||||
|
"document-id": id
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = requests.post(
|
||||||
|
f"{url}librarian",
|
||||||
|
json=input,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(resp.text)
|
||||||
|
resp = resp.json()
|
||||||
|
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
if "error" in resp:
|
||||||
|
print(f"Error: {resp['error']}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# print(resp["response"])
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
41
test-api/test-library-remove-processing
Executable file
41
test-api/test-library-remove-processing
Executable file
|
|
@ -0,0 +1,41 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import base64
|
||||||
|
import time
|
||||||
|
|
||||||
|
url = "http://localhost:8088/api/v1/"
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
proc_id = "2714fc72-44ab-45f2-94dd-6773fc336535"
|
||||||
|
|
||||||
|
input = {
|
||||||
|
"operation": "remove-processing",
|
||||||
|
"user": "trustgraph",
|
||||||
|
"processing-id": proc_id,
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = requests.post(
|
||||||
|
f"{url}librarian",
|
||||||
|
json=input,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(resp.text)
|
||||||
|
resp = resp.json()
|
||||||
|
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
if "error" in resp:
|
||||||
|
print(f"Error: {resp['error']}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# print(resp["response"])
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
75
test-api/test-library-update-doc
Executable file
75
test-api/test-library-update-doc
Executable file
|
|
@ -0,0 +1,75 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import base64
|
||||||
|
import time
|
||||||
|
|
||||||
|
url = "http://localhost:8088/api/v1/"
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
id = "http://trustgraph.ai/doc/9fdee98b-b259-40ac-bcb9-8e82ccedeb04"
|
||||||
|
|
||||||
|
input = {
|
||||||
|
"operation": "update-document",
|
||||||
|
"document-metadata": {
|
||||||
|
"id": id,
|
||||||
|
"time": int(time.time()),
|
||||||
|
"title": "Mark's cats - a story",
|
||||||
|
"comments": "Information about Mark's cats",
|
||||||
|
"metadata": [
|
||||||
|
{
|
||||||
|
"s": {
|
||||||
|
"v": id,
|
||||||
|
"e": True,
|
||||||
|
},
|
||||||
|
"p": {
|
||||||
|
"v": "http://www.w3.org/2000/01/rdf-schema#label",
|
||||||
|
"e": True,
|
||||||
|
},
|
||||||
|
"o": {
|
||||||
|
"v": "Mark's pets", "e": False,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"s": {
|
||||||
|
"v": id,
|
||||||
|
"e": True,
|
||||||
|
},
|
||||||
|
"p": {
|
||||||
|
"v": 'https://schema.org/keywords',
|
||||||
|
"e": True,
|
||||||
|
},
|
||||||
|
"o": {
|
||||||
|
"v": "cats", "e": False,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"user": "trustgraph",
|
||||||
|
"tags": ["mark", "cats", "pets"],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = requests.post(
|
||||||
|
f"{url}librarian",
|
||||||
|
json=input,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(resp.text)
|
||||||
|
resp = resp.json()
|
||||||
|
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
if "error" in resp:
|
||||||
|
print(f"Error: {resp['error']}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# print(resp["response"])
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
|
@ -6,16 +6,52 @@ from . types import Error
|
||||||
from . metadata import Metadata
|
from . metadata import Metadata
|
||||||
from . documents import Document, TextDocument
|
from . documents import Document, TextDocument
|
||||||
|
|
||||||
# add
|
# add-document
|
||||||
# -> (id, document)
|
# -> (document_id, document_metadata, content)
|
||||||
# <- ()
|
# <- ()
|
||||||
# <- (error)
|
# <- (error)
|
||||||
|
|
||||||
# list
|
# remove-document
|
||||||
# -> (user, collection?)
|
# -> (document_id)
|
||||||
# <- (info)
|
# <- ()
|
||||||
# <- (error)
|
# <- (error)
|
||||||
|
|
||||||
|
# update-document
|
||||||
|
# -> (document_id, document_metadata)
|
||||||
|
# <- ()
|
||||||
|
# <- (error)
|
||||||
|
|
||||||
|
# get-document-metadata
|
||||||
|
# -> (document_id)
|
||||||
|
# <- (document_metadata)
|
||||||
|
# <- (error)
|
||||||
|
|
||||||
|
# get-document-content
|
||||||
|
# -> (document_id)
|
||||||
|
# <- (content)
|
||||||
|
# <- (error)
|
||||||
|
|
||||||
|
# add-processing
|
||||||
|
# -> (processing_id, processing_metadata)
|
||||||
|
# <- ()
|
||||||
|
# <- (error)
|
||||||
|
|
||||||
|
# remove-processing
|
||||||
|
# -> (processing_id)
|
||||||
|
# <- ()
|
||||||
|
# <- (error)
|
||||||
|
|
||||||
|
# list-documents
|
||||||
|
# -> (user, collection?)
|
||||||
|
# <- (document_metadata[])
|
||||||
|
# <- (error)
|
||||||
|
|
||||||
|
# list-processing
|
||||||
|
# -> (user, collection?)
|
||||||
|
# <- (processing_metadata[])
|
||||||
|
# <- (error)
|
||||||
|
|
||||||
|
# OLD:
|
||||||
# add(Metadata, Bytes) : error?
|
# add(Metadata, Bytes) : error?
|
||||||
# copy(id, user, collection)
|
# copy(id, user, collection)
|
||||||
# move(id, user, collection)
|
# move(id, user, collection)
|
||||||
|
|
@ -26,26 +62,24 @@ from . documents import Document, TextDocument
|
||||||
# info(id[]) : DocumentInfo[]
|
# info(id[]) : DocumentInfo[]
|
||||||
# search(<key,op,value>[]) : id[]
|
# search(<key,op,value>[]) : id[]
|
||||||
|
|
||||||
class DocumentPackage(Record):
|
class DocumentMetadata(Record):
|
||||||
id = String()
|
id = String()
|
||||||
document = Bytes()
|
time = Long()
|
||||||
kind = String()
|
kind = String()
|
||||||
user = String()
|
|
||||||
collection = String()
|
|
||||||
title = String()
|
title = String()
|
||||||
comments = String()
|
comments = String()
|
||||||
time = Long()
|
|
||||||
metadata = Array(Triple())
|
metadata = Array(Triple())
|
||||||
|
user = String()
|
||||||
|
tags = Array(String())
|
||||||
|
|
||||||
class DocumentInfo(Record):
|
class ProcessingMetadata(Record):
|
||||||
id = String()
|
id = String()
|
||||||
kind = String()
|
document_id = String()
|
||||||
|
time = Long()
|
||||||
|
flow = String()
|
||||||
user = String()
|
user = String()
|
||||||
collection = String()
|
collection = String()
|
||||||
title = String()
|
tags = Array(String())
|
||||||
comments = String()
|
|
||||||
time = Long()
|
|
||||||
metadata = Array(Triple())
|
|
||||||
|
|
||||||
class Criteria(Record):
|
class Criteria(Record):
|
||||||
key = String()
|
key = String()
|
||||||
|
|
@ -53,17 +87,43 @@ class Criteria(Record):
|
||||||
operator = String()
|
operator = String()
|
||||||
|
|
||||||
class LibrarianRequest(Record):
|
class LibrarianRequest(Record):
|
||||||
|
|
||||||
|
# add-document, remove-document, update-document, get-document-metadata,
|
||||||
|
# get-document-content, add-processing, remove-processing, list-documents,
|
||||||
|
# list-processing
|
||||||
operation = String()
|
operation = String()
|
||||||
id = String()
|
|
||||||
document = DocumentPackage()
|
# add-document, remove-document, update-document, get-document-metadata,
|
||||||
|
# get-document-content
|
||||||
|
document_id = String()
|
||||||
|
|
||||||
|
# add-processing, remove-processing
|
||||||
|
processing_id = String()
|
||||||
|
|
||||||
|
# add-document, update-document
|
||||||
|
document_metadata = DocumentMetadata()
|
||||||
|
|
||||||
|
# add-processing
|
||||||
|
processing_metadata = ProcessingMetadata()
|
||||||
|
|
||||||
|
# add-document
|
||||||
|
content = Bytes()
|
||||||
|
|
||||||
|
# list-documents, list-processing
|
||||||
user = String()
|
user = String()
|
||||||
|
|
||||||
|
# list-documents?, list-processing?
|
||||||
collection = String()
|
collection = String()
|
||||||
|
|
||||||
|
#
|
||||||
criteria = Array(Criteria())
|
criteria = Array(Criteria())
|
||||||
|
|
||||||
class LibrarianResponse(Record):
|
class LibrarianResponse(Record):
|
||||||
error = Error()
|
error = Error()
|
||||||
document = DocumentPackage()
|
document_metadata = DocumentMetadata()
|
||||||
info = Array(DocumentInfo())
|
content = Bytes()
|
||||||
|
document_metadatas = Array(DocumentMetadata())
|
||||||
|
processing_metadatas = Array(ProcessingMetadata())
|
||||||
|
|
||||||
librarian_request_queue = topic(
|
librarian_request_queue = topic(
|
||||||
'librarian', kind='non-persistent', namespace='request'
|
'librarian', kind='non-persistent', namespace='request'
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,6 @@
|
||||||
Config service. Manages system global configuration state
|
Config service. Manages system global configuration state
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from pulsar.schema import JsonSchema
|
|
||||||
|
|
||||||
from trustgraph.schema import Error
|
from trustgraph.schema import Error
|
||||||
|
|
||||||
from trustgraph.schema import ConfigRequest, ConfigResponse, ConfigPush
|
from trustgraph.schema import ConfigRequest, ConfigResponse, ConfigPush
|
||||||
|
|
@ -14,7 +12,6 @@ from trustgraph.schema import config_push_queue
|
||||||
from trustgraph.schema import FlowRequest, FlowResponse
|
from trustgraph.schema import FlowRequest, FlowResponse
|
||||||
from trustgraph.schema import flow_request_queue, flow_response_queue
|
from trustgraph.schema import flow_request_queue, flow_response_queue
|
||||||
|
|
||||||
from trustgraph.log_level import LogLevel
|
|
||||||
from trustgraph.base import AsyncProcessor, Consumer, Producer
|
from trustgraph.base import AsyncProcessor, Consumer, Producer
|
||||||
|
|
||||||
from . config import Configuration
|
from . config import Configuration
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,15 @@
|
||||||
|
|
||||||
|
import base64
|
||||||
|
|
||||||
from ... schema import LibrarianRequest, LibrarianResponse
|
from ... schema import LibrarianRequest, LibrarianResponse
|
||||||
from ... schema import librarian_request_queue
|
from ... schema import librarian_request_queue
|
||||||
from ... schema import librarian_response_queue
|
from ... schema import librarian_response_queue
|
||||||
|
|
||||||
from . requestor import ServiceRequestor
|
from . requestor import ServiceRequestor
|
||||||
from . serialize import serialize_document_package, serialize_document_info
|
from . serialize import serialize_document_metadata
|
||||||
from . serialize import to_document_package, to_document_info, to_criteria
|
from . serialize import serialize_processing_metadata
|
||||||
|
from . serialize import to_document_metadata, to_processing_metadata
|
||||||
|
from . serialize import to_criteria
|
||||||
|
|
||||||
class LibrarianRequestor(ServiceRequestor):
|
class LibrarianRequestor(ServiceRequestor):
|
||||||
def __init__(self, pulsar_client, consumer, subscriber, timeout=120):
|
def __init__(self, pulsar_client, consumer, subscriber, timeout=120):
|
||||||
|
|
@ -23,20 +27,37 @@ class LibrarianRequestor(ServiceRequestor):
|
||||||
|
|
||||||
def to_request(self, body):
|
def to_request(self, body):
|
||||||
|
|
||||||
if "document" in body:
|
# Content gets base64 decoded & encoded again. It at least makes
|
||||||
dp = to_document_package(body["document"])
|
# sure payload is valid base64.
|
||||||
|
|
||||||
|
if "document-metadata" in body:
|
||||||
|
dm = to_document_metadata(body["document-metadata"])
|
||||||
else:
|
else:
|
||||||
dp = None
|
dm = None
|
||||||
|
|
||||||
|
if "processing-metadata" in body:
|
||||||
|
pm = to_processing_metadata(body["processing-metadata"])
|
||||||
|
else:
|
||||||
|
pm = None
|
||||||
|
|
||||||
if "criteria" in body:
|
if "criteria" in body:
|
||||||
criteria = to_criteria(body["criteria"])
|
criteria = to_criteria(body["criteria"])
|
||||||
else:
|
else:
|
||||||
criteria = None
|
criteria = None
|
||||||
|
|
||||||
|
if "content" in body:
|
||||||
|
content = base64.b64decode(body["content"].encode("utf-8"))
|
||||||
|
content = base64.b64encode(content).decode("utf-8")
|
||||||
|
else:
|
||||||
|
content = None
|
||||||
|
|
||||||
return LibrarianRequest(
|
return LibrarianRequest(
|
||||||
operation = body.get("operation", None),
|
operation = body.get("operation", None),
|
||||||
id = body.get("id", None),
|
document_id = body.get("document-id", None),
|
||||||
document = dp,
|
processing_id = body.get("processing-id", None),
|
||||||
|
document_metadata = dm,
|
||||||
|
processing_metadata = pm,
|
||||||
|
content = content,
|
||||||
user = body.get("user", None),
|
user = body.get("user", None),
|
||||||
collection = body.get("collection", None),
|
collection = body.get("collection", None),
|
||||||
criteria = criteria,
|
criteria = criteria,
|
||||||
|
|
@ -44,15 +65,28 @@ class LibrarianRequestor(ServiceRequestor):
|
||||||
|
|
||||||
def from_response(self, message):
|
def from_response(self, message):
|
||||||
|
|
||||||
|
print(message)
|
||||||
|
|
||||||
response = {}
|
response = {}
|
||||||
|
|
||||||
if message.document:
|
if message.document_metadata:
|
||||||
response["document"] = serialize_document_package(message.document)
|
response["document-metadata"] = serialize_document_metadata(
|
||||||
|
message.document_metadata
|
||||||
|
)
|
||||||
|
|
||||||
if message.info:
|
if message.content:
|
||||||
response["info"] = [
|
response["content"] = message.content.decode("utf-8")
|
||||||
serialize_document_info(v)
|
|
||||||
for v in message.info
|
if message.document_metadatas != None:
|
||||||
|
response["document-metadatas"] = [
|
||||||
|
serialize_document_metadata(v)
|
||||||
|
for v in message.document_metadatas
|
||||||
|
]
|
||||||
|
|
||||||
|
if message.processing_metadatas != None:
|
||||||
|
response["processing-metadatas"] = [
|
||||||
|
serialize_processing_metadata(v)
|
||||||
|
for v in message.processing_metadatas
|
||||||
]
|
]
|
||||||
|
|
||||||
return response, True
|
return response, True
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
from ... schema import Value, Triple, DocumentPackage, DocumentInfo
|
from ... schema import Value, Triple, DocumentMetadata, ProcessingMetadata
|
||||||
|
|
||||||
def to_value(x):
|
def to_value(x):
|
||||||
return Value(value=x["v"], is_uri=x["e"])
|
return Value(value=x["v"], is_uri=x["e"])
|
||||||
|
|
@ -80,88 +80,86 @@ def serialize_document_embeddings(message):
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
def serialize_document_package(message):
|
def serialize_document_metadata(message):
|
||||||
|
|
||||||
ret = {}
|
ret = {}
|
||||||
|
|
||||||
if message.id:
|
if message.id:
|
||||||
ret["id"] = message.id
|
ret["id"] = message.id
|
||||||
|
|
||||||
if message.metadata:
|
if message.time:
|
||||||
ret["metadata"] = serialize_subgraph(message.metdata)
|
ret["time"] = message.time
|
||||||
|
|
||||||
if message.document:
|
|
||||||
blob = base64.b64encode(
|
|
||||||
message.document.encode("utf-8")
|
|
||||||
).decode("utf-8")
|
|
||||||
ret["document"] = blob
|
|
||||||
|
|
||||||
if message.kind:
|
if message.kind:
|
||||||
ret["kind"] = message.kind
|
ret["kind"] = message.kind
|
||||||
|
|
||||||
if message.user:
|
|
||||||
ret["user"] = message.user
|
|
||||||
|
|
||||||
if message.collection:
|
|
||||||
ret["collection"] = message.collection
|
|
||||||
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def serialize_document_info(message):
|
|
||||||
|
|
||||||
ret = {}
|
|
||||||
|
|
||||||
if message.id:
|
|
||||||
ret["id"] = message.id
|
|
||||||
|
|
||||||
if message.kind:
|
|
||||||
ret["kind"] = message.kind
|
|
||||||
|
|
||||||
if message.user:
|
|
||||||
ret["user"] = message.user
|
|
||||||
|
|
||||||
if message.collection:
|
|
||||||
ret["collection"] = message.collection
|
|
||||||
|
|
||||||
if message.title:
|
if message.title:
|
||||||
ret["title"] = message.title
|
ret["title"] = message.title
|
||||||
|
|
||||||
if message.comments:
|
if message.comments:
|
||||||
ret["comments"] = message.comments
|
ret["comments"] = message.comments
|
||||||
|
|
||||||
if message.time:
|
|
||||||
ret["time"] = message.time
|
|
||||||
|
|
||||||
if message.metadata:
|
if message.metadata:
|
||||||
ret["metadata"] = serialize_subgraph(message.metadata)
|
ret["metadata"] = serialize_subgraph(message.metadata)
|
||||||
|
|
||||||
|
if message.user:
|
||||||
|
ret["user"] = message.user
|
||||||
|
|
||||||
|
if message.tags:
|
||||||
|
ret["tags"] = message.tags
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def to_document_package(x):
|
def serialize_processing_metadata(message):
|
||||||
|
|
||||||
return DocumentPackage(
|
ret = {}
|
||||||
|
|
||||||
|
if message.id:
|
||||||
|
ret["id"] = message.id
|
||||||
|
|
||||||
|
if message.id:
|
||||||
|
ret["document-id"] = message.document_id
|
||||||
|
|
||||||
|
if message.time:
|
||||||
|
ret["time"] = message.time
|
||||||
|
|
||||||
|
if message.flow:
|
||||||
|
ret["flow"] = message.flow
|
||||||
|
|
||||||
|
if message.user:
|
||||||
|
ret["user"] = message.user
|
||||||
|
|
||||||
|
if message.collection:
|
||||||
|
ret["collection"] = message.collection
|
||||||
|
|
||||||
|
if message.tags:
|
||||||
|
ret["tags"] = message.tags
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def to_document_metadata(x):
|
||||||
|
|
||||||
|
return DocumentMetadata(
|
||||||
id = x.get("id", None),
|
id = x.get("id", None),
|
||||||
|
time = x.get("time", None),
|
||||||
kind = x.get("kind", None),
|
kind = x.get("kind", None),
|
||||||
user = x.get("user", None),
|
|
||||||
collection = x.get("collection", None),
|
|
||||||
title = x.get("title", None),
|
title = x.get("title", None),
|
||||||
comments = x.get("comments", None),
|
comments = x.get("comments", None),
|
||||||
time = x.get("time", None),
|
|
||||||
document = x.get("document", None),
|
|
||||||
metadata = to_subgraph(x["metadata"]),
|
metadata = to_subgraph(x["metadata"]),
|
||||||
|
user = x.get("user", None),
|
||||||
|
tags = x.get("tags", None),
|
||||||
)
|
)
|
||||||
|
|
||||||
def to_document_info(x):
|
def to_processing_metadata(x):
|
||||||
|
|
||||||
return DocumentInfo(
|
return ProcessingMetadata(
|
||||||
id = x.get("id", None),
|
id = x.get("id", None),
|
||||||
kind = x.get("kind", None),
|
document_id = x.get("document-id", None),
|
||||||
|
time = x.get("time", None),
|
||||||
|
flow = x.get("flow", None),
|
||||||
user = x.get("user", None),
|
user = x.get("user", None),
|
||||||
collection = x.get("collection", None),
|
collection = x.get("collection", None),
|
||||||
title = x.get("title", None),
|
tags = x.get("tags", None),
|
||||||
comments = x.get("comments", None),
|
|
||||||
time = x.get("time", None),
|
|
||||||
metadata = to_subgraph(x["metadata"]),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def to_criteria(x):
|
def to_criteria(x):
|
||||||
|
|
@ -169,3 +167,4 @@ def to_criteria(x):
|
||||||
Critera(v["key"], v["value"], v["operator"])
|
Critera(v["key"], v["value"], v["operator"])
|
||||||
for v in x
|
for v in x
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -95,7 +95,6 @@ class Api:
|
||||||
|
|
||||||
await self.config_receiver.start()
|
await self.config_receiver.start()
|
||||||
|
|
||||||
|
|
||||||
for ep in self.endpoints:
|
for ep in self.endpoints:
|
||||||
ep.add_routes(self.app)
|
ep.add_routes(self.app)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ class BlobStore:
|
||||||
else:
|
else:
|
||||||
print("Bucket", self.bucket_name, "already exists", flush=True)
|
print("Bucket", self.bucket_name, "already exists", flush=True)
|
||||||
|
|
||||||
def add(self, object_id, blob, kind):
|
async def add(self, object_id, blob, kind):
|
||||||
|
|
||||||
# FIXME: Loop retry
|
# FIXME: Loop retry
|
||||||
self.minio.put_object(
|
self.minio.put_object(
|
||||||
|
|
@ -49,3 +49,25 @@ class BlobStore:
|
||||||
)
|
)
|
||||||
|
|
||||||
print("Add blob complete", flush=True)
|
print("Add blob complete", flush=True)
|
||||||
|
|
||||||
|
async def remove(self, object_id):
|
||||||
|
|
||||||
|
# FIXME: Loop retry
|
||||||
|
self.minio.remove_object(
|
||||||
|
bucket_name = self.bucket_name,
|
||||||
|
object_name = "doc/" + str(object_id),
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Remove blob complete", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
async def get(self, object_id):
|
||||||
|
|
||||||
|
# FIXME: Loop retry
|
||||||
|
resp = self.minio.get_object(
|
||||||
|
bucket_name = self.bucket_name,
|
||||||
|
object_name = "doc/" + str(object_id),
|
||||||
|
)
|
||||||
|
|
||||||
|
return resp.read()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,10 @@
|
||||||
|
|
||||||
from .. schema import LibrarianRequest, LibrarianResponse, Error, Triple
|
from .. schema import LibrarianRequest, LibrarianResponse, Error, Triple
|
||||||
from .. knowledge import hash
|
from .. knowledge import hash
|
||||||
from .. exceptions import RequestError
|
from .. exceptions import RequestError
|
||||||
from . table_store import TableStore
|
from . table_store import TableStore
|
||||||
from . blob_store import BlobStore
|
from . blob_store import BlobStore
|
||||||
|
import base64
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
|
|
@ -26,63 +28,240 @@ class Librarian:
|
||||||
self.load_document = load_document
|
self.load_document = load_document
|
||||||
self.load_text = load_text
|
self.load_text = load_text
|
||||||
|
|
||||||
async def add(self, document):
|
async def add_document(self, request):
|
||||||
|
|
||||||
if document.kind not in (
|
if request.document_metadata.kind not in (
|
||||||
"text/plain", "application/pdf"
|
"text/plain", "application/pdf"
|
||||||
):
|
):
|
||||||
raise RequestError("Invalid document kind: " + document.kind)
|
raise RequestError(
|
||||||
|
"Invalid document kind: " + request.document_metadata.kind
|
||||||
|
)
|
||||||
|
|
||||||
# Create object ID as a hash of the document
|
if await self.table_store.document_exists(
|
||||||
object_id = uuid.UUID(hash(document.document))
|
request.document_metadata.user,
|
||||||
|
request.document_metadata.id
|
||||||
|
):
|
||||||
|
raise RuntimeError("Document already exists")
|
||||||
|
|
||||||
self.blob_store.add(object_id, document.document, document.kind)
|
# Create object ID for blob
|
||||||
|
object_id = uuid.uuid4()
|
||||||
|
|
||||||
self.table_store.add(object_id, document)
|
print("Add blob...")
|
||||||
|
|
||||||
if document.kind == "application/pdf":
|
await self.blob_store.add(
|
||||||
await self.load_document(document)
|
object_id, base64.b64decode(request.content),
|
||||||
elif document.kind == "text/plain":
|
request.document_metadata.kind
|
||||||
await self.load_text(document)
|
)
|
||||||
|
|
||||||
|
print("Add table...")
|
||||||
|
|
||||||
|
await self.table_store.add_document(
|
||||||
|
request.document_metadata, object_id
|
||||||
|
)
|
||||||
|
|
||||||
print("Add complete", flush=True)
|
print("Add complete", flush=True)
|
||||||
|
|
||||||
return LibrarianResponse(
|
return LibrarianResponse(
|
||||||
error = None,
|
error = None,
|
||||||
document = None,
|
document_metadata = None,
|
||||||
info = None,
|
content = None,
|
||||||
|
document_metadatas = None,
|
||||||
|
processing_metadatas = None,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def list(self, user, collection):
|
async def remove_document(self, request):
|
||||||
|
|
||||||
print("list")
|
print("Removing doc...")
|
||||||
|
|
||||||
info = self.table_store.list(user, collection)
|
if not await self.table_store.document_exists(
|
||||||
|
request.user,
|
||||||
|
request.document_id,
|
||||||
|
):
|
||||||
|
raise RuntimeError("Document does not exist")
|
||||||
|
|
||||||
print(">>", info)
|
object_id = await self.table_store.get_document_object_id(
|
||||||
|
request.user,
|
||||||
|
request.document_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove blob...
|
||||||
|
await self.blob_store.remove(object_id)
|
||||||
|
|
||||||
|
# Remove doc table row
|
||||||
|
await self.table_store.remove_document(
|
||||||
|
request.user,
|
||||||
|
request.document_id
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Remove complete", flush=True)
|
||||||
|
|
||||||
return LibrarianResponse(
|
return LibrarianResponse(
|
||||||
error = None,
|
error = None,
|
||||||
document = None,
|
document_metadata = None,
|
||||||
info = info,
|
content = None,
|
||||||
|
document_metadatas = None,
|
||||||
|
processing_metadatas = None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def handle_triples(self, m):
|
async def update_document(self, request):
|
||||||
self.table_store.add_triples(m)
|
|
||||||
|
|
||||||
def handle_graph_embeddings(self, m):
|
print("Updating doc...")
|
||||||
self.table_store.add_graph_embeddings(m)
|
|
||||||
|
|
||||||
def handle_document_embeddings(self, m):
|
# You can't update the document ID, user or kind.
|
||||||
self.table_store.add_document_embeddings(m)
|
|
||||||
|
if not await self.table_store.document_exists(
|
||||||
|
request.document_metadata.user,
|
||||||
|
request.document_metadata.id
|
||||||
|
):
|
||||||
|
raise RuntimeError("Document does not exist")
|
||||||
|
|
||||||
|
await self.table_store.update_document(request.document_metadata)
|
||||||
|
|
||||||
|
print("Update complete", flush=True)
|
||||||
|
|
||||||
|
return LibrarianResponse(
|
||||||
|
error = None,
|
||||||
|
document_metadata = None,
|
||||||
|
content = None,
|
||||||
|
document_metadatas = None,
|
||||||
|
processing_metadatas = None,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def get_document_metadata(self, request):
|
||||||
|
|
||||||
|
print("Get doc...")
|
||||||
|
|
||||||
|
doc = await self.table_store.get_document(
|
||||||
|
request.user,
|
||||||
|
request.document_id
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Get complete", flush=True)
|
||||||
|
|
||||||
|
return LibrarianResponse(
|
||||||
|
error = None,
|
||||||
|
document_metadata = doc,
|
||||||
|
content = None,
|
||||||
|
document_metadatas = None,
|
||||||
|
processing_metadatas = None,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def get_document_content(self, request):
|
||||||
|
|
||||||
|
print("Get doc content...")
|
||||||
|
|
||||||
|
object_id = await self.table_store.get_document_object_id(
|
||||||
|
request.user,
|
||||||
|
request.document_id
|
||||||
|
)
|
||||||
|
|
||||||
|
content = await self.blob_store.get(
|
||||||
|
object_id
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Get complete", flush=True)
|
||||||
|
|
||||||
|
return LibrarianResponse(
|
||||||
|
error = None,
|
||||||
|
document_metadata = None,
|
||||||
|
content = base64.b64encode(content),
|
||||||
|
document_metadatas = None,
|
||||||
|
processing_metadatas = None,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def add_processing(self, request):
|
||||||
|
|
||||||
|
print("Add processing")
|
||||||
|
|
||||||
|
if await self.table_store.processing_exists(
|
||||||
|
request.processing_metadata.user,
|
||||||
|
request.processing_metadata.id
|
||||||
|
):
|
||||||
|
raise RuntimeError("Processing already exists")
|
||||||
|
|
||||||
|
doc = await self.table_store.get_document(
|
||||||
|
request.processing_metadata.user,
|
||||||
|
request.processing_metadata.document_id
|
||||||
|
)
|
||||||
|
|
||||||
|
object_id = await self.table_store.get_document_object_id(
|
||||||
|
request.processing_metadata.user,
|
||||||
|
request.processing_metadata.document_id
|
||||||
|
)
|
||||||
|
|
||||||
|
content = await self.blob_store.get(
|
||||||
|
object_id
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Got content")
|
||||||
|
|
||||||
|
print("Add processing...")
|
||||||
|
|
||||||
|
await self.table_store.add_processing(request.processing_metadata)
|
||||||
|
|
||||||
|
print("Add complete", flush=True)
|
||||||
|
|
||||||
|
return LibrarianResponse(
|
||||||
|
error = None,
|
||||||
|
document_metadata = None,
|
||||||
|
content = None,
|
||||||
|
document_metadatas = None,
|
||||||
|
processing_metadatas = None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def handle_triples(self, m):
|
# if document.kind == "application/pdf":
|
||||||
self.table_store.add_triples(m)
|
# await self.load_document(document)
|
||||||
|
# elif document.kind == "text/plain":
|
||||||
|
# await self.load_text(document)
|
||||||
|
|
||||||
def handle_graph_embeddings(self, m):
|
async def remove_processing(self, request):
|
||||||
self.table_store.add_graph_embeddings(m)
|
|
||||||
|
|
||||||
def handle_document_embeddings(self, m):
|
print("Removing processing...")
|
||||||
self.table_store.add_document_embeddings(m)
|
|
||||||
|
if not await self.table_store.processing_exists(
|
||||||
|
request.user,
|
||||||
|
request.processing_id,
|
||||||
|
):
|
||||||
|
raise RuntimeError("Processing object does not exist")
|
||||||
|
|
||||||
|
# Remove doc table row
|
||||||
|
await self.table_store.remove_processing(
|
||||||
|
request.user,
|
||||||
|
request.processing_id
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Remove complete", flush=True)
|
||||||
|
|
||||||
|
return LibrarianResponse(
|
||||||
|
error = None,
|
||||||
|
document_metadata = None,
|
||||||
|
content = None,
|
||||||
|
document_metadatas = None,
|
||||||
|
processing_metadatas = None,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def list_documents(self, request):
|
||||||
|
|
||||||
|
docs = await self.table_store.list_documents(request.user)
|
||||||
|
|
||||||
|
return LibrarianResponse(
|
||||||
|
error = None,
|
||||||
|
document_metadata = None,
|
||||||
|
content = None,
|
||||||
|
document_metadatas = docs,
|
||||||
|
processing_metadatas = None,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def list_processing(self, request):
|
||||||
|
|
||||||
|
procs = await self.table_store.list_processing(request.user)
|
||||||
|
|
||||||
|
return LibrarianResponse(
|
||||||
|
error = None,
|
||||||
|
document_metadata = None,
|
||||||
|
content = None,
|
||||||
|
document_metadatas = None,
|
||||||
|
processing_metadatas = procs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,41 +5,27 @@ Librarian service, manages documents in collections
|
||||||
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import asyncio
|
import asyncio
|
||||||
import threading
|
|
||||||
import queue
|
|
||||||
import base64
|
import base64
|
||||||
|
import json
|
||||||
|
|
||||||
from pulsar.schema import JsonSchema
|
from .. base import AsyncProcessor, Consumer, Producer, Publisher, Subscriber
|
||||||
|
from .. base import ConsumerMetrics, ProducerMetrics
|
||||||
|
|
||||||
from .. schema import LibrarianRequest, LibrarianResponse, Error
|
from .. schema import LibrarianRequest, LibrarianResponse, Error
|
||||||
from .. schema import librarian_request_queue, librarian_response_queue
|
from .. schema import librarian_request_queue, librarian_response_queue
|
||||||
|
|
||||||
from .. schema import GraphEmbeddings
|
|
||||||
from .. schema import graph_embeddings_store_queue
|
|
||||||
from .. schema import Triples
|
|
||||||
from .. schema import triples_store_queue
|
|
||||||
from .. schema import DocumentEmbeddings
|
|
||||||
from .. schema import document_embeddings_store_queue
|
|
||||||
|
|
||||||
from .. schema import Document, Metadata
|
from .. schema import Document, Metadata
|
||||||
from .. schema import document_ingest_queue
|
|
||||||
from .. schema import TextDocument, Metadata
|
from .. schema import TextDocument, Metadata
|
||||||
from .. schema import text_ingest_queue
|
|
||||||
|
|
||||||
from .. base import Publisher
|
|
||||||
from .. base import Subscriber
|
|
||||||
|
|
||||||
from .. log_level import LogLevel
|
|
||||||
from .. base import ConsumerProducer
|
|
||||||
from .. exceptions import RequestError
|
from .. exceptions import RequestError
|
||||||
|
|
||||||
from . librarian import Librarian
|
from . librarian import Librarian
|
||||||
|
|
||||||
module = "librarian"
|
default_ident = "librarian"
|
||||||
|
|
||||||
|
default_librarian_request_queue = librarian_request_queue
|
||||||
|
default_librarian_response_queue = librarian_response_queue
|
||||||
|
|
||||||
default_input_queue = librarian_request_queue
|
|
||||||
default_output_queue = librarian_response_queue
|
|
||||||
default_subscriber = module
|
|
||||||
default_minio_host = "minio:9000"
|
default_minio_host = "minio:9000"
|
||||||
default_minio_access_key = "minioadmin"
|
default_minio_access_key = "minioadmin"
|
||||||
default_minio_secret_key = "minioadmin"
|
default_minio_secret_key = "minioadmin"
|
||||||
|
|
@ -50,15 +36,21 @@ bucket_name = "library"
|
||||||
# FIXME: How to ensure this doesn't conflict with other usage?
|
# FIXME: How to ensure this doesn't conflict with other usage?
|
||||||
keyspace = "librarian"
|
keyspace = "librarian"
|
||||||
|
|
||||||
class Processor(ConsumerProducer):
|
class Processor(AsyncProcessor):
|
||||||
|
|
||||||
def __init__(self, **params):
|
def __init__(self, **params):
|
||||||
|
|
||||||
self.running = True
|
id = params.get("id")
|
||||||
|
|
||||||
input_queue = params.get("input_queue", default_input_queue)
|
# self.running = True
|
||||||
output_queue = params.get("output_queue", default_output_queue)
|
|
||||||
subscriber = params.get("subscriber", default_subscriber)
|
librarian_request_queue = params.get(
|
||||||
|
"librarian_request_queue", default_librarian_request_queue
|
||||||
|
)
|
||||||
|
|
||||||
|
librarian_response_queue = params.get(
|
||||||
|
"librarian_response_queue", default_librarian_response_queue
|
||||||
|
)
|
||||||
|
|
||||||
minio_host = params.get("minio_host", default_minio_host)
|
minio_host = params.get("minio_host", default_minio_host)
|
||||||
minio_access_key = params.get(
|
minio_access_key = params.get(
|
||||||
|
|
@ -74,19 +66,10 @@ class Processor(ConsumerProducer):
|
||||||
cassandra_user = params.get("cassandra_user")
|
cassandra_user = params.get("cassandra_user")
|
||||||
cassandra_password = params.get("cassandra_password")
|
cassandra_password = params.get("cassandra_password")
|
||||||
|
|
||||||
triples_queue = params.get("triples_queue")
|
|
||||||
graph_embeddings_queue = params.get("graph_embeddings_queue")
|
|
||||||
document_embeddings_queue = params.get("document_embeddings_queue")
|
|
||||||
document_load_queue = params.get("document_load_queue")
|
|
||||||
text_load_queue = params.get("text_load_queue")
|
|
||||||
|
|
||||||
super(Processor, self).__init__(
|
super(Processor, self).__init__(
|
||||||
**params | {
|
**params | {
|
||||||
"input_queue": input_queue,
|
"librarian_request_queue": librarian_request_queue,
|
||||||
"output_queue": output_queue,
|
"librarian_response_queue": librarian_response_queue,
|
||||||
"subscriber": subscriber,
|
|
||||||
"input_schema": LibrarianRequest,
|
|
||||||
"output_schema": LibrarianResponse,
|
|
||||||
"minio_host": minio_host,
|
"minio_host": minio_host,
|
||||||
"minio_access_key": minio_access_key,
|
"minio_access_key": minio_access_key,
|
||||||
"cassandra_host": cassandra_host,
|
"cassandra_host": cassandra_host,
|
||||||
|
|
@ -94,38 +77,30 @@ class Processor(ConsumerProducer):
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
self.document_load = Publisher(
|
librarian_request_metrics = ConsumerMetrics(
|
||||||
self.client, document_load_queue, JsonSchema(Document),
|
processor = self.id, flow = None, name = "librarian-request"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.text_load = Publisher(
|
librarian_response_metrics = ProducerMetrics(
|
||||||
self.client, text_load_queue, JsonSchema(TextDocument),
|
processor = self.id, flow = None, name = "librarian-response"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.triples_brk = Subscriber(
|
self.librarian_request_consumer = Consumer(
|
||||||
self.client, triples_store_queue,
|
taskgroup = self.taskgroup,
|
||||||
"librarian", "librarian",
|
client = self.pulsar_client,
|
||||||
schema=JsonSchema(Triples),
|
flow = None,
|
||||||
)
|
topic = librarian_request_queue,
|
||||||
self.graph_embeddings_brk = Subscriber(
|
subscriber = id,
|
||||||
self.client, graph_embeddings_store_queue,
|
schema = LibrarianRequest,
|
||||||
"librarian", "librarian",
|
handler = self.on_librarian_request,
|
||||||
schema=JsonSchema(GraphEmbeddings),
|
metrics = librarian_request_metrics,
|
||||||
)
|
|
||||||
self.document_embeddings_brk = Subscriber(
|
|
||||||
self.client, document_embeddings_store_queue,
|
|
||||||
"librarian", "librarian",
|
|
||||||
schema=JsonSchema(DocumentEmbeddings),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self.triples_reader = threading.Thread(
|
self.librarian_response_producer = Producer(
|
||||||
target=self.receive_triples
|
client = self.pulsar_client,
|
||||||
)
|
topic = librarian_response_queue,
|
||||||
self.graph_embeddings_reader = threading.Thread(
|
schema = LibrarianResponse,
|
||||||
target=self.receive_graph_embeddings
|
metrics = librarian_response_metrics,
|
||||||
)
|
|
||||||
self.document_embeddings_reader = threading.Thread(
|
|
||||||
target=self.receive_document_embeddings
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self.librarian = Librarian(
|
self.librarian = Librarian(
|
||||||
|
|
@ -141,87 +116,34 @@ class Processor(ConsumerProducer):
|
||||||
load_text = self.load_text,
|
load_text = self.load_text,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.register_config_handler(self.on_librarian_config)
|
||||||
|
|
||||||
|
self.flows = {}
|
||||||
|
|
||||||
print("Initialised.", flush=True)
|
print("Initialised.", flush=True)
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
|
|
||||||
self.document_load.start()
|
|
||||||
self.text_load.start()
|
|
||||||
|
|
||||||
self.triples_brk.start()
|
await super(Processor, self).start()
|
||||||
self.graph_embeddings_brk.start()
|
await self.librarian_request_consumer.start()
|
||||||
self.document_embeddings_brk.start()
|
await self.librarian_response_producer.start()
|
||||||
|
|
||||||
self.triples_sub = self.triples_brk.subscribe_all("x")
|
async def on_librarian_config(self, config, version):
|
||||||
self.graph_embeddings_sub = self.graph_embeddings_brk.subscribe_all("x")
|
|
||||||
self.document_embeddings_sub = self.document_embeddings_brk.subscribe_all("x")
|
|
||||||
|
|
||||||
self.triples_reader.start()
|
print("config version", version)
|
||||||
self.graph_embeddings_reader.start()
|
|
||||||
self.document_embeddings_reader.start()
|
if "flows" in config:
|
||||||
|
|
||||||
|
self.flows = {
|
||||||
|
k: json.loads(v)
|
||||||
|
for k, v in config["flows"].items()
|
||||||
|
}
|
||||||
|
|
||||||
|
print(self.flows)
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
|
|
||||||
self.running = False
|
pass
|
||||||
|
|
||||||
if hasattr(self, "document_load"):
|
|
||||||
self.document_load.stop()
|
|
||||||
self.document_load.join()
|
|
||||||
|
|
||||||
if hasattr(self, "text_load"):
|
|
||||||
self.text_load.stop()
|
|
||||||
self.text_load.join()
|
|
||||||
|
|
||||||
if hasattr(self, "triples_sub"):
|
|
||||||
self.triples_sub.unsubscribe_all("x")
|
|
||||||
|
|
||||||
if hasattr(self, "graph_embeddings_sub"):
|
|
||||||
self.graph_embeddings_sub.unsubscribe_all("x")
|
|
||||||
|
|
||||||
if hasattr(self, "document_embeddings_sub"):
|
|
||||||
self.document_embeddings_sub.unsubscribe_all("x")
|
|
||||||
|
|
||||||
if hasattr(self, "triples_brk"):
|
|
||||||
self.triples_brk.stop()
|
|
||||||
self.triples_brk.join()
|
|
||||||
|
|
||||||
if hasattr(self, "graph_embeddings_brk"):
|
|
||||||
self.graph_embeddings_brk.stop()
|
|
||||||
self.graph_embeddings_brk.join()
|
|
||||||
|
|
||||||
if hasattr(self, "document_embeddings_brk"):
|
|
||||||
self.document_embeddings_brk.stop()
|
|
||||||
self.document_embeddings_brk.join()
|
|
||||||
|
|
||||||
def receive_triples(self):
|
|
||||||
|
|
||||||
while self.running:
|
|
||||||
try:
|
|
||||||
msg = self.triples_sub.get(timeout=1)
|
|
||||||
except queue.Empty:
|
|
||||||
continue
|
|
||||||
|
|
||||||
self.librarian.handle_triples(msg)
|
|
||||||
|
|
||||||
def receive_graph_embeddings(self):
|
|
||||||
|
|
||||||
while self.running:
|
|
||||||
try:
|
|
||||||
msg = self.graph_embeddings_sub.get(timeout=1)
|
|
||||||
except queue.Empty:
|
|
||||||
continue
|
|
||||||
|
|
||||||
self.librarian.handle_graph_embeddings(msg)
|
|
||||||
|
|
||||||
def receive_document_embeddings(self):
|
|
||||||
|
|
||||||
while self.running:
|
|
||||||
try:
|
|
||||||
msg = self.document_embeddings_sub.get(timeout=1)
|
|
||||||
except queue.Empty:
|
|
||||||
continue
|
|
||||||
|
|
||||||
self.librarian.handle_document_embeddings(msg)
|
|
||||||
|
|
||||||
async def load_document(self, document):
|
async def load_document(self, document):
|
||||||
|
|
||||||
|
|
@ -235,6 +157,8 @@ class Processor(ConsumerProducer):
|
||||||
data = document.document
|
data = document.document
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
self.document_load.send(None, doc)
|
self.document_load.send(None, doc)
|
||||||
|
|
||||||
async def load_text(self, document):
|
async def load_text(self, document):
|
||||||
|
|
@ -254,41 +178,31 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
self.text_load.send(None, doc)
|
self.text_load.send(None, doc)
|
||||||
|
|
||||||
def parse_request(self, v):
|
async def process_request(self, v):
|
||||||
|
|
||||||
if v.operation is None:
|
if v.operation is None:
|
||||||
raise RequestError("Null operation")
|
raise RequestError("Null operation")
|
||||||
|
|
||||||
print("op", v.operation)
|
print("requets", v.operation)
|
||||||
|
|
||||||
if v.operation == "add":
|
impls = {
|
||||||
if (
|
"add-document": self.librarian.add_document,
|
||||||
v.document and v.document.id and v.document.metadata and
|
"remove-document": self.librarian.remove_document,
|
||||||
v.document.document and v.document.kind
|
"update-document": self.librarian.update_document,
|
||||||
):
|
"get-document-metadata": self.librarian.get_document_metadata,
|
||||||
return partial(
|
"get-document-content": self.librarian.get_document_content,
|
||||||
self.librarian.add,
|
"add-processing": self.librarian.add_processing,
|
||||||
document = v.document,
|
"remove-processing": self.librarian.remove_processing,
|
||||||
)
|
"list-documents": self.librarian.list_documents,
|
||||||
else:
|
"list-processing": self.librarian.list_processing,
|
||||||
raise RequestError("Invalid call")
|
}
|
||||||
|
|
||||||
if v.operation == "list":
|
if v.operation not in impls:
|
||||||
print("list", v)
|
raise RequestError(f"Invalid operation: {v.operation}")
|
||||||
print(v.user)
|
|
||||||
if v.user:
|
|
||||||
return partial(
|
|
||||||
self.librarian.list,
|
|
||||||
user = v.user,
|
|
||||||
collection = v.collection,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
print("BROK")
|
|
||||||
raise RequestError("Invalid call")
|
|
||||||
|
|
||||||
raise RequestError("Invalid operation: " + v.operation)
|
return await impls[v.operation](v)
|
||||||
|
|
||||||
async def handle(self, msg):
|
async def on_librarian_request(self, msg, consumer, flow):
|
||||||
|
|
||||||
v = msg.value()
|
v = msg.value()
|
||||||
|
|
||||||
|
|
@ -299,20 +213,15 @@ class Processor(ConsumerProducer):
|
||||||
print(f"Handling input {id}...", flush=True)
|
print(f"Handling input {id}...", flush=True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
func = self.parse_request(v)
|
|
||||||
except RequestError as e:
|
resp = await self.process_request(v)
|
||||||
resp = LibrarianResponse(
|
|
||||||
error = Error(
|
await self.librarian_response_producer.send(
|
||||||
type = "request-error",
|
resp, properties={"id": id}
|
||||||
message = str(e),
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
await self.send(resp, properties={"id": id})
|
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
|
||||||
resp = await func()
|
|
||||||
print("->", resp)
|
|
||||||
except RequestError as e:
|
except RequestError as e:
|
||||||
resp = LibrarianResponse(
|
resp = LibrarianResponse(
|
||||||
error = Error(
|
error = Error(
|
||||||
|
|
@ -320,31 +229,43 @@ class Processor(ConsumerProducer):
|
||||||
message = str(e),
|
message = str(e),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
await self.send(resp, properties={"id": id})
|
|
||||||
|
await self.librarian_response_producer.send(
|
||||||
|
resp, properties={"id": id}
|
||||||
|
)
|
||||||
|
|
||||||
return
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Exception:", e, flush=True)
|
|
||||||
resp = LibrarianResponse(
|
resp = LibrarianResponse(
|
||||||
error = Error(
|
error = Error(
|
||||||
type = "processing-error",
|
type = "unexpected-error",
|
||||||
message = "Unhandled error: " + str(e),
|
message = str(e),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
await self.send(resp, properties={"id": id})
|
|
||||||
|
await self.librarian_response_producer.send(
|
||||||
|
resp, properties={"id": id}
|
||||||
|
)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
print("Send response..!.", flush=True)
|
|
||||||
|
|
||||||
await self.send(resp, properties={"id": id})
|
|
||||||
|
|
||||||
print("Done.", flush=True)
|
print("Done.", flush=True)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def add_args(parser):
|
def add_args(parser):
|
||||||
|
|
||||||
ConsumerProducer.add_args(
|
AsyncProcessor.add_args(parser)
|
||||||
parser, default_input_queue, default_subscriber,
|
|
||||||
default_output_queue,
|
parser.add_argument(
|
||||||
|
'--librarian-request-queue',
|
||||||
|
default=default_librarian_request_queue,
|
||||||
|
help=f'Config request queue (default: {default_librarian_request_queue})'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--librarian-response-queue',
|
||||||
|
default=default_librarian_response_queue,
|
||||||
|
help=f'Config response queue {default_librarian_response_queue}',
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|
@ -385,40 +306,7 @@ class Processor(ConsumerProducer):
|
||||||
help=f'Cassandra password'
|
help=f'Cassandra password'
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
'--triples-queue',
|
|
||||||
default=triples_store_queue,
|
|
||||||
help=f'Triples queue (default: {triples_store_queue})'
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
'--graph-embeddings-queue',
|
|
||||||
default=graph_embeddings_store_queue,
|
|
||||||
help=f'Graph embeddings queue (default: {triples_store_queue})'
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
'--document-embeddings-queue',
|
|
||||||
default=document_embeddings_store_queue,
|
|
||||||
help='Document embeddings queue '
|
|
||||||
f'(default: {document_embeddings_store_queue})'
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
'--document-load-queue',
|
|
||||||
default=document_ingest_queue,
|
|
||||||
help='Document load queue '
|
|
||||||
f'(default: {document_ingest_queue})'
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
'--text-load-queue',
|
|
||||||
default=text_ingest_queue,
|
|
||||||
help='Text ingest queue '
|
|
||||||
f'(default: {text_ingest_queue})'
|
|
||||||
)
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
|
|
||||||
Processor.launch(module, __doc__)
|
Processor.launch(default_ident, __doc__)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
|
|
||||||
from .. schema import LibrarianRequest, LibrarianResponse
|
from .. schema import LibrarianRequest, LibrarianResponse
|
||||||
from .. schema import DocumentInfo, Error, Triple, Value
|
from .. schema import DocumentMetadata, ProcessingMetadata
|
||||||
|
from .. schema import Error, Triple, Value
|
||||||
from .. knowledge import hash
|
from .. knowledge import hash
|
||||||
from .. exceptions import RequestError
|
from .. exceptions import RequestError
|
||||||
|
|
||||||
|
|
@ -7,8 +9,10 @@ from cassandra.cluster import Cluster
|
||||||
from cassandra.auth import PlainTextAuthProvider
|
from cassandra.auth import PlainTextAuthProvider
|
||||||
from cassandra.query import BatchStatement
|
from cassandra.query import BatchStatement
|
||||||
from ssl import SSLContext, PROTOCOL_TLSv1_2
|
from ssl import SSLContext, PROTOCOL_TLSv1_2
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
import time
|
import time
|
||||||
|
import asyncio
|
||||||
|
|
||||||
class TableStore:
|
class TableStore:
|
||||||
|
|
||||||
|
|
@ -63,18 +67,18 @@ class TableStore:
|
||||||
|
|
||||||
self.cassandra.execute("""
|
self.cassandra.execute("""
|
||||||
CREATE TABLE IF NOT EXISTS document (
|
CREATE TABLE IF NOT EXISTS document (
|
||||||
user text,
|
|
||||||
collection text,
|
|
||||||
id text,
|
id text,
|
||||||
|
user text,
|
||||||
time timestamp,
|
time timestamp,
|
||||||
|
kind text,
|
||||||
title text,
|
title text,
|
||||||
comments text,
|
comments text,
|
||||||
kind text,
|
|
||||||
object_id uuid,
|
|
||||||
metadata list<tuple<
|
metadata list<tuple<
|
||||||
text, boolean, text, boolean, text, boolean
|
text, boolean, text, boolean, text, boolean
|
||||||
>>,
|
>>,
|
||||||
PRIMARY KEY (user, collection, id)
|
tags list<text>,
|
||||||
|
object_id uuid,
|
||||||
|
PRIMARY KEY (user, id)
|
||||||
);
|
);
|
||||||
""");
|
""");
|
||||||
|
|
||||||
|
|
@ -85,6 +89,23 @@ class TableStore:
|
||||||
ON document (object_id)
|
ON document (object_id)
|
||||||
""");
|
""");
|
||||||
|
|
||||||
|
print("processing table...", flush=True)
|
||||||
|
|
||||||
|
self.cassandra.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS processing (
|
||||||
|
id text,
|
||||||
|
document_id text,
|
||||||
|
time timestamp,
|
||||||
|
flow text,
|
||||||
|
user text,
|
||||||
|
collection text,
|
||||||
|
tags list<text>,
|
||||||
|
PRIMARY KEY (user, id)
|
||||||
|
);
|
||||||
|
""");
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
print("triples table...", flush=True)
|
print("triples table...", flush=True)
|
||||||
|
|
||||||
self.cassandra.execute("""
|
self.cassandra.execute("""
|
||||||
|
|
@ -155,26 +176,84 @@ class TableStore:
|
||||||
self.insert_document_stmt = self.cassandra.prepare("""
|
self.insert_document_stmt = self.cassandra.prepare("""
|
||||||
INSERT INTO document
|
INSERT INTO document
|
||||||
(
|
(
|
||||||
id, user, collection, kind, object_id, time, title, comments,
|
id, user, time,
|
||||||
metadata
|
kind, title, comments,
|
||||||
|
metadata, tags, object_id
|
||||||
)
|
)
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
self.update_document_stmt = self.cassandra.prepare("""
|
||||||
|
UPDATE document
|
||||||
|
SET time = ?, title = ?, comments = ?,
|
||||||
|
metadata = ?, tags = ?
|
||||||
|
WHERE user = ? AND id = ?
|
||||||
|
""")
|
||||||
|
|
||||||
|
self.get_document_stmt = self.cassandra.prepare("""
|
||||||
|
SELECT time, kind, title, comments, metadata, tags, object_id
|
||||||
|
FROM document
|
||||||
|
WHERE user = ? AND id = ?
|
||||||
|
""")
|
||||||
|
|
||||||
|
self.delete_document_stmt = self.cassandra.prepare("""
|
||||||
|
DELETE FROM document
|
||||||
|
WHERE user = ? AND id = ?
|
||||||
|
""")
|
||||||
|
|
||||||
|
self.test_document_exists_stmt = self.cassandra.prepare("""
|
||||||
|
SELECT id
|
||||||
|
FROM document
|
||||||
|
WHERE user = ? AND id = ?
|
||||||
|
LIMIT 1
|
||||||
|
""")
|
||||||
|
|
||||||
self.list_document_stmt = self.cassandra.prepare("""
|
self.list_document_stmt = self.cassandra.prepare("""
|
||||||
SELECT
|
SELECT
|
||||||
id, kind, user, collection, title, comments, time, metadata
|
id, time, kind, title, comments, metadata, tags, object_id
|
||||||
FROM document
|
FROM document
|
||||||
WHERE user = ?
|
WHERE user = ?
|
||||||
""")
|
""")
|
||||||
|
|
||||||
self.list_document_by_collection_stmt = self.cassandra.prepare("""
|
self.list_document_by_tag_stmt = self.cassandra.prepare("""
|
||||||
SELECT
|
SELECT
|
||||||
id, kind, user, collection, title, comments, time, metadata
|
id, time, kind, title, comments, metadata, tags, object_id
|
||||||
FROM document
|
FROM document
|
||||||
WHERE user = ? AND collection = ?
|
WHERE user = ? AND tags CONTAINS ?
|
||||||
|
ALLOW FILTERING
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
self.insert_processing_stmt = self.cassandra.prepare("""
|
||||||
|
INSERT INTO processing
|
||||||
|
(
|
||||||
|
id, document_id, time,
|
||||||
|
flow, user, collection,
|
||||||
|
tags
|
||||||
|
)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""")
|
||||||
|
|
||||||
|
self.delete_processing_stmt = self.cassandra.prepare("""
|
||||||
|
DELETE FROM processing
|
||||||
|
WHERE user = ? AND id = ?
|
||||||
|
""")
|
||||||
|
|
||||||
|
self.test_processing_exists_stmt = self.cassandra.prepare("""
|
||||||
|
SELECT id
|
||||||
|
FROM processing
|
||||||
|
WHERE user = ? AND id = ?
|
||||||
|
LIMIT 1
|
||||||
|
""")
|
||||||
|
|
||||||
|
self.list_processing_stmt = self.cassandra.prepare("""
|
||||||
|
SELECT
|
||||||
|
id, document_id, time, flow, collection, tags
|
||||||
|
FROM processing
|
||||||
|
WHERE user = ?
|
||||||
|
""")
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
self.insert_triples_stmt = self.cassandra.prepare("""
|
self.insert_triples_stmt = self.cassandra.prepare("""
|
||||||
INSERT INTO triples
|
INSERT INTO triples
|
||||||
(
|
(
|
||||||
|
|
@ -202,17 +281,24 @@ class TableStore:
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
""")
|
""")
|
||||||
|
|
||||||
def add(self, object_id, document):
|
async def document_exists(self, user, id):
|
||||||
|
|
||||||
if document.kind not in (
|
resp = self.cassandra.execute(
|
||||||
"text/plain", "application/pdf"
|
self.test_document_exists_stmt,
|
||||||
):
|
( user, id )
|
||||||
raise RequestError("Invalid document kind: " + document.kind)
|
)
|
||||||
|
|
||||||
# Create random doc ID
|
# If a row exists, document exists. It's a cursor, can't just
|
||||||
when = int(time.time() * 1000)
|
# count the length
|
||||||
|
|
||||||
print("Adding", document.id, object_id)
|
for row in resp:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def add_document(self, document, object_id):
|
||||||
|
|
||||||
|
print("Adding document", document.id, object_id)
|
||||||
|
|
||||||
metadata = [
|
metadata = [
|
||||||
(
|
(
|
||||||
|
|
@ -229,10 +315,9 @@ class TableStore:
|
||||||
resp = self.cassandra.execute(
|
resp = self.cassandra.execute(
|
||||||
self.insert_document_stmt,
|
self.insert_document_stmt,
|
||||||
(
|
(
|
||||||
document.id, document.user, document.collection,
|
document.id, document.user, int(document.time * 1000),
|
||||||
document.kind, object_id, when,
|
document.kind, document.title, document.comments,
|
||||||
document.title, document.comments,
|
metadata, document.tags, object_id
|
||||||
metadata
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -242,11 +327,71 @@ class TableStore:
|
||||||
|
|
||||||
print("Exception:", type(e))
|
print("Exception:", type(e))
|
||||||
print(f"{e}, retry...", flush=True)
|
print(f"{e}, retry...", flush=True)
|
||||||
time.sleep(1)
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
print("Add complete", flush=True)
|
print("Add complete", flush=True)
|
||||||
|
|
||||||
def add_triples(self, m):
|
async def update_document(self, document):
|
||||||
|
|
||||||
|
print("Updating document", document.id)
|
||||||
|
|
||||||
|
metadata = [
|
||||||
|
(
|
||||||
|
v.s.value, v.s.is_uri, v.p.value, v.p.is_uri,
|
||||||
|
v.o.value, v.o.is_uri
|
||||||
|
)
|
||||||
|
for v in document.metadata
|
||||||
|
]
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
resp = self.cassandra.execute(
|
||||||
|
self.update_document_stmt,
|
||||||
|
(
|
||||||
|
int(document.time * 1000), document.title,
|
||||||
|
document.comments, metadata, document.tags,
|
||||||
|
document.user, document.id
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
|
||||||
|
print("Exception:", type(e))
|
||||||
|
print(f"{e}, retry...", flush=True)
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
print("Update complete", flush=True)
|
||||||
|
|
||||||
|
async def remove_document(self, user, document_id):
|
||||||
|
|
||||||
|
print("Removing document", document_id)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
resp = self.cassandra.execute(
|
||||||
|
self.delete_document_stmt,
|
||||||
|
(
|
||||||
|
user, document_id
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
|
||||||
|
print("Exception:", type(e))
|
||||||
|
print(f"{e}, retry...", flush=True)
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
print("Delete complete", flush=True)
|
||||||
|
|
||||||
|
async def add_triples(self, m):
|
||||||
|
|
||||||
when = int(time.time() * 1000)
|
when = int(time.time() * 1000)
|
||||||
|
|
||||||
|
|
@ -288,76 +433,235 @@ class TableStore:
|
||||||
|
|
||||||
print("Exception:", type(e))
|
print("Exception:", type(e))
|
||||||
print(f"{e}, retry...", flush=True)
|
print(f"{e}, retry...", flush=True)
|
||||||
time.sleep(1)
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
def list(self, user, collection=None):
|
async def list_documents(self, user):
|
||||||
|
|
||||||
|
print("List documents...")
|
||||||
|
|
||||||
print("LIST")
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
print("TRY")
|
|
||||||
|
|
||||||
print(self.list_document_stmt)
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
if collection:
|
resp = self.cassandra.execute(
|
||||||
resp = self.cassandra.execute(
|
self.list_document_stmt,
|
||||||
self.list_document_by_collection_stmt,
|
(user,)
|
||||||
(user, collection)
|
)
|
||||||
)
|
|
||||||
else:
|
|
||||||
resp = self.cassandra.execute(
|
|
||||||
self.list_document_stmt,
|
|
||||||
(user,)
|
|
||||||
)
|
|
||||||
break
|
|
||||||
|
|
||||||
print("OK")
|
break
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Exception:", type(e))
|
print("Exception:", type(e))
|
||||||
print(f"{e}, retry...", flush=True)
|
print(f"{e}, retry...", flush=True)
|
||||||
time.sleep(1)
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
print("OK2")
|
|
||||||
|
|
||||||
info = [
|
lst = [
|
||||||
DocumentInfo(
|
DocumentMetadata(
|
||||||
id = row[0],
|
id = row[0],
|
||||||
kind = row[1],
|
user = user,
|
||||||
user = row[2],
|
time = int(time.mktime(row[1].timetuple())),
|
||||||
collection = row[3],
|
kind = row[2],
|
||||||
title = row[4],
|
title = row[3],
|
||||||
comments = row[5],
|
comments = row[4],
|
||||||
time = int(1000 * row[6].timestamp()),
|
|
||||||
metadata = [
|
metadata = [
|
||||||
Triple(
|
Triple(
|
||||||
s=Value(value=m[0], is_uri=m[1]),
|
s=Value(value=m[0], is_uri=m[1]),
|
||||||
p=Value(value=m[2], is_uri=m[3]),
|
p=Value(value=m[2], is_uri=m[3]),
|
||||||
o=Value(value=m[4], is_uri=m[5])
|
o=Value(value=m[4], is_uri=m[5])
|
||||||
)
|
)
|
||||||
for m in row[7]
|
for m in row[5]
|
||||||
],
|
],
|
||||||
|
tags = row[6],
|
||||||
|
object_id = row[7],
|
||||||
)
|
)
|
||||||
for row in resp
|
for row in resp
|
||||||
]
|
]
|
||||||
|
|
||||||
print("OK3")
|
print("Done")
|
||||||
|
|
||||||
print(info[0])
|
return lst
|
||||||
|
|
||||||
print(info[0].user)
|
async def get_document(self, user, id):
|
||||||
print(info[0].time)
|
|
||||||
print(info[0].kind)
|
|
||||||
print(info[0].collection)
|
|
||||||
print(info[0].title)
|
|
||||||
print(info[0].comments)
|
|
||||||
print(info[0].metadata)
|
|
||||||
print(info[0].metadata)
|
|
||||||
|
|
||||||
return info
|
print("Get document")
|
||||||
|
|
||||||
def add_graph_embeddings(self, m):
|
while True:
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
resp = self.cassandra.execute(
|
||||||
|
self.get_document_stmt,
|
||||||
|
(user, id)
|
||||||
|
)
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print("Exception:", type(e))
|
||||||
|
print(f"{e}, retry...", flush=True)
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
|
for row in resp:
|
||||||
|
doc = DocumentMetadata(
|
||||||
|
id = id,
|
||||||
|
user = user,
|
||||||
|
time = int(time.mktime(row[0].timetuple())),
|
||||||
|
kind = row[1],
|
||||||
|
title = row[2],
|
||||||
|
comments = row[3],
|
||||||
|
metadata = [
|
||||||
|
Triple(
|
||||||
|
s=Value(value=m[0], is_uri=m[1]),
|
||||||
|
p=Value(value=m[2], is_uri=m[3]),
|
||||||
|
o=Value(value=m[4], is_uri=m[5])
|
||||||
|
)
|
||||||
|
for m in row[4]
|
||||||
|
],
|
||||||
|
tags = row[5],
|
||||||
|
object_id = row[6],
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Done")
|
||||||
|
return doc
|
||||||
|
|
||||||
|
raise RuntimeError("No such document row?")
|
||||||
|
|
||||||
|
async def get_document_object_id(self, user, id):
|
||||||
|
|
||||||
|
print("Get document obj ID")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
resp = self.cassandra.execute(
|
||||||
|
self.get_document_stmt,
|
||||||
|
(user, id)
|
||||||
|
)
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print("Exception:", type(e))
|
||||||
|
print(f"{e}, retry...", flush=True)
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
|
for row in resp:
|
||||||
|
print("Done")
|
||||||
|
return row[6]
|
||||||
|
|
||||||
|
raise RuntimeError("No such document row?")
|
||||||
|
|
||||||
|
async def processing_exists(self, user, id):
|
||||||
|
|
||||||
|
resp = self.cassandra.execute(
|
||||||
|
self.test_processing_exists_stmt,
|
||||||
|
( user, id )
|
||||||
|
)
|
||||||
|
|
||||||
|
# If a row exists, document exists. It's a cursor, can't just
|
||||||
|
# count the length
|
||||||
|
|
||||||
|
for row in resp:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def add_processing(self, processing):
|
||||||
|
|
||||||
|
print("Adding processing", processing.id)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
resp = self.cassandra.execute(
|
||||||
|
self.insert_processing_stmt,
|
||||||
|
(
|
||||||
|
processing.id, processing.document_id,
|
||||||
|
int(processing.time * 1000), processing.flow,
|
||||||
|
processing.user, processing.collection,
|
||||||
|
processing.tags
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
|
||||||
|
print("Exception:", type(e))
|
||||||
|
print(f"{e}, retry...", flush=True)
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
print("Add complete", flush=True)
|
||||||
|
|
||||||
|
async def remove_processing(self, user, processing_id):
|
||||||
|
|
||||||
|
print("Removing processing", processing_id)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
resp = self.cassandra.execute(
|
||||||
|
self.delete_processing_stmt,
|
||||||
|
(
|
||||||
|
user, processing_id
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
|
||||||
|
print("Exception:", type(e))
|
||||||
|
print(f"{e}, retry...", flush=True)
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
print("Delete complete", flush=True)
|
||||||
|
|
||||||
|
async def list_processing(self, user):
|
||||||
|
|
||||||
|
print("List processing objects")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
resp = self.cassandra.execute(
|
||||||
|
self.list_processing_stmt,
|
||||||
|
(user,)
|
||||||
|
)
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print("Exception:", type(e))
|
||||||
|
print(f"{e}, retry...", flush=True)
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
|
lst = [
|
||||||
|
ProcessingMetadata(
|
||||||
|
id = row[0],
|
||||||
|
document_id = row[1],
|
||||||
|
time = int(time.mktime(row[2].timetuple())),
|
||||||
|
flow = row[3],
|
||||||
|
user = user,
|
||||||
|
collection = row[4],
|
||||||
|
tags = row[5],
|
||||||
|
)
|
||||||
|
for row in resp
|
||||||
|
]
|
||||||
|
|
||||||
|
print("Done")
|
||||||
|
|
||||||
|
return lst
|
||||||
|
|
||||||
|
async def add_graph_embeddings(self, m):
|
||||||
|
|
||||||
when = int(time.time() * 1000)
|
when = int(time.time() * 1000)
|
||||||
|
|
||||||
|
|
@ -399,9 +703,9 @@ class TableStore:
|
||||||
|
|
||||||
print("Exception:", type(e))
|
print("Exception:", type(e))
|
||||||
print(f"{e}, retry...", flush=True)
|
print(f"{e}, retry...", flush=True)
|
||||||
time.sleep(1)
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
def add_document_embeddings(self, m):
|
async def add_document_embeddings(self, m):
|
||||||
|
|
||||||
when = int(time.time() * 1000)
|
when = int(time.time() * 1000)
|
||||||
|
|
||||||
|
|
@ -443,6 +747,6 @@ class TableStore:
|
||||||
|
|
||||||
print("Exception:", type(e))
|
print("Exception:", type(e))
|
||||||
print(f"{e}, retry...", flush=True)
|
print(f"{e}, retry...", flush=True)
|
||||||
time.sleep(1)
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue