mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Librarian doc submission (#362)
This commit is contained in:
parent
ff28d26f4d
commit
8146f0f2ff
3 changed files with 112 additions and 34 deletions
50
test-api/test-library-add-processing2
Executable file
50
test-api/test-library-add-processing2
Executable file
|
|
@ -0,0 +1,50 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import base64
|
||||||
|
import time
|
||||||
|
|
||||||
|
url = "http://localhost:8088/api/v1/"
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
doc_id = "http://trustgraph.ai/doc/6d034da9-2759-45c2-af24-14db7f4c44c2"
|
||||||
|
|
||||||
|
proc_id = "72be9c56-a63a-4dde-8f3c-9b35f2598b83"
|
||||||
|
|
||||||
|
input = {
|
||||||
|
"operation": "add-processing",
|
||||||
|
"processing-metadata": {
|
||||||
|
"id": proc_id,
|
||||||
|
"document-id": doc_id,
|
||||||
|
"time": int(time.time()),
|
||||||
|
"flow": "0000",
|
||||||
|
"user": "trustgraph",
|
||||||
|
"collection": "default",
|
||||||
|
"tags": ["test"],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = requests.post(
|
||||||
|
f"{url}librarian",
|
||||||
|
json=input,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(resp.text)
|
||||||
|
resp = resp.json()
|
||||||
|
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
if "error" in resp:
|
||||||
|
print(f"Error: {resp['error']}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# print(resp["response"])
|
||||||
|
print(resp)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
|
@ -14,7 +14,7 @@ class Librarian:
|
||||||
self,
|
self,
|
||||||
cassandra_host, cassandra_user, cassandra_password,
|
cassandra_host, cassandra_user, cassandra_password,
|
||||||
minio_host, minio_access_key, minio_secret_key,
|
minio_host, minio_access_key, minio_secret_key,
|
||||||
bucket_name, keyspace, load_document, load_text,
|
bucket_name, keyspace, load_document,
|
||||||
):
|
):
|
||||||
|
|
||||||
self.blob_store = BlobStore(
|
self.blob_store = BlobStore(
|
||||||
|
|
@ -26,7 +26,6 @@ class Librarian:
|
||||||
)
|
)
|
||||||
|
|
||||||
self.load_document = load_document
|
self.load_document = load_document
|
||||||
self.load_text = load_text
|
|
||||||
|
|
||||||
async def add_document(self, request):
|
async def add_document(self, request):
|
||||||
|
|
||||||
|
|
@ -199,6 +198,14 @@ class Librarian:
|
||||||
|
|
||||||
await self.table_store.add_processing(request.processing_metadata)
|
await self.table_store.add_processing(request.processing_metadata)
|
||||||
|
|
||||||
|
print("Invoke document processing...")
|
||||||
|
|
||||||
|
await self.load_document(
|
||||||
|
document = doc,
|
||||||
|
processing = request.processing_metadata,
|
||||||
|
content = content,
|
||||||
|
)
|
||||||
|
|
||||||
print("Add complete", flush=True)
|
print("Add complete", flush=True)
|
||||||
|
|
||||||
return LibrarianResponse(
|
return LibrarianResponse(
|
||||||
|
|
@ -209,12 +216,6 @@ class Librarian:
|
||||||
processing_metadatas = None,
|
processing_metadatas = None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# if document.kind == "application/pdf":
|
|
||||||
# await self.load_document(document)
|
|
||||||
# elif document.kind == "text/plain":
|
|
||||||
# await self.load_text(document)
|
|
||||||
|
|
||||||
async def remove_processing(self, request):
|
async def remove_processing(self, request):
|
||||||
|
|
||||||
print("Removing processing...")
|
print("Removing processing...")
|
||||||
|
|
|
||||||
|
|
@ -113,7 +113,6 @@ class Processor(AsyncProcessor):
|
||||||
bucket_name = bucket_name,
|
bucket_name = bucket_name,
|
||||||
keyspace = keyspace,
|
keyspace = keyspace,
|
||||||
load_document = self.load_document,
|
load_document = self.load_document,
|
||||||
load_text = self.load_text,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self.register_config_handler(self.on_librarian_config)
|
self.register_config_handler(self.on_librarian_config)
|
||||||
|
|
@ -145,38 +144,66 @@ class Processor(AsyncProcessor):
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
async def load_document(self, document):
|
async def load_document(self, document, processing, content):
|
||||||
|
|
||||||
doc = Document(
|
print("Ready for processing...")
|
||||||
metadata = Metadata(
|
|
||||||
id = document.id,
|
|
||||||
metadata = document.metadata,
|
|
||||||
user = document.user,
|
|
||||||
collection = document.collection
|
|
||||||
),
|
|
||||||
data = document.document
|
|
||||||
)
|
|
||||||
|
|
||||||
|
print(document, processing, len(content))
|
||||||
|
|
||||||
|
if processing.flow not in self.flows:
|
||||||
|
raise RuntimeError("Invalid flow ID")
|
||||||
|
|
||||||
self.document_load.send(None, doc)
|
flow = self.flows[processing.flow]
|
||||||
|
|
||||||
async def load_text(self, document):
|
if document.kind == "text/plain":
|
||||||
|
kind = "text-load"
|
||||||
|
elif document.kind == "application/pdf":
|
||||||
|
kind = "document-load"
|
||||||
|
else:
|
||||||
|
raise RuntimeError("Document with a MIME type I don't know")
|
||||||
|
|
||||||
text = base64.b64decode(document.document)
|
q = flow["interfaces"][kind]
|
||||||
text = text.decode("utf-8")
|
|
||||||
|
|
||||||
|
if kind == "text-load":
|
||||||
doc = TextDocument(
|
doc = TextDocument(
|
||||||
metadata = Metadata(
|
metadata = Metadata(
|
||||||
id = document.id,
|
id = document.id,
|
||||||
metadata = document.metadata,
|
metadata = document.metadata,
|
||||||
user = document.user,
|
user = processing.user,
|
||||||
collection = document.collection
|
collection = processing.collection
|
||||||
),
|
),
|
||||||
text = text,
|
text = content,
|
||||||
|
)
|
||||||
|
schema = TextDocument
|
||||||
|
else:
|
||||||
|
doc = Document(
|
||||||
|
metadata = Metadata(
|
||||||
|
id = document.id,
|
||||||
|
metadata = document.metadata,
|
||||||
|
user = processing.user,
|
||||||
|
collection = processing.collection
|
||||||
|
),
|
||||||
|
data = base64.b64encode(content).decode("utf-8")
|
||||||
|
|
||||||
|
)
|
||||||
|
schema = Document
|
||||||
|
|
||||||
|
print(f"Submit on queue {q}...")
|
||||||
|
|
||||||
|
pub = Publisher(
|
||||||
|
self.pulsar_client, q, schema=schema
|
||||||
)
|
)
|
||||||
|
|
||||||
self.text_load.send(None, doc)
|
await pub.start()
|
||||||
|
|
||||||
|
# FIXME: Time wait kludge?
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
await pub.send(None, doc)
|
||||||
|
|
||||||
|
await pub.stop()
|
||||||
|
|
||||||
|
print("Document submitted")
|
||||||
|
|
||||||
async def process_request(self, v):
|
async def process_request(self, v):
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue