Librarian doc submission (#362)

This commit is contained in:
cybermaggedon 2025-05-04 22:56:47 +01:00 committed by GitHub
parent ff28d26f4d
commit 8146f0f2ff
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 112 additions and 34 deletions

View file

@ -0,0 +1,50 @@
#!/usr/bin/env python3
import requests
import json
import sys
import base64
import time
url = "http://localhost:8088/api/v1/"
############################################################################
doc_id = "http://trustgraph.ai/doc/6d034da9-2759-45c2-af24-14db7f4c44c2"
proc_id = "72be9c56-a63a-4dde-8f3c-9b35f2598b83"
input = {
"operation": "add-processing",
"processing-metadata": {
"id": proc_id,
"document-id": doc_id,
"time": int(time.time()),
"flow": "0000",
"user": "trustgraph",
"collection": "default",
"tags": ["test"],
}
}
resp = requests.post(
f"{url}librarian",
json=input,
)
print(resp.text)
resp = resp.json()
print(resp)
if "error" in resp:
print(f"Error: {resp['error']}")
sys.exit(1)
# print(resp["response"])
print(resp)
sys.exit(0)
############################################################################

View file

@ -14,7 +14,7 @@ class Librarian:
self,
cassandra_host, cassandra_user, cassandra_password,
minio_host, minio_access_key, minio_secret_key,
bucket_name, keyspace, load_document, load_text,
bucket_name, keyspace, load_document,
):
self.blob_store = BlobStore(
@ -26,7 +26,6 @@ class Librarian:
)
self.load_document = load_document
self.load_text = load_text
async def add_document(self, request):
@ -199,6 +198,14 @@ class Librarian:
await self.table_store.add_processing(request.processing_metadata)
print("Invoke document processing...")
await self.load_document(
document = doc,
processing = request.processing_metadata,
content = content,
)
print("Add complete", flush=True)
return LibrarianResponse(
@ -209,12 +216,6 @@ class Librarian:
processing_metadatas = None,
)
# if document.kind == "application/pdf":
# await self.load_document(document)
# elif document.kind == "text/plain":
# await self.load_text(document)
async def remove_processing(self, request):
print("Removing processing...")

View file

@ -113,7 +113,6 @@ class Processor(AsyncProcessor):
bucket_name = bucket_name,
keyspace = keyspace,
load_document = self.load_document,
load_text = self.load_text,
)
self.register_config_handler(self.on_librarian_config)
@ -145,38 +144,66 @@ class Processor(AsyncProcessor):
pass
async def load_document(self, document):
async def load_document(self, document, processing, content):
doc = Document(
metadata = Metadata(
id = document.id,
metadata = document.metadata,
user = document.user,
collection = document.collection
),
data = document.document
)
print("Ready for processing...")
print(document, processing, len(content))
if processing.flow not in self.flows:
raise RuntimeError("Invalid flow ID")
self.document_load.send(None, doc)
flow = self.flows[processing.flow]
async def load_text(self, document):
if document.kind == "text/plain":
kind = "text-load"
elif document.kind == "application/pdf":
kind = "document-load"
else:
raise RuntimeError("Document with a MIME type I don't know")
text = base64.b64decode(document.document)
text = text.decode("utf-8")
q = flow["interfaces"][kind]
if kind == "text-load":
doc = TextDocument(
metadata = Metadata(
id = document.id,
metadata = document.metadata,
user = document.user,
collection = document.collection
user = processing.user,
collection = processing.collection
),
text = text,
text = content,
)
schema = TextDocument
else:
doc = Document(
metadata = Metadata(
id = document.id,
metadata = document.metadata,
user = processing.user,
collection = processing.collection
),
data = base64.b64encode(content).decode("utf-8")
)
schema = Document
print(f"Submit on queue {q}...")
pub = Publisher(
self.pulsar_client, q, schema=schema
)
self.text_load.send(None, doc)
await pub.start()
# FIXME: Time wait kludge?
await asyncio.sleep(1)
await pub.send(None, doc)
await pub.stop()
print("Document submitted")
async def process_request(self, v):