mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-29 02:23:44 +02:00
Librarian doc submission (#362)
This commit is contained in:
parent
ff28d26f4d
commit
8146f0f2ff
3 changed files with 112 additions and 34 deletions
|
|
@ -14,7 +14,7 @@ class Librarian:
|
|||
self,
|
||||
cassandra_host, cassandra_user, cassandra_password,
|
||||
minio_host, minio_access_key, minio_secret_key,
|
||||
bucket_name, keyspace, load_document, load_text,
|
||||
bucket_name, keyspace, load_document,
|
||||
):
|
||||
|
||||
self.blob_store = BlobStore(
|
||||
|
|
@ -26,7 +26,6 @@ class Librarian:
|
|||
)
|
||||
|
||||
self.load_document = load_document
|
||||
self.load_text = load_text
|
||||
|
||||
async def add_document(self, request):
|
||||
|
||||
|
|
@ -199,6 +198,14 @@ class Librarian:
|
|||
|
||||
await self.table_store.add_processing(request.processing_metadata)
|
||||
|
||||
print("Invoke document processing...")
|
||||
|
||||
await self.load_document(
|
||||
document = doc,
|
||||
processing = request.processing_metadata,
|
||||
content = content,
|
||||
)
|
||||
|
||||
print("Add complete", flush=True)
|
||||
|
||||
return LibrarianResponse(
|
||||
|
|
@ -208,12 +215,6 @@ class Librarian:
|
|||
document_metadatas = None,
|
||||
processing_metadatas = None,
|
||||
)
|
||||
|
||||
|
||||
# if document.kind == "application/pdf":
|
||||
# await self.load_document(document)
|
||||
# elif document.kind == "text/plain":
|
||||
# await self.load_text(document)
|
||||
|
||||
async def remove_processing(self, request):
|
||||
|
||||
|
|
|
|||
|
|
@ -113,7 +113,6 @@ class Processor(AsyncProcessor):
|
|||
bucket_name = bucket_name,
|
||||
keyspace = keyspace,
|
||||
load_document = self.load_document,
|
||||
load_text = self.load_text,
|
||||
)
|
||||
|
||||
self.register_config_handler(self.on_librarian_config)
|
||||
|
|
@ -145,38 +144,66 @@ class Processor(AsyncProcessor):
|
|||
|
||||
pass
|
||||
|
||||
async def load_document(self, document):
|
||||
async def load_document(self, document, processing, content):
|
||||
|
||||
doc = Document(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
metadata = document.metadata,
|
||||
user = document.user,
|
||||
collection = document.collection
|
||||
),
|
||||
data = document.document
|
||||
print("Ready for processing...")
|
||||
|
||||
print(document, processing, len(content))
|
||||
|
||||
if processing.flow not in self.flows:
|
||||
raise RuntimeError("Invalid flow ID")
|
||||
|
||||
flow = self.flows[processing.flow]
|
||||
|
||||
if document.kind == "text/plain":
|
||||
kind = "text-load"
|
||||
elif document.kind == "application/pdf":
|
||||
kind = "document-load"
|
||||
else:
|
||||
raise RuntimeError("Document with a MIME type I don't know")
|
||||
|
||||
q = flow["interfaces"][kind]
|
||||
|
||||
if kind == "text-load":
|
||||
doc = TextDocument(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
metadata = document.metadata,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
text = content,
|
||||
)
|
||||
schema = TextDocument
|
||||
else:
|
||||
doc = Document(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
metadata = document.metadata,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
data = base64.b64encode(content).decode("utf-8")
|
||||
|
||||
)
|
||||
schema = Document
|
||||
|
||||
print(f"Submit on queue {q}...")
|
||||
|
||||
pub = Publisher(
|
||||
self.pulsar_client, q, schema=schema
|
||||
)
|
||||
|
||||
|
||||
await pub.start()
|
||||
|
||||
self.document_load.send(None, doc)
|
||||
# FIXME: Time wait kludge?
|
||||
await asyncio.sleep(1)
|
||||
|
||||
async def load_text(self, document):
|
||||
await pub.send(None, doc)
|
||||
|
||||
text = base64.b64decode(document.document)
|
||||
text = text.decode("utf-8")
|
||||
await pub.stop()
|
||||
|
||||
doc = TextDocument(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
metadata = document.metadata,
|
||||
user = document.user,
|
||||
collection = document.collection
|
||||
),
|
||||
text = text,
|
||||
)
|
||||
|
||||
self.text_load.send(None, doc)
|
||||
print("Document submitted")
|
||||
|
||||
async def process_request(self, v):
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue