Updated utils to have command line

This commit is contained in:
Cyber MacGeddon 2024-07-12 15:06:51 +01:00
parent 603ad4e38f
commit da94865fb5
3 changed files with 175 additions and 36 deletions

View file

@ -1,47 +1,128 @@
#!/usr/bin/env python3
"""
Loads a PDF documented into TrustGraph processing.
"""
import pulsar
from pulsar.schema import JsonSchema, Bytes, String
from pulsar.schema import JsonSchema
from trustgraph.schema import Document, Source
import base64
import hashlib
import argparse
import os
import time
# client = pulsar.Client("pulsar://localhost:6650")
host="10.89.1.246"
host="localhost"
client = pulsar.Client(f"pulsar://{host}:6650")
from trustgraph.log_level import LogLevel
producer = client.create_producer(
topic='document-load',
schema=JsonSchema(Document),
chunking_enabled=True,
)
class Loader:
files=[
"Challenger-Report-Vol1.pdf",
# "columbia-accident-investigation-board-report-volume-1.pdf",
# "Proposed_CIRCIA_Rules.pdf",
]
def __init__(
self,
pulsar_host,
output_queue,
log_level,
file,
):
for file in files:
self.client = pulsar.Client(
pulsar_host,
logger=pulsar.ConsoleLogger(log_level.to_pulsar())
)
path = "sources/" + file
data = open(path, "rb").read()
self.producer = self.client.create_producer(
topic=output_queue,
schema=JsonSchema(Document),
chunking_enabled=True,
)
id = hashlib.sha256(path.encode("utf-8")).hexdigest()[0:8]
self.file = file
r = Document(
source=Source(
source=path,
title=path,
id=id,
),
data=base64.b64encode(data),
def run(self):
try:
path = self.file
data = open(path, "rb").read()
id = hashlib.sha256(path.encode("utf-8")).hexdigest()[0:8]
r = Document(
source=Source(
source=path,
title=path,
id=id,
),
data=base64.b64encode(data),
)
self.producer.send(r)
except Exception as e:
print(e, flush=True)
def __del__(self):
self.client.close()
def main():
parser = argparse.ArgumentParser(
prog='loader',
description=__doc__,
)
resp = producer.send(r)
default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
default_output_queue = 'document-load'
print(resp)
parser.add_argument(
'-p', '--pulsar-host',
default=default_pulsar_host,
help=f'Pulsar host (default: {default_pulsar_host})',
)
client.close()
parser.add_argument(
'-o', '--output-queue',
default=default_output_queue,
help=f'Output queue (default: {default_output_queue})'
)
parser.add_argument(
'-l', '--log-level',
type=LogLevel,
default=LogLevel.ERROR,
choices=list(LogLevel),
help=f'Output queue (default: info)'
)
parser.add_argument(
'-f', '--file',
required=True,
help=f'File to load'
)
args = parser.parse_args()
while True:
try:
p = Loader(
pulsar_host=args.pulsar_host,
output_queue=args.output_queue,
log_level=args.log_level,
file=args.file,
)
p.run()
print("File loaded.")
break
except Exception as e:
print("Exception:", e, flush=True)
print("Will retry...", flush=True)
time.sleep(10)
main()