diff --git a/README.md b/README.md index 8fc8f3bb..2fa82db5 100644 --- a/README.md +++ b/README.md @@ -181,13 +181,15 @@ chcon -Rt svirt_sandbox_file_t vertexai/ Check that you have a set of containers running... -```docker ps +``` +docker ps ``` You might want to look at containers which are down to see if any have exited unexpectedly - look at the STATUS field. -```docker ps -a +``` +docker ps -a ``` ### Wait @@ -223,7 +225,7 @@ curl -o sources/Challenger-Report-Vol1.pdf https://sma.nasa.gov/SignificantIncid Then load the file... ``` -scripts/loader +scripts/loader -f sources/Challenger-Report-Vol1.pdf ``` You get some output on the screen, if nothing looks like errors (has the @@ -394,3 +396,24 @@ If you get an answer to your query, Graph RAG is working! If you want to try different queries try modifying the script you ran at `tests/test-graph-rag`. +### Clearing everything down + +``` +docker-compose -f docker-compose-ollama.yaml down +``` + +You should also clean out unwanted volumes... + +``` +docker volume ls +``` + +And delete anything you don't need... + +``` +docker volume rm -f +``` + +If you want to experiment with your own data set, it would be best +to clear down everything created so far and start from scratch. + diff --git a/scripts/graph-show b/scripts/graph-show index 26e6cbff..b0dcdb19 100755 --- a/scripts/graph-show +++ b/scripts/graph-show @@ -1,10 +1,45 @@ #!/usr/bin/env python3 +""" +Connects to the trustgraph graph hosts and dumps all graph edges. +""" + +import argparse +import time + from trustgraph.trustgraph import TrustGraph -t = TrustGraph() +def show_graph(graph_hosts): -rows = t.get_all(limit=100_000_000) -for s, p, o in rows: - print(s, p, o) + t = TrustGraph(hosts=graph_hosts) + + rows = t.get_all(limit=100_000_000) + for s, p, o in rows: + print(s, p, o) + + +def main(): + + parser = argparse.ArgumentParser( + prog='graph-show', + description=__doc__, + ) + + parser.add_argument( + '-g', '--graph-hosts', + default="localhost", + help=f'Graph host (default: localhost)', + ) + + args = parser.parse_args() + + try: + + show_graph(graph_hosts=args.graph_hosts.split(",")) + + except Exception as e: + + print("Exception:", e, flush=True) + +main() diff --git a/scripts/loader b/scripts/loader index a6dc4450..d862a507 100755 --- a/scripts/loader +++ b/scripts/loader @@ -1,47 +1,128 @@ #!/usr/bin/env python3 +""" +Loads a PDF documented into TrustGraph processing. +""" + import pulsar -from pulsar.schema import JsonSchema, Bytes, String +from pulsar.schema import JsonSchema from trustgraph.schema import Document, Source import base64 import hashlib +import argparse +import os +import time -# client = pulsar.Client("pulsar://localhost:6650") -host="10.89.1.246" -host="localhost" -client = pulsar.Client(f"pulsar://{host}:6650") +from trustgraph.log_level import LogLevel -producer = client.create_producer( - topic='document-load', - schema=JsonSchema(Document), - chunking_enabled=True, -) +class Loader: -files=[ - "Challenger-Report-Vol1.pdf", -# "columbia-accident-investigation-board-report-volume-1.pdf", -# "Proposed_CIRCIA_Rules.pdf", -] + def __init__( + self, + pulsar_host, + output_queue, + log_level, + file, + ): -for file in files: + self.client = pulsar.Client( + pulsar_host, + logger=pulsar.ConsoleLogger(log_level.to_pulsar()) + ) - path = "sources/" + file - data = open(path, "rb").read() + self.producer = self.client.create_producer( + topic=output_queue, + schema=JsonSchema(Document), + chunking_enabled=True, + ) - id = hashlib.sha256(path.encode("utf-8")).hexdigest()[0:8] + self.file = file - r = Document( - source=Source( - source=path, - title=path, - id=id, - ), - data=base64.b64encode(data), + def run(self): + + try: + + path = self.file + data = open(path, "rb").read() + + id = hashlib.sha256(path.encode("utf-8")).hexdigest()[0:8] + + r = Document( + source=Source( + source=path, + title=path, + id=id, + ), + data=base64.b64encode(data), + ) + + self.producer.send(r) + + except Exception as e: + print(e, flush=True) + + def __del__(self): + self.client.close() + +def main(): + + parser = argparse.ArgumentParser( + prog='loader', + description=__doc__, ) - resp = producer.send(r) + default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650') + default_output_queue = 'document-load' - print(resp) + parser.add_argument( + '-p', '--pulsar-host', + default=default_pulsar_host, + help=f'Pulsar host (default: {default_pulsar_host})', + ) -client.close() + parser.add_argument( + '-o', '--output-queue', + default=default_output_queue, + help=f'Output queue (default: {default_output_queue})' + ) + + parser.add_argument( + '-l', '--log-level', + type=LogLevel, + default=LogLevel.ERROR, + choices=list(LogLevel), + help=f'Output queue (default: info)' + ) + + parser.add_argument( + '-f', '--file', + required=True, + help=f'File to load' + ) + + args = parser.parse_args() + + while True: + + try: + p = Loader( + pulsar_host=args.pulsar_host, + output_queue=args.output_queue, + log_level=args.log_level, + file=args.file, + ) + + p.run() + + print("File loaded.") + break + + except Exception as e: + + print("Exception:", e, flush=True) + print("Will retry...", flush=True) + + time.sleep(10) + +main()