diff --git a/docs/README.quickstart-docker-compose.md b/docs/README.quickstart-docker-compose.md index 25752017..8166adbf 100644 --- a/docs/README.quickstart-docker-compose.md +++ b/docs/README.quickstart-docker-compose.md @@ -268,7 +268,7 @@ curl -o sources/Challenger-Report-Vol1.pdf https://sma.nasa.gov/SignificantIncid Load the file for knowledge extraction: ``` -scripts/loader -f sources/Challenger-Report-Vol1.pdf +scripts/load-pdf -f sources/Challenger-Report-Vol1.pdf ``` The console output `File loaded.` indicates the PDF has been sucessfully loaded to the processing queues and extraction will begin. @@ -391,9 +391,9 @@ scripts/graph-show | wc -l The Challenger report has a long introduction with quite a bit of adminstrative text commonly found in official reports. The first few hundred graph edges mostly capture this document formatting knowledge. To fully test the ability to extract complex knowledge, wait until at least `1000` graph edges have been extracted. The full extraction for this PDF will extract many thousand graph edges. -### RAG Test Script +### RAG Test ``` -tests/test-graph-rag +scripts/query-graph-rag -q 'Give me 20 facts about the space shuttle Challenger' ``` This script forms a LM prompt asking for 20 facts regarding the Challenger disaster. Depending on how many graph edges have been extracted, the response will be similar to: @@ -428,7 +428,8 @@ docker logs -f trustgraph-graph-rag-1 ``` ### More RAG Test Queries -If you want to try different RAG queries, modify the `query` in the [test script](https://github.com/trustgraph-ai/trustgraph/blob/master/tests/test-graph-rag). +If you want to try different RAG queries, modify the parameter to the `-q` +option. ### Shutting Down TrustGraph diff --git a/scripts/graph-to-turtle b/scripts/graph-to-turtle index 1bd48802..26e18774 100755 --- a/scripts/graph-to-turtle +++ b/scripts/graph-to-turtle @@ -1,37 +1,74 @@ #!/usr/bin/env python3 -from trustgraph.trustgraph import TrustGraph +""" +Connects to the graph query service and dumps all graph edges. +""" + +import argparse +import os +from trustgraph.clients.triples_query_client import TriplesQueryClient import rdflib -import sys import io +import sys -t = TrustGraph() +default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650') -g = rdflib.Graph() +def show_graph(pulsar): -rows = t.get_all(limit=100_000_000) -for s, p, o in rows: + tq = TriplesQueryClient(pulsar_host="pulsar://localhost:6650") -# print(s, p, o) - sv = rdflib.term.URIRef(s) - pv = rdflib.term.URIRef(p) + rows = tq.request(None, None, None, limit=10_000_000) - if o.startswith("https://") or o.startswith("http://"): + g = rdflib.Graph() - # Skip malformed URLs with spaces in - if " " in o: - continue + for row in rows: - ov = rdflib.term.URIRef(o) - else: - ov = rdflib.term.Literal(o) + sv = rdflib.term.URIRef(row.s.value) + pv = rdflib.term.URIRef(row.p.value) - g.add((sv, pv, ov)) + if row.o.is_uri: -g.serialize(destination="output.ttl", format="turtle") + # Skip malformed URLs with spaces in + if " " in row.o.value: + continue -buf = io.BytesIO() + ov = rdflib.term.URIRef(row.o.value) + else: + ov = rdflib.term.Literal(row.o.value) -g.serialize(destination=buf, format="turtle") + g.add((sv, pv, ov)) + + g.serialize(destination="output.ttl", format="turtle") + + buf = io.BytesIO() + + g.serialize(destination=buf, format="turtle") + + sys.stdout.write(buf.getvalue().decode("utf-8")) + + +def main(): + + parser = argparse.ArgumentParser( + prog='graph-show', + description=__doc__, + ) + + parser.add_argument( + '-p', '--pulsar-host', + default=default_pulsar_host, + help=f'Pulsar host (default: {default_pulsar_host})', + ) + + args = parser.parse_args() + + try: + + show_graph(args.pulsar_host) + + except Exception as e: + + print("Exception:", e, flush=True) + +main() -sys.stdout.write(buf.getvalue().decode("utf-8")) diff --git a/scripts/loader b/scripts/load-pdf similarity index 100% rename from scripts/loader rename to scripts/load-pdf diff --git a/scripts/query-document-rag b/scripts/query-document-rag new file mode 100755 index 00000000..948dcd2f --- /dev/null +++ b/scripts/query-document-rag @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +""" +Uses the Document RAG service to answer a query +""" + +import argparse +import os +from trustgraph.clients.document_rag_client import DocumentRagClient + +default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650') + +def query(pulsar, query): + + rag = DocumentRagClient(pulsar_host=pulsar) + resp = rag.request(query) + print(resp) + +def main(): + + parser = argparse.ArgumentParser( + prog='graph-show', + description=__doc__, + ) + + parser.add_argument( + '-p', '--pulsar-host', + default=default_pulsar_host, + help=f'Pulsar host (default: {default_pulsar_host})', + ) + + parser.add_argument( + '-q', '--query', + required=True, + help=f'Query to execute', + ) + + args = parser.parse_args() + + try: + + query(args.pulsar_host, args.query) + + except Exception as e: + + print("Exception:", e, flush=True) + +main() + diff --git a/scripts/query-graph-rag b/scripts/query-graph-rag new file mode 100755 index 00000000..5250bf15 --- /dev/null +++ b/scripts/query-graph-rag @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +""" +Uses the GraphRAG service to answer a query +""" + +import argparse +import os +from trustgraph.clients.graph_rag_client import GraphRagClient + +default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650') + +def query(pulsar, query): + + rag = GraphRagClient(pulsar_host=pulsar) + resp = rag.request(query) + print(resp) + +def main(): + + parser = argparse.ArgumentParser( + prog='graph-show', + description=__doc__, + ) + + parser.add_argument( + '-p', '--pulsar-host', + default=default_pulsar_host, + help=f'Pulsar host (default: {default_pulsar_host})', + ) + + parser.add_argument( + '-q', '--query', + required=True, + help=f'Query to execute', + ) + + args = parser.parse_args() + + try: + + query(args.pulsar_host, args.query) + + except Exception as e: + + print("Exception:", e, flush=True) + +main() + diff --git a/setup.py b/setup.py index b3926bfa..941e13bb 100644 --- a/setup.py +++ b/setup.py @@ -57,8 +57,8 @@ setuptools.setup( "scripts/chunker-token", "scripts/concat-parquet", "scripts/de-query-milvus", - "scripts/de-write-milvus", "scripts/de-query-qdrant", + "scripts/de-write-milvus", "scripts/de-write-qdrant", "scripts/document-rag", "scripts/dump-parquet", @@ -67,8 +67,8 @@ setuptools.setup( "scripts/embeddings-vectorize", "scripts/ge-dump-parquet", "scripts/ge-query-milvus", - "scripts/ge-write-milvus", "scripts/ge-query-qdrant", + "scripts/ge-write-milvus", "scripts/ge-write-qdrant", "scripts/graph-rag", "scripts/graph-show", @@ -77,14 +77,16 @@ setuptools.setup( "scripts/kg-extract-definitions", "scripts/kg-extract-relationships", "scripts/load-graph-embeddings", + "scripts/load-pdf", + "scripts/load-text", "scripts/load-triples", - "scripts/loader", "scripts/object-extract-row", "scripts/oe-write-milvus", "scripts/pdf-decoder", "scripts/prompt-generic", "scripts/prompt-template", - "scripts/query", + "scripts/query-document-rag", + "scripts/query-graph-rag", "scripts/rows-write-cassandra", "scripts/run-processing", "scripts/text-completion-azure", diff --git a/scripts/query b/tests/query similarity index 100% rename from scripts/query rename to tests/query