Tidied scripts, added 2 query scripts (#53)

2026-04-25 00:16:23 +02:00 · 2024-09-05 16:45:22 +01:00 · 2024-09-05 16:45:22 +01:00 · 6e4534e35c
commit 6e4534e35c
parent 65d7f6d261
7 changed files with 167 additions and 29 deletions
--- a/docs/README.quickstart-docker-compose.md
+++ b/docs/README.quickstart-docker-compose.md
@ -268,7 +268,7 @@ curl -o sources/Challenger-Report-Vol1.pdf https://sma.nasa.gov/SignificantIncid
 Load the file for knowledge extraction:

 ```
-scripts/loader -f sources/Challenger-Report-Vol1.pdf
+scripts/load-pdf -f sources/Challenger-Report-Vol1.pdf
 ```

 The console output `File loaded.` indicates the PDF has been sucessfully loaded to the processing queues and extraction will begin.
@ -391,9 +391,9 @@ scripts/graph-show  | wc -l

 The Challenger report has a long introduction with quite a bit of adminstrative text commonly found in official reports. The first few hundred graph edges mostly capture this document formatting knowledge. To fully test the ability to extract complex knowledge, wait until at least `1000` graph edges have been extracted. The full extraction for this PDF will extract many thousand graph edges.

-### RAG Test Script
+### RAG Test
 ```
-tests/test-graph-rag
+scripts/query-graph-rag -q 'Give me 20 facts about the space shuttle Challenger'
 ```
 This script forms a LM prompt asking for 20 facts regarding the Challenger disaster. Depending on how many graph edges have been extracted, the response will be similar to:

@ -428,7 +428,8 @@ docker logs -f trustgraph-graph-rag-1
 ```
 ### More RAG Test Queries

-If you want to try different RAG queries, modify the `query` in the [test script](https://github.com/trustgraph-ai/trustgraph/blob/master/tests/test-graph-rag).
+If you want to try different RAG queries, modify the parameter to the `-q`
+option.

 ### Shutting Down TrustGraph

--- a/scripts/graph-to-turtle
+++ b/scripts/graph-to-turtle
@ -1,37 +1,74 @@
 #!/usr/bin/env python3

-from trustgraph.trustgraph import TrustGraph
+"""
+Connects to the graph query service and dumps all graph edges.
+"""
+
+import argparse
+import os
+from trustgraph.clients.triples_query_client import TriplesQueryClient
 import rdflib
-import sys
 import io
+import sys

-t = TrustGraph()
+default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://pulsar:6650')

-g = rdflib.Graph()
+def show_graph(pulsar):

-rows = t.get_all(limit=100_000_000)
-for s, p, o in rows:
+    tq = TriplesQueryClient(pulsar_host="pulsar://localhost:6650")

-#     print(s, p, o)
-    sv = rdflib.term.URIRef(s)
-    pv = rdflib.term.URIRef(p)
+    rows = tq.request(None, None, None, limit=10_000_000)

-    if o.startswith("https://") or o.startswith("http://"):
+    g = rdflib.Graph()

-        # Skip malformed URLs with spaces in
-        if " " in o:
-            continue
+    for row in rows:

-        ov = rdflib.term.URIRef(o)
-    else:
-        ov = rdflib.term.Literal(o)
+        sv = rdflib.term.URIRef(row.s.value)
+        pv = rdflib.term.URIRef(row.p.value)

-    g.add((sv, pv, ov))
+        if row.o.is_uri:

-g.serialize(destination="output.ttl", format="turtle")
+            # Skip malformed URLs with spaces in
+            if " " in row.o.value:
+                continue

-buf = io.BytesIO()
+            ov = rdflib.term.URIRef(row.o.value)
+        else:
+            ov = rdflib.term.Literal(row.o.value)

-g.serialize(destination=buf, format="turtle")
+        g.add((sv, pv, ov))
+
+    g.serialize(destination="output.ttl", format="turtle")
+
+    buf = io.BytesIO()
+
+    g.serialize(destination=buf, format="turtle")
+
+    sys.stdout.write(buf.getvalue().decode("utf-8"))
+
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        prog='graph-show',
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        '-p', '--pulsar-host',
+        default=default_pulsar_host,
+        help=f'Pulsar host (default: {default_pulsar_host})',
+    )
+
+    args = parser.parse_args()
+
+    try:
+
+        show_graph(args.pulsar_host)
+
+    except Exception as e:
+
+        print("Exception:", e, flush=True)
+
+main()

-sys.stdout.write(buf.getvalue().decode("utf-8"))
--- a/scripts/load-pdf
+++ b/scripts/load-pdf
--- a/scripts/query-document-rag
+++ b/scripts/query-document-rag
@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+
+"""
+Uses the Document RAG service to answer a query
+"""
+
+import argparse
+import os
+from trustgraph.clients.document_rag_client import DocumentRagClient
+
+default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
+
+def query(pulsar, query):
+
+    rag = DocumentRagClient(pulsar_host=pulsar)
+    resp = rag.request(query)
+    print(resp)
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        prog='graph-show',
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        '-p', '--pulsar-host',
+        default=default_pulsar_host,
+        help=f'Pulsar host (default: {default_pulsar_host})',
+    )
+
+    parser.add_argument(
+        '-q', '--query',
+        required=True,
+        help=f'Query to execute',
+    )
+
+    args = parser.parse_args()
+
+    try:
+
+        query(args.pulsar_host, args.query)
+
+    except Exception as e:
+
+        print("Exception:", e, flush=True)
+
+main()
+
--- a/scripts/query-graph-rag
+++ b/scripts/query-graph-rag
@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+
+"""
+Uses the GraphRAG service to answer a query
+"""
+
+import argparse
+import os
+from trustgraph.clients.graph_rag_client import GraphRagClient
+
+default_pulsar_host = os.getenv("PULSAR_HOST", 'pulsar://localhost:6650')
+
+def query(pulsar, query):
+
+    rag = GraphRagClient(pulsar_host=pulsar)
+    resp = rag.request(query)
+    print(resp)
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        prog='graph-show',
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        '-p', '--pulsar-host',
+        default=default_pulsar_host,
+        help=f'Pulsar host (default: {default_pulsar_host})',
+    )
+
+    parser.add_argument(
+        '-q', '--query',
+        required=True,
+        help=f'Query to execute',
+    )
+
+    args = parser.parse_args()
+
+    try:
+
+        query(args.pulsar_host, args.query)
+
+    except Exception as e:
+
+        print("Exception:", e, flush=True)
+
+main()
+
--- a/setup.py
+++ b/setup.py
@ -57,8 +57,8 @@ setuptools.setup(
        "scripts/chunker-token",
        "scripts/concat-parquet",
        "scripts/de-query-milvus",
-        "scripts/de-write-milvus",
        "scripts/de-query-qdrant",
+        "scripts/de-write-milvus",
        "scripts/de-write-qdrant",
        "scripts/document-rag",
        "scripts/dump-parquet",
@ -67,8 +67,8 @@ setuptools.setup(
        "scripts/embeddings-vectorize",
        "scripts/ge-dump-parquet",
        "scripts/ge-query-milvus",
-        "scripts/ge-write-milvus",
        "scripts/ge-query-qdrant",
+        "scripts/ge-write-milvus",
        "scripts/ge-write-qdrant",
        "scripts/graph-rag",
        "scripts/graph-show",
@ -77,14 +77,16 @@ setuptools.setup(
        "scripts/kg-extract-definitions",
        "scripts/kg-extract-relationships",
        "scripts/load-graph-embeddings",
+        "scripts/load-pdf",
+        "scripts/load-text",
        "scripts/load-triples",
-        "scripts/loader",
        "scripts/object-extract-row",
        "scripts/oe-write-milvus",
        "scripts/pdf-decoder",
        "scripts/prompt-generic",
        "scripts/prompt-template",
-        "scripts/query",
+        "scripts/query-document-rag",
+        "scripts/query-graph-rag",
        "scripts/rows-write-cassandra",
        "scripts/run-processing",
        "scripts/text-completion-azure",
--- a/scripts/query
+++ b/scripts/query