Streaming rag responses (#568)

* Tech spec for streaming RAG * Support for streaming Graph/Doc RAG
2026-04-25 08:26:21 +02:00 · 2025-11-26 19:47:39 +00:00 · 2025-11-26 19:47:39 +00:00 · 1948edaa50
commit 1948edaa50
parent b1cc724f7d
20 changed files with 3087 additions and 94 deletions
--- a/trustgraph-cli/trustgraph/cli/invoke_document_rag.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_document_rag.py
@ -4,6 +4,10 @@ Uses the DocumentRAG service to answer a question

 import argparse
 import os
+import asyncio
+import json
+import uuid
+from websockets.asyncio.client import connect
 from trustgraph.api import Api

 default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
@ -11,7 +15,69 @@ default_user = 'trustgraph'
 default_collection = 'default'
 default_doc_limit = 10

-def question(url, flow_id, question, user, collection, doc_limit):
+async def question_streaming(url, flow_id, question, user, collection, doc_limit):
+    """Streaming version using websockets"""
+
+    # Convert http:// to ws://
+    if url.startswith('http://'):
+        url = 'ws://' + url[7:]
+    elif url.startswith('https://'):
+        url = 'wss://' + url[8:]
+
+    if not url.endswith("/"):
+        url += "/"
+
+    url = url + "api/v1/socket"
+
+    mid = str(uuid.uuid4())
+
+    async with connect(url) as ws:
+        req = {
+            "id": mid,
+            "service": "document-rag",
+            "flow": flow_id,
+            "request": {
+                "query": question,
+                "user": user,
+                "collection": collection,
+                "doc-limit": doc_limit,
+                "streaming": True
+            }
+        }
+
+        req = json.dumps(req)
+        await ws.send(req)
+
+        while True:
+            msg = await ws.recv()
+            obj = json.loads(msg)
+
+            if "error" in obj:
+                raise RuntimeError(obj["error"])
+
+            if obj["id"] != mid:
+                print("Ignore message")
+                continue
+
+            response = obj["response"]
+
+            # Handle streaming format (chunk)
+            if "chunk" in response:
+                chunk = response["chunk"]
+                print(chunk, end="", flush=True)
+            elif "response" in response:
+                # Final response with complete text
+                # Already printed via chunks, just add newline
+                pass
+
+            if obj["complete"]:
+                print()  # Final newline
+                break
+
+        await ws.close()
+
+def question_non_streaming(url, flow_id, question, user, collection, doc_limit):
+    """Non-streaming version using HTTP API"""

    api = Api(url).flow().id(flow_id)

@ -65,18 +131,36 @@ def main():
        help=f'Document limit (default: {default_doc_limit})'
    )

+    parser.add_argument(
+        '--no-streaming',
+        action='store_true',
+        help='Disable streaming (use non-streaming mode)'
+    )
+
    args = parser.parse_args()

    try:

-        question(
-            url=args.url,
-            flow_id = args.flow_id,
-            question=args.question,
-            user=args.user,
-            collection=args.collection,
-            doc_limit=args.doc_limit,
-        )
+        if not args.no_streaming:
+            asyncio.run(
+                question_streaming(
+                    url=args.url,
+                    flow_id=args.flow_id,
+                    question=args.question,
+                    user=args.user,
+                    collection=args.collection,
+                    doc_limit=args.doc_limit,
+                )
+            )
+        else:
+            question_non_streaming(
+                url=args.url,
+                flow_id=args.flow_id,
+                question=args.question,
+                user=args.user,
+                collection=args.collection,
+                doc_limit=args.doc_limit,
+            )

    except Exception as e:

--- a/trustgraph-cli/trustgraph/cli/invoke_graph_rag.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_graph_rag.py
@ -4,6 +4,10 @@ Uses the GraphRAG service to answer a question

 import argparse
 import os
+import asyncio
+import json
+import uuid
+from websockets.asyncio.client import connect
 from trustgraph.api import Api

 default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
@ -14,10 +18,78 @@ default_triple_limit = 30
 default_max_subgraph_size = 150
 default_max_path_length = 2

-def question(
+async def question_streaming(
        url, flow_id, question, user, collection, entity_limit, triple_limit,
        max_subgraph_size, max_path_length
 ):
+    """Streaming version using websockets"""
+
+    # Convert http:// to ws://
+    if url.startswith('http://'):
+        url = 'ws://' + url[7:]
+    elif url.startswith('https://'):
+        url = 'wss://' + url[8:]
+
+    if not url.endswith("/"):
+        url += "/"
+
+    url = url + "api/v1/socket"
+
+    mid = str(uuid.uuid4())
+
+    async with connect(url) as ws:
+        req = {
+            "id": mid,
+            "service": "graph-rag",
+            "flow": flow_id,
+            "request": {
+                "query": question,
+                "user": user,
+                "collection": collection,
+                "entity-limit": entity_limit,
+                "triple-limit": triple_limit,
+                "max-subgraph-size": max_subgraph_size,
+                "max-path-length": max_path_length,
+                "streaming": True
+            }
+        }
+
+        req = json.dumps(req)
+        await ws.send(req)
+
+        while True:
+            msg = await ws.recv()
+            obj = json.loads(msg)
+
+            if "error" in obj:
+                raise RuntimeError(obj["error"])
+
+            if obj["id"] != mid:
+                print("Ignore message")
+                continue
+
+            response = obj["response"]
+
+            # Handle streaming format (chunk)
+            if "chunk" in response:
+                chunk = response["chunk"]
+                print(chunk, end="", flush=True)
+            elif "response" in response:
+                # Final response with complete text
+                # Already printed via chunks, just add newline
+                pass
+
+            if obj["complete"]:
+                print()  # Final newline
+                break
+
+        await ws.close()
+
+def question_non_streaming(
+        url, flow_id, question, user, collection, entity_limit, triple_limit,
+        max_subgraph_size, max_path_length
+):
+    """Non-streaming version using HTTP API"""

    api = Api(url).flow().id(flow_id)

@ -91,21 +163,42 @@ def main():
        help=f'Max path length (default: {default_max_path_length})'
    )

+    parser.add_argument(
+        '--no-streaming',
+        action='store_true',
+        help='Disable streaming (use non-streaming mode)'
+    )
+
    args = parser.parse_args()

    try:

-        question(
-            url=args.url,
-            flow_id = args.flow_id,
-            question=args.question,
-            user=args.user,
-            collection=args.collection,
-            entity_limit=args.entity_limit,
-            triple_limit=args.triple_limit,
-            max_subgraph_size=args.max_subgraph_size,
-            max_path_length=args.max_path_length,
-        )
+        if not args.no_streaming:
+            asyncio.run(
+                question_streaming(
+                    url=args.url,
+                    flow_id=args.flow_id,
+                    question=args.question,
+                    user=args.user,
+                    collection=args.collection,
+                    entity_limit=args.entity_limit,
+                    triple_limit=args.triple_limit,
+                    max_subgraph_size=args.max_subgraph_size,
+                    max_path_length=args.max_path_length,
+                )
+            )
+        else:
+            question_non_streaming(
+                url=args.url,
+                flow_id=args.flow_id,
+                question=args.question,
+                user=args.user,
+                collection=args.collection,
+                entity_limit=args.entity_limit,
+                triple_limit=args.triple_limit,
+                max_subgraph_size=args.max_subgraph_size,
+                max_path_length=args.max_path_length,
+            )

    except Exception as e: