Streaming rag responses (#568)

* Tech spec for streaming RAG

* Support for streaming Graph/Doc RAG
This commit is contained in:
cybermaggedon 2025-11-26 19:47:39 +00:00 committed by GitHub
parent b1cc724f7d
commit 1948edaa50
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 3087 additions and 94 deletions

View file

@ -4,6 +4,10 @@ Uses the DocumentRAG service to answer a question
import argparse
import os
import asyncio
import json
import uuid
from websockets.asyncio.client import connect
from trustgraph.api import Api
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
@ -11,7 +15,69 @@ default_user = 'trustgraph'
default_collection = 'default'
default_doc_limit = 10
def question(url, flow_id, question, user, collection, doc_limit):
async def question_streaming(url, flow_id, question, user, collection, doc_limit):
"""Streaming version using websockets"""
# Convert http:// to ws://
if url.startswith('http://'):
url = 'ws://' + url[7:]
elif url.startswith('https://'):
url = 'wss://' + url[8:]
if not url.endswith("/"):
url += "/"
url = url + "api/v1/socket"
mid = str(uuid.uuid4())
async with connect(url) as ws:
req = {
"id": mid,
"service": "document-rag",
"flow": flow_id,
"request": {
"query": question,
"user": user,
"collection": collection,
"doc-limit": doc_limit,
"streaming": True
}
}
req = json.dumps(req)
await ws.send(req)
while True:
msg = await ws.recv()
obj = json.loads(msg)
if "error" in obj:
raise RuntimeError(obj["error"])
if obj["id"] != mid:
print("Ignore message")
continue
response = obj["response"]
# Handle streaming format (chunk)
if "chunk" in response:
chunk = response["chunk"]
print(chunk, end="", flush=True)
elif "response" in response:
# Final response with complete text
# Already printed via chunks, just add newline
pass
if obj["complete"]:
print() # Final newline
break
await ws.close()
def question_non_streaming(url, flow_id, question, user, collection, doc_limit):
"""Non-streaming version using HTTP API"""
api = Api(url).flow().id(flow_id)
@ -65,18 +131,36 @@ def main():
help=f'Document limit (default: {default_doc_limit})'
)
parser.add_argument(
'--no-streaming',
action='store_true',
help='Disable streaming (use non-streaming mode)'
)
args = parser.parse_args()
try:
question(
url=args.url,
flow_id = args.flow_id,
question=args.question,
user=args.user,
collection=args.collection,
doc_limit=args.doc_limit,
)
if not args.no_streaming:
asyncio.run(
question_streaming(
url=args.url,
flow_id=args.flow_id,
question=args.question,
user=args.user,
collection=args.collection,
doc_limit=args.doc_limit,
)
)
else:
question_non_streaming(
url=args.url,
flow_id=args.flow_id,
question=args.question,
user=args.user,
collection=args.collection,
doc_limit=args.doc_limit,
)
except Exception as e:

View file

@ -4,6 +4,10 @@ Uses the GraphRAG service to answer a question
import argparse
import os
import asyncio
import json
import uuid
from websockets.asyncio.client import connect
from trustgraph.api import Api
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
@ -14,10 +18,78 @@ default_triple_limit = 30
default_max_subgraph_size = 150
default_max_path_length = 2
def question(
async def question_streaming(
url, flow_id, question, user, collection, entity_limit, triple_limit,
max_subgraph_size, max_path_length
):
"""Streaming version using websockets"""
# Convert http:// to ws://
if url.startswith('http://'):
url = 'ws://' + url[7:]
elif url.startswith('https://'):
url = 'wss://' + url[8:]
if not url.endswith("/"):
url += "/"
url = url + "api/v1/socket"
mid = str(uuid.uuid4())
async with connect(url) as ws:
req = {
"id": mid,
"service": "graph-rag",
"flow": flow_id,
"request": {
"query": question,
"user": user,
"collection": collection,
"entity-limit": entity_limit,
"triple-limit": triple_limit,
"max-subgraph-size": max_subgraph_size,
"max-path-length": max_path_length,
"streaming": True
}
}
req = json.dumps(req)
await ws.send(req)
while True:
msg = await ws.recv()
obj = json.loads(msg)
if "error" in obj:
raise RuntimeError(obj["error"])
if obj["id"] != mid:
print("Ignore message")
continue
response = obj["response"]
# Handle streaming format (chunk)
if "chunk" in response:
chunk = response["chunk"]
print(chunk, end="", flush=True)
elif "response" in response:
# Final response with complete text
# Already printed via chunks, just add newline
pass
if obj["complete"]:
print() # Final newline
break
await ws.close()
def question_non_streaming(
url, flow_id, question, user, collection, entity_limit, triple_limit,
max_subgraph_size, max_path_length
):
"""Non-streaming version using HTTP API"""
api = Api(url).flow().id(flow_id)
@ -91,21 +163,42 @@ def main():
help=f'Max path length (default: {default_max_path_length})'
)
parser.add_argument(
'--no-streaming',
action='store_true',
help='Disable streaming (use non-streaming mode)'
)
args = parser.parse_args()
try:
question(
url=args.url,
flow_id = args.flow_id,
question=args.question,
user=args.user,
collection=args.collection,
entity_limit=args.entity_limit,
triple_limit=args.triple_limit,
max_subgraph_size=args.max_subgraph_size,
max_path_length=args.max_path_length,
)
if not args.no_streaming:
asyncio.run(
question_streaming(
url=args.url,
flow_id=args.flow_id,
question=args.question,
user=args.user,
collection=args.collection,
entity_limit=args.entity_limit,
triple_limit=args.triple_limit,
max_subgraph_size=args.max_subgraph_size,
max_path_length=args.max_path_length,
)
)
else:
question_non_streaming(
url=args.url,
flow_id=args.flow_id,
question=args.question,
user=args.user,
collection=args.collection,
entity_limit=args.entity_limit,
triple_limit=args.triple_limit,
max_subgraph_size=args.max_subgraph_size,
max_path_length=args.max_path_length,
)
except Exception as e: